From 652142e189ee6d1b6bdb9292f7c858870647e721 Mon Sep 17 00:00:00 2001 From: Brian Fiete Date: Thu, 2 Jun 2022 17:55:17 -0700 Subject: [PATCH] Added TCMalloc and JEMalloc projects --- BeefRT/JEMalloc/.appveyor.yml | 41 + BeefRT/JEMalloc/.autom4te.cfg | 3 + BeefRT/JEMalloc/COPYING | 27 + BeefRT/JEMalloc/INSTALL.md | 424 ++ BeefRT/JEMalloc/Makefile.in | 762 +++ BeefRT/JEMalloc/README | 20 + BeefRT/JEMalloc/VERSION | 1 + BeefRT/JEMalloc/autogen.sh | 17 + BeefRT/JEMalloc/bin/jemalloc-config.in | 83 + BeefRT/JEMalloc/bin/jemalloc.sh.in | 9 + BeefRT/JEMalloc/bin/jeprof.in | 5723 +++++++++++++++++ BeefRT/JEMalloc/build-aux/install-sh | 250 + BeefRT/JEMalloc/configure.ac | 2669 ++++++++ .../jemalloc/internal/activity_callback.h | 23 + .../include/jemalloc/internal/arena_externs.h | 121 + .../jemalloc/internal/arena_inlines_a.h | 24 + .../jemalloc/internal/arena_inlines_b.h | 550 ++ .../include/jemalloc/internal/arena_stats.h | 114 + .../include/jemalloc/internal/arena_structs.h | 101 + .../include/jemalloc/internal/arena_types.h | 58 + .../include/jemalloc/internal/assert.h | 56 + .../include/jemalloc/internal/atomic.h | 107 + .../include/jemalloc/internal/atomic_c11.h | 97 + .../jemalloc/internal/atomic_gcc_atomic.h | 129 + .../jemalloc/internal/atomic_gcc_sync.h | 195 + .../include/jemalloc/internal/atomic_msvc.h | 158 + .../internal/background_thread_externs.h | 33 + .../internal/background_thread_inlines.h | 48 + .../internal/background_thread_structs.h | 66 + .../JEMalloc/include/jemalloc/internal/base.h | 110 + .../JEMalloc/include/jemalloc/internal/bin.h | 82 + .../include/jemalloc/internal/bin_info.h | 50 + .../include/jemalloc/internal/bin_stats.h | 57 + .../include/jemalloc/internal/bin_types.h | 17 + .../include/jemalloc/internal/bit_util.h | 422 ++ .../include/jemalloc/internal/bitmap.h | 368 ++ .../include/jemalloc/internal/buf_writer.h | 32 + .../include/jemalloc/internal/cache_bin.h | 670 ++ .../JEMalloc/include/jemalloc/internal/ckh.h | 101 + .../include/jemalloc/internal/counter.h | 34 + .../JEMalloc/include/jemalloc/internal/ctl.h | 159 + .../include/jemalloc/internal/decay.h | 186 + .../JEMalloc/include/jemalloc/internal/div.h | 41 + .../include/jemalloc/internal/ecache.h | 55 + .../include/jemalloc/internal/edata.h | 698 ++ .../include/jemalloc/internal/edata_cache.h | 49 + .../include/jemalloc/internal/ehooks.h | 412 ++ .../JEMalloc/include/jemalloc/internal/emap.h | 357 + .../include/jemalloc/internal/emitter.h | 510 ++ .../JEMalloc/include/jemalloc/internal/eset.h | 77 + .../include/jemalloc/internal/exp_grow.h | 50 + .../include/jemalloc/internal/extent.h | 137 + .../include/jemalloc/internal/extent_dss.h | 26 + .../include/jemalloc/internal/extent_mmap.h | 10 + .../JEMalloc/include/jemalloc/internal/fb.h | 373 ++ .../JEMalloc/include/jemalloc/internal/fxp.h | 126 + .../JEMalloc/include/jemalloc/internal/hash.h | 320 + .../JEMalloc/include/jemalloc/internal/hook.h | 163 + .../JEMalloc/include/jemalloc/internal/hpa.h | 182 + .../include/jemalloc/internal/hpa_hooks.h | 17 + .../include/jemalloc/internal/hpa_opts.h | 74 + .../include/jemalloc/internal/hpdata.h | 413 ++ .../include/jemalloc/internal/inspect.h | 40 + .../internal/jemalloc_internal_decls.h | 108 + .../internal/jemalloc_internal_defs.h | 428 ++ .../internal/jemalloc_internal_defs.h.in | 427 ++ .../internal/jemalloc_internal_externs.h | 75 + .../internal/jemalloc_internal_includes.h | 84 + .../internal/jemalloc_internal_inlines_a.h | 122 + .../internal/jemalloc_internal_inlines_b.h | 103 + .../internal/jemalloc_internal_inlines_c.h | 340 + .../internal/jemalloc_internal_macros.h | 111 + .../internal/jemalloc_internal_types.h | 130 + .../jemalloc/internal/jemalloc_preamble.h | 263 + .../jemalloc/internal/jemalloc_preamble.h.in | 263 + .../include/jemalloc/internal/large_externs.h | 24 + .../include/jemalloc/internal/lockedint.h | 204 + .../JEMalloc/include/jemalloc/internal/log.h | 115 + .../include/jemalloc/internal/malloc_io.h | 105 + .../include/jemalloc/internal/mpsc_queue.h | 134 + .../include/jemalloc/internal/mutex.h | 319 + .../include/jemalloc/internal/mutex_prof.h | 117 + .../include/jemalloc/internal/nstime.h | 73 + .../JEMalloc/include/jemalloc/internal/pa.h | 243 + .../JEMalloc/include/jemalloc/internal/pac.h | 179 + .../include/jemalloc/internal/pages.h | 119 + .../JEMalloc/include/jemalloc/internal/pai.h | 95 + .../JEMalloc/include/jemalloc/internal/peak.h | 37 + .../include/jemalloc/internal/peak_event.h | 24 + .../JEMalloc/include/jemalloc/internal/ph.h | 520 ++ .../jemalloc/internal/private_namespace.sh | 5 + .../jemalloc/internal/private_symbols.awk | 52 + .../jemalloc/internal/private_symbols.sh | 51 + .../jemalloc/internal/private_symbols_jet.awk | 52 + .../JEMalloc/include/jemalloc/internal/prng.h | 168 + .../include/jemalloc/internal/prof_data.h | 37 + .../include/jemalloc/internal/prof_externs.h | 95 + .../include/jemalloc/internal/prof_hook.h | 21 + .../include/jemalloc/internal/prof_inlines.h | 261 + .../include/jemalloc/internal/prof_log.h | 22 + .../include/jemalloc/internal/prof_recent.h | 23 + .../include/jemalloc/internal/prof_stats.h | 17 + .../include/jemalloc/internal/prof_structs.h | 221 + .../include/jemalloc/internal/prof_sys.h | 30 + .../include/jemalloc/internal/prof_types.h | 75 + .../include/jemalloc/internal/psset.h | 131 + .../jemalloc/internal/public_namespace.h | 22 + .../jemalloc/internal/public_namespace.sh | 6 + .../jemalloc/internal/public_symbols.txt | 22 + .../jemalloc/internal/public_unnamespace.h | 22 + .../jemalloc/internal/public_unnamespace.sh | 6 + .../JEMalloc/include/jemalloc/internal/ql.h | 197 + .../JEMalloc/include/jemalloc/internal/qr.h | 140 + .../include/jemalloc/internal/quantum.h | 87 + .../JEMalloc/include/jemalloc/internal/rb.h | 1856 ++++++ .../include/jemalloc/internal/rtree.h | 554 ++ .../include/jemalloc/internal/rtree_tsd.h | 62 + .../include/jemalloc/internal/safety_check.h | 31 + .../JEMalloc/include/jemalloc/internal/san.h | 191 + .../include/jemalloc/internal/san_bump.h | 52 + .../JEMalloc/include/jemalloc/internal/sc.h | 357 + .../JEMalloc/include/jemalloc/internal/sec.h | 120 + .../include/jemalloc/internal/sec_opts.h | 59 + .../JEMalloc/include/jemalloc/internal/seq.h | 55 + .../include/jemalloc/internal/slab_data.h | 12 + .../include/jemalloc/internal/smoothstep.h | 232 + .../include/jemalloc/internal/smoothstep.sh | 101 + .../JEMalloc/include/jemalloc/internal/spin.h | 40 + .../include/jemalloc/internal/stats.h | 54 + .../JEMalloc/include/jemalloc/internal/sz.h | 371 ++ .../jemalloc/internal/tcache_externs.h | 75 + .../jemalloc/internal/tcache_inlines.h | 193 + .../jemalloc/internal/tcache_structs.h | 68 + .../include/jemalloc/internal/tcache_types.h | 35 + .../include/jemalloc/internal/test_hooks.h | 24 + .../include/jemalloc/internal/thread_event.h | 301 + .../include/jemalloc/internal/ticker.h | 175 + .../JEMalloc/include/jemalloc/internal/tsd.h | 518 ++ .../include/jemalloc/internal/tsd_generic.h | 182 + .../internal/tsd_malloc_thread_cleanup.h | 61 + .../include/jemalloc/internal/tsd_tls.h | 60 + .../include/jemalloc/internal/tsd_types.h | 10 + .../include/jemalloc/internal/tsd_win.h | 139 + .../include/jemalloc/internal/typed_list.h | 55 + .../JEMalloc/include/jemalloc/internal/util.h | 123 + .../include/jemalloc/internal/witness.h | 378 ++ BeefRT/JEMalloc/include/jemalloc/jemalloc.h | 462 ++ BeefRT/JEMalloc/include/jemalloc/jemalloc.sh | 27 + .../JEMalloc/include/jemalloc/jemalloc_defs.h | 55 + .../include/jemalloc/jemalloc_defs.h.in | 54 + .../include/jemalloc/jemalloc_macros.h | 149 + .../include/jemalloc/jemalloc_macros.h.in | 149 + .../include/jemalloc/jemalloc_mangle.h | 66 + .../include/jemalloc/jemalloc_mangle.sh | 45 + .../include/jemalloc/jemalloc_mangle_jet.h | 66 + .../include/jemalloc/jemalloc_protos.h | 71 + .../include/jemalloc/jemalloc_protos.h.in | 71 + .../include/jemalloc/jemalloc_protos_jet.h | 71 + .../include/jemalloc/jemalloc_rename.h | 29 + .../include/jemalloc/jemalloc_rename.sh | 22 + .../include/jemalloc/jemalloc_typedefs.h | 77 + .../include/jemalloc/jemalloc_typedefs.h.in | 77 + .../include/msvc_compat/C99/stdbool.h | 20 + .../JEMalloc/include/msvc_compat/C99/stdint.h | 247 + BeefRT/JEMalloc/include/msvc_compat/strings.h | 58 + .../include/msvc_compat/windows_extra.h | 6 + BeefRT/JEMalloc/jemalloc.pc.in | 12 + BeefRT/JEMalloc/jemalloc.sln | 58 + BeefRT/JEMalloc/jemalloc.vcxproj | 464 ++ BeefRT/JEMalloc/jemalloc.vcxproj.filters | 197 + BeefRT/JEMalloc/src/arena.c | 1891 ++++++ BeefRT/JEMalloc/src/background_thread.c | 820 +++ BeefRT/JEMalloc/src/base.c | 529 ++ BeefRT/JEMalloc/src/bin.c | 69 + BeefRT/JEMalloc/src/bin_info.c | 30 + BeefRT/JEMalloc/src/bitmap.c | 120 + BeefRT/JEMalloc/src/buf_writer.c | 144 + BeefRT/JEMalloc/src/cache_bin.c | 99 + BeefRT/JEMalloc/src/ckh.c | 569 ++ BeefRT/JEMalloc/src/counter.c | 30 + BeefRT/JEMalloc/src/ctl.c | 4414 +++++++++++++ BeefRT/JEMalloc/src/decay.c | 295 + BeefRT/JEMalloc/src/div.c | 55 + BeefRT/JEMalloc/src/ecache.c | 35 + BeefRT/JEMalloc/src/edata.c | 6 + BeefRT/JEMalloc/src/edata_cache.c | 154 + BeefRT/JEMalloc/src/ehooks.c | 275 + BeefRT/JEMalloc/src/emap.c | 386 ++ BeefRT/JEMalloc/src/eset.c | 282 + BeefRT/JEMalloc/src/exp_grow.c | 8 + BeefRT/JEMalloc/src/extent.c | 1326 ++++ BeefRT/JEMalloc/src/extent_dss.c | 277 + BeefRT/JEMalloc/src/extent_mmap.c | 41 + BeefRT/JEMalloc/src/fxp.c | 124 + BeefRT/JEMalloc/src/hook.c | 195 + BeefRT/JEMalloc/src/hpa.c | 1044 +++ BeefRT/JEMalloc/src/hpa_hooks.c | 63 + BeefRT/JEMalloc/src/hpdata.c | 325 + BeefRT/JEMalloc/src/inspect.c | 77 + BeefRT/JEMalloc/src/jemalloc.c | 4476 +++++++++++++ BeefRT/JEMalloc/src/jemalloc_cpp.cpp | 254 + BeefRT/JEMalloc/src/large.c | 322 + BeefRT/JEMalloc/src/log.c | 78 + BeefRT/JEMalloc/src/malloc_io.c | 697 ++ BeefRT/JEMalloc/src/mutex.c | 228 + BeefRT/JEMalloc/src/nstime.c | 289 + BeefRT/JEMalloc/src/pa.c | 277 + BeefRT/JEMalloc/src/pa_extra.c | 191 + BeefRT/JEMalloc/src/pac.c | 587 ++ BeefRT/JEMalloc/src/pages.c | 824 +++ BeefRT/JEMalloc/src/pai.c | 31 + BeefRT/JEMalloc/src/peak_event.c | 82 + BeefRT/JEMalloc/src/prof.c | 789 +++ BeefRT/JEMalloc/src/prof_data.c | 1447 +++++ BeefRT/JEMalloc/src/prof_log.c | 717 +++ BeefRT/JEMalloc/src/prof_recent.c | 600 ++ BeefRT/JEMalloc/src/prof_stats.c | 57 + BeefRT/JEMalloc/src/prof_sys.c | 669 ++ BeefRT/JEMalloc/src/psset.c | 385 ++ BeefRT/JEMalloc/src/rtree.c | 261 + BeefRT/JEMalloc/src/safety_check.c | 36 + BeefRT/JEMalloc/src/san.c | 208 + BeefRT/JEMalloc/src/san_bump.c | 104 + BeefRT/JEMalloc/src/sc.c | 306 + BeefRT/JEMalloc/src/sec.c | 422 ++ BeefRT/JEMalloc/src/stats.c | 1973 ++++++ BeefRT/JEMalloc/src/sz.c | 114 + BeefRT/JEMalloc/src/tcache.c | 1101 ++++ BeefRT/JEMalloc/src/test_hooks.c | 12 + BeefRT/JEMalloc/src/thread_event.c | 343 + BeefRT/JEMalloc/src/ticker.c | 32 + BeefRT/JEMalloc/src/ticker.py | 15 + BeefRT/JEMalloc/src/tsd.c | 549 ++ BeefRT/JEMalloc/src/witness.c | 122 + BeefRT/JEMalloc/src/zone.c | 469 ++ BeefRT/TCMalloc/TCMalloc.cpp | 27 + BeefRT/TCMalloc/TCMalloc.vcxproj | 760 +++ BeefRT/TCMalloc/TCMalloc.vcxproj.filters | 247 + BeefRT/gperftools/src/tcmalloc.cc | 4 +- BeefRT/gperftools/src/thread_cache.cc | 1 - BeefRT/gperftools/src/windows/config.h | 10 +- .../src/windows/gperftools/tcmalloc.h | 4 +- 242 files changed, 67746 insertions(+), 6 deletions(-) create mode 100644 BeefRT/JEMalloc/.appveyor.yml create mode 100644 BeefRT/JEMalloc/.autom4te.cfg create mode 100644 BeefRT/JEMalloc/COPYING create mode 100644 BeefRT/JEMalloc/INSTALL.md create mode 100644 BeefRT/JEMalloc/Makefile.in create mode 100644 BeefRT/JEMalloc/README create mode 100644 BeefRT/JEMalloc/VERSION create mode 100644 BeefRT/JEMalloc/autogen.sh create mode 100644 BeefRT/JEMalloc/bin/jemalloc-config.in create mode 100644 BeefRT/JEMalloc/bin/jemalloc.sh.in create mode 100644 BeefRT/JEMalloc/bin/jeprof.in create mode 100644 BeefRT/JEMalloc/build-aux/install-sh create mode 100644 BeefRT/JEMalloc/configure.ac create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/activity_callback.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/arena_externs.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/arena_inlines_a.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/arena_inlines_b.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/arena_stats.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/arena_structs.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/arena_types.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/assert.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/atomic.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/atomic_c11.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/atomic_gcc_atomic.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/atomic_gcc_sync.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/atomic_msvc.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/background_thread_externs.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/background_thread_inlines.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/background_thread_structs.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/base.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/bin.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/bin_info.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/bin_stats.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/bin_types.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/bit_util.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/bitmap.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/buf_writer.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/cache_bin.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/ckh.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/counter.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/ctl.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/decay.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/div.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/ecache.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/edata.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/edata_cache.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/ehooks.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/emap.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/emitter.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/eset.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/exp_grow.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/extent.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/extent_dss.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/extent_mmap.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/fb.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/fxp.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/hash.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/hook.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/hpa.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/hpa_hooks.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/hpa_opts.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/hpdata.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/inspect.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_decls.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_defs.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_defs.h.in create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_externs.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_includes.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_inlines_a.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_inlines_b.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_macros.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_types.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_preamble.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_preamble.h.in create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/large_externs.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/lockedint.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/log.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/malloc_io.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/mpsc_queue.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/mutex.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/mutex_prof.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/nstime.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/pa.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/pac.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/pages.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/pai.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/peak.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/peak_event.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/ph.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/private_namespace.sh create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/private_symbols.awk create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/private_symbols.sh create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/private_symbols_jet.awk create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/prng.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/prof_data.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/prof_externs.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/prof_hook.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/prof_inlines.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/prof_log.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/prof_recent.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/prof_stats.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/prof_structs.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/prof_sys.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/prof_types.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/psset.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/public_namespace.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/public_namespace.sh create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/public_symbols.txt create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/public_unnamespace.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/public_unnamespace.sh create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/ql.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/qr.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/quantum.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/rb.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/rtree.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/rtree_tsd.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/safety_check.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/san.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/san_bump.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/sc.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/sec.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/sec_opts.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/seq.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/slab_data.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/smoothstep.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/smoothstep.sh create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/spin.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/stats.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/sz.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/tcache_externs.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/tcache_inlines.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/tcache_structs.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/tcache_types.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/test_hooks.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/thread_event.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/ticker.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/tsd.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/tsd_generic.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/tsd_malloc_thread_cleanup.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/tsd_tls.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/tsd_types.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/tsd_win.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/typed_list.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/util.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/internal/witness.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/jemalloc.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/jemalloc.sh create mode 100644 BeefRT/JEMalloc/include/jemalloc/jemalloc_defs.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/jemalloc_defs.h.in create mode 100644 BeefRT/JEMalloc/include/jemalloc/jemalloc_macros.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/jemalloc_macros.h.in create mode 100644 BeefRT/JEMalloc/include/jemalloc/jemalloc_mangle.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/jemalloc_mangle.sh create mode 100644 BeefRT/JEMalloc/include/jemalloc/jemalloc_mangle_jet.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/jemalloc_protos.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/jemalloc_protos.h.in create mode 100644 BeefRT/JEMalloc/include/jemalloc/jemalloc_protos_jet.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/jemalloc_rename.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/jemalloc_rename.sh create mode 100644 BeefRT/JEMalloc/include/jemalloc/jemalloc_typedefs.h create mode 100644 BeefRT/JEMalloc/include/jemalloc/jemalloc_typedefs.h.in create mode 100644 BeefRT/JEMalloc/include/msvc_compat/C99/stdbool.h create mode 100644 BeefRT/JEMalloc/include/msvc_compat/C99/stdint.h create mode 100644 BeefRT/JEMalloc/include/msvc_compat/strings.h create mode 100644 BeefRT/JEMalloc/include/msvc_compat/windows_extra.h create mode 100644 BeefRT/JEMalloc/jemalloc.pc.in create mode 100644 BeefRT/JEMalloc/jemalloc.sln create mode 100644 BeefRT/JEMalloc/jemalloc.vcxproj create mode 100644 BeefRT/JEMalloc/jemalloc.vcxproj.filters create mode 100644 BeefRT/JEMalloc/src/arena.c create mode 100644 BeefRT/JEMalloc/src/background_thread.c create mode 100644 BeefRT/JEMalloc/src/base.c create mode 100644 BeefRT/JEMalloc/src/bin.c create mode 100644 BeefRT/JEMalloc/src/bin_info.c create mode 100644 BeefRT/JEMalloc/src/bitmap.c create mode 100644 BeefRT/JEMalloc/src/buf_writer.c create mode 100644 BeefRT/JEMalloc/src/cache_bin.c create mode 100644 BeefRT/JEMalloc/src/ckh.c create mode 100644 BeefRT/JEMalloc/src/counter.c create mode 100644 BeefRT/JEMalloc/src/ctl.c create mode 100644 BeefRT/JEMalloc/src/decay.c create mode 100644 BeefRT/JEMalloc/src/div.c create mode 100644 BeefRT/JEMalloc/src/ecache.c create mode 100644 BeefRT/JEMalloc/src/edata.c create mode 100644 BeefRT/JEMalloc/src/edata_cache.c create mode 100644 BeefRT/JEMalloc/src/ehooks.c create mode 100644 BeefRT/JEMalloc/src/emap.c create mode 100644 BeefRT/JEMalloc/src/eset.c create mode 100644 BeefRT/JEMalloc/src/exp_grow.c create mode 100644 BeefRT/JEMalloc/src/extent.c create mode 100644 BeefRT/JEMalloc/src/extent_dss.c create mode 100644 BeefRT/JEMalloc/src/extent_mmap.c create mode 100644 BeefRT/JEMalloc/src/fxp.c create mode 100644 BeefRT/JEMalloc/src/hook.c create mode 100644 BeefRT/JEMalloc/src/hpa.c create mode 100644 BeefRT/JEMalloc/src/hpa_hooks.c create mode 100644 BeefRT/JEMalloc/src/hpdata.c create mode 100644 BeefRT/JEMalloc/src/inspect.c create mode 100644 BeefRT/JEMalloc/src/jemalloc.c create mode 100644 BeefRT/JEMalloc/src/jemalloc_cpp.cpp create mode 100644 BeefRT/JEMalloc/src/large.c create mode 100644 BeefRT/JEMalloc/src/log.c create mode 100644 BeefRT/JEMalloc/src/malloc_io.c create mode 100644 BeefRT/JEMalloc/src/mutex.c create mode 100644 BeefRT/JEMalloc/src/nstime.c create mode 100644 BeefRT/JEMalloc/src/pa.c create mode 100644 BeefRT/JEMalloc/src/pa_extra.c create mode 100644 BeefRT/JEMalloc/src/pac.c create mode 100644 BeefRT/JEMalloc/src/pages.c create mode 100644 BeefRT/JEMalloc/src/pai.c create mode 100644 BeefRT/JEMalloc/src/peak_event.c create mode 100644 BeefRT/JEMalloc/src/prof.c create mode 100644 BeefRT/JEMalloc/src/prof_data.c create mode 100644 BeefRT/JEMalloc/src/prof_log.c create mode 100644 BeefRT/JEMalloc/src/prof_recent.c create mode 100644 BeefRT/JEMalloc/src/prof_stats.c create mode 100644 BeefRT/JEMalloc/src/prof_sys.c create mode 100644 BeefRT/JEMalloc/src/psset.c create mode 100644 BeefRT/JEMalloc/src/rtree.c create mode 100644 BeefRT/JEMalloc/src/safety_check.c create mode 100644 BeefRT/JEMalloc/src/san.c create mode 100644 BeefRT/JEMalloc/src/san_bump.c create mode 100644 BeefRT/JEMalloc/src/sc.c create mode 100644 BeefRT/JEMalloc/src/sec.c create mode 100644 BeefRT/JEMalloc/src/stats.c create mode 100644 BeefRT/JEMalloc/src/sz.c create mode 100644 BeefRT/JEMalloc/src/tcache.c create mode 100644 BeefRT/JEMalloc/src/test_hooks.c create mode 100644 BeefRT/JEMalloc/src/thread_event.c create mode 100644 BeefRT/JEMalloc/src/ticker.c create mode 100644 BeefRT/JEMalloc/src/ticker.py create mode 100644 BeefRT/JEMalloc/src/tsd.c create mode 100644 BeefRT/JEMalloc/src/witness.c create mode 100644 BeefRT/JEMalloc/src/zone.c create mode 100644 BeefRT/TCMalloc/TCMalloc.cpp create mode 100644 BeefRT/TCMalloc/TCMalloc.vcxproj create mode 100644 BeefRT/TCMalloc/TCMalloc.vcxproj.filters diff --git a/BeefRT/JEMalloc/.appveyor.yml b/BeefRT/JEMalloc/.appveyor.yml new file mode 100644 index 00000000..d31f9aed --- /dev/null +++ b/BeefRT/JEMalloc/.appveyor.yml @@ -0,0 +1,41 @@ +version: '{build}' + +environment: + matrix: + - MSYSTEM: MINGW64 + CPU: x86_64 + MSVC: amd64 + CONFIG_FLAGS: --enable-debug + - MSYSTEM: MINGW64 + CPU: x86_64 + CONFIG_FLAGS: --enable-debug + - MSYSTEM: MINGW32 + CPU: i686 + MSVC: x86 + CONFIG_FLAGS: --enable-debug + - MSYSTEM: MINGW32 + CPU: i686 + CONFIG_FLAGS: --enable-debug + - MSYSTEM: MINGW64 + CPU: x86_64 + MSVC: amd64 + - MSYSTEM: MINGW64 + CPU: x86_64 + - MSYSTEM: MINGW32 + CPU: i686 + MSVC: x86 + - MSYSTEM: MINGW32 + CPU: i686 + +install: + - set PATH=c:\msys64\%MSYSTEM%\bin;c:\msys64\usr\bin;%PATH% + - if defined MSVC call "c:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" %MSVC% + - if defined MSVC pacman --noconfirm -Rsc mingw-w64-%CPU%-gcc gcc + +build_script: + - bash -c "autoconf" + - bash -c "./configure $CONFIG_FLAGS" + - mingw32-make + - file lib/jemalloc.dll + - mingw32-make tests + - mingw32-make -k check diff --git a/BeefRT/JEMalloc/.autom4te.cfg b/BeefRT/JEMalloc/.autom4te.cfg new file mode 100644 index 00000000..fe2424db --- /dev/null +++ b/BeefRT/JEMalloc/.autom4te.cfg @@ -0,0 +1,3 @@ +begin-language: "Autoconf-without-aclocal-m4" +args: --no-cache +end-language: "Autoconf-without-aclocal-m4" diff --git a/BeefRT/JEMalloc/COPYING b/BeefRT/JEMalloc/COPYING new file mode 100644 index 00000000..3b7fd358 --- /dev/null +++ b/BeefRT/JEMalloc/COPYING @@ -0,0 +1,27 @@ +Unless otherwise specified, files in the jemalloc source distribution are +subject to the following license: +-------------------------------------------------------------------------------- +Copyright (C) 2002-present Jason Evans . +All rights reserved. +Copyright (C) 2007-2012 Mozilla Foundation. All rights reserved. +Copyright (C) 2009-present Facebook, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +1. Redistributions of source code must retain the above copyright notice(s), + this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice(s), + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY EXPRESS +OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +-------------------------------------------------------------------------------- diff --git a/BeefRT/JEMalloc/INSTALL.md b/BeefRT/JEMalloc/INSTALL.md new file mode 100644 index 00000000..90da718d --- /dev/null +++ b/BeefRT/JEMalloc/INSTALL.md @@ -0,0 +1,424 @@ +Building and installing a packaged release of jemalloc can be as simple as +typing the following while in the root directory of the source tree: + + ./configure + make + make install + +If building from unpackaged developer sources, the simplest command sequence +that might work is: + + ./autogen.sh + make + make install + +You can uninstall the installed build artifacts like this: + + make uninstall + +Notes: + - "autoconf" needs to be installed + - Documentation is built by the default target only when xsltproc is +available. Build will warn but not stop if the dependency is missing. + + +## Advanced configuration + +The 'configure' script supports numerous options that allow control of which +functionality is enabled, where jemalloc is installed, etc. Optionally, pass +any of the following arguments (not a definitive list) to 'configure': + +* `--help` + + Print a definitive list of options. + +* `--prefix=` + + Set the base directory in which to install. For example: + + ./configure --prefix=/usr/local + + will cause files to be installed into /usr/local/include, /usr/local/lib, + and /usr/local/man. + +* `--with-version=(..--g|VERSION)` + + The VERSION file is mandatory for successful configuration, and the + following steps are taken to assure its presence: + 1) If --with-version=..--g is specified, + generate VERSION using the specified value. + 2) If --with-version is not specified in either form and the source + directory is inside a git repository, try to generate VERSION via 'git + describe' invocations that pattern-match release tags. + 3) If VERSION is missing, generate it with a bogus version: + 0.0.0-0-g0000000000000000000000000000000000000000 + + Note that --with-version=VERSION bypasses (1) and (2), which simplifies + VERSION configuration when embedding a jemalloc release into another + project's git repository. + +* `--with-rpath=` + + Embed one or more library paths, so that libjemalloc can find the libraries + it is linked to. This works only on ELF-based systems. + +* `--with-mangling=` + + Mangle public symbols specified in which is a comma-separated list of + name:mangled pairs. + + For example, to use ld's --wrap option as an alternative method for + overriding libc's malloc implementation, specify something like: + + --with-mangling=malloc:__wrap_malloc,free:__wrap_free[...] + + Note that mangling happens prior to application of the prefix specified by + --with-jemalloc-prefix, and mangled symbols are then ignored when applying + the prefix. + +* `--with-jemalloc-prefix=` + + Prefix all public APIs with . For example, if is + "prefix_", API changes like the following occur: + + malloc() --> prefix_malloc() + malloc_conf --> prefix_malloc_conf + /etc/malloc.conf --> /etc/prefix_malloc.conf + MALLOC_CONF --> PREFIX_MALLOC_CONF + + This makes it possible to use jemalloc at the same time as the system + allocator, or even to use multiple copies of jemalloc simultaneously. + + By default, the prefix is "", except on OS X, where it is "je_". On OS X, + jemalloc overlays the default malloc zone, but makes no attempt to actually + replace the "malloc", "calloc", etc. symbols. + +* `--without-export` + + Don't export public APIs. This can be useful when building jemalloc as a + static library, or to avoid exporting public APIs when using the zone + allocator on OSX. + +* `--with-private-namespace=` + + Prefix all library-private APIs with je_. For shared libraries, + symbol visibility mechanisms prevent these symbols from being exported, but + for static libraries, naming collisions are a real possibility. By + default, is empty, which results in a symbol prefix of je_ . + +* `--with-install-suffix=` + + Append to the base name of all installed files, such that multiple + versions of jemalloc can coexist in the same installation directory. For + example, libjemalloc.so.0 becomes libjemalloc.so.0. + +* `--with-malloc-conf=` + + Embed `` as a run-time options string that is processed prior to + the malloc_conf global variable, the /etc/malloc.conf symlink, and the + MALLOC_CONF environment variable. For example, to change the default decay + time to 30 seconds: + + --with-malloc-conf=decay_ms:30000 + +* `--enable-debug` + + Enable assertions and validation code. This incurs a substantial + performance hit, but is very useful during application development. + +* `--disable-stats` + + Disable statistics gathering functionality. See the "opt.stats_print" + option documentation for usage details. + +* `--enable-prof` + + Enable heap profiling and leak detection functionality. See the "opt.prof" + option documentation for usage details. When enabled, there are several + approaches to backtracing, and the configure script chooses the first one + in the following list that appears to function correctly: + + + libunwind (requires --enable-prof-libunwind) + + libgcc (unless --disable-prof-libgcc) + + gcc intrinsics (unless --disable-prof-gcc) + +* `--enable-prof-libunwind` + + Use the libunwind library (http://www.nongnu.org/libunwind/) for stack + backtracing. + +* `--disable-prof-libgcc` + + Disable the use of libgcc's backtracing functionality. + +* `--disable-prof-gcc` + + Disable the use of gcc intrinsics for backtracing. + +* `--with-static-libunwind=` + + Statically link against the specified libunwind.a rather than dynamically + linking with -lunwind. + +* `--disable-fill` + + Disable support for junk/zero filling of memory. See the "opt.junk" and + "opt.zero" option documentation for usage details. + +* `--disable-zone-allocator` + + Disable zone allocator for Darwin. This means jemalloc won't be hooked as + the default allocator on OSX/iOS. + +* `--enable-utrace` + + Enable utrace(2)-based allocation tracing. This feature is not broadly + portable (FreeBSD has it, but Linux and OS X do not). + +* `--enable-xmalloc` + + Enable support for optional immediate termination due to out-of-memory + errors, as is commonly implemented by "xmalloc" wrapper function for malloc. + See the "opt.xmalloc" option documentation for usage details. + +* `--enable-lazy-lock` + + Enable code that wraps pthread_create() to detect when an application + switches from single-threaded to multi-threaded mode, so that it can avoid + mutex locking/unlocking operations while in single-threaded mode. In + practice, this feature usually has little impact on performance unless + thread-specific caching is disabled. + +* `--disable-cache-oblivious` + + Disable cache-oblivious large allocation alignment by default, for large + allocation requests with no alignment constraints. If this feature is + disabled, all large allocations are page-aligned as an implementation + artifact, which can severely harm CPU cache utilization. However, the + cache-oblivious layout comes at the cost of one extra page per large + allocation, which in the most extreme case increases physical memory usage + for the 16 KiB size class to 20 KiB. + +* `--disable-syscall` + + Disable use of syscall(2) rather than {open,read,write,close}(2). This is + intended as a workaround for systems that place security limitations on + syscall(2). + +* `--disable-cxx` + + Disable C++ integration. This will cause new and delete operator + implementations to be omitted. + +* `--with-xslroot=` + + Specify where to find DocBook XSL stylesheets when building the + documentation. + +* `--with-lg-page=` + + Specify the base 2 log of the allocator page size, which must in turn be at + least as large as the system page size. By default the configure script + determines the host's page size and sets the allocator page size equal to + the system page size, so this option need not be specified unless the + system page size may change between configuration and execution, e.g. when + cross compiling. + +* `--with-lg-hugepage=` + + Specify the base 2 log of the system huge page size. This option is useful + when cross compiling, or when overriding the default for systems that do + not explicitly support huge pages. + +* `--with-lg-quantum=` + + Specify the base 2 log of the minimum allocation alignment. jemalloc needs + to know the minimum alignment that meets the following C standard + requirement (quoted from the April 12, 2011 draft of the C11 standard): + + > The pointer returned if the allocation succeeds is suitably aligned so + that it may be assigned to a pointer to any type of object with a + fundamental alignment requirement and then used to access such an object + or an array of such objects in the space allocated [...] + + This setting is architecture-specific, and although jemalloc includes known + safe values for the most commonly used modern architectures, there is a + wrinkle related to GNU libc (glibc) that may impact your choice of + . On most modern architectures, this mandates 16-byte + alignment (=4), but the glibc developers chose not to meet this + requirement for performance reasons. An old discussion can be found at + . Unlike glibc, + jemalloc does follow the C standard by default (caveat: jemalloc + technically cheats for size classes smaller than the quantum), but the fact + that Linux systems already work around this allocator noncompliance means + that it is generally safe in practice to let jemalloc's minimum alignment + follow glibc's lead. If you specify `--with-lg-quantum=3` during + configuration, jemalloc will provide additional size classes that are not + 16-byte-aligned (24, 40, and 56). + +* `--with-lg-vaddr=` + + Specify the number of significant virtual address bits. By default, the + configure script attempts to detect virtual address size on those platforms + where it knows how, and picks a default otherwise. This option may be + useful when cross-compiling. + +* `--disable-initial-exec-tls` + + Disable the initial-exec TLS model for jemalloc's internal thread-local + storage (on those platforms that support explicit settings). This can allow + jemalloc to be dynamically loaded after program startup (e.g. using dlopen). + Note that in this case, there will be two malloc implementations operating + in the same process, which will almost certainly result in confusing runtime + crashes if pointers leak from one implementation to the other. + +* `--disable-libdl` + + Disable the usage of libdl, namely dlsym(3) which is required by the lazy + lock option. This can allow building static binaries. + +The following environment variables (not a definitive list) impact configure's +behavior: + +* `CFLAGS="?"` +* `CXXFLAGS="?"` + + Pass these flags to the C/C++ compiler. Any flags set by the configure + script are prepended, which means explicitly set flags generally take + precedence. Take care when specifying flags such as -Werror, because + configure tests may be affected in undesirable ways. + +* `EXTRA_CFLAGS="?"` +* `EXTRA_CXXFLAGS="?"` + + Append these flags to CFLAGS/CXXFLAGS, without passing them to the + compiler(s) during configuration. This makes it possible to add flags such + as -Werror, while allowing the configure script to determine what other + flags are appropriate for the specified configuration. + +* `CPPFLAGS="?"` + + Pass these flags to the C preprocessor. Note that CFLAGS is not passed to + 'cpp' when 'configure' is looking for include files, so you must use + CPPFLAGS instead if you need to help 'configure' find header files. + +* `LD_LIBRARY_PATH="?"` + + 'ld' uses this colon-separated list to find libraries. + +* `LDFLAGS="?"` + + Pass these flags when linking. + +* `PATH="?"` + + 'configure' uses this to find programs. + +In some cases it may be necessary to work around configuration results that do +not match reality. For example, Linux 4.5 added support for the MADV_FREE flag +to madvise(2), which can cause problems if building on a host with MADV_FREE +support and deploying to a target without. To work around this, use a cache +file to override the relevant configuration variable defined in configure.ac, +e.g.: + + echo "je_cv_madv_free=no" > config.cache && ./configure -C + + +## Advanced compilation + +To build only parts of jemalloc, use the following targets: + + build_lib_shared + build_lib_static + build_lib + build_doc_html + build_doc_man + build_doc + +To install only parts of jemalloc, use the following targets: + + install_bin + install_include + install_lib_shared + install_lib_static + install_lib_pc + install_lib + install_doc_html + install_doc_man + install_doc + +To clean up build results to varying degrees, use the following make targets: + + clean + distclean + relclean + + +## Advanced installation + +Optionally, define make variables when invoking make, including (not +exclusively): + +* `INCLUDEDIR="?"` + + Use this as the installation prefix for header files. + +* `LIBDIR="?"` + + Use this as the installation prefix for libraries. + +* `MANDIR="?"` + + Use this as the installation prefix for man pages. + +* `DESTDIR="?"` + + Prepend DESTDIR to INCLUDEDIR, LIBDIR, DATADIR, and MANDIR. This is useful + when installing to a different path than was specified via --prefix. + +* `CC="?"` + + Use this to invoke the C compiler. + +* `CFLAGS="?"` + + Pass these flags to the compiler. + +* `CPPFLAGS="?"` + + Pass these flags to the C preprocessor. + +* `LDFLAGS="?"` + + Pass these flags when linking. + +* `PATH="?"` + + Use this to search for programs used during configuration and building. + + +## Development + +If you intend to make non-trivial changes to jemalloc, use the 'autogen.sh' +script rather than 'configure'. This re-generates 'configure', enables +configuration dependency rules, and enables re-generation of automatically +generated source files. + +The build system supports using an object directory separate from the source +tree. For example, you can create an 'obj' directory, and from within that +directory, issue configuration and build commands: + + autoconf + mkdir obj + cd obj + ../configure --enable-autogen + make + + +## Documentation + +The manual page is generated in both html and roff formats. Any web browser +can be used to view the html manual. The roff manual page can be formatted +prior to installation via the following command: + + nroff -man -t doc/jemalloc.3 diff --git a/BeefRT/JEMalloc/Makefile.in b/BeefRT/JEMalloc/Makefile.in new file mode 100644 index 00000000..1193cd85 --- /dev/null +++ b/BeefRT/JEMalloc/Makefile.in @@ -0,0 +1,762 @@ +# Clear out all vpaths, then set just one (default vpath) for the main build +# directory. +vpath +vpath % . + +# Clear the default suffixes, so that built-in rules are not used. +.SUFFIXES : + +SHELL := /bin/sh + +CC := @CC@ +CXX := @CXX@ + +# Configuration parameters. +DESTDIR = +BINDIR := $(DESTDIR)@BINDIR@ +INCLUDEDIR := $(DESTDIR)@INCLUDEDIR@ +LIBDIR := $(DESTDIR)@LIBDIR@ +DATADIR := $(DESTDIR)@DATADIR@ +MANDIR := $(DESTDIR)@MANDIR@ +srcroot := @srcroot@ +objroot := @objroot@ +abs_srcroot := @abs_srcroot@ +abs_objroot := @abs_objroot@ + +# Build parameters. +CPPFLAGS := @CPPFLAGS@ -I$(objroot)include -I$(srcroot)include +CONFIGURE_CFLAGS := @CONFIGURE_CFLAGS@ +SPECIFIED_CFLAGS := @SPECIFIED_CFLAGS@ +EXTRA_CFLAGS := @EXTRA_CFLAGS@ +CFLAGS := $(strip $(CONFIGURE_CFLAGS) $(SPECIFIED_CFLAGS) $(EXTRA_CFLAGS)) +CONFIGURE_CXXFLAGS := @CONFIGURE_CXXFLAGS@ +SPECIFIED_CXXFLAGS := @SPECIFIED_CXXFLAGS@ +EXTRA_CXXFLAGS := @EXTRA_CXXFLAGS@ +CXXFLAGS := $(strip $(CONFIGURE_CXXFLAGS) $(SPECIFIED_CXXFLAGS) $(EXTRA_CXXFLAGS)) +LDFLAGS := @LDFLAGS@ +EXTRA_LDFLAGS := @EXTRA_LDFLAGS@ +LIBS := @LIBS@ +RPATH_EXTRA := @RPATH_EXTRA@ +SO := @so@ +IMPORTLIB := @importlib@ +O := @o@ +A := @a@ +EXE := @exe@ +LIBPREFIX := @libprefix@ +REV := @rev@ +install_suffix := @install_suffix@ +ABI := @abi@ +XSLTPROC := @XSLTPROC@ +XSLROOT := @XSLROOT@ +AUTOCONF := @AUTOCONF@ +_RPATH = @RPATH@ +RPATH = $(if $(1),$(call _RPATH,$(1))) +cfghdrs_in := $(addprefix $(srcroot),@cfghdrs_in@) +cfghdrs_out := @cfghdrs_out@ +cfgoutputs_in := $(addprefix $(srcroot),@cfgoutputs_in@) +cfgoutputs_out := @cfgoutputs_out@ +enable_autogen := @enable_autogen@ +enable_doc := @enable_doc@ +enable_shared := @enable_shared@ +enable_static := @enable_static@ +enable_prof := @enable_prof@ +enable_zone_allocator := @enable_zone_allocator@ +enable_experimental_smallocx := @enable_experimental_smallocx@ +MALLOC_CONF := @JEMALLOC_CPREFIX@MALLOC_CONF +link_whole_archive := @link_whole_archive@ +DSO_LDFLAGS = @DSO_LDFLAGS@ +SOREV = @SOREV@ +PIC_CFLAGS = @PIC_CFLAGS@ +CTARGET = @CTARGET@ +LDTARGET = @LDTARGET@ +TEST_LD_MODE = @TEST_LD_MODE@ +MKLIB = @MKLIB@ +AR = @AR@ +ARFLAGS = @ARFLAGS@ +DUMP_SYMS = @DUMP_SYMS@ +AWK := @AWK@ +CC_MM = @CC_MM@ +LM := @LM@ +INSTALL = @INSTALL@ + +ifeq (macho, $(ABI)) +TEST_LIBRARY_PATH := DYLD_FALLBACK_LIBRARY_PATH="$(objroot)lib" +else +ifeq (pecoff, $(ABI)) +TEST_LIBRARY_PATH := PATH="$(PATH):$(objroot)lib" +else +TEST_LIBRARY_PATH := +endif +endif + +LIBJEMALLOC := $(LIBPREFIX)jemalloc$(install_suffix) + +# Lists of files. +BINS := $(objroot)bin/jemalloc-config $(objroot)bin/jemalloc.sh $(objroot)bin/jeprof +C_HDRS := $(objroot)include/jemalloc/jemalloc$(install_suffix).h +C_SRCS := $(srcroot)src/jemalloc.c \ + $(srcroot)src/arena.c \ + $(srcroot)src/background_thread.c \ + $(srcroot)src/base.c \ + $(srcroot)src/bin.c \ + $(srcroot)src/bin_info.c \ + $(srcroot)src/bitmap.c \ + $(srcroot)src/buf_writer.c \ + $(srcroot)src/cache_bin.c \ + $(srcroot)src/ckh.c \ + $(srcroot)src/counter.c \ + $(srcroot)src/ctl.c \ + $(srcroot)src/decay.c \ + $(srcroot)src/div.c \ + $(srcroot)src/ecache.c \ + $(srcroot)src/edata.c \ + $(srcroot)src/edata_cache.c \ + $(srcroot)src/ehooks.c \ + $(srcroot)src/emap.c \ + $(srcroot)src/eset.c \ + $(srcroot)src/exp_grow.c \ + $(srcroot)src/extent.c \ + $(srcroot)src/extent_dss.c \ + $(srcroot)src/extent_mmap.c \ + $(srcroot)src/fxp.c \ + $(srcroot)src/san.c \ + $(srcroot)src/san_bump.c \ + $(srcroot)src/hook.c \ + $(srcroot)src/hpa.c \ + $(srcroot)src/hpa_hooks.c \ + $(srcroot)src/hpdata.c \ + $(srcroot)src/inspect.c \ + $(srcroot)src/large.c \ + $(srcroot)src/log.c \ + $(srcroot)src/malloc_io.c \ + $(srcroot)src/mutex.c \ + $(srcroot)src/nstime.c \ + $(srcroot)src/pa.c \ + $(srcroot)src/pa_extra.c \ + $(srcroot)src/pai.c \ + $(srcroot)src/pac.c \ + $(srcroot)src/pages.c \ + $(srcroot)src/peak_event.c \ + $(srcroot)src/prof.c \ + $(srcroot)src/prof_data.c \ + $(srcroot)src/prof_log.c \ + $(srcroot)src/prof_recent.c \ + $(srcroot)src/prof_stats.c \ + $(srcroot)src/prof_sys.c \ + $(srcroot)src/psset.c \ + $(srcroot)src/rtree.c \ + $(srcroot)src/safety_check.c \ + $(srcroot)src/sc.c \ + $(srcroot)src/sec.c \ + $(srcroot)src/stats.c \ + $(srcroot)src/sz.c \ + $(srcroot)src/tcache.c \ + $(srcroot)src/test_hooks.c \ + $(srcroot)src/thread_event.c \ + $(srcroot)src/ticker.c \ + $(srcroot)src/tsd.c \ + $(srcroot)src/witness.c +ifeq ($(enable_zone_allocator), 1) +C_SRCS += $(srcroot)src/zone.c +endif +ifeq ($(IMPORTLIB),$(SO)) +STATIC_LIBS := $(objroot)lib/$(LIBJEMALLOC).$(A) +endif +ifdef PIC_CFLAGS +STATIC_LIBS += $(objroot)lib/$(LIBJEMALLOC)_pic.$(A) +else +STATIC_LIBS += $(objroot)lib/$(LIBJEMALLOC)_s.$(A) +endif +DSOS := $(objroot)lib/$(LIBJEMALLOC).$(SOREV) +ifneq ($(SOREV),$(SO)) +DSOS += $(objroot)lib/$(LIBJEMALLOC).$(SO) +endif +ifeq (1, $(link_whole_archive)) +LJEMALLOC := -Wl,--whole-archive -L$(objroot)lib -l$(LIBJEMALLOC) -Wl,--no-whole-archive +else +LJEMALLOC := $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB) +endif +PC := $(objroot)jemalloc.pc +DOCS_XML := $(objroot)doc/jemalloc$(install_suffix).xml +DOCS_HTML := $(DOCS_XML:$(objroot)%.xml=$(objroot)%.html) +DOCS_MAN3 := $(DOCS_XML:$(objroot)%.xml=$(objroot)%.3) +DOCS := $(DOCS_HTML) $(DOCS_MAN3) +C_TESTLIB_SRCS := $(srcroot)test/src/btalloc.c $(srcroot)test/src/btalloc_0.c \ + $(srcroot)test/src/btalloc_1.c $(srcroot)test/src/math.c \ + $(srcroot)test/src/mtx.c $(srcroot)test/src/sleep.c \ + $(srcroot)test/src/SFMT.c $(srcroot)test/src/test.c \ + $(srcroot)test/src/thd.c $(srcroot)test/src/timer.c +ifeq (1, $(link_whole_archive)) +C_UTIL_INTEGRATION_SRCS := +C_UTIL_CPP_SRCS := +else +C_UTIL_INTEGRATION_SRCS := $(srcroot)src/nstime.c $(srcroot)src/malloc_io.c \ + $(srcroot)src/ticker.c +C_UTIL_CPP_SRCS := $(srcroot)src/nstime.c $(srcroot)src/malloc_io.c +endif +TESTS_UNIT := \ + $(srcroot)test/unit/a0.c \ + $(srcroot)test/unit/arena_decay.c \ + $(srcroot)test/unit/arena_reset.c \ + $(srcroot)test/unit/atomic.c \ + $(srcroot)test/unit/background_thread.c \ + $(srcroot)test/unit/background_thread_enable.c \ + $(srcroot)test/unit/base.c \ + $(srcroot)test/unit/batch_alloc.c \ + $(srcroot)test/unit/binshard.c \ + $(srcroot)test/unit/bitmap.c \ + $(srcroot)test/unit/bit_util.c \ + $(srcroot)test/unit/buf_writer.c \ + $(srcroot)test/unit/cache_bin.c \ + $(srcroot)test/unit/ckh.c \ + $(srcroot)test/unit/counter.c \ + $(srcroot)test/unit/decay.c \ + $(srcroot)test/unit/div.c \ + $(srcroot)test/unit/double_free.c \ + $(srcroot)test/unit/edata_cache.c \ + $(srcroot)test/unit/emitter.c \ + $(srcroot)test/unit/extent_quantize.c \ + ${srcroot}test/unit/fb.c \ + $(srcroot)test/unit/fork.c \ + ${srcroot}test/unit/fxp.c \ + ${srcroot}test/unit/san.c \ + ${srcroot}test/unit/san_bump.c \ + $(srcroot)test/unit/hash.c \ + $(srcroot)test/unit/hook.c \ + $(srcroot)test/unit/hpa.c \ + $(srcroot)test/unit/hpa_background_thread.c \ + $(srcroot)test/unit/hpdata.c \ + $(srcroot)test/unit/huge.c \ + $(srcroot)test/unit/inspect.c \ + $(srcroot)test/unit/junk.c \ + $(srcroot)test/unit/junk_alloc.c \ + $(srcroot)test/unit/junk_free.c \ + $(srcroot)test/unit/log.c \ + $(srcroot)test/unit/mallctl.c \ + $(srcroot)test/unit/malloc_conf_2.c \ + $(srcroot)test/unit/malloc_io.c \ + $(srcroot)test/unit/math.c \ + $(srcroot)test/unit/mpsc_queue.c \ + $(srcroot)test/unit/mq.c \ + $(srcroot)test/unit/mtx.c \ + $(srcroot)test/unit/nstime.c \ + $(srcroot)test/unit/oversize_threshold.c \ + $(srcroot)test/unit/pa.c \ + $(srcroot)test/unit/pack.c \ + $(srcroot)test/unit/pages.c \ + $(srcroot)test/unit/peak.c \ + $(srcroot)test/unit/ph.c \ + $(srcroot)test/unit/prng.c \ + $(srcroot)test/unit/prof_accum.c \ + $(srcroot)test/unit/prof_active.c \ + $(srcroot)test/unit/prof_gdump.c \ + $(srcroot)test/unit/prof_hook.c \ + $(srcroot)test/unit/prof_idump.c \ + $(srcroot)test/unit/prof_log.c \ + $(srcroot)test/unit/prof_mdump.c \ + $(srcroot)test/unit/prof_recent.c \ + $(srcroot)test/unit/prof_reset.c \ + $(srcroot)test/unit/prof_stats.c \ + $(srcroot)test/unit/prof_tctx.c \ + $(srcroot)test/unit/prof_thread_name.c \ + $(srcroot)test/unit/prof_sys_thread_name.c \ + $(srcroot)test/unit/psset.c \ + $(srcroot)test/unit/ql.c \ + $(srcroot)test/unit/qr.c \ + $(srcroot)test/unit/rb.c \ + $(srcroot)test/unit/retained.c \ + $(srcroot)test/unit/rtree.c \ + $(srcroot)test/unit/safety_check.c \ + $(srcroot)test/unit/sc.c \ + $(srcroot)test/unit/sec.c \ + $(srcroot)test/unit/seq.c \ + $(srcroot)test/unit/SFMT.c \ + $(srcroot)test/unit/size_check.c \ + $(srcroot)test/unit/size_classes.c \ + $(srcroot)test/unit/slab.c \ + $(srcroot)test/unit/smoothstep.c \ + $(srcroot)test/unit/spin.c \ + $(srcroot)test/unit/stats.c \ + $(srcroot)test/unit/stats_print.c \ + $(srcroot)test/unit/sz.c \ + $(srcroot)test/unit/tcache_max.c \ + $(srcroot)test/unit/test_hooks.c \ + $(srcroot)test/unit/thread_event.c \ + $(srcroot)test/unit/ticker.c \ + $(srcroot)test/unit/tsd.c \ + $(srcroot)test/unit/uaf.c \ + $(srcroot)test/unit/witness.c \ + $(srcroot)test/unit/zero.c \ + $(srcroot)test/unit/zero_realloc_abort.c \ + $(srcroot)test/unit/zero_realloc_free.c \ + $(srcroot)test/unit/zero_realloc_alloc.c \ + $(srcroot)test/unit/zero_reallocs.c +ifeq (@enable_prof@, 1) +TESTS_UNIT += \ + $(srcroot)test/unit/arena_reset_prof.c \ + $(srcroot)test/unit/batch_alloc_prof.c +endif +TESTS_INTEGRATION := $(srcroot)test/integration/aligned_alloc.c \ + $(srcroot)test/integration/allocated.c \ + $(srcroot)test/integration/extent.c \ + $(srcroot)test/integration/malloc.c \ + $(srcroot)test/integration/mallocx.c \ + $(srcroot)test/integration/MALLOCX_ARENA.c \ + $(srcroot)test/integration/overflow.c \ + $(srcroot)test/integration/posix_memalign.c \ + $(srcroot)test/integration/rallocx.c \ + $(srcroot)test/integration/sdallocx.c \ + $(srcroot)test/integration/slab_sizes.c \ + $(srcroot)test/integration/thread_arena.c \ + $(srcroot)test/integration/thread_tcache_enabled.c \ + $(srcroot)test/integration/xallocx.c +ifeq (@enable_experimental_smallocx@, 1) +TESTS_INTEGRATION += \ + $(srcroot)test/integration/smallocx.c +endif +ifeq (@enable_cxx@, 1) +CPP_SRCS := $(srcroot)src/jemalloc_cpp.cpp +TESTS_INTEGRATION_CPP := $(srcroot)test/integration/cpp/basic.cpp \ + $(srcroot)test/integration/cpp/infallible_new_true.cpp \ + $(srcroot)test/integration/cpp/infallible_new_false.cpp +else +CPP_SRCS := +TESTS_INTEGRATION_CPP := +endif +TESTS_ANALYZE := $(srcroot)test/analyze/prof_bias.c \ + $(srcroot)test/analyze/rand.c \ + $(srcroot)test/analyze/sizes.c +TESTS_STRESS := $(srcroot)test/stress/batch_alloc.c \ + $(srcroot)test/stress/fill_flush.c \ + $(srcroot)test/stress/hookbench.c \ + $(srcroot)test/stress/large_microbench.c \ + $(srcroot)test/stress/mallctl.c \ + $(srcroot)test/stress/microbench.c + + +TESTS := $(TESTS_UNIT) $(TESTS_INTEGRATION) $(TESTS_INTEGRATION_CPP) \ + $(TESTS_ANALYZE) $(TESTS_STRESS) + +PRIVATE_NAMESPACE_HDRS := $(objroot)include/jemalloc/internal/private_namespace.h $(objroot)include/jemalloc/internal/private_namespace_jet.h +PRIVATE_NAMESPACE_GEN_HDRS := $(PRIVATE_NAMESPACE_HDRS:%.h=%.gen.h) +C_SYM_OBJS := $(C_SRCS:$(srcroot)%.c=$(objroot)%.sym.$(O)) +C_SYMS := $(C_SRCS:$(srcroot)%.c=$(objroot)%.sym) +C_OBJS := $(C_SRCS:$(srcroot)%.c=$(objroot)%.$(O)) +CPP_OBJS := $(CPP_SRCS:$(srcroot)%.cpp=$(objroot)%.$(O)) +C_PIC_OBJS := $(C_SRCS:$(srcroot)%.c=$(objroot)%.pic.$(O)) +CPP_PIC_OBJS := $(CPP_SRCS:$(srcroot)%.cpp=$(objroot)%.pic.$(O)) +C_JET_SYM_OBJS := $(C_SRCS:$(srcroot)%.c=$(objroot)%.jet.sym.$(O)) +C_JET_SYMS := $(C_SRCS:$(srcroot)%.c=$(objroot)%.jet.sym) +C_JET_OBJS := $(C_SRCS:$(srcroot)%.c=$(objroot)%.jet.$(O)) +C_TESTLIB_UNIT_OBJS := $(C_TESTLIB_SRCS:$(srcroot)%.c=$(objroot)%.unit.$(O)) +C_TESTLIB_INTEGRATION_OBJS := $(C_TESTLIB_SRCS:$(srcroot)%.c=$(objroot)%.integration.$(O)) +C_UTIL_INTEGRATION_OBJS := $(C_UTIL_INTEGRATION_SRCS:$(srcroot)%.c=$(objroot)%.integration.$(O)) +C_TESTLIB_ANALYZE_OBJS := $(C_TESTLIB_SRCS:$(srcroot)%.c=$(objroot)%.analyze.$(O)) +C_TESTLIB_STRESS_OBJS := $(C_TESTLIB_SRCS:$(srcroot)%.c=$(objroot)%.stress.$(O)) +C_TESTLIB_OBJS := $(C_TESTLIB_UNIT_OBJS) $(C_TESTLIB_INTEGRATION_OBJS) \ + $(C_UTIL_INTEGRATION_OBJS) $(C_TESTLIB_ANALYZE_OBJS) \ + $(C_TESTLIB_STRESS_OBJS) + +TESTS_UNIT_OBJS := $(TESTS_UNIT:$(srcroot)%.c=$(objroot)%.$(O)) +TESTS_INTEGRATION_OBJS := $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%.$(O)) +TESTS_INTEGRATION_CPP_OBJS := $(TESTS_INTEGRATION_CPP:$(srcroot)%.cpp=$(objroot)%.$(O)) +TESTS_ANALYZE_OBJS := $(TESTS_ANALYZE:$(srcroot)%.c=$(objroot)%.$(O)) +TESTS_STRESS_OBJS := $(TESTS_STRESS:$(srcroot)%.c=$(objroot)%.$(O)) +TESTS_OBJS := $(TESTS_UNIT_OBJS) $(TESTS_INTEGRATION_OBJS) $(TESTS_ANALYZE_OBJS) \ + $(TESTS_STRESS_OBJS) +TESTS_CPP_OBJS := $(TESTS_INTEGRATION_CPP_OBJS) + +.PHONY: all dist build_doc_html build_doc_man build_doc +.PHONY: install_bin install_include install_lib +.PHONY: install_doc_html install_doc_man install_doc install +.PHONY: tests check clean distclean relclean + +.SECONDARY : $(PRIVATE_NAMESPACE_GEN_HDRS) $(TESTS_OBJS) $(TESTS_CPP_OBJS) + +# Default target. +all: build_lib + +dist: build_doc + +$(objroot)doc/%$(install_suffix).html : $(objroot)doc/%.xml $(srcroot)doc/stylesheet.xsl $(objroot)doc/html.xsl +ifneq ($(XSLROOT),) + $(XSLTPROC) -o $@ $(objroot)doc/html.xsl $< +else +ifeq ($(wildcard $(DOCS_HTML)),) + @echo "

Missing xsltproc. Doc not built.

" > $@ +endif + @echo "Missing xsltproc. "$@" not (re)built." +endif + +$(objroot)doc/%$(install_suffix).3 : $(objroot)doc/%.xml $(srcroot)doc/stylesheet.xsl $(objroot)doc/manpages.xsl +ifneq ($(XSLROOT),) + $(XSLTPROC) -o $@ $(objroot)doc/manpages.xsl $< +# The -o option (output filename) of xsltproc may not work (it uses the +# in the .xml file). Manually add the suffix if so. + ifneq ($(install_suffix),) + @if [ -f $(objroot)doc/jemalloc.3 ]; then \ + mv $(objroot)doc/jemalloc.3 $(objroot)doc/jemalloc$(install_suffix).3 ; \ + fi + endif +else +ifeq ($(wildcard $(DOCS_MAN3)),) + @echo "Missing xsltproc. Doc not built." > $@ +endif + @echo "Missing xsltproc. "$@" not (re)built." +endif + +build_doc_html: $(DOCS_HTML) +build_doc_man: $(DOCS_MAN3) +build_doc: $(DOCS) + +# +# Include generated dependency files. +# +ifdef CC_MM +-include $(C_SYM_OBJS:%.$(O)=%.d) +-include $(C_OBJS:%.$(O)=%.d) +-include $(CPP_OBJS:%.$(O)=%.d) +-include $(C_PIC_OBJS:%.$(O)=%.d) +-include $(CPP_PIC_OBJS:%.$(O)=%.d) +-include $(C_JET_SYM_OBJS:%.$(O)=%.d) +-include $(C_JET_OBJS:%.$(O)=%.d) +-include $(C_TESTLIB_OBJS:%.$(O)=%.d) +-include $(TESTS_OBJS:%.$(O)=%.d) +-include $(TESTS_CPP_OBJS:%.$(O)=%.d) +endif + +$(C_SYM_OBJS): $(objroot)src/%.sym.$(O): $(srcroot)src/%.c +$(C_SYM_OBJS): CPPFLAGS += -DJEMALLOC_NO_PRIVATE_NAMESPACE +$(C_SYMS): $(objroot)src/%.sym: $(objroot)src/%.sym.$(O) +$(C_OBJS): $(objroot)src/%.$(O): $(srcroot)src/%.c +$(CPP_OBJS): $(objroot)src/%.$(O): $(srcroot)src/%.cpp +$(C_PIC_OBJS): $(objroot)src/%.pic.$(O): $(srcroot)src/%.c +$(C_PIC_OBJS): CFLAGS += $(PIC_CFLAGS) +$(CPP_PIC_OBJS): $(objroot)src/%.pic.$(O): $(srcroot)src/%.cpp +$(CPP_PIC_OBJS): CXXFLAGS += $(PIC_CFLAGS) +$(C_JET_SYM_OBJS): $(objroot)src/%.jet.sym.$(O): $(srcroot)src/%.c +$(C_JET_SYM_OBJS): CPPFLAGS += -DJEMALLOC_JET -DJEMALLOC_NO_PRIVATE_NAMESPACE +$(C_JET_SYMS): $(objroot)src/%.jet.sym: $(objroot)src/%.jet.sym.$(O) +$(C_JET_OBJS): $(objroot)src/%.jet.$(O): $(srcroot)src/%.c +$(C_JET_OBJS): CPPFLAGS += -DJEMALLOC_JET +$(C_TESTLIB_UNIT_OBJS): $(objroot)test/src/%.unit.$(O): $(srcroot)test/src/%.c +$(C_TESTLIB_UNIT_OBJS): CPPFLAGS += -DJEMALLOC_UNIT_TEST +$(C_TESTLIB_INTEGRATION_OBJS): $(objroot)test/src/%.integration.$(O): $(srcroot)test/src/%.c +$(C_TESTLIB_INTEGRATION_OBJS): CPPFLAGS += -DJEMALLOC_INTEGRATION_TEST +$(C_UTIL_INTEGRATION_OBJS): $(objroot)src/%.integration.$(O): $(srcroot)src/%.c +$(C_TESTLIB_ANALYZE_OBJS): $(objroot)test/src/%.analyze.$(O): $(srcroot)test/src/%.c +$(C_TESTLIB_ANALYZE_OBJS): CPPFLAGS += -DJEMALLOC_ANALYZE_TEST +$(C_TESTLIB_STRESS_OBJS): $(objroot)test/src/%.stress.$(O): $(srcroot)test/src/%.c +$(C_TESTLIB_STRESS_OBJS): CPPFLAGS += -DJEMALLOC_STRESS_TEST -DJEMALLOC_STRESS_TESTLIB +$(C_TESTLIB_OBJS): CPPFLAGS += -I$(srcroot)test/include -I$(objroot)test/include +$(TESTS_UNIT_OBJS): CPPFLAGS += -DJEMALLOC_UNIT_TEST +$(TESTS_INTEGRATION_OBJS): CPPFLAGS += -DJEMALLOC_INTEGRATION_TEST +$(TESTS_INTEGRATION_CPP_OBJS): CPPFLAGS += -DJEMALLOC_INTEGRATION_CPP_TEST +$(TESTS_ANALYZE_OBJS): CPPFLAGS += -DJEMALLOC_ANALYZE_TEST +$(TESTS_STRESS_OBJS): CPPFLAGS += -DJEMALLOC_STRESS_TEST +$(TESTS_OBJS): $(objroot)test/%.$(O): $(srcroot)test/%.c +$(TESTS_CPP_OBJS): $(objroot)test/%.$(O): $(srcroot)test/%.cpp +$(TESTS_OBJS): CPPFLAGS += -I$(srcroot)test/include -I$(objroot)test/include +$(TESTS_CPP_OBJS): CPPFLAGS += -I$(srcroot)test/include -I$(objroot)test/include +ifneq ($(IMPORTLIB),$(SO)) +$(CPP_OBJS) $(C_SYM_OBJS) $(C_OBJS) $(C_JET_SYM_OBJS) $(C_JET_OBJS): CPPFLAGS += -DDLLEXPORT +endif + +# Dependencies. +ifndef CC_MM +HEADER_DIRS = $(srcroot)include/jemalloc/internal \ + $(objroot)include/jemalloc $(objroot)include/jemalloc/internal +HEADERS = $(filter-out $(PRIVATE_NAMESPACE_HDRS),$(wildcard $(foreach dir,$(HEADER_DIRS),$(dir)/*.h))) +$(C_SYM_OBJS) $(C_OBJS) $(CPP_OBJS) $(C_PIC_OBJS) $(CPP_PIC_OBJS) $(C_JET_SYM_OBJS) $(C_JET_OBJS) $(C_TESTLIB_OBJS) $(TESTS_OBJS) $(TESTS_CPP_OBJS): $(HEADERS) +$(TESTS_OBJS) $(TESTS_CPP_OBJS): $(objroot)test/include/test/jemalloc_test.h +endif + +$(C_OBJS) $(CPP_OBJS) $(C_PIC_OBJS) $(CPP_PIC_OBJS) $(C_TESTLIB_INTEGRATION_OBJS) $(C_UTIL_INTEGRATION_OBJS) $(TESTS_INTEGRATION_OBJS) $(TESTS_INTEGRATION_CPP_OBJS): $(objroot)include/jemalloc/internal/private_namespace.h +$(C_JET_OBJS) $(C_TESTLIB_UNIT_OBJS) $(C_TESTLIB_ANALYZE_OBJS) $(C_TESTLIB_STRESS_OBJS) $(TESTS_UNIT_OBJS) $(TESTS_ANALYZE_OBJS) $(TESTS_STRESS_OBJS): $(objroot)include/jemalloc/internal/private_namespace_jet.h + +$(C_SYM_OBJS) $(C_OBJS) $(C_PIC_OBJS) $(C_JET_SYM_OBJS) $(C_JET_OBJS) $(C_TESTLIB_OBJS) $(TESTS_OBJS): %.$(O): + @mkdir -p $(@D) + $(CC) $(CFLAGS) -c $(CPPFLAGS) $(CTARGET) $< +ifdef CC_MM + @$(CC) -MM $(CPPFLAGS) -MT $@ -o $(@:%.$(O)=%.d) $< +endif + +$(C_SYMS): %.sym: + @mkdir -p $(@D) + $(DUMP_SYMS) $< | $(AWK) -f $(objroot)include/jemalloc/internal/private_symbols.awk > $@ + +$(C_JET_SYMS): %.sym: + @mkdir -p $(@D) + $(DUMP_SYMS) $< | $(AWK) -f $(objroot)include/jemalloc/internal/private_symbols_jet.awk > $@ + +$(objroot)include/jemalloc/internal/private_namespace.gen.h: $(C_SYMS) + $(SHELL) $(srcroot)include/jemalloc/internal/private_namespace.sh $^ > $@ + +$(objroot)include/jemalloc/internal/private_namespace_jet.gen.h: $(C_JET_SYMS) + $(SHELL) $(srcroot)include/jemalloc/internal/private_namespace.sh $^ > $@ + +%.h: %.gen.h + @if ! `cmp -s $< $@` ; then echo "cp $< $@"; cp $< $@ ; fi + +$(CPP_OBJS) $(CPP_PIC_OBJS) $(TESTS_CPP_OBJS): %.$(O): + @mkdir -p $(@D) + $(CXX) $(CXXFLAGS) -c $(CPPFLAGS) $(CTARGET) $< +ifdef CC_MM + @$(CXX) -MM $(CPPFLAGS) -MT $@ -o $(@:%.$(O)=%.d) $< +endif + +ifneq ($(SOREV),$(SO)) +%.$(SO) : %.$(SOREV) + @mkdir -p $(@D) + ln -sf $( $(srcroot)config.stamp.in + +$(objroot)config.stamp : $(cfgoutputs_in) $(cfghdrs_in) $(srcroot)configure + ./$(objroot)config.status + @touch $@ + +# There must be some action in order for make to re-read Makefile when it is +# out of date. +$(cfgoutputs_out) $(cfghdrs_out) : $(objroot)config.stamp + @true +endif diff --git a/BeefRT/JEMalloc/README b/BeefRT/JEMalloc/README new file mode 100644 index 00000000..3a6e0d27 --- /dev/null +++ b/BeefRT/JEMalloc/README @@ -0,0 +1,20 @@ +jemalloc is a general purpose malloc(3) implementation that emphasizes +fragmentation avoidance and scalable concurrency support. jemalloc first came +into use as the FreeBSD libc allocator in 2005, and since then it has found its +way into numerous applications that rely on its predictable behavior. In 2010 +jemalloc development efforts broadened to include developer support features +such as heap profiling and extensive monitoring/tuning hooks. Modern jemalloc +releases continue to be integrated back into FreeBSD, and therefore versatility +remains critical. Ongoing development efforts trend toward making jemalloc +among the best allocators for a broad range of demanding applications, and +eliminating/mitigating weaknesses that have practical repercussions for real +world applications. + +The COPYING file contains copyright and licensing information. + +The INSTALL file contains information on how to configure, build, and install +jemalloc. + +The ChangeLog file contains a brief summary of changes for each release. + +URL: http://jemalloc.net/ diff --git a/BeefRT/JEMalloc/VERSION b/BeefRT/JEMalloc/VERSION new file mode 100644 index 00000000..1dcfea03 --- /dev/null +++ b/BeefRT/JEMalloc/VERSION @@ -0,0 +1 @@ +5.3.0-0-g54eaed1d8b56b1aa528be3bdd1877e59c56fa90c diff --git a/BeefRT/JEMalloc/autogen.sh b/BeefRT/JEMalloc/autogen.sh new file mode 100644 index 00000000..75f32da6 --- /dev/null +++ b/BeefRT/JEMalloc/autogen.sh @@ -0,0 +1,17 @@ +#!/bin/sh + +for i in autoconf; do + echo "$i" + $i + if [ $? -ne 0 ]; then + echo "Error $? in $i" + exit 1 + fi +done + +echo "./configure --enable-autogen $@" +./configure --enable-autogen $@ +if [ $? -ne 0 ]; then + echo "Error $? in ./configure" + exit 1 +fi diff --git a/BeefRT/JEMalloc/bin/jemalloc-config.in b/BeefRT/JEMalloc/bin/jemalloc-config.in new file mode 100644 index 00000000..80eca2e6 --- /dev/null +++ b/BeefRT/JEMalloc/bin/jemalloc-config.in @@ -0,0 +1,83 @@ +#!/bin/sh + +usage() { + cat < +Options: + --help | -h : Print usage. + --version : Print jemalloc version. + --revision : Print shared library revision number. + --config : Print configure options used to build jemalloc. + --prefix : Print installation directory prefix. + --bindir : Print binary installation directory. + --datadir : Print data installation directory. + --includedir : Print include installation directory. + --libdir : Print library installation directory. + --mandir : Print manual page installation directory. + --cc : Print compiler used to build jemalloc. + --cflags : Print compiler flags used to build jemalloc. + --cppflags : Print preprocessor flags used to build jemalloc. + --cxxflags : Print C++ compiler flags used to build jemalloc. + --ldflags : Print library flags used to build jemalloc. + --libs : Print libraries jemalloc was linked against. +EOF +} + +prefix="@prefix@" +exec_prefix="@exec_prefix@" + +case "$1" in +--help | -h) + usage + exit 0 + ;; +--version) + echo "@jemalloc_version@" + ;; +--revision) + echo "@rev@" + ;; +--config) + echo "@CONFIG@" + ;; +--prefix) + echo "@PREFIX@" + ;; +--bindir) + echo "@BINDIR@" + ;; +--datadir) + echo "@DATADIR@" + ;; +--includedir) + echo "@INCLUDEDIR@" + ;; +--libdir) + echo "@LIBDIR@" + ;; +--mandir) + echo "@MANDIR@" + ;; +--cc) + echo "@CC@" + ;; +--cflags) + echo "@CFLAGS@" + ;; +--cppflags) + echo "@CPPFLAGS@" + ;; +--cxxflags) + echo "@CXXFLAGS@" + ;; +--ldflags) + echo "@LDFLAGS@ @EXTRA_LDFLAGS@" + ;; +--libs) + echo "@LIBS@" + ;; +*) + usage + exit 1 +esac diff --git a/BeefRT/JEMalloc/bin/jemalloc.sh.in b/BeefRT/JEMalloc/bin/jemalloc.sh.in new file mode 100644 index 00000000..cdf36737 --- /dev/null +++ b/BeefRT/JEMalloc/bin/jemalloc.sh.in @@ -0,0 +1,9 @@ +#!/bin/sh + +prefix=@prefix@ +exec_prefix=@exec_prefix@ +libdir=@libdir@ + +@LD_PRELOAD_VAR@=${libdir}/libjemalloc.@SOREV@ +export @LD_PRELOAD_VAR@ +exec "$@" diff --git a/BeefRT/JEMalloc/bin/jeprof.in b/BeefRT/JEMalloc/bin/jeprof.in new file mode 100644 index 00000000..dbf6252b --- /dev/null +++ b/BeefRT/JEMalloc/bin/jeprof.in @@ -0,0 +1,5723 @@ +#! /usr/bin/env perl + +# Copyright (c) 1998-2007, Google Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Google Inc. nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# --- +# Program for printing the profile generated by common/profiler.cc, +# or by the heap profiler (common/debugallocation.cc) +# +# The profile contains a sequence of entries of the form: +# +# This program parses the profile, and generates user-readable +# output. +# +# Examples: +# +# % tools/jeprof "program" "profile" +# Enters "interactive" mode +# +# % tools/jeprof --text "program" "profile" +# Generates one line per procedure +# +# % tools/jeprof --gv "program" "profile" +# Generates annotated call-graph and displays via "gv" +# +# % tools/jeprof --gv --focus=Mutex "program" "profile" +# Restrict to code paths that involve an entry that matches "Mutex" +# +# % tools/jeprof --gv --focus=Mutex --ignore=string "program" "profile" +# Restrict to code paths that involve an entry that matches "Mutex" +# and does not match "string" +# +# % tools/jeprof --list=IBF_CheckDocid "program" "profile" +# Generates disassembly listing of all routines with at least one +# sample that match the --list= pattern. The listing is +# annotated with the flat and cumulative sample counts at each line. +# +# % tools/jeprof --disasm=IBF_CheckDocid "program" "profile" +# Generates disassembly listing of all routines with at least one +# sample that match the --disasm= pattern. The listing is +# annotated with the flat and cumulative sample counts at each PC value. +# +# TODO: Use color to indicate files? + +use strict; +use warnings; +use Getopt::Long; +use Cwd; + +my $JEPROF_VERSION = "@jemalloc_version@"; +my $PPROF_VERSION = "2.0"; + +# These are the object tools we use which can come from a +# user-specified location using --tools, from the JEPROF_TOOLS +# environment variable, or from the environment. +my %obj_tool_map = ( + "objdump" => "objdump", + "nm" => "nm", + "addr2line" => "addr2line", + "c++filt" => "c++filt", + ## ConfigureObjTools may add architecture-specific entries: + #"nm_pdb" => "nm-pdb", # for reading windows (PDB-format) executables + #"addr2line_pdb" => "addr2line-pdb", # ditto + #"otool" => "otool", # equivalent of objdump on OS X +); +# NOTE: these are lists, so you can put in commandline flags if you want. +my @DOT = ("dot"); # leave non-absolute, since it may be in /usr/local +my @GV = ("gv"); +my @EVINCE = ("evince"); # could also be xpdf or perhaps acroread +my @KCACHEGRIND = ("kcachegrind"); +my @PS2PDF = ("ps2pdf"); +# These are used for dynamic profiles +my @URL_FETCHER = ("curl", "-s", "--fail"); + +# These are the web pages that servers need to support for dynamic profiles +my $HEAP_PAGE = "/pprof/heap"; +my $PROFILE_PAGE = "/pprof/profile"; # must support cgi-param "?seconds=#" +my $PMUPROFILE_PAGE = "/pprof/pmuprofile(?:\\?.*)?"; # must support cgi-param + # ?seconds=#&event=x&period=n +my $GROWTH_PAGE = "/pprof/growth"; +my $CONTENTION_PAGE = "/pprof/contention"; +my $WALL_PAGE = "/pprof/wall(?:\\?.*)?"; # accepts options like namefilter +my $FILTEREDPROFILE_PAGE = "/pprof/filteredprofile(?:\\?.*)?"; +my $CENSUSPROFILE_PAGE = "/pprof/censusprofile(?:\\?.*)?"; # must support cgi-param + # "?seconds=#", + # "?tags_regexp=#" and + # "?type=#". +my $SYMBOL_PAGE = "/pprof/symbol"; # must support symbol lookup via POST +my $PROGRAM_NAME_PAGE = "/pprof/cmdline"; + +# These are the web pages that can be named on the command line. +# All the alternatives must begin with /. +my $PROFILES = "($HEAP_PAGE|$PROFILE_PAGE|$PMUPROFILE_PAGE|" . + "$GROWTH_PAGE|$CONTENTION_PAGE|$WALL_PAGE|" . + "$FILTEREDPROFILE_PAGE|$CENSUSPROFILE_PAGE)"; + +# default binary name +my $UNKNOWN_BINARY = "(unknown)"; + +# There is a pervasive dependency on the length (in hex characters, +# i.e., nibbles) of an address, distinguishing between 32-bit and +# 64-bit profiles. To err on the safe size, default to 64-bit here: +my $address_length = 16; + +my $dev_null = "/dev/null"; +if (! -e $dev_null && $^O =~ /MSWin/) { # $^O is the OS perl was built for + $dev_null = "nul"; +} + +# A list of paths to search for shared object files +my @prefix_list = (); + +# Special routine name that should not have any symbols. +# Used as separator to parse "addr2line -i" output. +my $sep_symbol = '_fini'; +my $sep_address = undef; + +##### Argument parsing ##### + +sub usage_string { + return < + is a space separated list of profile names. +jeprof [options] + is a list of profile files where each file contains + the necessary symbol mappings as well as profile data (likely generated + with --raw). +jeprof [options] + is a remote form. Symbols are obtained from host:port$SYMBOL_PAGE + + Each name can be: + /path/to/profile - a path to a profile file + host:port[/] - a location of a service to get profile from + + The / can be $HEAP_PAGE, $PROFILE_PAGE, /pprof/pmuprofile, + $GROWTH_PAGE, $CONTENTION_PAGE, /pprof/wall, + $CENSUSPROFILE_PAGE, or /pprof/filteredprofile. + For instance: + jeprof http://myserver.com:80$HEAP_PAGE + If / is omitted, the service defaults to $PROFILE_PAGE (cpu profiling). +jeprof --symbols + Maps addresses to symbol names. In this mode, stdin should be a + list of library mappings, in the same format as is found in the heap- + and cpu-profile files (this loosely matches that of /proc/self/maps + on linux), followed by a list of hex addresses to map, one per line. + + For more help with querying remote servers, including how to add the + necessary server-side support code, see this filename (or one like it): + + /usr/doc/gperftools-$PPROF_VERSION/pprof_remote_servers.html + +Options: + --cum Sort by cumulative data + --base= Subtract from before display + --interactive Run in interactive mode (interactive "help" gives help) [default] + --seconds= Length of time for dynamic profiles [default=30 secs] + --add_lib= Read additional symbols and line info from the given library + --lib_prefix= Comma separated list of library path prefixes + +Reporting Granularity: + --addresses Report at address level + --lines Report at source line level + --functions Report at function level [default] + --files Report at source file level + +Output type: + --text Generate text report + --callgrind Generate callgrind format to stdout + --gv Generate Postscript and display + --evince Generate PDF and display + --web Generate SVG and display + --list= Generate source listing of matching routines + --disasm= Generate disassembly of matching routines + --symbols Print demangled symbol names found at given addresses + --dot Generate DOT file to stdout + --ps Generate Postcript to stdout + --pdf Generate PDF to stdout + --svg Generate SVG to stdout + --gif Generate GIF to stdout + --raw Generate symbolized jeprof data (useful with remote fetch) + --collapsed Generate collapsed stacks for building flame graphs + (see http://www.brendangregg.com/flamegraphs.html) + +Heap-Profile Options: + --inuse_space Display in-use (mega)bytes [default] + --inuse_objects Display in-use objects + --alloc_space Display allocated (mega)bytes + --alloc_objects Display allocated objects + --show_bytes Display space in bytes + --drop_negative Ignore negative differences + +Contention-profile options: + --total_delay Display total delay at each region [default] + --contentions Display number of delays at each region + --mean_delay Display mean delay at each region + +Call-graph Options: + --nodecount= Show at most so many nodes [default=80] + --nodefraction= Hide nodes below *total [default=.005] + --edgefraction= Hide edges below *total [default=.001] + --maxdegree= Max incoming/outgoing edges per node [default=8] + --focus= Focus on backtraces with nodes matching + --thread= Show profile for thread + --ignore= Ignore backtraces with nodes matching + --scale= Set GV scaling [default=0] + --heapcheck Make nodes with non-0 object counts + (i.e. direct leak generators) more visible + --retain= Retain only nodes that match + --exclude= Exclude all nodes that match + +Miscellaneous: + --tools=[,...] \$PATH for object tool pathnames + --test Run unit tests + --help This message + --version Version information + --debug-syms-by-id (Linux only) Find debug symbol files by build ID as well as by name + +Environment Variables: + JEPROF_TMPDIR Profiles directory. Defaults to \$HOME/jeprof + JEPROF_TOOLS Prefix for object tools pathnames + +Examples: + +jeprof /bin/ls ls.prof + Enters "interactive" mode +jeprof --text /bin/ls ls.prof + Outputs one line per procedure +jeprof --web /bin/ls ls.prof + Displays annotated call-graph in web browser +jeprof --gv /bin/ls ls.prof + Displays annotated call-graph via 'gv' +jeprof --gv --focus=Mutex /bin/ls ls.prof + Restricts to code paths including a .*Mutex.* entry +jeprof --gv --focus=Mutex --ignore=string /bin/ls ls.prof + Code paths including Mutex but not string +jeprof --list=getdir /bin/ls ls.prof + (Per-line) annotated source listing for getdir() +jeprof --disasm=getdir /bin/ls ls.prof + (Per-PC) annotated disassembly for getdir() + +jeprof http://localhost:1234/ + Enters "interactive" mode +jeprof --text localhost:1234 + Outputs one line per procedure for localhost:1234 +jeprof --raw localhost:1234 > ./local.raw +jeprof --text ./local.raw + Fetches a remote profile for later analysis and then + analyzes it in text mode. +EOF +} + +sub version_string { + return < \$main::opt_help, + "version!" => \$main::opt_version, + "cum!" => \$main::opt_cum, + "base=s" => \$main::opt_base, + "seconds=i" => \$main::opt_seconds, + "add_lib=s" => \$main::opt_lib, + "lib_prefix=s" => \$main::opt_lib_prefix, + "functions!" => \$main::opt_functions, + "lines!" => \$main::opt_lines, + "addresses!" => \$main::opt_addresses, + "files!" => \$main::opt_files, + "text!" => \$main::opt_text, + "callgrind!" => \$main::opt_callgrind, + "list=s" => \$main::opt_list, + "disasm=s" => \$main::opt_disasm, + "symbols!" => \$main::opt_symbols, + "gv!" => \$main::opt_gv, + "evince!" => \$main::opt_evince, + "web!" => \$main::opt_web, + "dot!" => \$main::opt_dot, + "ps!" => \$main::opt_ps, + "pdf!" => \$main::opt_pdf, + "svg!" => \$main::opt_svg, + "gif!" => \$main::opt_gif, + "raw!" => \$main::opt_raw, + "collapsed!" => \$main::opt_collapsed, + "interactive!" => \$main::opt_interactive, + "nodecount=i" => \$main::opt_nodecount, + "nodefraction=f" => \$main::opt_nodefraction, + "edgefraction=f" => \$main::opt_edgefraction, + "maxdegree=i" => \$main::opt_maxdegree, + "focus=s" => \$main::opt_focus, + "thread=s" => \$main::opt_thread, + "ignore=s" => \$main::opt_ignore, + "scale=i" => \$main::opt_scale, + "heapcheck" => \$main::opt_heapcheck, + "retain=s" => \$main::opt_retain, + "exclude=s" => \$main::opt_exclude, + "inuse_space!" => \$main::opt_inuse_space, + "inuse_objects!" => \$main::opt_inuse_objects, + "alloc_space!" => \$main::opt_alloc_space, + "alloc_objects!" => \$main::opt_alloc_objects, + "show_bytes!" => \$main::opt_show_bytes, + "drop_negative!" => \$main::opt_drop_negative, + "total_delay!" => \$main::opt_total_delay, + "contentions!" => \$main::opt_contentions, + "mean_delay!" => \$main::opt_mean_delay, + "tools=s" => \$main::opt_tools, + "test!" => \$main::opt_test, + "debug!" => \$main::opt_debug, + "debug-syms-by-id!" => \$main::opt_debug_syms_by_id, + # Undocumented flags used only by unittests: + "test_stride=i" => \$main::opt_test_stride, + ) || usage("Invalid option(s)"); + + # Deal with the standard --help and --version + if ($main::opt_help) { + print usage_string(); + exit(0); + } + + if ($main::opt_version) { + print version_string(); + exit(0); + } + + # Disassembly/listing/symbols mode requires address-level info + if ($main::opt_disasm || $main::opt_list || $main::opt_symbols) { + $main::opt_functions = 0; + $main::opt_lines = 0; + $main::opt_addresses = 1; + $main::opt_files = 0; + } + + # Check heap-profiling flags + if ($main::opt_inuse_space + + $main::opt_inuse_objects + + $main::opt_alloc_space + + $main::opt_alloc_objects > 1) { + usage("Specify at most on of --inuse/--alloc options"); + } + + # Check output granularities + my $grains = + $main::opt_functions + + $main::opt_lines + + $main::opt_addresses + + $main::opt_files + + 0; + if ($grains > 1) { + usage("Only specify one output granularity option"); + } + if ($grains == 0) { + $main::opt_functions = 1; + } + + # Check output modes + my $modes = + $main::opt_text + + $main::opt_callgrind + + ($main::opt_list eq '' ? 0 : 1) + + ($main::opt_disasm eq '' ? 0 : 1) + + ($main::opt_symbols == 0 ? 0 : 1) + + $main::opt_gv + + $main::opt_evince + + $main::opt_web + + $main::opt_dot + + $main::opt_ps + + $main::opt_pdf + + $main::opt_svg + + $main::opt_gif + + $main::opt_raw + + $main::opt_collapsed + + $main::opt_interactive + + 0; + if ($modes > 1) { + usage("Only specify one output mode"); + } + if ($modes == 0) { + if (-t STDOUT) { # If STDOUT is a tty, activate interactive mode + $main::opt_interactive = 1; + } else { + $main::opt_text = 1; + } + } + + if ($main::opt_test) { + RunUnitTests(); + # Should not return + exit(1); + } + + # Binary name and profile arguments list + $main::prog = ""; + @main::pfile_args = (); + + # Remote profiling without a binary (using $SYMBOL_PAGE instead) + if (@ARGV > 0) { + if (IsProfileURL($ARGV[0])) { + $main::use_symbol_page = 1; + } elsif (IsSymbolizedProfileFile($ARGV[0])) { + $main::use_symbolized_profile = 1; + $main::prog = $UNKNOWN_BINARY; # will be set later from the profile file + } + } + + if ($main::use_symbol_page || $main::use_symbolized_profile) { + # We don't need a binary! + my %disabled = ('--lines' => $main::opt_lines, + '--disasm' => $main::opt_disasm); + for my $option (keys %disabled) { + usage("$option cannot be used without a binary") if $disabled{$option}; + } + # Set $main::prog later... + scalar(@ARGV) || usage("Did not specify profile file"); + } elsif ($main::opt_symbols) { + # --symbols needs a binary-name (to run nm on, etc) but not profiles + $main::prog = shift(@ARGV) || usage("Did not specify program"); + } else { + $main::prog = shift(@ARGV) || usage("Did not specify program"); + scalar(@ARGV) || usage("Did not specify profile file"); + } + + # Parse profile file/location arguments + foreach my $farg (@ARGV) { + if ($farg =~ m/(.*)\@([0-9]+)(|\/.*)$/ ) { + my $machine = $1; + my $num_machines = $2; + my $path = $3; + for (my $i = 0; $i < $num_machines; $i++) { + unshift(@main::pfile_args, "$i.$machine$path"); + } + } else { + unshift(@main::pfile_args, $farg); + } + } + + if ($main::use_symbol_page) { + unless (IsProfileURL($main::pfile_args[0])) { + error("The first profile should be a remote form to use $SYMBOL_PAGE\n"); + } + CheckSymbolPage(); + $main::prog = FetchProgramName(); + } elsif (!$main::use_symbolized_profile) { # may not need objtools! + ConfigureObjTools($main::prog) + } + + # Break the opt_lib_prefix into the prefix_list array + @prefix_list = split (',', $main::opt_lib_prefix); + + # Remove trailing / from the prefixes, in the list to prevent + # searching things like /my/path//lib/mylib.so + foreach (@prefix_list) { + s|/+$||; + } + + # Flag to prevent us from trying over and over to use + # elfutils if it's not installed (used only with + # --debug-syms-by-id option). + $main::gave_up_on_elfutils = 0; +} + +sub FilterAndPrint { + my ($profile, $symbols, $libs, $thread) = @_; + + # Get total data in profile + my $total = TotalProfile($profile); + + # Remove uniniteresting stack items + $profile = RemoveUninterestingFrames($symbols, $profile); + + # Focus? + if ($main::opt_focus ne '') { + $profile = FocusProfile($symbols, $profile, $main::opt_focus); + } + + # Ignore? + if ($main::opt_ignore ne '') { + $profile = IgnoreProfile($symbols, $profile, $main::opt_ignore); + } + + my $calls = ExtractCalls($symbols, $profile); + + # Reduce profiles to required output granularity, and also clean + # each stack trace so a given entry exists at most once. + my $reduced = ReduceProfile($symbols, $profile); + + # Get derived profiles + my $flat = FlatProfile($reduced); + my $cumulative = CumulativeProfile($reduced); + + # Print + if (!$main::opt_interactive) { + if ($main::opt_disasm) { + PrintDisassembly($libs, $flat, $cumulative, $main::opt_disasm); + } elsif ($main::opt_list) { + PrintListing($total, $libs, $flat, $cumulative, $main::opt_list, 0); + } elsif ($main::opt_text) { + # Make sure the output is empty when have nothing to report + # (only matters when --heapcheck is given but we must be + # compatible with old branches that did not pass --heapcheck always): + if ($total != 0) { + printf("Total%s: %s %s\n", + (defined($thread) ? " (t$thread)" : ""), + Unparse($total), Units()); + } + PrintText($symbols, $flat, $cumulative, -1); + } elsif ($main::opt_raw) { + PrintSymbolizedProfile($symbols, $profile, $main::prog); + } elsif ($main::opt_collapsed) { + PrintCollapsedStacks($symbols, $profile); + } elsif ($main::opt_callgrind) { + PrintCallgrind($calls); + } else { + if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) { + if ($main::opt_gv) { + RunGV(TempName($main::next_tmpfile, "ps"), ""); + } elsif ($main::opt_evince) { + RunEvince(TempName($main::next_tmpfile, "pdf"), ""); + } elsif ($main::opt_web) { + my $tmp = TempName($main::next_tmpfile, "svg"); + RunWeb($tmp); + # The command we run might hand the file name off + # to an already running browser instance and then exit. + # Normally, we'd remove $tmp on exit (right now), + # but fork a child to remove $tmp a little later, so that the + # browser has time to load it first. + delete $main::tempnames{$tmp}; + if (fork() == 0) { + sleep 5; + unlink($tmp); + exit(0); + } + } + } else { + cleanup(); + exit(1); + } + } + } else { + InteractiveMode($profile, $symbols, $libs, $total); + } +} + +sub Main() { + Init(); + $main::collected_profile = undef; + @main::profile_files = (); + $main::op_time = time(); + + # Printing symbols is special and requires a lot less info that most. + if ($main::opt_symbols) { + PrintSymbols(*STDIN); # Get /proc/maps and symbols output from stdin + return; + } + + # Fetch all profile data + FetchDynamicProfiles(); + + # this will hold symbols that we read from the profile files + my $symbol_map = {}; + + # Read one profile, pick the last item on the list + my $data = ReadProfile($main::prog, pop(@main::profile_files)); + my $profile = $data->{profile}; + my $pcs = $data->{pcs}; + my $libs = $data->{libs}; # Info about main program and shared libraries + $symbol_map = MergeSymbols($symbol_map, $data->{symbols}); + + # Add additional profiles, if available. + if (scalar(@main::profile_files) > 0) { + foreach my $pname (@main::profile_files) { + my $data2 = ReadProfile($main::prog, $pname); + $profile = AddProfile($profile, $data2->{profile}); + $pcs = AddPcs($pcs, $data2->{pcs}); + $symbol_map = MergeSymbols($symbol_map, $data2->{symbols}); + } + } + + # Subtract base from profile, if specified + if ($main::opt_base ne '') { + my $base = ReadProfile($main::prog, $main::opt_base); + $profile = SubtractProfile($profile, $base->{profile}); + $pcs = AddPcs($pcs, $base->{pcs}); + $symbol_map = MergeSymbols($symbol_map, $base->{symbols}); + } + + # Collect symbols + my $symbols; + if ($main::use_symbolized_profile) { + $symbols = FetchSymbols($pcs, $symbol_map); + } elsif ($main::use_symbol_page) { + $symbols = FetchSymbols($pcs); + } else { + # TODO(csilvers): $libs uses the /proc/self/maps data from profile1, + # which may differ from the data from subsequent profiles, especially + # if they were run on different machines. Use appropriate libs for + # each pc somehow. + $symbols = ExtractSymbols($libs, $pcs); + } + + if (!defined($main::opt_thread)) { + FilterAndPrint($profile, $symbols, $libs); + } + if (defined($data->{threads})) { + foreach my $thread (sort { $a <=> $b } keys(%{$data->{threads}})) { + if (defined($main::opt_thread) && + ($main::opt_thread eq '*' || $main::opt_thread == $thread)) { + my $thread_profile = $data->{threads}{$thread}; + FilterAndPrint($thread_profile, $symbols, $libs, $thread); + } + } + } + + cleanup(); + exit(0); +} + +##### Entry Point ##### + +Main(); + +# Temporary code to detect if we're running on a Goobuntu system. +# These systems don't have the right stuff installed for the special +# Readline libraries to work, so as a temporary workaround, we default +# to using the normal stdio code, rather than the fancier readline-based +# code +sub ReadlineMightFail { + if (-e '/lib/libtermcap.so.2') { + return 0; # libtermcap exists, so readline should be okay + } else { + return 1; + } +} + +sub RunGV { + my $fname = shift; + my $bg = shift; # "" or " &" if we should run in background + if (!system(ShellEscape(@GV, "--version") . " >$dev_null 2>&1")) { + # Options using double dash are supported by this gv version. + # Also, turn on noantialias to better handle bug in gv for + # postscript files with large dimensions. + # TODO: Maybe we should not pass the --noantialias flag + # if the gv version is known to work properly without the flag. + system(ShellEscape(@GV, "--scale=$main::opt_scale", "--noantialias", $fname) + . $bg); + } else { + # Old gv version - only supports options that use single dash. + print STDERR ShellEscape(@GV, "-scale", $main::opt_scale) . "\n"; + system(ShellEscape(@GV, "-scale", "$main::opt_scale", $fname) . $bg); + } +} + +sub RunEvince { + my $fname = shift; + my $bg = shift; # "" or " &" if we should run in background + system(ShellEscape(@EVINCE, $fname) . $bg); +} + +sub RunWeb { + my $fname = shift; + print STDERR "Loading web page file:///$fname\n"; + + if (`uname` =~ /Darwin/) { + # OS X: open will use standard preference for SVG files. + system("/usr/bin/open", $fname); + return; + } + + # Some kind of Unix; try generic symlinks, then specific browsers. + # (Stop once we find one.) + # Works best if the browser is already running. + my @alt = ( + "/etc/alternatives/gnome-www-browser", + "/etc/alternatives/x-www-browser", + "google-chrome", + "firefox", + ); + foreach my $b (@alt) { + if (system($b, $fname) == 0) { + return; + } + } + + print STDERR "Could not load web browser.\n"; +} + +sub RunKcachegrind { + my $fname = shift; + my $bg = shift; # "" or " &" if we should run in background + print STDERR "Starting '@KCACHEGRIND " . $fname . $bg . "'\n"; + system(ShellEscape(@KCACHEGRIND, $fname) . $bg); +} + + +##### Interactive helper routines ##### + +sub InteractiveMode { + $| = 1; # Make output unbuffered for interactive mode + my ($orig_profile, $symbols, $libs, $total) = @_; + + print STDERR "Welcome to jeprof! For help, type 'help'.\n"; + + # Use ReadLine if it's installed and input comes from a console. + if ( -t STDIN && + !ReadlineMightFail() && + defined(eval {require Term::ReadLine}) ) { + my $term = new Term::ReadLine 'jeprof'; + while ( defined ($_ = $term->readline('(jeprof) '))) { + $term->addhistory($_) if /\S/; + if (!InteractiveCommand($orig_profile, $symbols, $libs, $total, $_)) { + last; # exit when we get an interactive command to quit + } + } + } else { # don't have readline + while (1) { + print STDERR "(jeprof) "; + $_ = ; + last if ! defined $_ ; + s/\r//g; # turn windows-looking lines into unix-looking lines + + # Save some flags that might be reset by InteractiveCommand() + my $save_opt_lines = $main::opt_lines; + + if (!InteractiveCommand($orig_profile, $symbols, $libs, $total, $_)) { + last; # exit when we get an interactive command to quit + } + + # Restore flags + $main::opt_lines = $save_opt_lines; + } + } +} + +# Takes two args: orig profile, and command to run. +# Returns 1 if we should keep going, or 0 if we were asked to quit +sub InteractiveCommand { + my($orig_profile, $symbols, $libs, $total, $command) = @_; + $_ = $command; # just to make future m//'s easier + if (!defined($_)) { + print STDERR "\n"; + return 0; + } + if (m/^\s*quit/) { + return 0; + } + if (m/^\s*help/) { + InteractiveHelpMessage(); + return 1; + } + # Clear all the mode options -- mode is controlled by "$command" + $main::opt_text = 0; + $main::opt_callgrind = 0; + $main::opt_disasm = 0; + $main::opt_list = 0; + $main::opt_gv = 0; + $main::opt_evince = 0; + $main::opt_cum = 0; + + if (m/^\s*(text|top)(\d*)\s*(.*)/) { + $main::opt_text = 1; + + my $line_limit = ($2 ne "") ? int($2) : 10; + + my $routine; + my $ignore; + ($routine, $ignore) = ParseInteractiveArgs($3); + + my $profile = ProcessProfile($total, $orig_profile, $symbols, "", $ignore); + my $reduced = ReduceProfile($symbols, $profile); + + # Get derived profiles + my $flat = FlatProfile($reduced); + my $cumulative = CumulativeProfile($reduced); + + PrintText($symbols, $flat, $cumulative, $line_limit); + return 1; + } + if (m/^\s*callgrind\s*([^ \n]*)/) { + $main::opt_callgrind = 1; + + # Get derived profiles + my $calls = ExtractCalls($symbols, $orig_profile); + my $filename = $1; + if ( $1 eq '' ) { + $filename = TempName($main::next_tmpfile, "callgrind"); + } + PrintCallgrind($calls, $filename); + if ( $1 eq '' ) { + RunKcachegrind($filename, " & "); + $main::next_tmpfile++; + } + + return 1; + } + if (m/^\s*(web)?list\s*(.+)/) { + my $html = (defined($1) && ($1 eq "web")); + $main::opt_list = 1; + + my $routine; + my $ignore; + ($routine, $ignore) = ParseInteractiveArgs($2); + + my $profile = ProcessProfile($total, $orig_profile, $symbols, "", $ignore); + my $reduced = ReduceProfile($symbols, $profile); + + # Get derived profiles + my $flat = FlatProfile($reduced); + my $cumulative = CumulativeProfile($reduced); + + PrintListing($total, $libs, $flat, $cumulative, $routine, $html); + return 1; + } + if (m/^\s*disasm\s*(.+)/) { + $main::opt_disasm = 1; + + my $routine; + my $ignore; + ($routine, $ignore) = ParseInteractiveArgs($1); + + # Process current profile to account for various settings + my $profile = ProcessProfile($total, $orig_profile, $symbols, "", $ignore); + my $reduced = ReduceProfile($symbols, $profile); + + # Get derived profiles + my $flat = FlatProfile($reduced); + my $cumulative = CumulativeProfile($reduced); + + PrintDisassembly($libs, $flat, $cumulative, $routine); + return 1; + } + if (m/^\s*(gv|web|evince)\s*(.*)/) { + $main::opt_gv = 0; + $main::opt_evince = 0; + $main::opt_web = 0; + if ($1 eq "gv") { + $main::opt_gv = 1; + } elsif ($1 eq "evince") { + $main::opt_evince = 1; + } elsif ($1 eq "web") { + $main::opt_web = 1; + } + + my $focus; + my $ignore; + ($focus, $ignore) = ParseInteractiveArgs($2); + + # Process current profile to account for various settings + my $profile = ProcessProfile($total, $orig_profile, $symbols, + $focus, $ignore); + my $reduced = ReduceProfile($symbols, $profile); + + # Get derived profiles + my $flat = FlatProfile($reduced); + my $cumulative = CumulativeProfile($reduced); + + if (PrintDot($main::prog, $symbols, $profile, $flat, $cumulative, $total)) { + if ($main::opt_gv) { + RunGV(TempName($main::next_tmpfile, "ps"), " &"); + } elsif ($main::opt_evince) { + RunEvince(TempName($main::next_tmpfile, "pdf"), " &"); + } elsif ($main::opt_web) { + RunWeb(TempName($main::next_tmpfile, "svg")); + } + $main::next_tmpfile++; + } + return 1; + } + if (m/^\s*$/) { + return 1; + } + print STDERR "Unknown command: try 'help'.\n"; + return 1; +} + + +sub ProcessProfile { + my $total_count = shift; + my $orig_profile = shift; + my $symbols = shift; + my $focus = shift; + my $ignore = shift; + + # Process current profile to account for various settings + my $profile = $orig_profile; + printf("Total: %s %s\n", Unparse($total_count), Units()); + if ($focus ne '') { + $profile = FocusProfile($symbols, $profile, $focus); + my $focus_count = TotalProfile($profile); + printf("After focusing on '%s': %s %s of %s (%0.1f%%)\n", + $focus, + Unparse($focus_count), Units(), + Unparse($total_count), ($focus_count*100.0) / $total_count); + } + if ($ignore ne '') { + $profile = IgnoreProfile($symbols, $profile, $ignore); + my $ignore_count = TotalProfile($profile); + printf("After ignoring '%s': %s %s of %s (%0.1f%%)\n", + $ignore, + Unparse($ignore_count), Units(), + Unparse($total_count), + ($ignore_count*100.0) / $total_count); + } + + return $profile; +} + +sub InteractiveHelpMessage { + print STDERR <{$k}; + my @addrs = split(/\n/, $k); + if ($#addrs >= 0) { + my $depth = $#addrs + 1; + # int(foo / 2**32) is the only reliable way to get rid of bottom + # 32 bits on both 32- and 64-bit systems. + print pack('L*', $count & 0xFFFFFFFF, int($count / 2**32)); + print pack('L*', $depth & 0xFFFFFFFF, int($depth / 2**32)); + + foreach my $full_addr (@addrs) { + my $addr = $full_addr; + $addr =~ s/0x0*//; # strip off leading 0x, zeroes + if (length($addr) > 16) { + print STDERR "Invalid address in profile: $full_addr\n"; + next; + } + my $low_addr = substr($addr, -8); # get last 8 hex chars + my $high_addr = substr($addr, -16, 8); # get up to 8 more hex chars + print pack('L*', hex('0x' . $low_addr), hex('0x' . $high_addr)); + } + } + } +} + +# Print symbols and profile data +sub PrintSymbolizedProfile { + my $symbols = shift; + my $profile = shift; + my $prog = shift; + + $SYMBOL_PAGE =~ m,[^/]+$,; # matches everything after the last slash + my $symbol_marker = $&; + + print '--- ', $symbol_marker, "\n"; + if (defined($prog)) { + print 'binary=', $prog, "\n"; + } + while (my ($pc, $name) = each(%{$symbols})) { + my $sep = ' '; + print '0x', $pc; + # We have a list of function names, which include the inlined + # calls. They are separated (and terminated) by --, which is + # illegal in function names. + for (my $j = 2; $j <= $#{$name}; $j += 3) { + print $sep, $name->[$j]; + $sep = '--'; + } + print "\n"; + } + print '---', "\n"; + + my $profile_marker; + if ($main::profile_type eq 'heap') { + $HEAP_PAGE =~ m,[^/]+$,; # matches everything after the last slash + $profile_marker = $&; + } elsif ($main::profile_type eq 'growth') { + $GROWTH_PAGE =~ m,[^/]+$,; # matches everything after the last slash + $profile_marker = $&; + } elsif ($main::profile_type eq 'contention') { + $CONTENTION_PAGE =~ m,[^/]+$,; # matches everything after the last slash + $profile_marker = $&; + } else { # elsif ($main::profile_type eq 'cpu') + $PROFILE_PAGE =~ m,[^/]+$,; # matches everything after the last slash + $profile_marker = $&; + } + + print '--- ', $profile_marker, "\n"; + if (defined($main::collected_profile)) { + # if used with remote fetch, simply dump the collected profile to output. + open(SRC, "<$main::collected_profile"); + while () { + print $_; + } + close(SRC); + } else { + # --raw/http: For everything to work correctly for non-remote profiles, we + # would need to extend PrintProfileData() to handle all possible profile + # types, re-enable the code that is currently disabled in ReadCPUProfile() + # and FixCallerAddresses(), and remove the remote profile dumping code in + # the block above. + die "--raw/http: jeprof can only dump remote profiles for --raw\n"; + # dump a cpu-format profile to standard out + PrintProfileData($profile); + } +} + +# Print text output +sub PrintText { + my $symbols = shift; + my $flat = shift; + my $cumulative = shift; + my $line_limit = shift; + + my $total = TotalProfile($flat); + + # Which profile to sort by? + my $s = $main::opt_cum ? $cumulative : $flat; + + my $running_sum = 0; + my $lines = 0; + foreach my $k (sort { GetEntry($s, $b) <=> GetEntry($s, $a) || $a cmp $b } + keys(%{$cumulative})) { + my $f = GetEntry($flat, $k); + my $c = GetEntry($cumulative, $k); + $running_sum += $f; + + my $sym = $k; + if (exists($symbols->{$k})) { + $sym = $symbols->{$k}->[0] . " " . $symbols->{$k}->[1]; + if ($main::opt_addresses) { + $sym = $k . " " . $sym; + } + } + + if ($f != 0 || $c != 0) { + printf("%8s %6s %6s %8s %6s %s\n", + Unparse($f), + Percent($f, $total), + Percent($running_sum, $total), + Unparse($c), + Percent($c, $total), + $sym); + } + $lines++; + last if ($line_limit >= 0 && $lines >= $line_limit); + } +} + +# Callgrind format has a compression for repeated function and file +# names. You show the name the first time, and just use its number +# subsequently. This can cut down the file to about a third or a +# quarter of its uncompressed size. $key and $val are the key/value +# pair that would normally be printed by callgrind; $map is a map from +# value to number. +sub CompressedCGName { + my($key, $val, $map) = @_; + my $idx = $map->{$val}; + # For very short keys, providing an index hurts rather than helps. + if (length($val) <= 3) { + return "$key=$val\n"; + } elsif (defined($idx)) { + return "$key=($idx)\n"; + } else { + # scalar(keys $map) gives the number of items in the map. + $idx = scalar(keys(%{$map})) + 1; + $map->{$val} = $idx; + return "$key=($idx) $val\n"; + } +} + +# Print the call graph in a way that's suiteable for callgrind. +sub PrintCallgrind { + my $calls = shift; + my $filename; + my %filename_to_index_map; + my %fnname_to_index_map; + + if ($main::opt_interactive) { + $filename = shift; + print STDERR "Writing callgrind file to '$filename'.\n" + } else { + $filename = "&STDOUT"; + } + open(CG, ">$filename"); + printf CG ("events: Hits\n\n"); + foreach my $call ( map { $_->[0] } + sort { $a->[1] cmp $b ->[1] || + $a->[2] <=> $b->[2] } + map { /([^:]+):(\d+):([^ ]+)( -> ([^:]+):(\d+):(.+))?/; + [$_, $1, $2] } + keys %$calls ) { + my $count = int($calls->{$call}); + $call =~ /([^:]+):(\d+):([^ ]+)( -> ([^:]+):(\d+):(.+))?/; + my ( $caller_file, $caller_line, $caller_function, + $callee_file, $callee_line, $callee_function ) = + ( $1, $2, $3, $5, $6, $7 ); + + # TODO(csilvers): for better compression, collect all the + # caller/callee_files and functions first, before printing + # anything, and only compress those referenced more than once. + printf CG CompressedCGName("fl", $caller_file, \%filename_to_index_map); + printf CG CompressedCGName("fn", $caller_function, \%fnname_to_index_map); + if (defined $6) { + printf CG CompressedCGName("cfl", $callee_file, \%filename_to_index_map); + printf CG CompressedCGName("cfn", $callee_function, \%fnname_to_index_map); + printf CG ("calls=$count $callee_line\n"); + } + printf CG ("$caller_line $count\n\n"); + } +} + +# Print disassembly for all all routines that match $main::opt_disasm +sub PrintDisassembly { + my $libs = shift; + my $flat = shift; + my $cumulative = shift; + my $disasm_opts = shift; + + my $total = TotalProfile($flat); + + foreach my $lib (@{$libs}) { + my $symbol_table = GetProcedureBoundaries($lib->[0], $disasm_opts); + my $offset = AddressSub($lib->[1], $lib->[3]); + foreach my $routine (sort ByName keys(%{$symbol_table})) { + my $start_addr = $symbol_table->{$routine}->[0]; + my $end_addr = $symbol_table->{$routine}->[1]; + # See if there are any samples in this routine + my $length = hex(AddressSub($end_addr, $start_addr)); + my $addr = AddressAdd($start_addr, $offset); + for (my $i = 0; $i < $length; $i++) { + if (defined($cumulative->{$addr})) { + PrintDisassembledFunction($lib->[0], $offset, + $routine, $flat, $cumulative, + $start_addr, $end_addr, $total); + last; + } + $addr = AddressInc($addr); + } + } + } +} + +# Return reference to array of tuples of the form: +# [start_address, filename, linenumber, instruction, limit_address] +# E.g., +# ["0x806c43d", "/foo/bar.cc", 131, "ret", "0x806c440"] +sub Disassemble { + my $prog = shift; + my $offset = shift; + my $start_addr = shift; + my $end_addr = shift; + + my $objdump = $obj_tool_map{"objdump"}; + my $cmd = ShellEscape($objdump, "-C", "-d", "-l", "--no-show-raw-insn", + "--start-address=0x$start_addr", + "--stop-address=0x$end_addr", $prog); + open(OBJDUMP, "$cmd |") || error("$cmd: $!\n"); + my @result = (); + my $filename = ""; + my $linenumber = -1; + my $last = ["", "", "", ""]; + while () { + s/\r//g; # turn windows-looking lines into unix-looking lines + chop; + if (m|\s*([^:\s]+):(\d+)\s*$|) { + # Location line of the form: + # : + $filename = $1; + $linenumber = $2; + } elsif (m/^ +([0-9a-f]+):\s*(.*)/) { + # Disassembly line -- zero-extend address to full length + my $addr = HexExtend($1); + my $k = AddressAdd($addr, $offset); + $last->[4] = $k; # Store ending address for previous instruction + $last = [$k, $filename, $linenumber, $2, $end_addr]; + push(@result, $last); + } + } + close(OBJDUMP); + return @result; +} + +# The input file should contain lines of the form /proc/maps-like +# output (same format as expected from the profiles) or that looks +# like hex addresses (like "0xDEADBEEF"). We will parse all +# /proc/maps output, and for all the hex addresses, we will output +# "short" symbol names, one per line, in the same order as the input. +sub PrintSymbols { + my $maps_and_symbols_file = shift; + + # ParseLibraries expects pcs to be in a set. Fine by us... + my @pclist = (); # pcs in sorted order + my $pcs = {}; + my $map = ""; + foreach my $line (<$maps_and_symbols_file>) { + $line =~ s/\r//g; # turn windows-looking lines into unix-looking lines + if ($line =~ /\b(0x[0-9a-f]+)\b/i) { + push(@pclist, HexExtend($1)); + $pcs->{$pclist[-1]} = 1; + } else { + $map .= $line; + } + } + + my $libs = ParseLibraries($main::prog, $map, $pcs); + my $symbols = ExtractSymbols($libs, $pcs); + + foreach my $pc (@pclist) { + # ->[0] is the shortname, ->[2] is the full name + print(($symbols->{$pc}->[0] || "??") . "\n"); + } +} + + +# For sorting functions by name +sub ByName { + return ShortFunctionName($a) cmp ShortFunctionName($b); +} + +# Print source-listing for all all routines that match $list_opts +sub PrintListing { + my $total = shift; + my $libs = shift; + my $flat = shift; + my $cumulative = shift; + my $list_opts = shift; + my $html = shift; + + my $output = \*STDOUT; + my $fname = ""; + + if ($html) { + # Arrange to write the output to a temporary file + $fname = TempName($main::next_tmpfile, "html"); + $main::next_tmpfile++; + if (!open(TEMP, ">$fname")) { + print STDERR "$fname: $!\n"; + return; + } + $output = \*TEMP; + print $output HtmlListingHeader(); + printf $output ("
%s
Total: %s %s
\n", + $main::prog, Unparse($total), Units()); + } + + my $listed = 0; + foreach my $lib (@{$libs}) { + my $symbol_table = GetProcedureBoundaries($lib->[0], $list_opts); + my $offset = AddressSub($lib->[1], $lib->[3]); + foreach my $routine (sort ByName keys(%{$symbol_table})) { + # Print if there are any samples in this routine + my $start_addr = $symbol_table->{$routine}->[0]; + my $end_addr = $symbol_table->{$routine}->[1]; + my $length = hex(AddressSub($end_addr, $start_addr)); + my $addr = AddressAdd($start_addr, $offset); + for (my $i = 0; $i < $length; $i++) { + if (defined($cumulative->{$addr})) { + $listed += PrintSource( + $lib->[0], $offset, + $routine, $flat, $cumulative, + $start_addr, $end_addr, + $html, + $output); + last; + } + $addr = AddressInc($addr); + } + } + } + + if ($html) { + if ($listed > 0) { + print $output HtmlListingFooter(); + close($output); + RunWeb($fname); + } else { + close($output); + unlink($fname); + } + } +} + +sub HtmlListingHeader { + return <<'EOF'; + + + +Pprof listing + + + + +EOF +} + +sub HtmlListingFooter { + return <<'EOF'; + + +EOF +} + +sub HtmlEscape { + my $text = shift; + $text =~ s/&/&/g; + $text =~ s//>/g; + return $text; +} + +# Returns the indentation of the line, if it has any non-whitespace +# characters. Otherwise, returns -1. +sub Indentation { + my $line = shift; + if (m/^(\s*)\S/) { + return length($1); + } else { + return -1; + } +} + +# If the symbol table contains inlining info, Disassemble() may tag an +# instruction with a location inside an inlined function. But for +# source listings, we prefer to use the location in the function we +# are listing. So use MapToSymbols() to fetch full location +# information for each instruction and then pick out the first +# location from a location list (location list contains callers before +# callees in case of inlining). +# +# After this routine has run, each entry in $instructions contains: +# [0] start address +# [1] filename for function we are listing +# [2] line number for function we are listing +# [3] disassembly +# [4] limit address +# [5] most specific filename (may be different from [1] due to inlining) +# [6] most specific line number (may be different from [2] due to inlining) +sub GetTopLevelLineNumbers { + my ($lib, $offset, $instructions) = @_; + my $pcs = []; + for (my $i = 0; $i <= $#{$instructions}; $i++) { + push(@{$pcs}, $instructions->[$i]->[0]); + } + my $symbols = {}; + MapToSymbols($lib, $offset, $pcs, $symbols); + for (my $i = 0; $i <= $#{$instructions}; $i++) { + my $e = $instructions->[$i]; + push(@{$e}, $e->[1]); + push(@{$e}, $e->[2]); + my $addr = $e->[0]; + my $sym = $symbols->{$addr}; + if (defined($sym)) { + if ($#{$sym} >= 2 && $sym->[1] =~ m/^(.*):(\d+)$/) { + $e->[1] = $1; # File name + $e->[2] = $2; # Line number + } + } + } +} + +# Print source-listing for one routine +sub PrintSource { + my $prog = shift; + my $offset = shift; + my $routine = shift; + my $flat = shift; + my $cumulative = shift; + my $start_addr = shift; + my $end_addr = shift; + my $html = shift; + my $output = shift; + + # Disassemble all instructions (just to get line numbers) + my @instructions = Disassemble($prog, $offset, $start_addr, $end_addr); + GetTopLevelLineNumbers($prog, $offset, \@instructions); + + # Hack 1: assume that the first source file encountered in the + # disassembly contains the routine + my $filename = undef; + for (my $i = 0; $i <= $#instructions; $i++) { + if ($instructions[$i]->[2] >= 0) { + $filename = $instructions[$i]->[1]; + last; + } + } + if (!defined($filename)) { + print STDERR "no filename found in $routine\n"; + return 0; + } + + # Hack 2: assume that the largest line number from $filename is the + # end of the procedure. This is typically safe since if P1 contains + # an inlined call to P2, then P2 usually occurs earlier in the + # source file. If this does not work, we might have to compute a + # density profile or just print all regions we find. + my $lastline = 0; + for (my $i = 0; $i <= $#instructions; $i++) { + my $f = $instructions[$i]->[1]; + my $l = $instructions[$i]->[2]; + if (($f eq $filename) && ($l > $lastline)) { + $lastline = $l; + } + } + + # Hack 3: assume the first source location from "filename" is the start of + # the source code. + my $firstline = 1; + for (my $i = 0; $i <= $#instructions; $i++) { + if ($instructions[$i]->[1] eq $filename) { + $firstline = $instructions[$i]->[2]; + last; + } + } + + # Hack 4: Extend last line forward until its indentation is less than + # the indentation we saw on $firstline + my $oldlastline = $lastline; + { + if (!open(FILE, "<$filename")) { + print STDERR "$filename: $!\n"; + return 0; + } + my $l = 0; + my $first_indentation = -1; + while () { + s/\r//g; # turn windows-looking lines into unix-looking lines + $l++; + my $indent = Indentation($_); + if ($l >= $firstline) { + if ($first_indentation < 0 && $indent >= 0) { + $first_indentation = $indent; + last if ($first_indentation == 0); + } + } + if ($l >= $lastline && $indent >= 0) { + if ($indent >= $first_indentation) { + $lastline = $l+1; + } else { + last; + } + } + } + close(FILE); + } + + # Assign all samples to the range $firstline,$lastline, + # Hack 4: If an instruction does not occur in the range, its samples + # are moved to the next instruction that occurs in the range. + my $samples1 = {}; # Map from line number to flat count + my $samples2 = {}; # Map from line number to cumulative count + my $running1 = 0; # Unassigned flat counts + my $running2 = 0; # Unassigned cumulative counts + my $total1 = 0; # Total flat counts + my $total2 = 0; # Total cumulative counts + my %disasm = (); # Map from line number to disassembly + my $running_disasm = ""; # Unassigned disassembly + my $skip_marker = "---\n"; + if ($html) { + $skip_marker = ""; + for (my $l = $firstline; $l <= $lastline; $l++) { + $disasm{$l} = ""; + } + } + my $last_dis_filename = ''; + my $last_dis_linenum = -1; + my $last_touched_line = -1; # To detect gaps in disassembly for a line + foreach my $e (@instructions) { + # Add up counts for all address that fall inside this instruction + my $c1 = 0; + my $c2 = 0; + for (my $a = $e->[0]; $a lt $e->[4]; $a = AddressInc($a)) { + $c1 += GetEntry($flat, $a); + $c2 += GetEntry($cumulative, $a); + } + + if ($html) { + my $dis = sprintf(" %6s %6s \t\t%8s: %s ", + HtmlPrintNumber($c1), + HtmlPrintNumber($c2), + UnparseAddress($offset, $e->[0]), + CleanDisassembly($e->[3])); + + # Append the most specific source line associated with this instruction + if (length($dis) < 80) { $dis .= (' ' x (80 - length($dis))) }; + $dis = HtmlEscape($dis); + my $f = $e->[5]; + my $l = $e->[6]; + if ($f ne $last_dis_filename) { + $dis .= sprintf("%s:%d", + HtmlEscape(CleanFileName($f)), $l); + } elsif ($l ne $last_dis_linenum) { + # De-emphasize the unchanged file name portion + $dis .= sprintf("%s" . + ":%d", + HtmlEscape(CleanFileName($f)), $l); + } else { + # De-emphasize the entire location + $dis .= sprintf("%s:%d", + HtmlEscape(CleanFileName($f)), $l); + } + $last_dis_filename = $f; + $last_dis_linenum = $l; + $running_disasm .= $dis; + $running_disasm .= "\n"; + } + + $running1 += $c1; + $running2 += $c2; + $total1 += $c1; + $total2 += $c2; + my $file = $e->[1]; + my $line = $e->[2]; + if (($file eq $filename) && + ($line >= $firstline) && + ($line <= $lastline)) { + # Assign all accumulated samples to this line + AddEntry($samples1, $line, $running1); + AddEntry($samples2, $line, $running2); + $running1 = 0; + $running2 = 0; + if ($html) { + if ($line != $last_touched_line && $disasm{$line} ne '') { + $disasm{$line} .= "\n"; + } + $disasm{$line} .= $running_disasm; + $running_disasm = ''; + $last_touched_line = $line; + } + } + } + + # Assign any leftover samples to $lastline + AddEntry($samples1, $lastline, $running1); + AddEntry($samples2, $lastline, $running2); + if ($html) { + if ($lastline != $last_touched_line && $disasm{$lastline} ne '') { + $disasm{$lastline} .= "\n"; + } + $disasm{$lastline} .= $running_disasm; + } + + if ($html) { + printf $output ( + "

%s

%s\n
\n" .
+      "Total:%6s %6s (flat / cumulative %s)\n",
+      HtmlEscape(ShortFunctionName($routine)),
+      HtmlEscape(CleanFileName($filename)),
+      Unparse($total1),
+      Unparse($total2),
+      Units());
+  } else {
+    printf $output (
+      "ROUTINE ====================== %s in %s\n" .
+      "%6s %6s Total %s (flat / cumulative)\n",
+      ShortFunctionName($routine),
+      CleanFileName($filename),
+      Unparse($total1),
+      Unparse($total2),
+      Units());
+  }
+  if (!open(FILE, "<$filename")) {
+    print STDERR "$filename: $!\n";
+    return 0;
+  }
+  my $l = 0;
+  while () {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    $l++;
+    if ($l >= $firstline - 5 &&
+        (($l <= $oldlastline + 5) || ($l <= $lastline))) {
+      chop;
+      my $text = $_;
+      if ($l == $firstline) { print $output $skip_marker; }
+      my $n1 = GetEntry($samples1, $l);
+      my $n2 = GetEntry($samples2, $l);
+      if ($html) {
+        # Emit a span that has one of the following classes:
+        #    livesrc -- has samples
+        #    deadsrc -- has disassembly, but with no samples
+        #    nop     -- has no matching disasembly
+        # Also emit an optional span containing disassembly.
+        my $dis = $disasm{$l};
+        my $asm = "";
+        if (defined($dis) && $dis ne '') {
+          $asm = "" . $dis . "";
+        }
+        my $source_class = (($n1 + $n2 > 0)
+                            ? "livesrc"
+                            : (($asm ne "") ? "deadsrc" : "nop"));
+        printf $output (
+          "%5d " .
+          "%6s %6s %s%s\n",
+          $l, $source_class,
+          HtmlPrintNumber($n1),
+          HtmlPrintNumber($n2),
+          HtmlEscape($text),
+          $asm);
+      } else {
+        printf $output(
+          "%6s %6s %4d: %s\n",
+          UnparseAlt($n1),
+          UnparseAlt($n2),
+          $l,
+          $text);
+      }
+      if ($l == $lastline)  { print $output $skip_marker; }
+    };
+  }
+  close(FILE);
+  if ($html) {
+    print $output "
\n"; + } + return 1; +} + +# Return the source line for the specified file/linenumber. +# Returns undef if not found. +sub SourceLine { + my $file = shift; + my $line = shift; + + # Look in cache + if (!defined($main::source_cache{$file})) { + if (100 < scalar keys(%main::source_cache)) { + # Clear the cache when it gets too big + $main::source_cache = (); + } + + # Read all lines from the file + if (!open(FILE, "<$file")) { + print STDERR "$file: $!\n"; + $main::source_cache{$file} = []; # Cache the negative result + return undef; + } + my $lines = []; + push(@{$lines}, ""); # So we can use 1-based line numbers as indices + while () { + push(@{$lines}, $_); + } + close(FILE); + + # Save the lines in the cache + $main::source_cache{$file} = $lines; + } + + my $lines = $main::source_cache{$file}; + if (($line < 0) || ($line > $#{$lines})) { + return undef; + } else { + return $lines->[$line]; + } +} + +# Print disassembly for one routine with interspersed source if available +sub PrintDisassembledFunction { + my $prog = shift; + my $offset = shift; + my $routine = shift; + my $flat = shift; + my $cumulative = shift; + my $start_addr = shift; + my $end_addr = shift; + my $total = shift; + + # Disassemble all instructions + my @instructions = Disassemble($prog, $offset, $start_addr, $end_addr); + + # Make array of counts per instruction + my @flat_count = (); + my @cum_count = (); + my $flat_total = 0; + my $cum_total = 0; + foreach my $e (@instructions) { + # Add up counts for all address that fall inside this instruction + my $c1 = 0; + my $c2 = 0; + for (my $a = $e->[0]; $a lt $e->[4]; $a = AddressInc($a)) { + $c1 += GetEntry($flat, $a); + $c2 += GetEntry($cumulative, $a); + } + push(@flat_count, $c1); + push(@cum_count, $c2); + $flat_total += $c1; + $cum_total += $c2; + } + + # Print header with total counts + printf("ROUTINE ====================== %s\n" . + "%6s %6s %s (flat, cumulative) %.1f%% of total\n", + ShortFunctionName($routine), + Unparse($flat_total), + Unparse($cum_total), + Units(), + ($cum_total * 100.0) / $total); + + # Process instructions in order + my $current_file = ""; + for (my $i = 0; $i <= $#instructions; ) { + my $e = $instructions[$i]; + + # Print the new file name whenever we switch files + if ($e->[1] ne $current_file) { + $current_file = $e->[1]; + my $fname = $current_file; + $fname =~ s|^\./||; # Trim leading "./" + + # Shorten long file names + if (length($fname) >= 58) { + $fname = "..." . substr($fname, -55); + } + printf("-------------------- %s\n", $fname); + } + + # TODO: Compute range of lines to print together to deal with + # small reorderings. + my $first_line = $e->[2]; + my $last_line = $first_line; + my %flat_sum = (); + my %cum_sum = (); + for (my $l = $first_line; $l <= $last_line; $l++) { + $flat_sum{$l} = 0; + $cum_sum{$l} = 0; + } + + # Find run of instructions for this range of source lines + my $first_inst = $i; + while (($i <= $#instructions) && + ($instructions[$i]->[2] >= $first_line) && + ($instructions[$i]->[2] <= $last_line)) { + $e = $instructions[$i]; + $flat_sum{$e->[2]} += $flat_count[$i]; + $cum_sum{$e->[2]} += $cum_count[$i]; + $i++; + } + my $last_inst = $i - 1; + + # Print source lines + for (my $l = $first_line; $l <= $last_line; $l++) { + my $line = SourceLine($current_file, $l); + if (!defined($line)) { + $line = "?\n"; + next; + } else { + $line =~ s/^\s+//; + } + printf("%6s %6s %5d: %s", + UnparseAlt($flat_sum{$l}), + UnparseAlt($cum_sum{$l}), + $l, + $line); + } + + # Print disassembly + for (my $x = $first_inst; $x <= $last_inst; $x++) { + my $e = $instructions[$x]; + printf("%6s %6s %8s: %6s\n", + UnparseAlt($flat_count[$x]), + UnparseAlt($cum_count[$x]), + UnparseAddress($offset, $e->[0]), + CleanDisassembly($e->[3])); + } + } +} + +# Print DOT graph +sub PrintDot { + my $prog = shift; + my $symbols = shift; + my $raw = shift; + my $flat = shift; + my $cumulative = shift; + my $overall_total = shift; + + # Get total + my $local_total = TotalProfile($flat); + my $nodelimit = int($main::opt_nodefraction * $local_total); + my $edgelimit = int($main::opt_edgefraction * $local_total); + my $nodecount = $main::opt_nodecount; + + # Find nodes to include + my @list = (sort { abs(GetEntry($cumulative, $b)) <=> + abs(GetEntry($cumulative, $a)) + || $a cmp $b } + keys(%{$cumulative})); + my $last = $nodecount - 1; + if ($last > $#list) { + $last = $#list; + } + while (($last >= 0) && + (abs(GetEntry($cumulative, $list[$last])) <= $nodelimit)) { + $last--; + } + if ($last < 0) { + print STDERR "No nodes to print\n"; + return 0; + } + + if ($nodelimit > 0 || $edgelimit > 0) { + printf STDERR ("Dropping nodes with <= %s %s; edges with <= %s abs(%s)\n", + Unparse($nodelimit), Units(), + Unparse($edgelimit), Units()); + } + + # Open DOT output file + my $output; + my $escaped_dot = ShellEscape(@DOT); + my $escaped_ps2pdf = ShellEscape(@PS2PDF); + if ($main::opt_gv) { + my $escaped_outfile = ShellEscape(TempName($main::next_tmpfile, "ps")); + $output = "| $escaped_dot -Tps2 >$escaped_outfile"; + } elsif ($main::opt_evince) { + my $escaped_outfile = ShellEscape(TempName($main::next_tmpfile, "pdf")); + $output = "| $escaped_dot -Tps2 | $escaped_ps2pdf - $escaped_outfile"; + } elsif ($main::opt_ps) { + $output = "| $escaped_dot -Tps2"; + } elsif ($main::opt_pdf) { + $output = "| $escaped_dot -Tps2 | $escaped_ps2pdf - -"; + } elsif ($main::opt_web || $main::opt_svg) { + # We need to post-process the SVG, so write to a temporary file always. + my $escaped_outfile = ShellEscape(TempName($main::next_tmpfile, "svg")); + $output = "| $escaped_dot -Tsvg >$escaped_outfile"; + } elsif ($main::opt_gif) { + $output = "| $escaped_dot -Tgif"; + } else { + $output = ">&STDOUT"; + } + open(DOT, $output) || error("$output: $!\n"); + + # Title + printf DOT ("digraph \"%s; %s %s\" {\n", + $prog, + Unparse($overall_total), + Units()); + if ($main::opt_pdf) { + # The output is more printable if we set the page size for dot. + printf DOT ("size=\"8,11\"\n"); + } + printf DOT ("node [width=0.375,height=0.25];\n"); + + # Print legend + printf DOT ("Legend [shape=box,fontsize=24,shape=plaintext," . + "label=\"%s\\l%s\\l%s\\l%s\\l%s\\l\"];\n", + $prog, + sprintf("Total %s: %s", Units(), Unparse($overall_total)), + sprintf("Focusing on: %s", Unparse($local_total)), + sprintf("Dropped nodes with <= %s abs(%s)", + Unparse($nodelimit), Units()), + sprintf("Dropped edges with <= %s %s", + Unparse($edgelimit), Units()) + ); + + # Print nodes + my %node = (); + my $nextnode = 1; + foreach my $a (@list[0..$last]) { + # Pick font size + my $f = GetEntry($flat, $a); + my $c = GetEntry($cumulative, $a); + + my $fs = 8; + if ($local_total > 0) { + $fs = 8 + (50.0 * sqrt(abs($f * 1.0 / $local_total))); + } + + $node{$a} = $nextnode++; + my $sym = $a; + $sym =~ s/\s+/\\n/g; + $sym =~ s/::/\\n/g; + + # Extra cumulative info to print for non-leaves + my $extra = ""; + if ($f != $c) { + $extra = sprintf("\\rof %s (%s)", + Unparse($c), + Percent($c, $local_total)); + } + my $style = ""; + if ($main::opt_heapcheck) { + if ($f > 0) { + # make leak-causing nodes more visible (add a background) + $style = ",style=filled,fillcolor=gray" + } elsif ($f < 0) { + # make anti-leak-causing nodes (which almost never occur) + # stand out as well (triple border) + $style = ",peripheries=3" + } + } + + printf DOT ("N%d [label=\"%s\\n%s (%s)%s\\r" . + "\",shape=box,fontsize=%.1f%s];\n", + $node{$a}, + $sym, + Unparse($f), + Percent($f, $local_total), + $extra, + $fs, + $style, + ); + } + + # Get edges and counts per edge + my %edge = (); + my $n; + my $fullname_to_shortname_map = {}; + FillFullnameToShortnameMap($symbols, $fullname_to_shortname_map); + foreach my $k (keys(%{$raw})) { + # TODO: omit low %age edges + $n = $raw->{$k}; + my @translated = TranslateStack($symbols, $fullname_to_shortname_map, $k); + for (my $i = 1; $i <= $#translated; $i++) { + my $src = $translated[$i]; + my $dst = $translated[$i-1]; + #next if ($src eq $dst); # Avoid self-edges? + if (exists($node{$src}) && exists($node{$dst})) { + my $edge_label = "$src\001$dst"; + if (!exists($edge{$edge_label})) { + $edge{$edge_label} = 0; + } + $edge{$edge_label} += $n; + } + } + } + + # Print edges (process in order of decreasing counts) + my %indegree = (); # Number of incoming edges added per node so far + my %outdegree = (); # Number of outgoing edges added per node so far + foreach my $e (sort { $edge{$b} <=> $edge{$a} } keys(%edge)) { + my @x = split(/\001/, $e); + $n = $edge{$e}; + + # Initialize degree of kept incoming and outgoing edges if necessary + my $src = $x[0]; + my $dst = $x[1]; + if (!exists($outdegree{$src})) { $outdegree{$src} = 0; } + if (!exists($indegree{$dst})) { $indegree{$dst} = 0; } + + my $keep; + if ($indegree{$dst} == 0) { + # Keep edge if needed for reachability + $keep = 1; + } elsif (abs($n) <= $edgelimit) { + # Drop if we are below --edgefraction + $keep = 0; + } elsif ($outdegree{$src} >= $main::opt_maxdegree || + $indegree{$dst} >= $main::opt_maxdegree) { + # Keep limited number of in/out edges per node + $keep = 0; + } else { + $keep = 1; + } + + if ($keep) { + $outdegree{$src}++; + $indegree{$dst}++; + + # Compute line width based on edge count + my $fraction = abs($local_total ? (3 * ($n / $local_total)) : 0); + if ($fraction > 1) { $fraction = 1; } + my $w = $fraction * 2; + if ($w < 1 && ($main::opt_web || $main::opt_svg)) { + # SVG output treats line widths < 1 poorly. + $w = 1; + } + + # Dot sometimes segfaults if given edge weights that are too large, so + # we cap the weights at a large value + my $edgeweight = abs($n) ** 0.7; + if ($edgeweight > 100000) { $edgeweight = 100000; } + $edgeweight = int($edgeweight); + + my $style = sprintf("setlinewidth(%f)", $w); + if ($x[1] =~ m/\(inline\)/) { + $style .= ",dashed"; + } + + # Use a slightly squashed function of the edge count as the weight + printf DOT ("N%s -> N%s [label=%s, weight=%d, style=\"%s\"];\n", + $node{$x[0]}, + $node{$x[1]}, + Unparse($n), + $edgeweight, + $style); + } + } + + print DOT ("}\n"); + close(DOT); + + if ($main::opt_web || $main::opt_svg) { + # Rewrite SVG to be more usable inside web browser. + RewriteSvg(TempName($main::next_tmpfile, "svg")); + } + + return 1; +} + +sub RewriteSvg { + my $svgfile = shift; + + open(SVG, $svgfile) || die "open temp svg: $!"; + my @svg = ; + close(SVG); + unlink $svgfile; + my $svg = join('', @svg); + + # Dot's SVG output is + # + # + # + # ... + # + # + # + # Change it to + # + # + # $svg_javascript + # + # + # ... + # + # + # + + # Fix width, height; drop viewBox. + $svg =~ s/(?s) above first + my $svg_javascript = SvgJavascript(); + my $viewport = "\n"; + $svg =~ s/ above . + $svg =~ s/(.*)(<\/svg>)/$1<\/g>$2/; + $svg =~ s/$svgfile") || die "open $svgfile: $!"; + print SVG $svg; + close(SVG); + } +} + +sub SvgJavascript { + return <<'EOF'; + +EOF +} + +# Provides a map from fullname to shortname for cases where the +# shortname is ambiguous. The symlist has both the fullname and +# shortname for all symbols, which is usually fine, but sometimes -- +# such as overloaded functions -- two different fullnames can map to +# the same shortname. In that case, we use the address of the +# function to disambiguate the two. This function fills in a map that +# maps fullnames to modified shortnames in such cases. If a fullname +# is not present in the map, the 'normal' shortname provided by the +# symlist is the appropriate one to use. +sub FillFullnameToShortnameMap { + my $symbols = shift; + my $fullname_to_shortname_map = shift; + my $shortnames_seen_once = {}; + my $shortnames_seen_more_than_once = {}; + + foreach my $symlist (values(%{$symbols})) { + # TODO(csilvers): deal with inlined symbols too. + my $shortname = $symlist->[0]; + my $fullname = $symlist->[2]; + if ($fullname !~ /<[0-9a-fA-F]+>$/) { # fullname doesn't end in an address + next; # the only collisions we care about are when addresses differ + } + if (defined($shortnames_seen_once->{$shortname}) && + $shortnames_seen_once->{$shortname} ne $fullname) { + $shortnames_seen_more_than_once->{$shortname} = 1; + } else { + $shortnames_seen_once->{$shortname} = $fullname; + } + } + + foreach my $symlist (values(%{$symbols})) { + my $shortname = $symlist->[0]; + my $fullname = $symlist->[2]; + # TODO(csilvers): take in a list of addresses we care about, and only + # store in the map if $symlist->[1] is in that list. Saves space. + next if defined($fullname_to_shortname_map->{$fullname}); + if (defined($shortnames_seen_more_than_once->{$shortname})) { + if ($fullname =~ /<0*([^>]*)>$/) { # fullname has address at end of it + $fullname_to_shortname_map->{$fullname} = "$shortname\@$1"; + } + } + } +} + +# Return a small number that identifies the argument. +# Multiple calls with the same argument will return the same number. +# Calls with different arguments will return different numbers. +sub ShortIdFor { + my $key = shift; + my $id = $main::uniqueid{$key}; + if (!defined($id)) { + $id = keys(%main::uniqueid) + 1; + $main::uniqueid{$key} = $id; + } + return $id; +} + +# Translate a stack of addresses into a stack of symbols +sub TranslateStack { + my $symbols = shift; + my $fullname_to_shortname_map = shift; + my $k = shift; + + my @addrs = split(/\n/, $k); + my @result = (); + for (my $i = 0; $i <= $#addrs; $i++) { + my $a = $addrs[$i]; + + # Skip large addresses since they sometimes show up as fake entries on RH9 + if (length($a) > 8 && $a gt "7fffffffffffffff") { + next; + } + + if ($main::opt_disasm || $main::opt_list) { + # We want just the address for the key + push(@result, $a); + next; + } + + my $symlist = $symbols->{$a}; + if (!defined($symlist)) { + $symlist = [$a, "", $a]; + } + + # We can have a sequence of symbols for a particular entry + # (more than one symbol in the case of inlining). Callers + # come before callees in symlist, so walk backwards since + # the translated stack should contain callees before callers. + for (my $j = $#{$symlist}; $j >= 2; $j -= 3) { + my $func = $symlist->[$j-2]; + my $fileline = $symlist->[$j-1]; + my $fullfunc = $symlist->[$j]; + if (defined($fullname_to_shortname_map->{$fullfunc})) { + $func = $fullname_to_shortname_map->{$fullfunc}; + } + if ($j > 2) { + $func = "$func (inline)"; + } + + # Do not merge nodes corresponding to Callback::Run since that + # causes confusing cycles in dot display. Instead, we synthesize + # a unique name for this frame per caller. + if ($func =~ m/Callback.*::Run$/) { + my $caller = ($i > 0) ? $addrs[$i-1] : 0; + $func = "Run#" . ShortIdFor($caller); + } + + if ($main::opt_addresses) { + push(@result, "$a $func $fileline"); + } elsif ($main::opt_lines) { + if ($func eq '??' && $fileline eq '??:0') { + push(@result, "$a"); + } else { + push(@result, "$func $fileline"); + } + } elsif ($main::opt_functions) { + if ($func eq '??') { + push(@result, "$a"); + } else { + push(@result, $func); + } + } elsif ($main::opt_files) { + if ($fileline eq '??:0' || $fileline eq '') { + push(@result, "$a"); + } else { + my $f = $fileline; + $f =~ s/:\d+$//; + push(@result, $f); + } + } else { + push(@result, $a); + last; # Do not print inlined info + } + } + } + + # print join(",", @addrs), " => ", join(",", @result), "\n"; + return @result; +} + +# Generate percent string for a number and a total +sub Percent { + my $num = shift; + my $tot = shift; + if ($tot != 0) { + return sprintf("%.1f%%", $num * 100.0 / $tot); + } else { + return ($num == 0) ? "nan" : (($num > 0) ? "+inf" : "-inf"); + } +} + +# Generate pretty-printed form of number +sub Unparse { + my $num = shift; + if ($main::profile_type eq 'heap' || $main::profile_type eq 'growth') { + if ($main::opt_inuse_objects || $main::opt_alloc_objects) { + return sprintf("%d", $num); + } else { + if ($main::opt_show_bytes) { + return sprintf("%d", $num); + } else { + return sprintf("%.1f", $num / 1048576.0); + } + } + } elsif ($main::profile_type eq 'contention' && !$main::opt_contentions) { + return sprintf("%.3f", $num / 1e9); # Convert nanoseconds to seconds + } else { + return sprintf("%d", $num); + } +} + +# Alternate pretty-printed form: 0 maps to "." +sub UnparseAlt { + my $num = shift; + if ($num == 0) { + return "."; + } else { + return Unparse($num); + } +} + +# Alternate pretty-printed form: 0 maps to "" +sub HtmlPrintNumber { + my $num = shift; + if ($num == 0) { + return ""; + } else { + return Unparse($num); + } +} + +# Return output units +sub Units { + if ($main::profile_type eq 'heap' || $main::profile_type eq 'growth') { + if ($main::opt_inuse_objects || $main::opt_alloc_objects) { + return "objects"; + } else { + if ($main::opt_show_bytes) { + return "B"; + } else { + return "MB"; + } + } + } elsif ($main::profile_type eq 'contention' && !$main::opt_contentions) { + return "seconds"; + } else { + return "samples"; + } +} + +##### Profile manipulation code ##### + +# Generate flattened profile: +# If count is charged to stack [a,b,c,d], in generated profile, +# it will be charged to [a] +sub FlatProfile { + my $profile = shift; + my $result = {}; + foreach my $k (keys(%{$profile})) { + my $count = $profile->{$k}; + my @addrs = split(/\n/, $k); + if ($#addrs >= 0) { + AddEntry($result, $addrs[0], $count); + } + } + return $result; +} + +# Generate cumulative profile: +# If count is charged to stack [a,b,c,d], in generated profile, +# it will be charged to [a], [b], [c], [d] +sub CumulativeProfile { + my $profile = shift; + my $result = {}; + foreach my $k (keys(%{$profile})) { + my $count = $profile->{$k}; + my @addrs = split(/\n/, $k); + foreach my $a (@addrs) { + AddEntry($result, $a, $count); + } + } + return $result; +} + +# If the second-youngest PC on the stack is always the same, returns +# that pc. Otherwise, returns undef. +sub IsSecondPcAlwaysTheSame { + my $profile = shift; + + my $second_pc = undef; + foreach my $k (keys(%{$profile})) { + my @addrs = split(/\n/, $k); + if ($#addrs < 1) { + return undef; + } + if (not defined $second_pc) { + $second_pc = $addrs[1]; + } else { + if ($second_pc ne $addrs[1]) { + return undef; + } + } + } + return $second_pc; +} + +sub ExtractSymbolNameInlineStack { + my $symbols = shift; + my $address = shift; + + my @stack = (); + + if (exists $symbols->{$address}) { + my @localinlinestack = @{$symbols->{$address}}; + for (my $i = $#localinlinestack; $i > 0; $i-=3) { + my $file = $localinlinestack[$i-1]; + my $fn = $localinlinestack[$i-0]; + + if ($file eq "?" || $file eq ":0") { + $file = "??:0"; + } + if ($fn eq '??') { + # If we can't get the symbol name, at least use the file information. + $fn = $file; + } + my $suffix = "[inline]"; + if ($i == 2) { + $suffix = ""; + } + push (@stack, $fn.$suffix); + } + } + else { + # If we can't get a symbol name, at least fill in the address. + push (@stack, $address); + } + + return @stack; +} + +sub ExtractSymbolLocation { + my $symbols = shift; + my $address = shift; + # 'addr2line' outputs "??:0" for unknown locations; we do the + # same to be consistent. + my $location = "??:0:unknown"; + if (exists $symbols->{$address}) { + my $file = $symbols->{$address}->[1]; + if ($file eq "?") { + $file = "??:0" + } + $location = $file . ":" . $symbols->{$address}->[0]; + } + return $location; +} + +# Extracts a graph of calls. +sub ExtractCalls { + my $symbols = shift; + my $profile = shift; + + my $calls = {}; + while( my ($stack_trace, $count) = each %$profile ) { + my @address = split(/\n/, $stack_trace); + my $destination = ExtractSymbolLocation($symbols, $address[0]); + AddEntry($calls, $destination, $count); + for (my $i = 1; $i <= $#address; $i++) { + my $source = ExtractSymbolLocation($symbols, $address[$i]); + my $call = "$source -> $destination"; + AddEntry($calls, $call, $count); + $destination = $source; + } + } + + return $calls; +} + +sub FilterFrames { + my $symbols = shift; + my $profile = shift; + + if ($main::opt_retain eq '' && $main::opt_exclude eq '') { + return $profile; + } + + my $result = {}; + foreach my $k (keys(%{$profile})) { + my $count = $profile->{$k}; + my @addrs = split(/\n/, $k); + my @path = (); + foreach my $a (@addrs) { + my $sym; + if (exists($symbols->{$a})) { + $sym = $symbols->{$a}->[0]; + } else { + $sym = $a; + } + if ($main::opt_retain ne '' && $sym !~ m/$main::opt_retain/) { + next; + } + if ($main::opt_exclude ne '' && $sym =~ m/$main::opt_exclude/) { + next; + } + push(@path, $a); + } + if (scalar(@path) > 0) { + my $reduced_path = join("\n", @path); + AddEntry($result, $reduced_path, $count); + } + } + + return $result; +} + +sub PrintCollapsedStacks { + my $symbols = shift; + my $profile = shift; + + while (my ($stack_trace, $count) = each %$profile) { + my @address = split(/\n/, $stack_trace); + my @names = reverse ( map { ExtractSymbolNameInlineStack($symbols, $_) } @address ); + printf("%s %d\n", join(";", @names), $count); + } +} + +sub RemoveUninterestingFrames { + my $symbols = shift; + my $profile = shift; + + # List of function names to skip + my %skip = (); + my $skip_regexp = 'NOMATCH'; + if ($main::profile_type eq 'heap' || $main::profile_type eq 'growth') { + foreach my $name ('@JEMALLOC_PREFIX@calloc', + 'cfree', + '@JEMALLOC_PREFIX@malloc', + 'newImpl', + 'void* newImpl', + '@JEMALLOC_PREFIX@free', + '@JEMALLOC_PREFIX@memalign', + '@JEMALLOC_PREFIX@posix_memalign', + '@JEMALLOC_PREFIX@aligned_alloc', + 'pvalloc', + '@JEMALLOC_PREFIX@valloc', + '@JEMALLOC_PREFIX@realloc', + '@JEMALLOC_PREFIX@mallocx', + '@JEMALLOC_PREFIX@rallocx', + '@JEMALLOC_PREFIX@xallocx', + '@JEMALLOC_PREFIX@dallocx', + '@JEMALLOC_PREFIX@sdallocx', + '@JEMALLOC_PREFIX@sdallocx_noflags', + 'tc_calloc', + 'tc_cfree', + 'tc_malloc', + 'tc_free', + 'tc_memalign', + 'tc_posix_memalign', + 'tc_pvalloc', + 'tc_valloc', + 'tc_realloc', + 'tc_new', + 'tc_delete', + 'tc_newarray', + 'tc_deletearray', + 'tc_new_nothrow', + 'tc_newarray_nothrow', + 'do_malloc', + '::do_malloc', # new name -- got moved to an unnamed ns + '::do_malloc_or_cpp_alloc', + 'DoSampledAllocation', + 'simple_alloc::allocate', + '__malloc_alloc_template::allocate', + '__builtin_delete', + '__builtin_new', + '__builtin_vec_delete', + '__builtin_vec_new', + 'operator new', + 'operator new[]', + # The entry to our memory-allocation routines on OS X + 'malloc_zone_malloc', + 'malloc_zone_calloc', + 'malloc_zone_valloc', + 'malloc_zone_realloc', + 'malloc_zone_memalign', + 'malloc_zone_free', + # These mark the beginning/end of our custom sections + '__start_google_malloc', + '__stop_google_malloc', + '__start_malloc_hook', + '__stop_malloc_hook') { + $skip{$name} = 1; + $skip{"_" . $name} = 1; # Mach (OS X) adds a _ prefix to everything + } + # TODO: Remove TCMalloc once everything has been + # moved into the tcmalloc:: namespace and we have flushed + # old code out of the system. + $skip_regexp = "TCMalloc|^tcmalloc::"; + } elsif ($main::profile_type eq 'contention') { + foreach my $vname ('base::RecordLockProfileData', + 'base::SubmitMutexProfileData', + 'base::SubmitSpinLockProfileData', + 'Mutex::Unlock', + 'Mutex::UnlockSlow', + 'Mutex::ReaderUnlock', + 'MutexLock::~MutexLock', + 'SpinLock::Unlock', + 'SpinLock::SlowUnlock', + 'SpinLockHolder::~SpinLockHolder') { + $skip{$vname} = 1; + } + } elsif ($main::profile_type eq 'cpu') { + # Drop signal handlers used for CPU profile collection + # TODO(dpeng): this should not be necessary; it's taken + # care of by the general 2nd-pc mechanism below. + foreach my $name ('ProfileData::Add', # historical + 'ProfileData::prof_handler', # historical + 'CpuProfiler::prof_handler', + '__FRAME_END__', + '__pthread_sighandler', + '__restore') { + $skip{$name} = 1; + } + } else { + # Nothing skipped for unknown types + } + + if ($main::profile_type eq 'cpu') { + # If all the second-youngest program counters are the same, + # this STRONGLY suggests that it is an artifact of measurement, + # i.e., stack frames pushed by the CPU profiler signal handler. + # Hence, we delete them. + # (The topmost PC is read from the signal structure, not from + # the stack, so it does not get involved.) + while (my $second_pc = IsSecondPcAlwaysTheSame($profile)) { + my $result = {}; + my $func = ''; + if (exists($symbols->{$second_pc})) { + $second_pc = $symbols->{$second_pc}->[0]; + } + print STDERR "Removing $second_pc from all stack traces.\n"; + foreach my $k (keys(%{$profile})) { + my $count = $profile->{$k}; + my @addrs = split(/\n/, $k); + splice @addrs, 1, 1; + my $reduced_path = join("\n", @addrs); + AddEntry($result, $reduced_path, $count); + } + $profile = $result; + } + } + + my $result = {}; + foreach my $k (keys(%{$profile})) { + my $count = $profile->{$k}; + my @addrs = split(/\n/, $k); + my @path = (); + foreach my $a (@addrs) { + if (exists($symbols->{$a})) { + my $func = $symbols->{$a}->[0]; + if ($skip{$func} || ($func =~ m/$skip_regexp/)) { + # Throw away the portion of the backtrace seen so far, under the + # assumption that previous frames were for functions internal to the + # allocator. + @path = (); + next; + } + } + push(@path, $a); + } + my $reduced_path = join("\n", @path); + AddEntry($result, $reduced_path, $count); + } + + $result = FilterFrames($symbols, $result); + + return $result; +} + +# Reduce profile to granularity given by user +sub ReduceProfile { + my $symbols = shift; + my $profile = shift; + my $result = {}; + my $fullname_to_shortname_map = {}; + FillFullnameToShortnameMap($symbols, $fullname_to_shortname_map); + foreach my $k (keys(%{$profile})) { + my $count = $profile->{$k}; + my @translated = TranslateStack($symbols, $fullname_to_shortname_map, $k); + my @path = (); + my %seen = (); + $seen{''} = 1; # So that empty keys are skipped + foreach my $e (@translated) { + # To avoid double-counting due to recursion, skip a stack-trace + # entry if it has already been seen + if (!$seen{$e}) { + $seen{$e} = 1; + push(@path, $e); + } + } + my $reduced_path = join("\n", @path); + AddEntry($result, $reduced_path, $count); + } + return $result; +} + +# Does the specified symbol array match the regexp? +sub SymbolMatches { + my $sym = shift; + my $re = shift; + if (defined($sym)) { + for (my $i = 0; $i < $#{$sym}; $i += 3) { + if ($sym->[$i] =~ m/$re/ || $sym->[$i+1] =~ m/$re/) { + return 1; + } + } + } + return 0; +} + +# Focus only on paths involving specified regexps +sub FocusProfile { + my $symbols = shift; + my $profile = shift; + my $focus = shift; + my $result = {}; + foreach my $k (keys(%{$profile})) { + my $count = $profile->{$k}; + my @addrs = split(/\n/, $k); + foreach my $a (@addrs) { + # Reply if it matches either the address/shortname/fileline + if (($a =~ m/$focus/) || SymbolMatches($symbols->{$a}, $focus)) { + AddEntry($result, $k, $count); + last; + } + } + } + return $result; +} + +# Focus only on paths not involving specified regexps +sub IgnoreProfile { + my $symbols = shift; + my $profile = shift; + my $ignore = shift; + my $result = {}; + foreach my $k (keys(%{$profile})) { + my $count = $profile->{$k}; + my @addrs = split(/\n/, $k); + my $matched = 0; + foreach my $a (@addrs) { + # Reply if it matches either the address/shortname/fileline + if (($a =~ m/$ignore/) || SymbolMatches($symbols->{$a}, $ignore)) { + $matched = 1; + last; + } + } + if (!$matched) { + AddEntry($result, $k, $count); + } + } + return $result; +} + +# Get total count in profile +sub TotalProfile { + my $profile = shift; + my $result = 0; + foreach my $k (keys(%{$profile})) { + $result += $profile->{$k}; + } + return $result; +} + +# Add A to B +sub AddProfile { + my $A = shift; + my $B = shift; + + my $R = {}; + # add all keys in A + foreach my $k (keys(%{$A})) { + my $v = $A->{$k}; + AddEntry($R, $k, $v); + } + # add all keys in B + foreach my $k (keys(%{$B})) { + my $v = $B->{$k}; + AddEntry($R, $k, $v); + } + return $R; +} + +# Merges symbol maps +sub MergeSymbols { + my $A = shift; + my $B = shift; + + my $R = {}; + foreach my $k (keys(%{$A})) { + $R->{$k} = $A->{$k}; + } + if (defined($B)) { + foreach my $k (keys(%{$B})) { + $R->{$k} = $B->{$k}; + } + } + return $R; +} + + +# Add A to B +sub AddPcs { + my $A = shift; + my $B = shift; + + my $R = {}; + # add all keys in A + foreach my $k (keys(%{$A})) { + $R->{$k} = 1 + } + # add all keys in B + foreach my $k (keys(%{$B})) { + $R->{$k} = 1 + } + return $R; +} + +# Subtract B from A +sub SubtractProfile { + my $A = shift; + my $B = shift; + + my $R = {}; + foreach my $k (keys(%{$A})) { + my $v = $A->{$k} - GetEntry($B, $k); + if ($v < 0 && $main::opt_drop_negative) { + $v = 0; + } + AddEntry($R, $k, $v); + } + if (!$main::opt_drop_negative) { + # Take care of when subtracted profile has more entries + foreach my $k (keys(%{$B})) { + if (!exists($A->{$k})) { + AddEntry($R, $k, 0 - $B->{$k}); + } + } + } + return $R; +} + +# Get entry from profile; zero if not present +sub GetEntry { + my $profile = shift; + my $k = shift; + if (exists($profile->{$k})) { + return $profile->{$k}; + } else { + return 0; + } +} + +# Add entry to specified profile +sub AddEntry { + my $profile = shift; + my $k = shift; + my $n = shift; + if (!exists($profile->{$k})) { + $profile->{$k} = 0; + } + $profile->{$k} += $n; +} + +# Add a stack of entries to specified profile, and add them to the $pcs +# list. +sub AddEntries { + my $profile = shift; + my $pcs = shift; + my $stack = shift; + my $count = shift; + my @k = (); + + foreach my $e (split(/\s+/, $stack)) { + my $pc = HexExtend($e); + $pcs->{$pc} = 1; + push @k, $pc; + } + AddEntry($profile, (join "\n", @k), $count); +} + +##### Code to profile a server dynamically ##### + +sub CheckSymbolPage { + my $url = SymbolPageURL(); + my $command = ShellEscape(@URL_FETCHER, $url); + open(SYMBOL, "$command |") or error($command); + my $line = ; + $line =~ s/\r//g; # turn windows-looking lines into unix-looking lines + close(SYMBOL); + unless (defined($line)) { + error("$url doesn't exist\n"); + } + + if ($line =~ /^num_symbols:\s+(\d+)$/) { + if ($1 == 0) { + error("Stripped binary. No symbols available.\n"); + } + } else { + error("Failed to get the number of symbols from $url\n"); + } +} + +sub IsProfileURL { + my $profile_name = shift; + if (-f $profile_name) { + printf STDERR "Using local file $profile_name.\n"; + return 0; + } + return 1; +} + +sub ParseProfileURL { + my $profile_name = shift; + + if (!defined($profile_name) || $profile_name eq "") { + return (); + } + + # Split profile URL - matches all non-empty strings, so no test. + $profile_name =~ m,^(https?://)?([^/]+)(.*?)(/|$PROFILES)?$,; + + my $proto = $1 || "http://"; + my $hostport = $2; + my $prefix = $3; + my $profile = $4 || "/"; + + my $host = $hostport; + $host =~ s/:.*//; + + my $baseurl = "$proto$hostport$prefix"; + return ($host, $baseurl, $profile); +} + +# We fetch symbols from the first profile argument. +sub SymbolPageURL { + my ($host, $baseURL, $path) = ParseProfileURL($main::pfile_args[0]); + return "$baseURL$SYMBOL_PAGE"; +} + +sub FetchProgramName() { + my ($host, $baseURL, $path) = ParseProfileURL($main::pfile_args[0]); + my $url = "$baseURL$PROGRAM_NAME_PAGE"; + my $command_line = ShellEscape(@URL_FETCHER, $url); + open(CMDLINE, "$command_line |") or error($command_line); + my $cmdline = ; + $cmdline =~ s/\r//g; # turn windows-looking lines into unix-looking lines + close(CMDLINE); + error("Failed to get program name from $url\n") unless defined($cmdline); + $cmdline =~ s/\x00.+//; # Remove argv[1] and latters. + $cmdline =~ s!\n!!g; # Remove LFs. + return $cmdline; +} + +# Gee, curl's -L (--location) option isn't reliable at least +# with its 7.12.3 version. Curl will forget to post data if +# there is a redirection. This function is a workaround for +# curl. Redirection happens on borg hosts. +sub ResolveRedirectionForCurl { + my $url = shift; + my $command_line = ShellEscape(@URL_FETCHER, "--head", $url); + open(CMDLINE, "$command_line |") or error($command_line); + while () { + s/\r//g; # turn windows-looking lines into unix-looking lines + if (/^Location: (.*)/) { + $url = $1; + } + } + close(CMDLINE); + return $url; +} + +# Add a timeout flat to URL_FETCHER. Returns a new list. +sub AddFetchTimeout { + my $timeout = shift; + my @fetcher = @_; + if (defined($timeout)) { + if (join(" ", @fetcher) =~ m/\bcurl -s/) { + push(@fetcher, "--max-time", sprintf("%d", $timeout)); + } elsif (join(" ", @fetcher) =~ m/\brpcget\b/) { + push(@fetcher, sprintf("--deadline=%d", $timeout)); + } + } + return @fetcher; +} + +# Reads a symbol map from the file handle name given as $1, returning +# the resulting symbol map. Also processes variables relating to symbols. +# Currently, the only variable processed is 'binary=' which updates +# $main::prog to have the correct program name. +sub ReadSymbols { + my $in = shift; + my $map = {}; + while (<$in>) { + s/\r//g; # turn windows-looking lines into unix-looking lines + # Removes all the leading zeroes from the symbols, see comment below. + if (m/^0x0*([0-9a-f]+)\s+(.+)/) { + $map->{$1} = $2; + } elsif (m/^---/) { + last; + } elsif (m/^([a-z][^=]*)=(.*)$/ ) { + my ($variable, $value) = ($1, $2); + for ($variable, $value) { + s/^\s+//; + s/\s+$//; + } + if ($variable eq "binary") { + if ($main::prog ne $UNKNOWN_BINARY && $main::prog ne $value) { + printf STDERR ("Warning: Mismatched binary name '%s', using '%s'.\n", + $main::prog, $value); + } + $main::prog = $value; + } else { + printf STDERR ("Ignoring unknown variable in symbols list: " . + "'%s' = '%s'\n", $variable, $value); + } + } + } + return $map; +} + +sub URLEncode { + my $str = shift; + $str =~ s/([^A-Za-z0-9\-_.!~*'()])/ sprintf "%%%02x", ord $1 /eg; + return $str; +} + +sub AppendSymbolFilterParams { + my $url = shift; + my @params = (); + if ($main::opt_retain ne '') { + push(@params, sprintf("retain=%s", URLEncode($main::opt_retain))); + } + if ($main::opt_exclude ne '') { + push(@params, sprintf("exclude=%s", URLEncode($main::opt_exclude))); + } + if (scalar @params > 0) { + $url = sprintf("%s?%s", $url, join("&", @params)); + } + return $url; +} + +# Fetches and processes symbols to prepare them for use in the profile output +# code. If the optional 'symbol_map' arg is not given, fetches symbols from +# $SYMBOL_PAGE for all PC values found in profile. Otherwise, the raw symbols +# are assumed to have already been fetched into 'symbol_map' and are simply +# extracted and processed. +sub FetchSymbols { + my $pcset = shift; + my $symbol_map = shift; + + my %seen = (); + my @pcs = grep { !$seen{$_}++ } keys(%$pcset); # uniq + + if (!defined($symbol_map)) { + my $post_data = join("+", sort((map {"0x" . "$_"} @pcs))); + + open(POSTFILE, ">$main::tmpfile_sym"); + print POSTFILE $post_data; + close(POSTFILE); + + my $url = SymbolPageURL(); + + my $command_line; + if (join(" ", @URL_FETCHER) =~ m/\bcurl -s/) { + $url = ResolveRedirectionForCurl($url); + $url = AppendSymbolFilterParams($url); + $command_line = ShellEscape(@URL_FETCHER, "-d", "\@$main::tmpfile_sym", + $url); + } else { + $url = AppendSymbolFilterParams($url); + $command_line = (ShellEscape(@URL_FETCHER, "--post", $url) + . " < " . ShellEscape($main::tmpfile_sym)); + } + # We use c++filt in case $SYMBOL_PAGE gives us mangled symbols. + my $escaped_cppfilt = ShellEscape($obj_tool_map{"c++filt"}); + open(SYMBOL, "$command_line | $escaped_cppfilt |") or error($command_line); + $symbol_map = ReadSymbols(*SYMBOL{IO}); + close(SYMBOL); + } + + my $symbols = {}; + foreach my $pc (@pcs) { + my $fullname; + # For 64 bits binaries, symbols are extracted with 8 leading zeroes. + # Then /symbol reads the long symbols in as uint64, and outputs + # the result with a "0x%08llx" format which get rid of the zeroes. + # By removing all the leading zeroes in both $pc and the symbols from + # /symbol, the symbols match and are retrievable from the map. + my $shortpc = $pc; + $shortpc =~ s/^0*//; + # Each line may have a list of names, which includes the function + # and also other functions it has inlined. They are separated (in + # PrintSymbolizedProfile), by --, which is illegal in function names. + my $fullnames; + if (defined($symbol_map->{$shortpc})) { + $fullnames = $symbol_map->{$shortpc}; + } else { + $fullnames = "0x" . $pc; # Just use addresses + } + my $sym = []; + $symbols->{$pc} = $sym; + foreach my $fullname (split("--", $fullnames)) { + my $name = ShortFunctionName($fullname); + push(@{$sym}, $name, "?", $fullname); + } + } + return $symbols; +} + +sub BaseName { + my $file_name = shift; + $file_name =~ s!^.*/!!; # Remove directory name + return $file_name; +} + +sub MakeProfileBaseName { + my ($binary_name, $profile_name) = @_; + my ($host, $baseURL, $path) = ParseProfileURL($profile_name); + my $binary_shortname = BaseName($binary_name); + return sprintf("%s.%s.%s", + $binary_shortname, $main::op_time, $host); +} + +sub FetchDynamicProfile { + my $binary_name = shift; + my $profile_name = shift; + my $fetch_name_only = shift; + my $encourage_patience = shift; + + if (!IsProfileURL($profile_name)) { + return $profile_name; + } else { + my ($host, $baseURL, $path) = ParseProfileURL($profile_name); + if ($path eq "" || $path eq "/") { + # Missing type specifier defaults to cpu-profile + $path = $PROFILE_PAGE; + } + + my $profile_file = MakeProfileBaseName($binary_name, $profile_name); + + my $url = "$baseURL$path"; + my $fetch_timeout = undef; + if ($path =~ m/$PROFILE_PAGE|$PMUPROFILE_PAGE/) { + if ($path =~ m/[?]/) { + $url .= "&"; + } else { + $url .= "?"; + } + $url .= sprintf("seconds=%d", $main::opt_seconds); + $fetch_timeout = $main::opt_seconds * 1.01 + 60; + # Set $profile_type for consumption by PrintSymbolizedProfile. + $main::profile_type = 'cpu'; + } else { + # For non-CPU profiles, we add a type-extension to + # the target profile file name. + my $suffix = $path; + $suffix =~ s,/,.,g; + $profile_file .= $suffix; + # Set $profile_type for consumption by PrintSymbolizedProfile. + if ($path =~ m/$HEAP_PAGE/) { + $main::profile_type = 'heap'; + } elsif ($path =~ m/$GROWTH_PAGE/) { + $main::profile_type = 'growth'; + } elsif ($path =~ m/$CONTENTION_PAGE/) { + $main::profile_type = 'contention'; + } + } + + my $profile_dir = $ENV{"JEPROF_TMPDIR"} || ($ENV{HOME} . "/jeprof"); + if (! -d $profile_dir) { + mkdir($profile_dir) + || die("Unable to create profile directory $profile_dir: $!\n"); + } + my $tmp_profile = "$profile_dir/.tmp.$profile_file"; + my $real_profile = "$profile_dir/$profile_file"; + + if ($fetch_name_only > 0) { + return $real_profile; + } + + my @fetcher = AddFetchTimeout($fetch_timeout, @URL_FETCHER); + my $cmd = ShellEscape(@fetcher, $url) . " > " . ShellEscape($tmp_profile); + if ($path =~ m/$PROFILE_PAGE|$PMUPROFILE_PAGE|$CENSUSPROFILE_PAGE/){ + print STDERR "Gathering CPU profile from $url for $main::opt_seconds seconds to\n ${real_profile}\n"; + if ($encourage_patience) { + print STDERR "Be patient...\n"; + } + } else { + print STDERR "Fetching $path profile from $url to\n ${real_profile}\n"; + } + + (system($cmd) == 0) || error("Failed to get profile: $cmd: $!\n"); + (system("mv", $tmp_profile, $real_profile) == 0) || error("Unable to rename profile\n"); + print STDERR "Wrote profile to $real_profile\n"; + $main::collected_profile = $real_profile; + return $main::collected_profile; + } +} + +# Collect profiles in parallel +sub FetchDynamicProfiles { + my $items = scalar(@main::pfile_args); + my $levels = log($items) / log(2); + + if ($items == 1) { + $main::profile_files[0] = FetchDynamicProfile($main::prog, $main::pfile_args[0], 0, 1); + } else { + # math rounding issues + if ((2 ** $levels) < $items) { + $levels++; + } + my $count = scalar(@main::pfile_args); + for (my $i = 0; $i < $count; $i++) { + $main::profile_files[$i] = FetchDynamicProfile($main::prog, $main::pfile_args[$i], 1, 0); + } + print STDERR "Fetching $count profiles, Be patient...\n"; + FetchDynamicProfilesRecurse($levels, 0, 0); + $main::collected_profile = join(" \\\n ", @main::profile_files); + } +} + +# Recursively fork a process to get enough processes +# collecting profiles +sub FetchDynamicProfilesRecurse { + my $maxlevel = shift; + my $level = shift; + my $position = shift; + + if (my $pid = fork()) { + $position = 0 | ($position << 1); + TryCollectProfile($maxlevel, $level, $position); + wait; + } else { + $position = 1 | ($position << 1); + TryCollectProfile($maxlevel, $level, $position); + cleanup(); + exit(0); + } +} + +# Collect a single profile +sub TryCollectProfile { + my $maxlevel = shift; + my $level = shift; + my $position = shift; + + if ($level >= ($maxlevel - 1)) { + if ($position < scalar(@main::pfile_args)) { + FetchDynamicProfile($main::prog, $main::pfile_args[$position], 0, 0); + } + } else { + FetchDynamicProfilesRecurse($maxlevel, $level+1, $position); + } +} + +##### Parsing code ##### + +# Provide a small streaming-read module to handle very large +# cpu-profile files. Stream in chunks along a sliding window. +# Provides an interface to get one 'slot', correctly handling +# endian-ness differences. A slot is one 32-bit or 64-bit word +# (depending on the input profile). We tell endianness and bit-size +# for the profile by looking at the first 8 bytes: in cpu profiles, +# the second slot is always 3 (we'll accept anything that's not 0). +BEGIN { + package CpuProfileStream; + + sub new { + my ($class, $file, $fname) = @_; + my $self = { file => $file, + base => 0, + stride => 512 * 1024, # must be a multiple of bitsize/8 + slots => [], + unpack_code => "", # N for big-endian, V for little + perl_is_64bit => 1, # matters if profile is 64-bit + }; + bless $self, $class; + # Let unittests adjust the stride + if ($main::opt_test_stride > 0) { + $self->{stride} = $main::opt_test_stride; + } + # Read the first two slots to figure out bitsize and endianness. + my $slots = $self->{slots}; + my $str; + read($self->{file}, $str, 8); + # Set the global $address_length based on what we see here. + # 8 is 32-bit (8 hexadecimal chars); 16 is 64-bit (16 hexadecimal chars). + $address_length = ($str eq (chr(0)x8)) ? 16 : 8; + if ($address_length == 8) { + if (substr($str, 6, 2) eq chr(0)x2) { + $self->{unpack_code} = 'V'; # Little-endian. + } elsif (substr($str, 4, 2) eq chr(0)x2) { + $self->{unpack_code} = 'N'; # Big-endian + } else { + ::error("$fname: header size >= 2**16\n"); + } + @$slots = unpack($self->{unpack_code} . "*", $str); + } else { + # If we're a 64-bit profile, check if we're a 64-bit-capable + # perl. Otherwise, each slot will be represented as a float + # instead of an int64, losing precision and making all the + # 64-bit addresses wrong. We won't complain yet, but will + # later if we ever see a value that doesn't fit in 32 bits. + my $has_q = 0; + eval { $has_q = pack("Q", "1") ? 1 : 1; }; + if (!$has_q) { + $self->{perl_is_64bit} = 0; + } + read($self->{file}, $str, 8); + if (substr($str, 4, 4) eq chr(0)x4) { + # We'd love to use 'Q', but it's a) not universal, b) not endian-proof. + $self->{unpack_code} = 'V'; # Little-endian. + } elsif (substr($str, 0, 4) eq chr(0)x4) { + $self->{unpack_code} = 'N'; # Big-endian + } else { + ::error("$fname: header size >= 2**32\n"); + } + my @pair = unpack($self->{unpack_code} . "*", $str); + # Since we know one of the pair is 0, it's fine to just add them. + @$slots = (0, $pair[0] + $pair[1]); + } + return $self; + } + + # Load more data when we access slots->get(X) which is not yet in memory. + sub overflow { + my ($self) = @_; + my $slots = $self->{slots}; + $self->{base} += $#$slots + 1; # skip over data we're replacing + my $str; + read($self->{file}, $str, $self->{stride}); + if ($address_length == 8) { # the 32-bit case + # This is the easy case: unpack provides 32-bit unpacking primitives. + @$slots = unpack($self->{unpack_code} . "*", $str); + } else { + # We need to unpack 32 bits at a time and combine. + my @b32_values = unpack($self->{unpack_code} . "*", $str); + my @b64_values = (); + for (my $i = 0; $i < $#b32_values; $i += 2) { + # TODO(csilvers): if this is a 32-bit perl, the math below + # could end up in a too-large int, which perl will promote + # to a double, losing necessary precision. Deal with that. + # Right now, we just die. + my ($lo, $hi) = ($b32_values[$i], $b32_values[$i+1]); + if ($self->{unpack_code} eq 'N') { # big-endian + ($lo, $hi) = ($hi, $lo); + } + my $value = $lo + $hi * (2**32); + if (!$self->{perl_is_64bit} && # check value is exactly represented + (($value % (2**32)) != $lo || int($value / (2**32)) != $hi)) { + ::error("Need a 64-bit perl to process this 64-bit profile.\n"); + } + push(@b64_values, $value); + } + @$slots = @b64_values; + } + } + + # Access the i-th long in the file (logically), or -1 at EOF. + sub get { + my ($self, $idx) = @_; + my $slots = $self->{slots}; + while ($#$slots >= 0) { + if ($idx < $self->{base}) { + # The only time we expect a reference to $slots[$i - something] + # after referencing $slots[$i] is reading the very first header. + # Since $stride > |header|, that shouldn't cause any lookback + # errors. And everything after the header is sequential. + print STDERR "Unexpected look-back reading CPU profile"; + return -1; # shrug, don't know what better to return + } elsif ($idx > $self->{base} + $#$slots) { + $self->overflow(); + } else { + return $slots->[$idx - $self->{base}]; + } + } + # If we get here, $slots is [], which means we've reached EOF + return -1; # unique since slots is supposed to hold unsigned numbers + } +} + +# Reads the top, 'header' section of a profile, and returns the last +# line of the header, commonly called a 'header line'. The header +# section of a profile consists of zero or more 'command' lines that +# are instructions to jeprof, which jeprof executes when reading the +# header. All 'command' lines start with a %. After the command +# lines is the 'header line', which is a profile-specific line that +# indicates what type of profile it is, and perhaps other global +# information about the profile. For instance, here's a header line +# for a heap profile: +# heap profile: 53: 38236 [ 5525: 1284029] @ heapprofile +# For historical reasons, the CPU profile does not contain a text- +# readable header line. If the profile looks like a CPU profile, +# this function returns "". If no header line could be found, this +# function returns undef. +# +# The following commands are recognized: +# %warn -- emit the rest of this line to stderr, prefixed by 'WARNING:' +# +# The input file should be in binmode. +sub ReadProfileHeader { + local *PROFILE = shift; + my $firstchar = ""; + my $line = ""; + read(PROFILE, $firstchar, 1); + seek(PROFILE, -1, 1); # unread the firstchar + if ($firstchar !~ /[[:print:]]/) { # is not a text character + return ""; + } + while (defined($line = )) { + $line =~ s/\r//g; # turn windows-looking lines into unix-looking lines + if ($line =~ /^%warn\s+(.*)/) { # 'warn' command + # Note this matches both '%warn blah\n' and '%warn\n'. + print STDERR "WARNING: $1\n"; # print the rest of the line + } elsif ($line =~ /^%/) { + print STDERR "Ignoring unknown command from profile header: $line"; + } else { + # End of commands, must be the header line. + return $line; + } + } + return undef; # got to EOF without seeing a header line +} + +sub IsSymbolizedProfileFile { + my $file_name = shift; + if (!(-e $file_name) || !(-r $file_name)) { + return 0; + } + # Check if the file contains a symbol-section marker. + open(TFILE, "<$file_name"); + binmode TFILE; + my $firstline = ReadProfileHeader(*TFILE); + close(TFILE); + if (!$firstline) { + return 0; + } + $SYMBOL_PAGE =~ m,[^/]+$,; # matches everything after the last slash + my $symbol_marker = $&; + return $firstline =~ /^--- *$symbol_marker/; +} + +# Parse profile generated by common/profiler.cc and return a reference +# to a map: +# $result->{version} Version number of profile file +# $result->{period} Sampling period (in microseconds) +# $result->{profile} Profile object +# $result->{threads} Map of thread IDs to profile objects +# $result->{map} Memory map info from profile +# $result->{pcs} Hash of all PC values seen, key is hex address +sub ReadProfile { + my $prog = shift; + my $fname = shift; + my $result; # return value + + $CONTENTION_PAGE =~ m,[^/]+$,; # matches everything after the last slash + my $contention_marker = $&; + $GROWTH_PAGE =~ m,[^/]+$,; # matches everything after the last slash + my $growth_marker = $&; + $SYMBOL_PAGE =~ m,[^/]+$,; # matches everything after the last slash + my $symbol_marker = $&; + $PROFILE_PAGE =~ m,[^/]+$,; # matches everything after the last slash + my $profile_marker = $&; + $HEAP_PAGE =~ m,[^/]+$,; # matches everything after the last slash + my $heap_marker = $&; + + # Look at first line to see if it is a heap or a CPU profile. + # CPU profile may start with no header at all, and just binary data + # (starting with \0\0\0\0) -- in that case, don't try to read the + # whole firstline, since it may be gigabytes(!) of data. + open(PROFILE, "<$fname") || error("$fname: $!\n"); + binmode PROFILE; # New perls do UTF-8 processing + my $header = ReadProfileHeader(*PROFILE); + if (!defined($header)) { # means "at EOF" + error("Profile is empty.\n"); + } + + my $symbols; + if ($header =~ m/^--- *$symbol_marker/o) { + # Verify that the user asked for a symbolized profile + if (!$main::use_symbolized_profile) { + # we have both a binary and symbolized profiles, abort + error("FATAL ERROR: Symbolized profile\n $fname\ncannot be used with " . + "a binary arg. Try again without passing\n $prog\n"); + } + # Read the symbol section of the symbolized profile file. + $symbols = ReadSymbols(*PROFILE{IO}); + # Read the next line to get the header for the remaining profile. + $header = ReadProfileHeader(*PROFILE) || ""; + } + + if ($header =~ m/^--- *($heap_marker|$growth_marker)/o) { + # Skip "--- ..." line for profile types that have their own headers. + $header = ReadProfileHeader(*PROFILE) || ""; + } + + $main::profile_type = ''; + + if ($header =~ m/^heap profile:.*$growth_marker/o) { + $main::profile_type = 'growth'; + $result = ReadHeapProfile($prog, *PROFILE, $header); + } elsif ($header =~ m/^heap profile:/) { + $main::profile_type = 'heap'; + $result = ReadHeapProfile($prog, *PROFILE, $header); + } elsif ($header =~ m/^heap/) { + $main::profile_type = 'heap'; + $result = ReadThreadedHeapProfile($prog, $fname, $header); + } elsif ($header =~ m/^--- *$contention_marker/o) { + $main::profile_type = 'contention'; + $result = ReadSynchProfile($prog, *PROFILE); + } elsif ($header =~ m/^--- *Stacks:/) { + print STDERR + "Old format contention profile: mistakenly reports " . + "condition variable signals as lock contentions.\n"; + $main::profile_type = 'contention'; + $result = ReadSynchProfile($prog, *PROFILE); + } elsif ($header =~ m/^--- *$profile_marker/) { + # the binary cpu profile data starts immediately after this line + $main::profile_type = 'cpu'; + $result = ReadCPUProfile($prog, $fname, *PROFILE); + } else { + if (defined($symbols)) { + # a symbolized profile contains a format we don't recognize, bail out + error("$fname: Cannot recognize profile section after symbols.\n"); + } + # no ascii header present -- must be a CPU profile + $main::profile_type = 'cpu'; + $result = ReadCPUProfile($prog, $fname, *PROFILE); + } + + close(PROFILE); + + # if we got symbols along with the profile, return those as well + if (defined($symbols)) { + $result->{symbols} = $symbols; + } + + return $result; +} + +# Subtract one from caller pc so we map back to call instr. +# However, don't do this if we're reading a symbolized profile +# file, in which case the subtract-one was done when the file +# was written. +# +# We apply the same logic to all readers, though ReadCPUProfile uses an +# independent implementation. +sub FixCallerAddresses { + my $stack = shift; + # --raw/http: Always subtract one from pc's, because PrintSymbolizedProfile() + # dumps unadjusted profiles. + { + $stack =~ /(\s)/; + my $delimiter = $1; + my @addrs = split(' ', $stack); + my @fixedaddrs; + $#fixedaddrs = $#addrs; + if ($#addrs >= 0) { + $fixedaddrs[0] = $addrs[0]; + } + for (my $i = 1; $i <= $#addrs; $i++) { + $fixedaddrs[$i] = AddressSub($addrs[$i], "0x1"); + } + return join $delimiter, @fixedaddrs; + } +} + +# CPU profile reader +sub ReadCPUProfile { + my $prog = shift; + my $fname = shift; # just used for logging + local *PROFILE = shift; + my $version; + my $period; + my $i; + my $profile = {}; + my $pcs = {}; + + # Parse string into array of slots. + my $slots = CpuProfileStream->new(*PROFILE, $fname); + + # Read header. The current header version is a 5-element structure + # containing: + # 0: header count (always 0) + # 1: header "words" (after this one: 3) + # 2: format version (0) + # 3: sampling period (usec) + # 4: unused padding (always 0) + if ($slots->get(0) != 0 ) { + error("$fname: not a profile file, or old format profile file\n"); + } + $i = 2 + $slots->get(1); + $version = $slots->get(2); + $period = $slots->get(3); + # Do some sanity checking on these header values. + if ($version > (2**32) || $period > (2**32) || $i > (2**32) || $i < 5) { + error("$fname: not a profile file, or corrupted profile file\n"); + } + + # Parse profile + while ($slots->get($i) != -1) { + my $n = $slots->get($i++); + my $d = $slots->get($i++); + if ($d > (2**16)) { # TODO(csilvers): what's a reasonable max-stack-depth? + my $addr = sprintf("0%o", $i * ($address_length == 8 ? 4 : 8)); + print STDERR "At index $i (address $addr):\n"; + error("$fname: stack trace depth >= 2**32\n"); + } + if ($slots->get($i) == 0) { + # End of profile data marker + $i += $d; + last; + } + + # Make key out of the stack entries + my @k = (); + for (my $j = 0; $j < $d; $j++) { + my $pc = $slots->get($i+$j); + # Subtract one from caller pc so we map back to call instr. + $pc--; + $pc = sprintf("%0*x", $address_length, $pc); + $pcs->{$pc} = 1; + push @k, $pc; + } + + AddEntry($profile, (join "\n", @k), $n); + $i += $d; + } + + # Parse map + my $map = ''; + seek(PROFILE, $i * 4, 0); + read(PROFILE, $map, (stat PROFILE)[7]); + + my $r = {}; + $r->{version} = $version; + $r->{period} = $period; + $r->{profile} = $profile; + $r->{libs} = ParseLibraries($prog, $map, $pcs); + $r->{pcs} = $pcs; + + return $r; +} + +sub HeapProfileIndex { + my $index = 1; + if ($main::opt_inuse_space) { + $index = 1; + } elsif ($main::opt_inuse_objects) { + $index = 0; + } elsif ($main::opt_alloc_space) { + $index = 3; + } elsif ($main::opt_alloc_objects) { + $index = 2; + } + return $index; +} + +sub ReadMappedLibraries { + my $fh = shift; + my $map = ""; + # Read the /proc/self/maps data + while (<$fh>) { + s/\r//g; # turn windows-looking lines into unix-looking lines + $map .= $_; + } + return $map; +} + +sub ReadMemoryMap { + my $fh = shift; + my $map = ""; + # Read /proc/self/maps data as formatted by DumpAddressMap() + my $buildvar = ""; + while () { + s/\r//g; # turn windows-looking lines into unix-looking lines + # Parse "build=" specification if supplied + if (m/^\s*build=(.*)\n/) { + $buildvar = $1; + } + + # Expand "$build" variable if available + $_ =~ s/\$build\b/$buildvar/g; + + $map .= $_; + } + return $map; +} + +sub AdjustSamples { + my ($sample_adjustment, $sampling_algorithm, $n1, $s1, $n2, $s2) = @_; + if ($sample_adjustment) { + if ($sampling_algorithm == 2) { + # Remote-heap version 2 + # The sampling frequency is the rate of a Poisson process. + # This means that the probability of sampling an allocation of + # size X with sampling rate Y is 1 - exp(-X/Y) + if ($n1 != 0) { + my $ratio = (($s1*1.0)/$n1)/($sample_adjustment); + my $scale_factor = 1/(1 - exp(-$ratio)); + $n1 *= $scale_factor; + $s1 *= $scale_factor; + } + if ($n2 != 0) { + my $ratio = (($s2*1.0)/$n2)/($sample_adjustment); + my $scale_factor = 1/(1 - exp(-$ratio)); + $n2 *= $scale_factor; + $s2 *= $scale_factor; + } + } else { + # Remote-heap version 1 + my $ratio; + $ratio = (($s1*1.0)/$n1)/($sample_adjustment); + if ($ratio < 1) { + $n1 /= $ratio; + $s1 /= $ratio; + } + $ratio = (($s2*1.0)/$n2)/($sample_adjustment); + if ($ratio < 1) { + $n2 /= $ratio; + $s2 /= $ratio; + } + } + } + return ($n1, $s1, $n2, $s2); +} + +sub ReadHeapProfile { + my $prog = shift; + local *PROFILE = shift; + my $header = shift; + + my $index = HeapProfileIndex(); + + # Find the type of this profile. The header line looks like: + # heap profile: 1246: 8800744 [ 1246: 8800744] @ /266053 + # There are two pairs , the first inuse objects/space, and the + # second allocated objects/space. This is followed optionally by a profile + # type, and if that is present, optionally by a sampling frequency. + # For remote heap profiles (v1): + # The interpretation of the sampling frequency is that the profiler, for + # each sample, calculates a uniformly distributed random integer less than + # the given value, and records the next sample after that many bytes have + # been allocated. Therefore, the expected sample interval is half of the + # given frequency. By default, if not specified, the expected sample + # interval is 128KB. Only remote-heap-page profiles are adjusted for + # sample size. + # For remote heap profiles (v2): + # The sampling frequency is the rate of a Poisson process. This means that + # the probability of sampling an allocation of size X with sampling rate Y + # is 1 - exp(-X/Y) + # For version 2, a typical header line might look like this: + # heap profile: 1922: 127792360 [ 1922: 127792360] @ _v2/524288 + # the trailing number (524288) is the sampling rate. (Version 1 showed + # double the 'rate' here) + my $sampling_algorithm = 0; + my $sample_adjustment = 0; + chomp($header); + my $type = "unknown"; + if ($header =~ m"^heap profile:\s*(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\](\s*@\s*([^/]*)(/(\d+))?)?") { + if (defined($6) && ($6 ne '')) { + $type = $6; + my $sample_period = $8; + # $type is "heapprofile" for profiles generated by the + # heap-profiler, and either "heap" or "heap_v2" for profiles + # generated by sampling directly within tcmalloc. It can also + # be "growth" for heap-growth profiles. The first is typically + # found for profiles generated locally, and the others for + # remote profiles. + if (($type eq "heapprofile") || ($type !~ /heap/) ) { + # No need to adjust for the sampling rate with heap-profiler-derived data + $sampling_algorithm = 0; + } elsif ($type =~ /_v2/) { + $sampling_algorithm = 2; # version 2 sampling + if (defined($sample_period) && ($sample_period ne '')) { + $sample_adjustment = int($sample_period); + } + } else { + $sampling_algorithm = 1; # version 1 sampling + if (defined($sample_period) && ($sample_period ne '')) { + $sample_adjustment = int($sample_period)/2; + } + } + } else { + # We detect whether or not this is a remote-heap profile by checking + # that the total-allocated stats ($n2,$s2) are exactly the + # same as the in-use stats ($n1,$s1). It is remotely conceivable + # that a non-remote-heap profile may pass this check, but it is hard + # to imagine how that could happen. + # In this case it's so old it's guaranteed to be remote-heap version 1. + my ($n1, $s1, $n2, $s2) = ($1, $2, $3, $4); + if (($n1 == $n2) && ($s1 == $s2)) { + # This is likely to be a remote-heap based sample profile + $sampling_algorithm = 1; + } + } + } + + if ($sampling_algorithm > 0) { + # For remote-heap generated profiles, adjust the counts and sizes to + # account for the sample rate (we sample once every 128KB by default). + if ($sample_adjustment == 0) { + # Turn on profile adjustment. + $sample_adjustment = 128*1024; + print STDERR "Adjusting heap profiles for 1-in-128KB sampling rate\n"; + } else { + printf STDERR ("Adjusting heap profiles for 1-in-%d sampling rate\n", + $sample_adjustment); + } + if ($sampling_algorithm > 1) { + # We don't bother printing anything for the original version (version 1) + printf STDERR "Heap version $sampling_algorithm\n"; + } + } + + my $profile = {}; + my $pcs = {}; + my $map = ""; + + while () { + s/\r//g; # turn windows-looking lines into unix-looking lines + if (/^MAPPED_LIBRARIES:/) { + $map .= ReadMappedLibraries(*PROFILE); + last; + } + + if (/^--- Memory map:/) { + $map .= ReadMemoryMap(*PROFILE); + last; + } + + # Read entry of the form: + # : [: ] @ a1 a2 a3 ... an + s/^\s*//; + s/\s*$//; + if (m/^\s*(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\]\s+@\s+(.*)$/) { + my $stack = $5; + my ($n1, $s1, $n2, $s2) = ($1, $2, $3, $4); + my @counts = AdjustSamples($sample_adjustment, $sampling_algorithm, + $n1, $s1, $n2, $s2); + AddEntries($profile, $pcs, FixCallerAddresses($stack), $counts[$index]); + } + } + + my $r = {}; + $r->{version} = "heap"; + $r->{period} = 1; + $r->{profile} = $profile; + $r->{libs} = ParseLibraries($prog, $map, $pcs); + $r->{pcs} = $pcs; + return $r; +} + +sub ReadThreadedHeapProfile { + my ($prog, $fname, $header) = @_; + + my $index = HeapProfileIndex(); + my $sampling_algorithm = 0; + my $sample_adjustment = 0; + chomp($header); + my $type = "unknown"; + # Assuming a very specific type of header for now. + if ($header =~ m"^heap_v2/(\d+)") { + $type = "_v2"; + $sampling_algorithm = 2; + $sample_adjustment = int($1); + } + if ($type ne "_v2" || !defined($sample_adjustment)) { + die "Threaded heap profiles require v2 sampling with a sample rate\n"; + } + + my $profile = {}; + my $thread_profiles = {}; + my $pcs = {}; + my $map = ""; + my $stack = ""; + + while () { + s/\r//g; + if (/^MAPPED_LIBRARIES:/) { + $map .= ReadMappedLibraries(*PROFILE); + last; + } + + if (/^--- Memory map:/) { + $map .= ReadMemoryMap(*PROFILE); + last; + } + + # Read entry of the form: + # @ a1 a2 ... an + # t*: : [: ] + # t1: : [: ] + # ... + # tn: : [: ] + s/^\s*//; + s/\s*$//; + if (m/^@\s+(.*)$/) { + $stack = $1; + } elsif (m/^\s*(t(\*|\d+)):\s+(\d+):\s+(\d+)\s+\[\s*(\d+):\s+(\d+)\]$/) { + if ($stack eq "") { + # Still in the header, so this is just a per-thread summary. + next; + } + my $thread = $2; + my ($n1, $s1, $n2, $s2) = ($3, $4, $5, $6); + my @counts = AdjustSamples($sample_adjustment, $sampling_algorithm, + $n1, $s1, $n2, $s2); + if ($thread eq "*") { + AddEntries($profile, $pcs, FixCallerAddresses($stack), $counts[$index]); + } else { + if (!exists($thread_profiles->{$thread})) { + $thread_profiles->{$thread} = {}; + } + AddEntries($thread_profiles->{$thread}, $pcs, + FixCallerAddresses($stack), $counts[$index]); + } + } + } + + my $r = {}; + $r->{version} = "heap"; + $r->{period} = 1; + $r->{profile} = $profile; + $r->{threads} = $thread_profiles; + $r->{libs} = ParseLibraries($prog, $map, $pcs); + $r->{pcs} = $pcs; + return $r; +} + +sub ReadSynchProfile { + my $prog = shift; + local *PROFILE = shift; + my $header = shift; + + my $map = ''; + my $profile = {}; + my $pcs = {}; + my $sampling_period = 1; + my $cyclespernanosec = 2.8; # Default assumption for old binaries + my $seen_clockrate = 0; + my $line; + + my $index = 0; + if ($main::opt_total_delay) { + $index = 0; + } elsif ($main::opt_contentions) { + $index = 1; + } elsif ($main::opt_mean_delay) { + $index = 2; + } + + while ( $line = ) { + $line =~ s/\r//g; # turn windows-looking lines into unix-looking lines + if ( $line =~ /^\s*(\d+)\s+(\d+) \@\s*(.*?)\s*$/ ) { + my ($cycles, $count, $stack) = ($1, $2, $3); + + # Convert cycles to nanoseconds + $cycles /= $cyclespernanosec; + + # Adjust for sampling done by application + $cycles *= $sampling_period; + $count *= $sampling_period; + + my @values = ($cycles, $count, $cycles / $count); + AddEntries($profile, $pcs, FixCallerAddresses($stack), $values[$index]); + + } elsif ( $line =~ /^(slow release).*thread \d+ \@\s*(.*?)\s*$/ || + $line =~ /^\s*(\d+) \@\s*(.*?)\s*$/ ) { + my ($cycles, $stack) = ($1, $2); + if ($cycles !~ /^\d+$/) { + next; + } + + # Convert cycles to nanoseconds + $cycles /= $cyclespernanosec; + + # Adjust for sampling done by application + $cycles *= $sampling_period; + + AddEntries($profile, $pcs, FixCallerAddresses($stack), $cycles); + + } elsif ( $line =~ m/^([a-z][^=]*)=(.*)$/ ) { + my ($variable, $value) = ($1,$2); + for ($variable, $value) { + s/^\s+//; + s/\s+$//; + } + if ($variable eq "cycles/second") { + $cyclespernanosec = $value / 1e9; + $seen_clockrate = 1; + } elsif ($variable eq "sampling period") { + $sampling_period = $value; + } elsif ($variable eq "ms since reset") { + # Currently nothing is done with this value in jeprof + # So we just silently ignore it for now + } elsif ($variable eq "discarded samples") { + # Currently nothing is done with this value in jeprof + # So we just silently ignore it for now + } else { + printf STDERR ("Ignoring unnknown variable in /contention output: " . + "'%s' = '%s'\n",$variable,$value); + } + } else { + # Memory map entry + $map .= $line; + } + } + + if (!$seen_clockrate) { + printf STDERR ("No cycles/second entry in profile; Guessing %.1f GHz\n", + $cyclespernanosec); + } + + my $r = {}; + $r->{version} = 0; + $r->{period} = $sampling_period; + $r->{profile} = $profile; + $r->{libs} = ParseLibraries($prog, $map, $pcs); + $r->{pcs} = $pcs; + return $r; +} + +# Given a hex value in the form "0x1abcd" or "1abcd", return either +# "0001abcd" or "000000000001abcd", depending on the current (global) +# address length. +sub HexExtend { + my $addr = shift; + + $addr =~ s/^(0x)?0*//; + my $zeros_needed = $address_length - length($addr); + if ($zeros_needed < 0) { + printf STDERR "Warning: address $addr is longer than address length $address_length\n"; + return $addr; + } + return ("0" x $zeros_needed) . $addr; +} + +##### Symbol extraction ##### + +# Aggressively search the lib_prefix values for the given library +# If all else fails, just return the name of the library unmodified. +# If the lib_prefix is "/my/path,/other/path" and $file is "/lib/dir/mylib.so" +# it will search the following locations in this order, until it finds a file: +# /my/path/lib/dir/mylib.so +# /other/path/lib/dir/mylib.so +# /my/path/dir/mylib.so +# /other/path/dir/mylib.so +# /my/path/mylib.so +# /other/path/mylib.so +# /lib/dir/mylib.so (returned as last resort) +sub FindLibrary { + my $file = shift; + my $suffix = $file; + + # Search for the library as described above + do { + foreach my $prefix (@prefix_list) { + my $fullpath = $prefix . $suffix; + if (-e $fullpath) { + return $fullpath; + } + } + } while ($suffix =~ s|^/[^/]+/|/|); + return $file; +} + +# Return path to library with debugging symbols. +# For libc libraries, the copy in /usr/lib/debug contains debugging symbols +sub DebuggingLibrary { + my $file = shift; + + if ($file !~ m|^/|) { + return undef; + } + + # Find debug symbol file if it's named after the library's name. + + if (-f "/usr/lib/debug$file") { + if($main::opt_debug) { print STDERR "found debug info for $file in /usr/lib/debug$file\n"; } + return "/usr/lib/debug$file"; + } elsif (-f "/usr/lib/debug$file.debug") { + if($main::opt_debug) { print STDERR "found debug info for $file in /usr/lib/debug$file.debug\n"; } + return "/usr/lib/debug$file.debug"; + } + + if(!$main::opt_debug_syms_by_id) { + if($main::opt_debug) { print STDERR "no debug symbols found for $file\n" }; + return undef; + } + + # Find debug file if it's named after the library's build ID. + + my $readelf = ''; + if (!$main::gave_up_on_elfutils) { + $readelf = qx/eu-readelf -n ${file}/; + if ($?) { + print STDERR "Cannot run eu-readelf. To use --debug-syms-by-id you must be on Linux, with elfutils installed.\n"; + $main::gave_up_on_elfutils = 1; + return undef; + } + my $buildID = $1 if $readelf =~ /Build ID: ([A-Fa-f0-9]+)/s; + if (defined $buildID && length $buildID > 0) { + my $symbolFile = '/usr/lib/debug/.build-id/' . substr($buildID, 0, 2) . '/' . substr($buildID, 2) . '.debug'; + if (-e $symbolFile) { + if($main::opt_debug) { print STDERR "found debug symbol file $symbolFile for $file\n" }; + return $symbolFile; + } else { + if($main::opt_debug) { print STDERR "no debug symbol file found for $file, build ID: $buildID\n" }; + return undef; + } + } + } + + if($main::opt_debug) { print STDERR "no debug symbols found for $file, build ID unknown\n" }; + return undef; +} + + +# Parse text section header of a library using objdump +sub ParseTextSectionHeaderFromObjdump { + my $lib = shift; + + my $size = undef; + my $vma; + my $file_offset; + # Get objdump output from the library file to figure out how to + # map between mapped addresses and addresses in the library. + my $cmd = ShellEscape($obj_tool_map{"objdump"}, "-h", $lib); + open(OBJDUMP, "$cmd |") || error("$cmd: $!\n"); + while () { + s/\r//g; # turn windows-looking lines into unix-looking lines + # Idx Name Size VMA LMA File off Algn + # 10 .text 00104b2c 420156f0 420156f0 000156f0 2**4 + # For 64-bit objects, VMA and LMA will be 16 hex digits, size and file + # offset may still be 8. But AddressSub below will still handle that. + my @x = split; + if (($#x >= 6) && ($x[1] eq '.text')) { + $size = $x[2]; + $vma = $x[3]; + $file_offset = $x[5]; + last; + } + } + close(OBJDUMP); + + if (!defined($size)) { + return undef; + } + + my $r = {}; + $r->{size} = $size; + $r->{vma} = $vma; + $r->{file_offset} = $file_offset; + + return $r; +} + +# Parse text section header of a library using otool (on OS X) +sub ParseTextSectionHeaderFromOtool { + my $lib = shift; + + my $size = undef; + my $vma = undef; + my $file_offset = undef; + # Get otool output from the library file to figure out how to + # map between mapped addresses and addresses in the library. + my $command = ShellEscape($obj_tool_map{"otool"}, "-l", $lib); + open(OTOOL, "$command |") || error("$command: $!\n"); + my $cmd = ""; + my $sectname = ""; + my $segname = ""; + foreach my $line () { + $line =~ s/\r//g; # turn windows-looking lines into unix-looking lines + # Load command <#> + # cmd LC_SEGMENT + # [...] + # Section + # sectname __text + # segname __TEXT + # addr 0x000009f8 + # size 0x00018b9e + # offset 2552 + # align 2^2 (4) + # We will need to strip off the leading 0x from the hex addresses, + # and convert the offset into hex. + if ($line =~ /Load command/) { + $cmd = ""; + $sectname = ""; + $segname = ""; + } elsif ($line =~ /Section/) { + $sectname = ""; + $segname = ""; + } elsif ($line =~ /cmd (\w+)/) { + $cmd = $1; + } elsif ($line =~ /sectname (\w+)/) { + $sectname = $1; + } elsif ($line =~ /segname (\w+)/) { + $segname = $1; + } elsif (!(($cmd eq "LC_SEGMENT" || $cmd eq "LC_SEGMENT_64") && + $sectname eq "__text" && + $segname eq "__TEXT")) { + next; + } elsif ($line =~ /\baddr 0x([0-9a-fA-F]+)/) { + $vma = $1; + } elsif ($line =~ /\bsize 0x([0-9a-fA-F]+)/) { + $size = $1; + } elsif ($line =~ /\boffset ([0-9]+)/) { + $file_offset = sprintf("%016x", $1); + } + if (defined($vma) && defined($size) && defined($file_offset)) { + last; + } + } + close(OTOOL); + + if (!defined($vma) || !defined($size) || !defined($file_offset)) { + return undef; + } + + my $r = {}; + $r->{size} = $size; + $r->{vma} = $vma; + $r->{file_offset} = $file_offset; + + return $r; +} + +sub ParseTextSectionHeader { + # obj_tool_map("otool") is only defined if we're in a Mach-O environment + if (defined($obj_tool_map{"otool"})) { + my $r = ParseTextSectionHeaderFromOtool(@_); + if (defined($r)){ + return $r; + } + } + # If otool doesn't work, or we don't have it, fall back to objdump + return ParseTextSectionHeaderFromObjdump(@_); +} + +# Split /proc/pid/maps dump into a list of libraries +sub ParseLibraries { + return if $main::use_symbol_page; # We don't need libraries info. + my $prog = Cwd::abs_path(shift); + my $map = shift; + my $pcs = shift; + + my $result = []; + my $h = "[a-f0-9]+"; + my $zero_offset = HexExtend("0"); + + my $buildvar = ""; + foreach my $l (split("\n", $map)) { + if ($l =~ m/^\s*build=(.*)$/) { + $buildvar = $1; + } + + my $start; + my $finish; + my $offset; + my $lib; + if ($l =~ /^($h)-($h)\s+..x.\s+($h)\s+\S+:\S+\s+\d+\s+(\S+\.(so|dll|dylib|bundle)((\.\d+)+\w*(\.\d+){0,3})?)$/i) { + # Full line from /proc/self/maps. Example: + # 40000000-40015000 r-xp 00000000 03:01 12845071 /lib/ld-2.3.2.so + $start = HexExtend($1); + $finish = HexExtend($2); + $offset = HexExtend($3); + $lib = $4; + $lib =~ s|\\|/|g; # turn windows-style paths into unix-style paths + } elsif ($l =~ /^\s*($h)-($h):\s*(\S+\.so(\.\d+)*)/) { + # Cooked line from DumpAddressMap. Example: + # 40000000-40015000: /lib/ld-2.3.2.so + $start = HexExtend($1); + $finish = HexExtend($2); + $offset = $zero_offset; + $lib = $3; + } elsif (($l =~ /^($h)-($h)\s+..x.\s+($h)\s+\S+:\S+\s+\d+\s+(\S+)$/i) && ($4 eq $prog)) { + # PIEs and address space randomization do not play well with our + # default assumption that main executable is at lowest + # addresses. So we're detecting main executable in + # /proc/self/maps as well. + $start = HexExtend($1); + $finish = HexExtend($2); + $offset = HexExtend($3); + $lib = $4; + $lib =~ s|\\|/|g; # turn windows-style paths into unix-style paths + } + # FreeBSD 10.0 virtual memory map /proc/curproc/map as defined in + # function procfs_doprocmap (sys/fs/procfs/procfs_map.c) + # + # Example: + # 0x800600000 0x80061a000 26 0 0xfffff800035a0000 r-x 75 33 0x1004 COW NC vnode /libexec/ld-elf.s + # o.1 NCH -1 + elsif ($l =~ /^(0x$h)\s(0x$h)\s\d+\s\d+\s0x$h\sr-x\s\d+\s\d+\s0x\d+\s(COW|NCO)\s(NC|NNC)\svnode\s(\S+\.so(\.\d+)*)/) { + $start = HexExtend($1); + $finish = HexExtend($2); + $offset = $zero_offset; + $lib = FindLibrary($5); + + } else { + next; + } + + # Expand "$build" variable if available + $lib =~ s/\$build\b/$buildvar/g; + + $lib = FindLibrary($lib); + + # Check for pre-relocated libraries, which use pre-relocated symbol tables + # and thus require adjusting the offset that we'll use to translate + # VM addresses into symbol table addresses. + # Only do this if we're not going to fetch the symbol table from a + # debugging copy of the library. + if (!DebuggingLibrary($lib)) { + my $text = ParseTextSectionHeader($lib); + if (defined($text)) { + my $vma_offset = AddressSub($text->{vma}, $text->{file_offset}); + $offset = AddressAdd($offset, $vma_offset); + } + } + + if($main::opt_debug) { printf STDERR "$start:$finish ($offset) $lib\n"; } + push(@{$result}, [$lib, $start, $finish, $offset]); + } + + # Append special entry for additional library (not relocated) + if ($main::opt_lib ne "") { + my $text = ParseTextSectionHeader($main::opt_lib); + if (defined($text)) { + my $start = $text->{vma}; + my $finish = AddressAdd($start, $text->{size}); + + push(@{$result}, [$main::opt_lib, $start, $finish, $start]); + } + } + + # Append special entry for the main program. This covers + # 0..max_pc_value_seen, so that we assume pc values not found in one + # of the library ranges will be treated as coming from the main + # program binary. + my $min_pc = HexExtend("0"); + my $max_pc = $min_pc; # find the maximal PC value in any sample + foreach my $pc (keys(%{$pcs})) { + if (HexExtend($pc) gt $max_pc) { $max_pc = HexExtend($pc); } + } + push(@{$result}, [$prog, $min_pc, $max_pc, $zero_offset]); + + return $result; +} + +# Add two hex addresses of length $address_length. +# Run jeprof --test for unit test if this is changed. +sub AddressAdd { + my $addr1 = shift; + my $addr2 = shift; + my $sum; + + if ($address_length == 8) { + # Perl doesn't cope with wraparound arithmetic, so do it explicitly: + $sum = (hex($addr1)+hex($addr2)) % (0x10000000 * 16); + return sprintf("%08x", $sum); + + } else { + # Do the addition in 7-nibble chunks to trivialize carry handling. + + if ($main::opt_debug and $main::opt_test) { + print STDERR "AddressAdd $addr1 + $addr2 = "; + } + + my $a1 = substr($addr1,-7); + $addr1 = substr($addr1,0,-7); + my $a2 = substr($addr2,-7); + $addr2 = substr($addr2,0,-7); + $sum = hex($a1) + hex($a2); + my $c = 0; + if ($sum > 0xfffffff) { + $c = 1; + $sum -= 0x10000000; + } + my $r = sprintf("%07x", $sum); + + $a1 = substr($addr1,-7); + $addr1 = substr($addr1,0,-7); + $a2 = substr($addr2,-7); + $addr2 = substr($addr2,0,-7); + $sum = hex($a1) + hex($a2) + $c; + $c = 0; + if ($sum > 0xfffffff) { + $c = 1; + $sum -= 0x10000000; + } + $r = sprintf("%07x", $sum) . $r; + + $sum = hex($addr1) + hex($addr2) + $c; + if ($sum > 0xff) { $sum -= 0x100; } + $r = sprintf("%02x", $sum) . $r; + + if ($main::opt_debug and $main::opt_test) { print STDERR "$r\n"; } + + return $r; + } +} + + +# Subtract two hex addresses of length $address_length. +# Run jeprof --test for unit test if this is changed. +sub AddressSub { + my $addr1 = shift; + my $addr2 = shift; + my $diff; + + if ($address_length == 8) { + # Perl doesn't cope with wraparound arithmetic, so do it explicitly: + $diff = (hex($addr1)-hex($addr2)) % (0x10000000 * 16); + return sprintf("%08x", $diff); + + } else { + # Do the addition in 7-nibble chunks to trivialize borrow handling. + # if ($main::opt_debug) { print STDERR "AddressSub $addr1 - $addr2 = "; } + + my $a1 = hex(substr($addr1,-7)); + $addr1 = substr($addr1,0,-7); + my $a2 = hex(substr($addr2,-7)); + $addr2 = substr($addr2,0,-7); + my $b = 0; + if ($a2 > $a1) { + $b = 1; + $a1 += 0x10000000; + } + $diff = $a1 - $a2; + my $r = sprintf("%07x", $diff); + + $a1 = hex(substr($addr1,-7)); + $addr1 = substr($addr1,0,-7); + $a2 = hex(substr($addr2,-7)) + $b; + $addr2 = substr($addr2,0,-7); + $b = 0; + if ($a2 > $a1) { + $b = 1; + $a1 += 0x10000000; + } + $diff = $a1 - $a2; + $r = sprintf("%07x", $diff) . $r; + + $a1 = hex($addr1); + $a2 = hex($addr2) + $b; + if ($a2 > $a1) { $a1 += 0x100; } + $diff = $a1 - $a2; + $r = sprintf("%02x", $diff) . $r; + + # if ($main::opt_debug) { print STDERR "$r\n"; } + + return $r; + } +} + +# Increment a hex addresses of length $address_length. +# Run jeprof --test for unit test if this is changed. +sub AddressInc { + my $addr = shift; + my $sum; + + if ($address_length == 8) { + # Perl doesn't cope with wraparound arithmetic, so do it explicitly: + $sum = (hex($addr)+1) % (0x10000000 * 16); + return sprintf("%08x", $sum); + + } else { + # Do the addition in 7-nibble chunks to trivialize carry handling. + # We are always doing this to step through the addresses in a function, + # and will almost never overflow the first chunk, so we check for this + # case and exit early. + + # if ($main::opt_debug) { print STDERR "AddressInc $addr1 = "; } + + my $a1 = substr($addr,-7); + $addr = substr($addr,0,-7); + $sum = hex($a1) + 1; + my $r = sprintf("%07x", $sum); + if ($sum <= 0xfffffff) { + $r = $addr . $r; + # if ($main::opt_debug) { print STDERR "$r\n"; } + return HexExtend($r); + } else { + $r = "0000000"; + } + + $a1 = substr($addr,-7); + $addr = substr($addr,0,-7); + $sum = hex($a1) + 1; + $r = sprintf("%07x", $sum) . $r; + if ($sum <= 0xfffffff) { + $r = $addr . $r; + # if ($main::opt_debug) { print STDERR "$r\n"; } + return HexExtend($r); + } else { + $r = "00000000000000"; + } + + $sum = hex($addr) + 1; + if ($sum > 0xff) { $sum -= 0x100; } + $r = sprintf("%02x", $sum) . $r; + + # if ($main::opt_debug) { print STDERR "$r\n"; } + return $r; + } +} + +# Extract symbols for all PC values found in profile +sub ExtractSymbols { + my $libs = shift; + my $pcset = shift; + + my $symbols = {}; + + # Map each PC value to the containing library. To make this faster, + # we sort libraries by their starting pc value (highest first), and + # advance through the libraries as we advance the pc. Sometimes the + # addresses of libraries may overlap with the addresses of the main + # binary, so to make sure the libraries 'win', we iterate over the + # libraries in reverse order (which assumes the binary doesn't start + # in the middle of a library, which seems a fair assumption). + my @pcs = (sort { $a cmp $b } keys(%{$pcset})); # pcset is 0-extended strings + foreach my $lib (sort {$b->[1] cmp $a->[1]} @{$libs}) { + my $libname = $lib->[0]; + my $start = $lib->[1]; + my $finish = $lib->[2]; + my $offset = $lib->[3]; + + # Use debug library if it exists + my $debug_libname = DebuggingLibrary($libname); + if ($debug_libname) { + $libname = $debug_libname; + } + + # Get list of pcs that belong in this library. + my $contained = []; + my ($start_pc_index, $finish_pc_index); + # Find smallest finish_pc_index such that $finish < $pc[$finish_pc_index]. + for ($finish_pc_index = $#pcs + 1; $finish_pc_index > 0; + $finish_pc_index--) { + last if $pcs[$finish_pc_index - 1] le $finish; + } + # Find smallest start_pc_index such that $start <= $pc[$start_pc_index]. + for ($start_pc_index = $finish_pc_index; $start_pc_index > 0; + $start_pc_index--) { + last if $pcs[$start_pc_index - 1] lt $start; + } + # This keeps PC values higher than $pc[$finish_pc_index] in @pcs, + # in case there are overlaps in libraries and the main binary. + @{$contained} = splice(@pcs, $start_pc_index, + $finish_pc_index - $start_pc_index); + # Map to symbols + MapToSymbols($libname, AddressSub($start, $offset), $contained, $symbols); + } + + return $symbols; +} + +# Map list of PC values to symbols for a given image +sub MapToSymbols { + my $image = shift; + my $offset = shift; + my $pclist = shift; + my $symbols = shift; + + my $debug = 0; + + # Ignore empty binaries + if ($#{$pclist} < 0) { return; } + + # Figure out the addr2line command to use + my $addr2line = $obj_tool_map{"addr2line"}; + my $cmd = ShellEscape($addr2line, "-f", "-C", "-e", $image); + if (exists $obj_tool_map{"addr2line_pdb"}) { + $addr2line = $obj_tool_map{"addr2line_pdb"}; + $cmd = ShellEscape($addr2line, "--demangle", "-f", "-C", "-e", $image); + } + + # If "addr2line" isn't installed on the system at all, just use + # nm to get what info we can (function names, but not line numbers). + if (system(ShellEscape($addr2line, "--help") . " >$dev_null 2>&1") != 0) { + MapSymbolsWithNM($image, $offset, $pclist, $symbols); + return; + } + + # "addr2line -i" can produce a variable number of lines per input + # address, with no separator that allows us to tell when data for + # the next address starts. So we find the address for a special + # symbol (_fini) and interleave this address between all real + # addresses passed to addr2line. The name of this special symbol + # can then be used as a separator. + $sep_address = undef; # May be filled in by MapSymbolsWithNM() + my $nm_symbols = {}; + MapSymbolsWithNM($image, $offset, $pclist, $nm_symbols); + if (defined($sep_address)) { + # Only add " -i" to addr2line if the binary supports it. + # addr2line --help returns 0, but not if it sees an unknown flag first. + if (system("$cmd -i --help >$dev_null 2>&1") == 0) { + $cmd .= " -i"; + } else { + $sep_address = undef; # no need for sep_address if we don't support -i + } + } + + # Make file with all PC values with intervening 'sep_address' so + # that we can reliably detect the end of inlined function list + open(ADDRESSES, ">$main::tmpfile_sym") || error("$main::tmpfile_sym: $!\n"); + if ($debug) { print("---- $image ---\n"); } + for (my $i = 0; $i <= $#{$pclist}; $i++) { + # addr2line always reads hex addresses, and does not need '0x' prefix. + if ($debug) { printf STDERR ("%s\n", $pclist->[$i]); } + printf ADDRESSES ("%s\n", AddressSub($pclist->[$i], $offset)); + if (defined($sep_address)) { + printf ADDRESSES ("%s\n", $sep_address); + } + } + close(ADDRESSES); + if ($debug) { + print("----\n"); + system("cat", $main::tmpfile_sym); + print("----\n"); + system("$cmd < " . ShellEscape($main::tmpfile_sym)); + print("----\n"); + } + + open(SYMBOLS, "$cmd <" . ShellEscape($main::tmpfile_sym) . " |") + || error("$cmd: $!\n"); + my $count = 0; # Index in pclist + while () { + # Read fullfunction and filelineinfo from next pair of lines + s/\r?\n$//g; + my $fullfunction = $_; + $_ = ; + s/\r?\n$//g; + my $filelinenum = $_; + + if (defined($sep_address) && $fullfunction eq $sep_symbol) { + # Terminating marker for data for this address + $count++; + next; + } + + $filelinenum =~ s|\\|/|g; # turn windows-style paths into unix-style paths + + my $pcstr = $pclist->[$count]; + my $function = ShortFunctionName($fullfunction); + my $nms = $nm_symbols->{$pcstr}; + if (defined($nms)) { + if ($fullfunction eq '??') { + # nm found a symbol for us. + $function = $nms->[0]; + $fullfunction = $nms->[2]; + } else { + # MapSymbolsWithNM tags each routine with its starting address, + # useful in case the image has multiple occurrences of this + # routine. (It uses a syntax that resembles template parameters, + # that are automatically stripped out by ShortFunctionName().) + # addr2line does not provide the same information. So we check + # if nm disambiguated our symbol, and if so take the annotated + # (nm) version of the routine-name. TODO(csilvers): this won't + # catch overloaded, inlined symbols, which nm doesn't see. + # Better would be to do a check similar to nm's, in this fn. + if ($nms->[2] =~ m/^\Q$function\E/) { # sanity check it's the right fn + $function = $nms->[0]; + $fullfunction = $nms->[2]; + } + } + } + + # Prepend to accumulated symbols for pcstr + # (so that caller comes before callee) + my $sym = $symbols->{$pcstr}; + if (!defined($sym)) { + $sym = []; + $symbols->{$pcstr} = $sym; + } + unshift(@{$sym}, $function, $filelinenum, $fullfunction); + if ($debug) { printf STDERR ("%s => [%s]\n", $pcstr, join(" ", @{$sym})); } + if (!defined($sep_address)) { + # Inlining is off, so this entry ends immediately + $count++; + } + } + close(SYMBOLS); +} + +# Use nm to map the list of referenced PCs to symbols. Return true iff we +# are able to read procedure information via nm. +sub MapSymbolsWithNM { + my $image = shift; + my $offset = shift; + my $pclist = shift; + my $symbols = shift; + + # Get nm output sorted by increasing address + my $symbol_table = GetProcedureBoundaries($image, "."); + if (!%{$symbol_table}) { + return 0; + } + # Start addresses are already the right length (8 or 16 hex digits). + my @names = sort { $symbol_table->{$a}->[0] cmp $symbol_table->{$b}->[0] } + keys(%{$symbol_table}); + + if ($#names < 0) { + # No symbols: just use addresses + foreach my $pc (@{$pclist}) { + my $pcstr = "0x" . $pc; + $symbols->{$pc} = [$pcstr, "?", $pcstr]; + } + return 0; + } + + # Sort addresses so we can do a join against nm output + my $index = 0; + my $fullname = $names[0]; + my $name = ShortFunctionName($fullname); + foreach my $pc (sort { $a cmp $b } @{$pclist}) { + # Adjust for mapped offset + my $mpc = AddressSub($pc, $offset); + while (($index < $#names) && ($mpc ge $symbol_table->{$fullname}->[1])){ + $index++; + $fullname = $names[$index]; + $name = ShortFunctionName($fullname); + } + if ($mpc lt $symbol_table->{$fullname}->[1]) { + $symbols->{$pc} = [$name, "?", $fullname]; + } else { + my $pcstr = "0x" . $pc; + $symbols->{$pc} = [$pcstr, "?", $pcstr]; + } + } + return 1; +} + +sub ShortFunctionName { + my $function = shift; + while ($function =~ s/\([^()]*\)(\s*const)?//g) { } # Argument types + while ($function =~ s/<[^<>]*>//g) { } # Remove template arguments + $function =~ s/^.*\s+(\w+::)/$1/; # Remove leading type + return $function; +} + +# Trim overly long symbols found in disassembler output +sub CleanDisassembly { + my $d = shift; + while ($d =~ s/\([^()%]*\)(\s*const)?//g) { } # Argument types, not (%rax) + while ($d =~ s/(\w+)<[^<>]*>/$1/g) { } # Remove template arguments + return $d; +} + +# Clean file name for display +sub CleanFileName { + my ($f) = @_; + $f =~ s|^/proc/self/cwd/||; + $f =~ s|^\./||; + return $f; +} + +# Make address relative to section and clean up for display +sub UnparseAddress { + my ($offset, $address) = @_; + $address = AddressSub($address, $offset); + $address =~ s/^0x//; + $address =~ s/^0*//; + return $address; +} + +##### Miscellaneous ##### + +# Find the right versions of the above object tools to use. The +# argument is the program file being analyzed, and should be an ELF +# 32-bit or ELF 64-bit executable file. The location of the tools +# is determined by considering the following options in this order: +# 1) --tools option, if set +# 2) JEPROF_TOOLS environment variable, if set +# 3) the environment +sub ConfigureObjTools { + my $prog_file = shift; + + # Check for the existence of $prog_file because /usr/bin/file does not + # predictably return error status in prod. + (-e $prog_file) || error("$prog_file does not exist.\n"); + + my $file_type = undef; + if (-e "/usr/bin/file") { + # Follow symlinks (at least for systems where "file" supports that). + my $escaped_prog_file = ShellEscape($prog_file); + $file_type = `/usr/bin/file -L $escaped_prog_file 2>$dev_null || + /usr/bin/file $escaped_prog_file`; + } elsif ($^O == "MSWin32") { + $file_type = "MS Windows"; + } else { + print STDERR "WARNING: Can't determine the file type of $prog_file"; + } + + if ($file_type =~ /64-bit/) { + # Change $address_length to 16 if the program file is ELF 64-bit. + # We can't detect this from many (most?) heap or lock contention + # profiles, since the actual addresses referenced are generally in low + # memory even for 64-bit programs. + $address_length = 16; + } + + if ($file_type =~ /MS Windows/) { + # For windows, we provide a version of nm and addr2line as part of + # the opensource release, which is capable of parsing + # Windows-style PDB executables. It should live in the path, or + # in the same directory as jeprof. + $obj_tool_map{"nm_pdb"} = "nm-pdb"; + $obj_tool_map{"addr2line_pdb"} = "addr2line-pdb"; + } + + if ($file_type =~ /Mach-O/) { + # OS X uses otool to examine Mach-O files, rather than objdump. + $obj_tool_map{"otool"} = "otool"; + $obj_tool_map{"addr2line"} = "false"; # no addr2line + $obj_tool_map{"objdump"} = "false"; # no objdump + } + + # Go fill in %obj_tool_map with the pathnames to use: + foreach my $tool (keys %obj_tool_map) { + $obj_tool_map{$tool} = ConfigureTool($obj_tool_map{$tool}); + } +} + +# Returns the path of a caller-specified object tool. If --tools or +# JEPROF_TOOLS are specified, then returns the full path to the tool +# with that prefix. Otherwise, returns the path unmodified (which +# means we will look for it on PATH). +sub ConfigureTool { + my $tool = shift; + my $path; + + # --tools (or $JEPROF_TOOLS) is a comma separated list, where each + # item is either a) a pathname prefix, or b) a map of the form + # :. First we look for an entry of type (b) for our + # tool. If one is found, we use it. Otherwise, we consider all the + # pathname prefixes in turn, until one yields an existing file. If + # none does, we use a default path. + my $tools = $main::opt_tools || $ENV{"JEPROF_TOOLS"} || ""; + if ($tools =~ m/(,|^)\Q$tool\E:([^,]*)/) { + $path = $2; + # TODO(csilvers): sanity-check that $path exists? Hard if it's relative. + } elsif ($tools ne '') { + foreach my $prefix (split(',', $tools)) { + next if ($prefix =~ /:/); # ignore "tool:fullpath" entries in the list + if (-x $prefix . $tool) { + $path = $prefix . $tool; + last; + } + } + if (!$path) { + error("No '$tool' found with prefix specified by " . + "--tools (or \$JEPROF_TOOLS) '$tools'\n"); + } + } else { + # ... otherwise use the version that exists in the same directory as + # jeprof. If there's nothing there, use $PATH. + $0 =~ m,[^/]*$,; # this is everything after the last slash + my $dirname = $`; # this is everything up to and including the last slash + if (-x "$dirname$tool") { + $path = "$dirname$tool"; + } else { + $path = $tool; + } + } + if ($main::opt_debug) { print STDERR "Using '$path' for '$tool'.\n"; } + return $path; +} + +sub ShellEscape { + my @escaped_words = (); + foreach my $word (@_) { + my $escaped_word = $word; + if ($word =~ m![^a-zA-Z0-9/.,_=-]!) { # check for anything not in whitelist + $escaped_word =~ s/'/'\\''/; + $escaped_word = "'$escaped_word'"; + } + push(@escaped_words, $escaped_word); + } + return join(" ", @escaped_words); +} + +sub cleanup { + unlink($main::tmpfile_sym); + unlink(keys %main::tempnames); + + # We leave any collected profiles in $HOME/jeprof in case the user wants + # to look at them later. We print a message informing them of this. + if ((scalar(@main::profile_files) > 0) && + defined($main::collected_profile)) { + if (scalar(@main::profile_files) == 1) { + print STDERR "Dynamically gathered profile is in $main::collected_profile\n"; + } + print STDERR "If you want to investigate this profile further, you can do:\n"; + print STDERR "\n"; + print STDERR " jeprof \\\n"; + print STDERR " $main::prog \\\n"; + print STDERR " $main::collected_profile\n"; + print STDERR "\n"; + } +} + +sub sighandler { + cleanup(); + exit(1); +} + +sub error { + my $msg = shift; + print STDERR $msg; + cleanup(); + exit(1); +} + + +# Run $nm_command and get all the resulting procedure boundaries whose +# names match "$regexp" and returns them in a hashtable mapping from +# procedure name to a two-element vector of [start address, end address] +sub GetProcedureBoundariesViaNm { + my $escaped_nm_command = shift; # shell-escaped + my $regexp = shift; + + my $symbol_table = {}; + open(NM, "$escaped_nm_command |") || error("$escaped_nm_command: $!\n"); + my $last_start = "0"; + my $routine = ""; + while () { + s/\r//g; # turn windows-looking lines into unix-looking lines + if (m/^\s*([0-9a-f]+) (.) (..*)/) { + my $start_val = $1; + my $type = $2; + my $this_routine = $3; + + # It's possible for two symbols to share the same address, if + # one is a zero-length variable (like __start_google_malloc) or + # one symbol is a weak alias to another (like __libc_malloc). + # In such cases, we want to ignore all values except for the + # actual symbol, which in nm-speak has type "T". The logic + # below does this, though it's a bit tricky: what happens when + # we have a series of lines with the same address, is the first + # one gets queued up to be processed. However, it won't + # *actually* be processed until later, when we read a line with + # a different address. That means that as long as we're reading + # lines with the same address, we have a chance to replace that + # item in the queue, which we do whenever we see a 'T' entry -- + # that is, a line with type 'T'. If we never see a 'T' entry, + # we'll just go ahead and process the first entry (which never + # got touched in the queue), and ignore the others. + if ($start_val eq $last_start && $type =~ /t/i) { + # We are the 'T' symbol at this address, replace previous symbol. + $routine = $this_routine; + next; + } elsif ($start_val eq $last_start) { + # We're not the 'T' symbol at this address, so ignore us. + next; + } + + if ($this_routine eq $sep_symbol) { + $sep_address = HexExtend($start_val); + } + + # Tag this routine with the starting address in case the image + # has multiple occurrences of this routine. We use a syntax + # that resembles template parameters that are automatically + # stripped out by ShortFunctionName() + $this_routine .= "<$start_val>"; + + if (defined($routine) && $routine =~ m/$regexp/) { + $symbol_table->{$routine} = [HexExtend($last_start), + HexExtend($start_val)]; + } + $last_start = $start_val; + $routine = $this_routine; + } elsif (m/^Loaded image name: (.+)/) { + # The win32 nm workalike emits information about the binary it is using. + if ($main::opt_debug) { print STDERR "Using Image $1\n"; } + } elsif (m/^PDB file name: (.+)/) { + # The win32 nm workalike emits information about the pdb it is using. + if ($main::opt_debug) { print STDERR "Using PDB $1\n"; } + } + } + close(NM); + # Handle the last line in the nm output. Unfortunately, we don't know + # how big this last symbol is, because we don't know how big the file + # is. For now, we just give it a size of 0. + # TODO(csilvers): do better here. + if (defined($routine) && $routine =~ m/$regexp/) { + $symbol_table->{$routine} = [HexExtend($last_start), + HexExtend($last_start)]; + } + return $symbol_table; +} + +# Gets the procedure boundaries for all routines in "$image" whose names +# match "$regexp" and returns them in a hashtable mapping from procedure +# name to a two-element vector of [start address, end address]. +# Will return an empty map if nm is not installed or not working properly. +sub GetProcedureBoundaries { + my $image = shift; + my $regexp = shift; + + # If $image doesn't start with /, then put ./ in front of it. This works + # around an obnoxious bug in our probing of nm -f behavior. + # "nm -f $image" is supposed to fail on GNU nm, but if: + # + # a. $image starts with [BbSsPp] (for example, bin/foo/bar), AND + # b. you have a.out in your current directory (a not uncommon occurrence) + # + # then "nm -f $image" succeeds because -f only looks at the first letter of + # the argument, which looks valid because it's [BbSsPp], and then since + # there's no image provided, it looks for a.out and finds it. + # + # This regex makes sure that $image starts with . or /, forcing the -f + # parsing to fail since . and / are not valid formats. + $image =~ s#^[^/]#./$&#; + + # For libc libraries, the copy in /usr/lib/debug contains debugging symbols + my $debugging = DebuggingLibrary($image); + if ($debugging) { + $image = $debugging; + } + + my $nm = $obj_tool_map{"nm"}; + my $cppfilt = $obj_tool_map{"c++filt"}; + + # nm can fail for two reasons: 1) $image isn't a debug library; 2) nm + # binary doesn't support --demangle. In addition, for OS X we need + # to use the -f flag to get 'flat' nm output (otherwise we don't sort + # properly and get incorrect results). Unfortunately, GNU nm uses -f + # in an incompatible way. So first we test whether our nm supports + # --demangle and -f. + my $demangle_flag = ""; + my $cppfilt_flag = ""; + my $to_devnull = ">$dev_null 2>&1"; + if (system(ShellEscape($nm, "--demangle", $image) . $to_devnull) == 0) { + # In this mode, we do "nm --demangle " + $demangle_flag = "--demangle"; + $cppfilt_flag = ""; + } elsif (system(ShellEscape($cppfilt, $image) . $to_devnull) == 0) { + # In this mode, we do "nm | c++filt" + $cppfilt_flag = " | " . ShellEscape($cppfilt); + }; + my $flatten_flag = ""; + if (system(ShellEscape($nm, "-f", $image) . $to_devnull) == 0) { + $flatten_flag = "-f"; + } + + # Finally, in the case $imagie isn't a debug library, we try again with + # -D to at least get *exported* symbols. If we can't use --demangle, + # we use c++filt instead, if it exists on this system. + my @nm_commands = (ShellEscape($nm, "-n", $flatten_flag, $demangle_flag, + $image) . " 2>$dev_null $cppfilt_flag", + ShellEscape($nm, "-D", "-n", $flatten_flag, $demangle_flag, + $image) . " 2>$dev_null $cppfilt_flag", + # 6nm is for Go binaries + ShellEscape("6nm", "$image") . " 2>$dev_null | sort", + ); + + # If the executable is an MS Windows PDB-format executable, we'll + # have set up obj_tool_map("nm_pdb"). In this case, we actually + # want to use both unix nm and windows-specific nm_pdb, since + # PDB-format executables can apparently include dwarf .o files. + if (exists $obj_tool_map{"nm_pdb"}) { + push(@nm_commands, + ShellEscape($obj_tool_map{"nm_pdb"}, "--demangle", $image) + . " 2>$dev_null"); + } + + foreach my $nm_command (@nm_commands) { + my $symbol_table = GetProcedureBoundariesViaNm($nm_command, $regexp); + return $symbol_table if (%{$symbol_table}); + } + my $symbol_table = {}; + return $symbol_table; +} + + +# The test vectors for AddressAdd/Sub/Inc are 8-16-nibble hex strings. +# To make them more readable, we add underscores at interesting places. +# This routine removes the underscores, producing the canonical representation +# used by jeprof to represent addresses, particularly in the tested routines. +sub CanonicalHex { + my $arg = shift; + return join '', (split '_',$arg); +} + + +# Unit test for AddressAdd: +sub AddressAddUnitTest { + my $test_data_8 = shift; + my $test_data_16 = shift; + my $error_count = 0; + my $fail_count = 0; + my $pass_count = 0; + # print STDERR "AddressAddUnitTest: ", 1+$#{$test_data_8}, " tests\n"; + + # First a few 8-nibble addresses. Note that this implementation uses + # plain old arithmetic, so a quick sanity check along with verifying what + # happens to overflow (we want it to wrap): + $address_length = 8; + foreach my $row (@{$test_data_8}) { + if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; } + my $sum = AddressAdd ($row->[0], $row->[1]); + if ($sum ne $row->[2]) { + printf STDERR "ERROR: %s != %s + %s = %s\n", $sum, + $row->[0], $row->[1], $row->[2]; + ++$fail_count; + } else { + ++$pass_count; + } + } + printf STDERR "AddressAdd 32-bit tests: %d passes, %d failures\n", + $pass_count, $fail_count; + $error_count = $fail_count; + $fail_count = 0; + $pass_count = 0; + + # Now 16-nibble addresses. + $address_length = 16; + foreach my $row (@{$test_data_16}) { + if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; } + my $sum = AddressAdd (CanonicalHex($row->[0]), CanonicalHex($row->[1])); + my $expected = join '', (split '_',$row->[2]); + if ($sum ne CanonicalHex($row->[2])) { + printf STDERR "ERROR: %s != %s + %s = %s\n", $sum, + $row->[0], $row->[1], $row->[2]; + ++$fail_count; + } else { + ++$pass_count; + } + } + printf STDERR "AddressAdd 64-bit tests: %d passes, %d failures\n", + $pass_count, $fail_count; + $error_count += $fail_count; + + return $error_count; +} + + +# Unit test for AddressSub: +sub AddressSubUnitTest { + my $test_data_8 = shift; + my $test_data_16 = shift; + my $error_count = 0; + my $fail_count = 0; + my $pass_count = 0; + # print STDERR "AddressSubUnitTest: ", 1+$#{$test_data_8}, " tests\n"; + + # First a few 8-nibble addresses. Note that this implementation uses + # plain old arithmetic, so a quick sanity check along with verifying what + # happens to overflow (we want it to wrap): + $address_length = 8; + foreach my $row (@{$test_data_8}) { + if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; } + my $sum = AddressSub ($row->[0], $row->[1]); + if ($sum ne $row->[3]) { + printf STDERR "ERROR: %s != %s - %s = %s\n", $sum, + $row->[0], $row->[1], $row->[3]; + ++$fail_count; + } else { + ++$pass_count; + } + } + printf STDERR "AddressSub 32-bit tests: %d passes, %d failures\n", + $pass_count, $fail_count; + $error_count = $fail_count; + $fail_count = 0; + $pass_count = 0; + + # Now 16-nibble addresses. + $address_length = 16; + foreach my $row (@{$test_data_16}) { + if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; } + my $sum = AddressSub (CanonicalHex($row->[0]), CanonicalHex($row->[1])); + if ($sum ne CanonicalHex($row->[3])) { + printf STDERR "ERROR: %s != %s - %s = %s\n", $sum, + $row->[0], $row->[1], $row->[3]; + ++$fail_count; + } else { + ++$pass_count; + } + } + printf STDERR "AddressSub 64-bit tests: %d passes, %d failures\n", + $pass_count, $fail_count; + $error_count += $fail_count; + + return $error_count; +} + + +# Unit test for AddressInc: +sub AddressIncUnitTest { + my $test_data_8 = shift; + my $test_data_16 = shift; + my $error_count = 0; + my $fail_count = 0; + my $pass_count = 0; + # print STDERR "AddressIncUnitTest: ", 1+$#{$test_data_8}, " tests\n"; + + # First a few 8-nibble addresses. Note that this implementation uses + # plain old arithmetic, so a quick sanity check along with verifying what + # happens to overflow (we want it to wrap): + $address_length = 8; + foreach my $row (@{$test_data_8}) { + if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; } + my $sum = AddressInc ($row->[0]); + if ($sum ne $row->[4]) { + printf STDERR "ERROR: %s != %s + 1 = %s\n", $sum, + $row->[0], $row->[4]; + ++$fail_count; + } else { + ++$pass_count; + } + } + printf STDERR "AddressInc 32-bit tests: %d passes, %d failures\n", + $pass_count, $fail_count; + $error_count = $fail_count; + $fail_count = 0; + $pass_count = 0; + + # Now 16-nibble addresses. + $address_length = 16; + foreach my $row (@{$test_data_16}) { + if ($main::opt_debug and $main::opt_test) { print STDERR "@{$row}\n"; } + my $sum = AddressInc (CanonicalHex($row->[0])); + if ($sum ne CanonicalHex($row->[4])) { + printf STDERR "ERROR: %s != %s + 1 = %s\n", $sum, + $row->[0], $row->[4]; + ++$fail_count; + } else { + ++$pass_count; + } + } + printf STDERR "AddressInc 64-bit tests: %d passes, %d failures\n", + $pass_count, $fail_count; + $error_count += $fail_count; + + return $error_count; +} + + +# Driver for unit tests. +# Currently just the address add/subtract/increment routines for 64-bit. +sub RunUnitTests { + my $error_count = 0; + + # This is a list of tuples [a, b, a+b, a-b, a+1] + my $unit_test_data_8 = [ + [qw(aaaaaaaa 50505050 fafafafa 5a5a5a5a aaaaaaab)], + [qw(50505050 aaaaaaaa fafafafa a5a5a5a6 50505051)], + [qw(ffffffff aaaaaaaa aaaaaaa9 55555555 00000000)], + [qw(00000001 ffffffff 00000000 00000002 00000002)], + [qw(00000001 fffffff0 fffffff1 00000011 00000002)], + ]; + my $unit_test_data_16 = [ + # The implementation handles data in 7-nibble chunks, so those are the + # interesting boundaries. + [qw(aaaaaaaa 50505050 + 00_000000f_afafafa 00_0000005_a5a5a5a 00_000000a_aaaaaab)], + [qw(50505050 aaaaaaaa + 00_000000f_afafafa ff_ffffffa_5a5a5a6 00_0000005_0505051)], + [qw(ffffffff aaaaaaaa + 00_000001a_aaaaaa9 00_0000005_5555555 00_0000010_0000000)], + [qw(00000001 ffffffff + 00_0000010_0000000 ff_ffffff0_0000002 00_0000000_0000002)], + [qw(00000001 fffffff0 + 00_000000f_ffffff1 ff_ffffff0_0000011 00_0000000_0000002)], + + [qw(00_a00000a_aaaaaaa 50505050 + 00_a00000f_afafafa 00_a000005_a5a5a5a 00_a00000a_aaaaaab)], + [qw(0f_fff0005_0505050 aaaaaaaa + 0f_fff000f_afafafa 0f_ffefffa_5a5a5a6 0f_fff0005_0505051)], + [qw(00_000000f_fffffff 01_800000a_aaaaaaa + 01_800001a_aaaaaa9 fe_8000005_5555555 00_0000010_0000000)], + [qw(00_0000000_0000001 ff_fffffff_fffffff + 00_0000000_0000000 00_0000000_0000002 00_0000000_0000002)], + [qw(00_0000000_0000001 ff_fffffff_ffffff0 + ff_fffffff_ffffff1 00_0000000_0000011 00_0000000_0000002)], + ]; + + $error_count += AddressAddUnitTest($unit_test_data_8, $unit_test_data_16); + $error_count += AddressSubUnitTest($unit_test_data_8, $unit_test_data_16); + $error_count += AddressIncUnitTest($unit_test_data_8, $unit_test_data_16); + if ($error_count > 0) { + print STDERR $error_count, " errors: FAILED\n"; + } else { + print STDERR "PASS\n"; + } + exit ($error_count); +} diff --git a/BeefRT/JEMalloc/build-aux/install-sh b/BeefRT/JEMalloc/build-aux/install-sh new file mode 100644 index 00000000..ebc66913 --- /dev/null +++ b/BeefRT/JEMalloc/build-aux/install-sh @@ -0,0 +1,250 @@ +#! /bin/sh +# +# install - install a program, script, or datafile +# This comes from X11R5 (mit/util/scripts/install.sh). +# +# Copyright 1991 by the Massachusetts Institute of Technology +# +# Permission to use, copy, modify, distribute, and sell this software and its +# documentation for any purpose is hereby granted without fee, provided that +# the above copyright notice appear in all copies and that both that +# copyright notice and this permission notice appear in supporting +# documentation, and that the name of M.I.T. not be used in advertising or +# publicity pertaining to distribution of the software without specific, +# written prior permission. M.I.T. makes no representations about the +# suitability of this software for any purpose. It is provided "as is" +# without express or implied warranty. +# +# Calling this script install-sh is preferred over install.sh, to prevent +# `make' implicit rules from creating a file called install from it +# when there is no Makefile. +# +# This script is compatible with the BSD install script, but was written +# from scratch. It can only install one file at a time, a restriction +# shared with many OS's install programs. + + +# set DOITPROG to echo to test this script + +# Don't use :- since 4.3BSD and earlier shells don't like it. +doit="${DOITPROG-}" + + +# put in absolute paths if you don't have them in your path; or use env. vars. + +mvprog="${MVPROG-mv}" +cpprog="${CPPROG-cp}" +chmodprog="${CHMODPROG-chmod}" +chownprog="${CHOWNPROG-chown}" +chgrpprog="${CHGRPPROG-chgrp}" +stripprog="${STRIPPROG-strip}" +rmprog="${RMPROG-rm}" +mkdirprog="${MKDIRPROG-mkdir}" + +transformbasename="" +transform_arg="" +instcmd="$mvprog" +chmodcmd="$chmodprog 0755" +chowncmd="" +chgrpcmd="" +stripcmd="" +rmcmd="$rmprog -f" +mvcmd="$mvprog" +src="" +dst="" +dir_arg="" + +while [ x"$1" != x ]; do + case $1 in + -c) instcmd="$cpprog" + shift + continue;; + + -d) dir_arg=true + shift + continue;; + + -m) chmodcmd="$chmodprog $2" + shift + shift + continue;; + + -o) chowncmd="$chownprog $2" + shift + shift + continue;; + + -g) chgrpcmd="$chgrpprog $2" + shift + shift + continue;; + + -s) stripcmd="$stripprog" + shift + continue;; + + -t=*) transformarg=`echo $1 | sed 's/-t=//'` + shift + continue;; + + -b=*) transformbasename=`echo $1 | sed 's/-b=//'` + shift + continue;; + + *) if [ x"$src" = x ] + then + src=$1 + else + # this colon is to work around a 386BSD /bin/sh bug + : + dst=$1 + fi + shift + continue;; + esac +done + +if [ x"$src" = x ] +then + echo "install: no input file specified" + exit 1 +else + true +fi + +if [ x"$dir_arg" != x ]; then + dst=$src + src="" + + if [ -d $dst ]; then + instcmd=: + else + instcmd=mkdir + fi +else + +# Waiting for this to be detected by the "$instcmd $src $dsttmp" command +# might cause directories to be created, which would be especially bad +# if $src (and thus $dsttmp) contains '*'. + + if [ -f $src -o -d $src ] + then + true + else + echo "install: $src does not exist" + exit 1 + fi + + if [ x"$dst" = x ] + then + echo "install: no destination specified" + exit 1 + else + true + fi + +# If destination is a directory, append the input filename; if your system +# does not like double slashes in filenames, you may need to add some logic + + if [ -d $dst ] + then + dst="$dst"/`basename $src` + else + true + fi +fi + +## this sed command emulates the dirname command +dstdir=`echo $dst | sed -e 's,[^/]*$,,;s,/$,,;s,^$,.,'` + +# Make sure that the destination directory exists. +# this part is taken from Noah Friedman's mkinstalldirs script + +# Skip lots of stat calls in the usual case. +if [ ! -d "$dstdir" ]; then +defaultIFS=' +' +IFS="${IFS-${defaultIFS}}" + +oIFS="${IFS}" +# Some sh's can't handle IFS=/ for some reason. +IFS='%' +set - `echo ${dstdir} | sed -e 's@/@%@g' -e 's@^%@/@'` +IFS="${oIFS}" + +pathcomp='' + +while [ $# -ne 0 ] ; do + pathcomp="${pathcomp}${1}" + shift + + if [ ! -d "${pathcomp}" ] ; + then + $mkdirprog "${pathcomp}" + else + true + fi + + pathcomp="${pathcomp}/" +done +fi + +if [ x"$dir_arg" != x ] +then + $doit $instcmd $dst && + + if [ x"$chowncmd" != x ]; then $doit $chowncmd $dst; else true ; fi && + if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dst; else true ; fi && + if [ x"$stripcmd" != x ]; then $doit $stripcmd $dst; else true ; fi && + if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dst; else true ; fi +else + +# If we're going to rename the final executable, determine the name now. + + if [ x"$transformarg" = x ] + then + dstfile=`basename $dst` + else + dstfile=`basename $dst $transformbasename | + sed $transformarg`$transformbasename + fi + +# don't allow the sed command to completely eliminate the filename + + if [ x"$dstfile" = x ] + then + dstfile=`basename $dst` + else + true + fi + +# Make a temp file name in the proper directory. + + dsttmp=$dstdir/#inst.$$# + +# Move or copy the file name to the temp name + + $doit $instcmd $src $dsttmp && + + trap "rm -f ${dsttmp}" 0 && + +# and set any options; do chmod last to preserve setuid bits + +# If any of these fail, we abort the whole thing. If we want to +# ignore errors from any of these, just make sure not to ignore +# errors from the above "$doit $instcmd $src $dsttmp" command. + + if [ x"$chowncmd" != x ]; then $doit $chowncmd $dsttmp; else true;fi && + if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dsttmp; else true;fi && + if [ x"$stripcmd" != x ]; then $doit $stripcmd $dsttmp; else true;fi && + if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dsttmp; else true;fi && + +# Now rename the file to the real destination. + + $doit $rmcmd -f $dstdir/$dstfile && + $doit $mvcmd $dsttmp $dstdir/$dstfile + +fi && + + +exit 0 diff --git a/BeefRT/JEMalloc/configure.ac b/BeefRT/JEMalloc/configure.ac new file mode 100644 index 00000000..f6d25f33 --- /dev/null +++ b/BeefRT/JEMalloc/configure.ac @@ -0,0 +1,2669 @@ +dnl Process this file with autoconf to produce a configure script. +AC_PREREQ(2.68) +AC_INIT([Makefile.in]) + +AC_CONFIG_AUX_DIR([build-aux]) + +dnl ============================================================================ +dnl Custom macro definitions. + +dnl JE_CONCAT_VVV(r, a, b) +dnl +dnl Set $r to the concatenation of $a and $b, with a space separating them iff +dnl both $a and $b are non-empty. +AC_DEFUN([JE_CONCAT_VVV], +if test "x[$]{$2}" = "x" -o "x[$]{$3}" = "x" ; then + $1="[$]{$2}[$]{$3}" +else + $1="[$]{$2} [$]{$3}" +fi +) + +dnl JE_APPEND_VS(a, b) +dnl +dnl Set $a to the concatenation of $a and b, with a space separating them iff +dnl both $a and b are non-empty. +AC_DEFUN([JE_APPEND_VS], + T_APPEND_V=$2 + JE_CONCAT_VVV($1, $1, T_APPEND_V) +) + +CONFIGURE_CFLAGS= +SPECIFIED_CFLAGS="${CFLAGS}" +dnl JE_CFLAGS_ADD(cflag) +dnl +dnl CFLAGS is the concatenation of CONFIGURE_CFLAGS and SPECIFIED_CFLAGS +dnl (ignoring EXTRA_CFLAGS, which does not impact configure tests. This macro +dnl appends to CONFIGURE_CFLAGS and regenerates CFLAGS. +AC_DEFUN([JE_CFLAGS_ADD], +[ +AC_MSG_CHECKING([whether compiler supports $1]) +T_CONFIGURE_CFLAGS="${CONFIGURE_CFLAGS}" +JE_APPEND_VS(CONFIGURE_CFLAGS, $1) +JE_CONCAT_VVV(CFLAGS, CONFIGURE_CFLAGS, SPECIFIED_CFLAGS) +AC_COMPILE_IFELSE([AC_LANG_PROGRAM( +[[ +]], [[ + return 0; +]])], + [je_cv_cflags_added=$1] + AC_MSG_RESULT([yes]), + [je_cv_cflags_added=] + AC_MSG_RESULT([no]) + [CONFIGURE_CFLAGS="${T_CONFIGURE_CFLAGS}"] +) +JE_CONCAT_VVV(CFLAGS, CONFIGURE_CFLAGS, SPECIFIED_CFLAGS) +]) + +dnl JE_CFLAGS_SAVE() +dnl JE_CFLAGS_RESTORE() +dnl +dnl Save/restore CFLAGS. Nesting is not supported. +AC_DEFUN([JE_CFLAGS_SAVE], +SAVED_CONFIGURE_CFLAGS="${CONFIGURE_CFLAGS}" +) +AC_DEFUN([JE_CFLAGS_RESTORE], +CONFIGURE_CFLAGS="${SAVED_CONFIGURE_CFLAGS}" +JE_CONCAT_VVV(CFLAGS, CONFIGURE_CFLAGS, SPECIFIED_CFLAGS) +) + +CONFIGURE_CXXFLAGS= +SPECIFIED_CXXFLAGS="${CXXFLAGS}" +dnl JE_CXXFLAGS_ADD(cxxflag) +AC_DEFUN([JE_CXXFLAGS_ADD], +[ +AC_MSG_CHECKING([whether compiler supports $1]) +T_CONFIGURE_CXXFLAGS="${CONFIGURE_CXXFLAGS}" +JE_APPEND_VS(CONFIGURE_CXXFLAGS, $1) +JE_CONCAT_VVV(CXXFLAGS, CONFIGURE_CXXFLAGS, SPECIFIED_CXXFLAGS) +AC_LANG_PUSH([C++]) +AC_COMPILE_IFELSE([AC_LANG_PROGRAM( +[[ +]], [[ + return 0; +]])], + [je_cv_cxxflags_added=$1] + AC_MSG_RESULT([yes]), + [je_cv_cxxflags_added=] + AC_MSG_RESULT([no]) + [CONFIGURE_CXXFLAGS="${T_CONFIGURE_CXXFLAGS}"] +) +AC_LANG_POP([C++]) +JE_CONCAT_VVV(CXXFLAGS, CONFIGURE_CXXFLAGS, SPECIFIED_CXXFLAGS) +]) + +dnl JE_COMPILABLE(label, hcode, mcode, rvar) +dnl +dnl Use AC_LINK_IFELSE() rather than AC_COMPILE_IFELSE() so that linker errors +dnl cause failure. +AC_DEFUN([JE_COMPILABLE], +[ +AC_CACHE_CHECK([whether $1 is compilable], + [$4], + [AC_LINK_IFELSE([AC_LANG_PROGRAM([$2], + [$3])], + [$4=yes], + [$4=no])]) +]) + +dnl ============================================================================ + +CONFIG=`echo ${ac_configure_args} | sed -e 's#'"'"'\([^ ]*\)'"'"'#\1#g'` +AC_SUBST([CONFIG]) + +dnl Library revision. +rev=2 +AC_SUBST([rev]) + +srcroot=$srcdir +if test "x${srcroot}" = "x." ; then + srcroot="" +else + srcroot="${srcroot}/" +fi +AC_SUBST([srcroot]) +abs_srcroot="`cd \"${srcdir}\"; pwd`/" +AC_SUBST([abs_srcroot]) + +objroot="" +AC_SUBST([objroot]) +abs_objroot="`pwd`/" +AC_SUBST([abs_objroot]) + +dnl Munge install path variables. +case "$prefix" in + *\ * ) AC_MSG_ERROR([Prefix should not contain spaces]) ;; + "NONE" ) prefix="/usr/local" ;; +esac +case "$exec_prefix" in + *\ * ) AC_MSG_ERROR([Exec prefix should not contain spaces]) ;; + "NONE" ) exec_prefix=$prefix ;; +esac +PREFIX=$prefix +AC_SUBST([PREFIX]) +BINDIR=`eval echo $bindir` +BINDIR=`eval echo $BINDIR` +AC_SUBST([BINDIR]) +INCLUDEDIR=`eval echo $includedir` +INCLUDEDIR=`eval echo $INCLUDEDIR` +AC_SUBST([INCLUDEDIR]) +LIBDIR=`eval echo $libdir` +LIBDIR=`eval echo $LIBDIR` +AC_SUBST([LIBDIR]) +DATADIR=`eval echo $datadir` +DATADIR=`eval echo $DATADIR` +AC_SUBST([DATADIR]) +MANDIR=`eval echo $mandir` +MANDIR=`eval echo $MANDIR` +AC_SUBST([MANDIR]) + +dnl Support for building documentation. +AC_PATH_PROG([XSLTPROC], [xsltproc], [false], [$PATH]) +if test -d "/usr/share/xml/docbook/stylesheet/docbook-xsl" ; then + DEFAULT_XSLROOT="/usr/share/xml/docbook/stylesheet/docbook-xsl" +elif test -d "/usr/share/sgml/docbook/xsl-stylesheets" ; then + DEFAULT_XSLROOT="/usr/share/sgml/docbook/xsl-stylesheets" +else + dnl Documentation building will fail if this default gets used. + DEFAULT_XSLROOT="" +fi +AC_ARG_WITH([xslroot], + [AS_HELP_STRING([--with-xslroot=], [XSL stylesheet root path])], [ +if test "x$with_xslroot" = "xno" ; then + XSLROOT="${DEFAULT_XSLROOT}" +else + XSLROOT="${with_xslroot}" +fi +], + XSLROOT="${DEFAULT_XSLROOT}" +) +if test "x$XSLTPROC" = "xfalse" ; then + XSLROOT="" +fi +AC_SUBST([XSLROOT]) + +dnl If CFLAGS isn't defined, set CFLAGS to something reasonable. Otherwise, +dnl just prevent autoconf from molesting CFLAGS. +CFLAGS=$CFLAGS +AC_PROG_CC + +if test "x$GCC" != "xyes" ; then + AC_CACHE_CHECK([whether compiler is MSVC], + [je_cv_msvc], + [AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], + [ +#ifndef _MSC_VER + int fail[-1]; +#endif +])], + [je_cv_msvc=yes], + [je_cv_msvc=no])]) +fi + +dnl check if a cray prgenv wrapper compiler is being used +je_cv_cray_prgenv_wrapper="" +if test "x${PE_ENV}" != "x" ; then + case "${CC}" in + CC|cc) + je_cv_cray_prgenv_wrapper="yes" + ;; + *) + ;; + esac +fi + +AC_CACHE_CHECK([whether compiler is cray], + [je_cv_cray], + [AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], + [ +#ifndef _CRAYC + int fail[-1]; +#endif +])], + [je_cv_cray=yes], + [je_cv_cray=no])]) + +if test "x${je_cv_cray}" = "xyes" ; then + AC_CACHE_CHECK([whether cray compiler version is 8.4], + [je_cv_cray_84], + [AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], + [ +#if !(_RELEASE_MAJOR == 8 && _RELEASE_MINOR == 4) + int fail[-1]; +#endif +])], + [je_cv_cray_84=yes], + [je_cv_cray_84=no])]) +fi + +if test "x$GCC" = "xyes" ; then + JE_CFLAGS_ADD([-std=gnu11]) + if test "x$je_cv_cflags_added" = "x-std=gnu11" ; then + AC_DEFINE_UNQUOTED([JEMALLOC_HAS_RESTRICT], [ ], [ ]) + else + JE_CFLAGS_ADD([-std=gnu99]) + if test "x$je_cv_cflags_added" = "x-std=gnu99" ; then + AC_DEFINE_UNQUOTED([JEMALLOC_HAS_RESTRICT], [ ], [ ]) + fi + fi + JE_CFLAGS_ADD([-Werror=unknown-warning-option]) + JE_CFLAGS_ADD([-Wall]) + JE_CFLAGS_ADD([-Wextra]) + JE_CFLAGS_ADD([-Wshorten-64-to-32]) + JE_CFLAGS_ADD([-Wsign-compare]) + JE_CFLAGS_ADD([-Wundef]) + JE_CFLAGS_ADD([-Wno-format-zero-length]) + JE_CFLAGS_ADD([-Wpointer-arith]) + dnl This warning triggers on the use of the universal zero initializer, which + dnl is a very handy idiom for things like the tcache static initializer (which + dnl has lots of nested structs). See the discussion at. + dnl https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53119 + JE_CFLAGS_ADD([-Wno-missing-braces]) + dnl This one too. + JE_CFLAGS_ADD([-Wno-missing-field-initializers]) + JE_CFLAGS_ADD([-Wno-missing-attributes]) + JE_CFLAGS_ADD([-pipe]) + JE_CFLAGS_ADD([-g3]) +elif test "x$je_cv_msvc" = "xyes" ; then + CC="$CC -nologo" + JE_CFLAGS_ADD([-Zi]) + JE_CFLAGS_ADD([-MT]) + JE_CFLAGS_ADD([-W3]) + JE_CFLAGS_ADD([-FS]) + JE_APPEND_VS(CPPFLAGS, -I${srcdir}/include/msvc_compat) +fi +if test "x$je_cv_cray" = "xyes" ; then + dnl cray compiler 8.4 has an inlining bug + if test "x$je_cv_cray_84" = "xyes" ; then + JE_CFLAGS_ADD([-hipa2]) + JE_CFLAGS_ADD([-hnognu]) + fi + dnl ignore unreachable code warning + JE_CFLAGS_ADD([-hnomessage=128]) + dnl ignore redefinition of "malloc", "free", etc warning + JE_CFLAGS_ADD([-hnomessage=1357]) +fi +AC_SUBST([CONFIGURE_CFLAGS]) +AC_SUBST([SPECIFIED_CFLAGS]) +AC_SUBST([EXTRA_CFLAGS]) +AC_PROG_CPP + +AC_ARG_ENABLE([cxx], + [AS_HELP_STRING([--disable-cxx], [Disable C++ integration])], +if test "x$enable_cxx" = "xno" ; then + enable_cxx="0" +else + enable_cxx="1" +fi +, +enable_cxx="1" +) +if test "x$enable_cxx" = "x1" ; then + dnl Require at least c++14, which is the first version to support sized + dnl deallocation. C++ support is not compiled otherwise. + m4_include([m4/ax_cxx_compile_stdcxx.m4]) + AX_CXX_COMPILE_STDCXX([17], [noext], [optional]) + if test "x${HAVE_CXX17}" != "x1"; then + AX_CXX_COMPILE_STDCXX([14], [noext], [optional]) + fi + if test "x${HAVE_CXX14}" = "x1" -o "x${HAVE_CXX17}" = "x1"; then + JE_CXXFLAGS_ADD([-Wall]) + JE_CXXFLAGS_ADD([-Wextra]) + JE_CXXFLAGS_ADD([-g3]) + + SAVED_LIBS="${LIBS}" + JE_APPEND_VS(LIBS, -lstdc++) + JE_COMPILABLE([libstdc++ linkage], [ +#include +], [[ + int *arr = (int *)malloc(sizeof(int) * 42); + if (arr == NULL) + return 1; +]], [je_cv_libstdcxx]) + if test "x${je_cv_libstdcxx}" = "xno" ; then + LIBS="${SAVED_LIBS}" + fi + else + enable_cxx="0" + fi +fi +if test "x$enable_cxx" = "x1"; then + AC_DEFINE([JEMALLOC_ENABLE_CXX], [ ], [ ]) +fi +AC_SUBST([enable_cxx]) +AC_SUBST([CONFIGURE_CXXFLAGS]) +AC_SUBST([SPECIFIED_CXXFLAGS]) +AC_SUBST([EXTRA_CXXFLAGS]) + +AC_C_BIGENDIAN([ac_cv_big_endian=1], [ac_cv_big_endian=0]) +if test "x${ac_cv_big_endian}" = "x1" ; then + AC_DEFINE_UNQUOTED([JEMALLOC_BIG_ENDIAN], [ ], [ ]) +fi + +if test "x${je_cv_msvc}" = "xyes" -a "x${ac_cv_header_inttypes_h}" = "xno"; then + JE_APPEND_VS(CPPFLAGS, -I${srcdir}/include/msvc_compat/C99) +fi + +if test "x${je_cv_msvc}" = "xyes" ; then + LG_SIZEOF_PTR=LG_SIZEOF_PTR_WIN + AC_MSG_RESULT([Using a predefined value for sizeof(void *): 4 for 32-bit, 8 for 64-bit]) +else + AC_CHECK_SIZEOF([void *]) + if test "x${ac_cv_sizeof_void_p}" = "x8" ; then + LG_SIZEOF_PTR=3 + elif test "x${ac_cv_sizeof_void_p}" = "x4" ; then + LG_SIZEOF_PTR=2 + else + AC_MSG_ERROR([Unsupported pointer size: ${ac_cv_sizeof_void_p}]) + fi +fi +AC_DEFINE_UNQUOTED([LG_SIZEOF_PTR], [$LG_SIZEOF_PTR], [ ]) + +AC_CHECK_SIZEOF([int]) +if test "x${ac_cv_sizeof_int}" = "x8" ; then + LG_SIZEOF_INT=3 +elif test "x${ac_cv_sizeof_int}" = "x4" ; then + LG_SIZEOF_INT=2 +else + AC_MSG_ERROR([Unsupported int size: ${ac_cv_sizeof_int}]) +fi +AC_DEFINE_UNQUOTED([LG_SIZEOF_INT], [$LG_SIZEOF_INT], [ ]) + +AC_CHECK_SIZEOF([long]) +if test "x${ac_cv_sizeof_long}" = "x8" ; then + LG_SIZEOF_LONG=3 +elif test "x${ac_cv_sizeof_long}" = "x4" ; then + LG_SIZEOF_LONG=2 +else + AC_MSG_ERROR([Unsupported long size: ${ac_cv_sizeof_long}]) +fi +AC_DEFINE_UNQUOTED([LG_SIZEOF_LONG], [$LG_SIZEOF_LONG], [ ]) + +AC_CHECK_SIZEOF([long long]) +if test "x${ac_cv_sizeof_long_long}" = "x8" ; then + LG_SIZEOF_LONG_LONG=3 +elif test "x${ac_cv_sizeof_long_long}" = "x4" ; then + LG_SIZEOF_LONG_LONG=2 +else + AC_MSG_ERROR([Unsupported long long size: ${ac_cv_sizeof_long_long}]) +fi +AC_DEFINE_UNQUOTED([LG_SIZEOF_LONG_LONG], [$LG_SIZEOF_LONG_LONG], [ ]) + +AC_CHECK_SIZEOF([intmax_t]) +if test "x${ac_cv_sizeof_intmax_t}" = "x16" ; then + LG_SIZEOF_INTMAX_T=4 +elif test "x${ac_cv_sizeof_intmax_t}" = "x8" ; then + LG_SIZEOF_INTMAX_T=3 +elif test "x${ac_cv_sizeof_intmax_t}" = "x4" ; then + LG_SIZEOF_INTMAX_T=2 +else + AC_MSG_ERROR([Unsupported intmax_t size: ${ac_cv_sizeof_intmax_t}]) +fi +AC_DEFINE_UNQUOTED([LG_SIZEOF_INTMAX_T], [$LG_SIZEOF_INTMAX_T], [ ]) + +AC_CANONICAL_HOST +dnl CPU-specific settings. +CPU_SPINWAIT="" +case "${host_cpu}" in + i686|x86_64) + HAVE_CPU_SPINWAIT=1 + if test "x${je_cv_msvc}" = "xyes" ; then + AC_CACHE_VAL([je_cv_pause_msvc], + [JE_COMPILABLE([pause instruction MSVC], [], + [[_mm_pause(); return 0;]], + [je_cv_pause_msvc])]) + if test "x${je_cv_pause_msvc}" = "xyes" ; then + CPU_SPINWAIT='_mm_pause()' + fi + else + AC_CACHE_VAL([je_cv_pause], + [JE_COMPILABLE([pause instruction], [], + [[__asm__ volatile("pause"); return 0;]], + [je_cv_pause])]) + if test "x${je_cv_pause}" = "xyes" ; then + CPU_SPINWAIT='__asm__ volatile("pause")' + fi + fi + ;; + aarch64|arm*) + HAVE_CPU_SPINWAIT=1 + dnl isb is a better equivalent to the pause instruction on x86. + AC_CACHE_VAL([je_cv_isb], + [JE_COMPILABLE([isb instruction], [], + [[__asm__ volatile("isb"); return 0;]], + [je_cv_isb])]) + if test "x${je_cv_isb}" = "xyes" ; then + CPU_SPINWAIT='__asm__ volatile("isb")' + fi + ;; + *) + HAVE_CPU_SPINWAIT=0 + ;; +esac +AC_DEFINE_UNQUOTED([HAVE_CPU_SPINWAIT], [$HAVE_CPU_SPINWAIT], [ ]) +AC_DEFINE_UNQUOTED([CPU_SPINWAIT], [$CPU_SPINWAIT], [ ]) + +AC_ARG_WITH([lg_vaddr], + [AS_HELP_STRING([--with-lg-vaddr=], [Number of significant virtual address bits])], + [LG_VADDR="$with_lg_vaddr"], [LG_VADDR="detect"]) + +case "${host_cpu}" in + aarch64) + if test "x$LG_VADDR" = "xdetect"; then + AC_MSG_CHECKING([number of significant virtual address bits]) + if test "x${LG_SIZEOF_PTR}" = "x2" ; then + #aarch64 ILP32 + LG_VADDR=32 + else + #aarch64 LP64 + LG_VADDR=48 + fi + AC_MSG_RESULT([$LG_VADDR]) + fi + ;; + x86_64) + if test "x$LG_VADDR" = "xdetect"; then + AC_CACHE_CHECK([number of significant virtual address bits], + [je_cv_lg_vaddr], + AC_RUN_IFELSE([AC_LANG_PROGRAM( +[[ +#include +#ifdef _WIN32 +#include +#include +typedef unsigned __int32 uint32_t; +#else +#include +#endif +]], [[ + uint32_t r[[4]]; + uint32_t eax_in = 0x80000008U; +#ifdef _WIN32 + __cpuid((int *)r, (int)eax_in); +#else + asm volatile ("cpuid" + : "=a" (r[[0]]), "=b" (r[[1]]), "=c" (r[[2]]), "=d" (r[[3]]) + : "a" (eax_in), "c" (0) + ); +#endif + uint32_t eax_out = r[[0]]; + uint32_t vaddr = ((eax_out & 0x0000ff00U) >> 8); + FILE *f = fopen("conftest.out", "w"); + if (f == NULL) { + return 1; + } + if (vaddr > (sizeof(void *) << 3)) { + vaddr = sizeof(void *) << 3; + } + fprintf(f, "%u", vaddr); + fclose(f); + return 0; +]])], + [je_cv_lg_vaddr=`cat conftest.out`], + [je_cv_lg_vaddr=error], + [je_cv_lg_vaddr=57])) + if test "x${je_cv_lg_vaddr}" != "x" ; then + LG_VADDR="${je_cv_lg_vaddr}" + fi + if test "x${LG_VADDR}" != "xerror" ; then + AC_DEFINE_UNQUOTED([LG_VADDR], [$LG_VADDR], [ ]) + else + AC_MSG_ERROR([cannot determine number of significant virtual address bits]) + fi + fi + ;; + *) + if test "x$LG_VADDR" = "xdetect"; then + AC_MSG_CHECKING([number of significant virtual address bits]) + if test "x${LG_SIZEOF_PTR}" = "x3" ; then + LG_VADDR=64 + elif test "x${LG_SIZEOF_PTR}" = "x2" ; then + LG_VADDR=32 + elif test "x${LG_SIZEOF_PTR}" = "xLG_SIZEOF_PTR_WIN" ; then + LG_VADDR="(1U << (LG_SIZEOF_PTR_WIN+3))" + else + AC_MSG_ERROR([Unsupported lg(pointer size): ${LG_SIZEOF_PTR}]) + fi + AC_MSG_RESULT([$LG_VADDR]) + fi + ;; +esac +AC_DEFINE_UNQUOTED([LG_VADDR], [$LG_VADDR], [ ]) + +LD_PRELOAD_VAR="LD_PRELOAD" +so="so" +importlib="${so}" +o="$ac_objext" +a="a" +exe="$ac_exeext" +libprefix="lib" +link_whole_archive="0" +DSO_LDFLAGS='-shared -Wl,-soname,$(@F)' +RPATH='-Wl,-rpath,$(1)' +SOREV="${so}.${rev}" +PIC_CFLAGS='-fPIC -DPIC' +CTARGET='-o $@' +LDTARGET='-o $@' +TEST_LD_MODE= +EXTRA_LDFLAGS= +ARFLAGS='crus' +AROUT=' $@' +CC_MM=1 + +if test "x$je_cv_cray_prgenv_wrapper" = "xyes" ; then + TEST_LD_MODE='-dynamic' +fi + +if test "x${je_cv_cray}" = "xyes" ; then + CC_MM= +fi + +AN_MAKEVAR([AR], [AC_PROG_AR]) +AN_PROGRAM([ar], [AC_PROG_AR]) +AC_DEFUN([AC_PROG_AR], [AC_CHECK_TOOL(AR, ar, :)]) +AC_PROG_AR + +AN_MAKEVAR([NM], [AC_PROG_NM]) +AN_PROGRAM([nm], [AC_PROG_NM]) +AC_DEFUN([AC_PROG_NM], [AC_CHECK_TOOL(NM, nm, :)]) +AC_PROG_NM + +AC_PROG_AWK + +dnl ============================================================================ +dnl jemalloc version. +dnl + +AC_ARG_WITH([version], + [AS_HELP_STRING([--with-version=..--g], + [Version string])], + [ + echo "${with_version}" | grep ['^[0-9]\+\.[0-9]\+\.[0-9]\+-[0-9]\+-g[0-9a-f]\+$'] 2>&1 1>/dev/null + if test $? -eq 0 ; then + echo "$with_version" > "${objroot}VERSION" + else + echo "${with_version}" | grep ['^VERSION$'] 2>&1 1>/dev/null + if test $? -ne 0 ; then + AC_MSG_ERROR([${with_version} does not match ..--g or VERSION]) + fi + fi + ], [ + dnl Set VERSION if source directory is inside a git repository. + if test "x`test ! \"${srcroot}\" && cd \"${srcroot}\"; git rev-parse --is-inside-work-tree 2>/dev/null`" = "xtrue" ; then + dnl Pattern globs aren't powerful enough to match both single- and + dnl double-digit version numbers, so iterate over patterns to support up + dnl to version 99.99.99 without any accidental matches. + for pattern in ['[0-9].[0-9].[0-9]' '[0-9].[0-9].[0-9][0-9]' \ + '[0-9].[0-9][0-9].[0-9]' '[0-9].[0-9][0-9].[0-9][0-9]' \ + '[0-9][0-9].[0-9].[0-9]' '[0-9][0-9].[0-9].[0-9][0-9]' \ + '[0-9][0-9].[0-9][0-9].[0-9]' \ + '[0-9][0-9].[0-9][0-9].[0-9][0-9]']; do + (test ! "${srcroot}" && cd "${srcroot}"; git describe --long --abbrev=40 --match="${pattern}") > "${objroot}VERSION.tmp" 2>/dev/null + if test $? -eq 0 ; then + mv "${objroot}VERSION.tmp" "${objroot}VERSION" + break + fi + done + fi + rm -f "${objroot}VERSION.tmp" + ]) + +if test ! -e "${objroot}VERSION" ; then + if test ! -e "${srcroot}VERSION" ; then + AC_MSG_RESULT( + [Missing VERSION file, and unable to generate it; creating bogus VERSION]) + echo "0.0.0-0-g000000missing_version_try_git_fetch_tags" > "${objroot}VERSION" + else + cp ${srcroot}VERSION ${objroot}VERSION + fi +fi +jemalloc_version=`cat "${objroot}VERSION"` +jemalloc_version_major=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]1}'` +jemalloc_version_minor=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]2}'` +jemalloc_version_bugfix=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]3}'` +jemalloc_version_nrev=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]4}'` +jemalloc_version_gid=`echo ${jemalloc_version} | tr ".g-" " " | awk '{print [$]5}'` +AC_SUBST([jemalloc_version]) +AC_SUBST([jemalloc_version_major]) +AC_SUBST([jemalloc_version_minor]) +AC_SUBST([jemalloc_version_bugfix]) +AC_SUBST([jemalloc_version_nrev]) +AC_SUBST([jemalloc_version_gid]) + +dnl Platform-specific settings. abi and RPATH can probably be determined +dnl programmatically, but doing so is error-prone, which makes it generally +dnl not worth the trouble. +dnl +dnl Define cpp macros in CPPFLAGS, rather than doing AC_DEFINE(macro), since the +dnl definitions need to be seen before any headers are included, which is a pain +dnl to make happen otherwise. +default_retain="0" +zero_realloc_default_free="0" +maps_coalesce="1" +DUMP_SYMS="${NM} -a" +SYM_PREFIX="" +case "${host}" in + *-*-darwin* | *-*-ios*) + abi="macho" + RPATH="" + LD_PRELOAD_VAR="DYLD_INSERT_LIBRARIES" + so="dylib" + importlib="${so}" + force_tls="0" + DSO_LDFLAGS='-shared -Wl,-install_name,$(LIBDIR)/$(@F)' + SOREV="${rev}.${so}" + sbrk_deprecated="1" + SYM_PREFIX="_" + ;; + *-*-freebsd*) + JE_APPEND_VS(CPPFLAGS, -D_BSD_SOURCE) + abi="elf" + AC_DEFINE([JEMALLOC_SYSCTL_VM_OVERCOMMIT], [ ], [ ]) + force_lazy_lock="1" + ;; + *-*-dragonfly*) + abi="elf" + ;; + *-*-openbsd*) + abi="elf" + force_tls="0" + ;; + *-*-bitrig*) + abi="elf" + ;; + *-*-linux-android*) + dnl syscall(2) and secure_getenv(3) are exposed by _GNU_SOURCE. + JE_APPEND_VS(CPPFLAGS, -D_GNU_SOURCE) + abi="elf" + glibc="0" + AC_DEFINE([JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS], [ ], [ ]) + AC_DEFINE([JEMALLOC_HAS_ALLOCA_H], [ ], [ ]) + AC_DEFINE([JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY], [ ], [ ]) + AC_DEFINE([JEMALLOC_THREADED_INIT], [ ], [ ]) + AC_DEFINE([JEMALLOC_C11_ATOMICS], [ ], [ ]) + force_tls="0" + if test "${LG_SIZEOF_PTR}" = "3"; then + default_retain="1" + fi + zero_realloc_default_free="1" + ;; + *-*-linux*) + dnl syscall(2) and secure_getenv(3) are exposed by _GNU_SOURCE. + JE_APPEND_VS(CPPFLAGS, -D_GNU_SOURCE) + abi="elf" + glibc="1" + AC_DEFINE([JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS], [ ], [ ]) + AC_DEFINE([JEMALLOC_HAS_ALLOCA_H], [ ], [ ]) + AC_DEFINE([JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY], [ ], [ ]) + AC_DEFINE([JEMALLOC_THREADED_INIT], [ ], [ ]) + AC_DEFINE([JEMALLOC_USE_CXX_THROW], [ ], [ ]) + if test "${LG_SIZEOF_PTR}" = "3"; then + default_retain="1" + fi + zero_realloc_default_free="1" + ;; + *-*-kfreebsd*) + dnl syscall(2) and secure_getenv(3) are exposed by _GNU_SOURCE. + JE_APPEND_VS(CPPFLAGS, -D_GNU_SOURCE) + abi="elf" + AC_DEFINE([JEMALLOC_HAS_ALLOCA_H], [ ], [ ]) + AC_DEFINE([JEMALLOC_SYSCTL_VM_OVERCOMMIT], [ ], [ ]) + AC_DEFINE([JEMALLOC_THREADED_INIT], [ ], [ ]) + AC_DEFINE([JEMALLOC_USE_CXX_THROW], [ ], [ ]) + ;; + *-*-netbsd*) + AC_MSG_CHECKING([ABI]) + AC_COMPILE_IFELSE([AC_LANG_PROGRAM( +[[#ifdef __ELF__ +/* ELF */ +#else +#error aout +#endif +]])], + [abi="elf"], + [abi="aout"]) + AC_MSG_RESULT([$abi]) + ;; + *-*-solaris2*) + abi="elf" + RPATH='-Wl,-R,$(1)' + dnl Solaris needs this for sigwait(). + JE_APPEND_VS(CPPFLAGS, -D_POSIX_PTHREAD_SEMANTICS) + JE_APPEND_VS(LIBS, -lposix4 -lsocket -lnsl) + ;; + *-ibm-aix*) + if test "${LG_SIZEOF_PTR}" = "3"; then + dnl 64bit AIX + LD_PRELOAD_VAR="LDR_PRELOAD64" + else + dnl 32bit AIX + LD_PRELOAD_VAR="LDR_PRELOAD" + fi + abi="xcoff" + ;; + *-*-mingw* | *-*-cygwin*) + abi="pecoff" + force_tls="0" + maps_coalesce="0" + RPATH="" + so="dll" + if test "x$je_cv_msvc" = "xyes" ; then + importlib="lib" + DSO_LDFLAGS="-LD" + EXTRA_LDFLAGS="-link -DEBUG" + CTARGET='-Fo$@' + LDTARGET='-Fe$@' + AR='lib' + ARFLAGS='-nologo -out:' + AROUT='$@' + CC_MM= + else + importlib="${so}" + DSO_LDFLAGS="-shared" + link_whole_archive="1" + fi + case "${host}" in + *-*-cygwin*) + DUMP_SYMS="dumpbin /SYMBOLS" + ;; + *) + ;; + esac + a="lib" + libprefix="" + SOREV="${so}" + PIC_CFLAGS="" + if test "${LG_SIZEOF_PTR}" = "3"; then + default_retain="1" + fi + zero_realloc_default_free="1" + ;; + *-*-nto-qnx) + abi="elf" + force_tls="0" + AC_DEFINE([JEMALLOC_HAS_ALLOCA_H], [ ], [ ]) + ;; + *) + AC_MSG_RESULT([Unsupported operating system: ${host}]) + abi="elf" + ;; +esac + +JEMALLOC_USABLE_SIZE_CONST=const +AC_CHECK_HEADERS([malloc.h], [ + AC_MSG_CHECKING([whether malloc_usable_size definition can use const argument]) + AC_COMPILE_IFELSE([AC_LANG_PROGRAM( + [#include + #include + size_t malloc_usable_size(const void *ptr); + ], + [])],[ + AC_MSG_RESULT([yes]) + ],[ + JEMALLOC_USABLE_SIZE_CONST= + AC_MSG_RESULT([no]) + ]) +]) +AC_DEFINE_UNQUOTED([JEMALLOC_USABLE_SIZE_CONST], [$JEMALLOC_USABLE_SIZE_CONST], [ ]) +AC_SUBST([abi]) +AC_SUBST([RPATH]) +AC_SUBST([LD_PRELOAD_VAR]) +AC_SUBST([so]) +AC_SUBST([importlib]) +AC_SUBST([o]) +AC_SUBST([a]) +AC_SUBST([exe]) +AC_SUBST([libprefix]) +AC_SUBST([link_whole_archive]) +AC_SUBST([DSO_LDFLAGS]) +AC_SUBST([EXTRA_LDFLAGS]) +AC_SUBST([SOREV]) +AC_SUBST([PIC_CFLAGS]) +AC_SUBST([CTARGET]) +AC_SUBST([LDTARGET]) +AC_SUBST([TEST_LD_MODE]) +AC_SUBST([MKLIB]) +AC_SUBST([ARFLAGS]) +AC_SUBST([AROUT]) +AC_SUBST([DUMP_SYMS]) +AC_SUBST([CC_MM]) + +dnl Determine whether libm must be linked to use e.g. log(3). +AC_SEARCH_LIBS([log], [m], , [AC_MSG_ERROR([Missing math functions])]) +if test "x$ac_cv_search_log" != "xnone required" ; then + LM="$ac_cv_search_log" +else + LM= +fi +AC_SUBST(LM) + +JE_COMPILABLE([__attribute__ syntax], + [static __attribute__((unused)) void foo(void){}], + [], + [je_cv_attribute]) +if test "x${je_cv_attribute}" = "xyes" ; then + AC_DEFINE([JEMALLOC_HAVE_ATTR], [ ], [ ]) + if test "x${GCC}" = "xyes" -a "x${abi}" = "xelf"; then + JE_CFLAGS_ADD([-fvisibility=hidden]) + JE_CXXFLAGS_ADD([-fvisibility=hidden]) + fi +fi +dnl Check for tls_model attribute support (clang 3.0 still lacks support). +JE_CFLAGS_SAVE() +JE_CFLAGS_ADD([-Werror]) +JE_CFLAGS_ADD([-herror_on_warning]) +JE_COMPILABLE([tls_model attribute], [], + [static __thread int + __attribute__((tls_model("initial-exec"), unused)) foo; + foo = 0;], + [je_cv_tls_model]) +JE_CFLAGS_RESTORE() +dnl (Setting of JEMALLOC_TLS_MODEL is done later, after we've checked for +dnl --disable-initial-exec-tls) + +dnl Check for alloc_size attribute support. +JE_CFLAGS_SAVE() +JE_CFLAGS_ADD([-Werror]) +JE_CFLAGS_ADD([-herror_on_warning]) +JE_COMPILABLE([alloc_size attribute], [#include ], + [void *foo(size_t size) __attribute__((alloc_size(1)));], + [je_cv_alloc_size]) +JE_CFLAGS_RESTORE() +if test "x${je_cv_alloc_size}" = "xyes" ; then + AC_DEFINE([JEMALLOC_HAVE_ATTR_ALLOC_SIZE], [ ], [ ]) +fi +dnl Check for format(gnu_printf, ...) attribute support. +JE_CFLAGS_SAVE() +JE_CFLAGS_ADD([-Werror]) +JE_CFLAGS_ADD([-herror_on_warning]) +JE_COMPILABLE([format(gnu_printf, ...) attribute], [#include ], + [void *foo(const char *format, ...) __attribute__((format(gnu_printf, 1, 2)));], + [je_cv_format_gnu_printf]) +JE_CFLAGS_RESTORE() +if test "x${je_cv_format_gnu_printf}" = "xyes" ; then + AC_DEFINE([JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF], [ ], [ ]) +fi +dnl Check for format(printf, ...) attribute support. +JE_CFLAGS_SAVE() +JE_CFLAGS_ADD([-Werror]) +JE_CFLAGS_ADD([-herror_on_warning]) +JE_COMPILABLE([format(printf, ...) attribute], [#include ], + [void *foo(const char *format, ...) __attribute__((format(printf, 1, 2)));], + [je_cv_format_printf]) +JE_CFLAGS_RESTORE() +if test "x${je_cv_format_printf}" = "xyes" ; then + AC_DEFINE([JEMALLOC_HAVE_ATTR_FORMAT_PRINTF], [ ], [ ]) +fi + +dnl Check for format_arg(...) attribute support. +JE_CFLAGS_SAVE() +JE_CFLAGS_ADD([-Werror]) +JE_CFLAGS_ADD([-herror_on_warning]) +JE_COMPILABLE([format(printf, ...) attribute], [#include ], + [const char * __attribute__((__format_arg__(1))) foo(const char *format);], + [je_cv_format_arg]) +JE_CFLAGS_RESTORE() +if test "x${je_cv_format_arg}" = "xyes" ; then + AC_DEFINE([JEMALLOC_HAVE_ATTR_FORMAT_ARG], [ ], [ ]) +fi + +dnl Check for fallthrough attribute support. +JE_CFLAGS_SAVE() +JE_CFLAGS_ADD([-Wimplicit-fallthrough]) +JE_COMPILABLE([fallthrough attribute], + [#if !__has_attribute(fallthrough) + #error "foo" + #endif], + [int x = 0; + switch (x) { + case 0: __attribute__((__fallthrough__)); + case 1: return 1; + }], + [je_cv_fallthrough]) +JE_CFLAGS_RESTORE() +if test "x${je_cv_fallthrough}" = "xyes" ; then + AC_DEFINE([JEMALLOC_HAVE_ATTR_FALLTHROUGH], [ ], [ ]) + JE_CFLAGS_ADD([-Wimplicit-fallthrough]) + JE_CXXFLAGS_ADD([-Wimplicit-fallthrough]) +fi + +dnl Check for cold attribute support. +JE_CFLAGS_SAVE() +JE_CFLAGS_ADD([-Werror]) +JE_CFLAGS_ADD([-herror_on_warning]) +JE_COMPILABLE([cold attribute], [], + [__attribute__((__cold__)) void foo();], + [je_cv_cold]) +JE_CFLAGS_RESTORE() +if test "x${je_cv_cold}" = "xyes" ; then + AC_DEFINE([JEMALLOC_HAVE_ATTR_COLD], [ ], [ ]) +fi + +dnl Check for VM_MAKE_TAG for mmap support. +JE_COMPILABLE([vm_make_tag], + [#include + #include ], + [void *p; + p = mmap(0, 16, PROT_READ, MAP_ANON|MAP_PRIVATE, VM_MAKE_TAG(1), 0); + munmap(p, 16);], + [je_cv_vm_make_tag]) +if test "x${je_cv_vm_make_tag}" = "xyes" ; then + AC_DEFINE([JEMALLOC_HAVE_VM_MAKE_TAG], [ ], [ ]) +fi + +dnl Support optional additions to rpath. +AC_ARG_WITH([rpath], + [AS_HELP_STRING([--with-rpath=], [Colon-separated rpath (ELF systems only)])], +if test "x$with_rpath" = "xno" ; then + RPATH_EXTRA= +else + RPATH_EXTRA="`echo $with_rpath | tr \":\" \" \"`" +fi, + RPATH_EXTRA= +) +AC_SUBST([RPATH_EXTRA]) + +dnl Disable rules that do automatic regeneration of configure output by default. +AC_ARG_ENABLE([autogen], + [AS_HELP_STRING([--enable-autogen], [Automatically regenerate configure output])], +if test "x$enable_autogen" = "xno" ; then + enable_autogen="0" +else + enable_autogen="1" +fi +, +enable_autogen="0" +) +AC_SUBST([enable_autogen]) + +AC_PROG_INSTALL +AC_PROG_RANLIB +AC_PATH_PROG([LD], [ld], [false], [$PATH]) +AC_PATH_PROG([AUTOCONF], [autoconf], [false], [$PATH]) + +dnl Enable documentation +AC_ARG_ENABLE([doc], + [AS_HELP_STRING([--enable-doc], [Build documentation])], +if test "x$enable_doc" = "xno" ; then + enable_doc="0" +else + enable_doc="1" +fi +, +enable_doc="1" +) +AC_SUBST([enable_doc]) + +dnl Enable shared libs +AC_ARG_ENABLE([shared], + [AS_HELP_STRING([--enable-shared], [Build shared libaries])], +if test "x$enable_shared" = "xno" ; then + enable_shared="0" +else + enable_shared="1" +fi +, +enable_shared="1" +) +AC_SUBST([enable_shared]) + +dnl Enable static libs +AC_ARG_ENABLE([static], + [AS_HELP_STRING([--enable-static], [Build static libaries])], +if test "x$enable_static" = "xno" ; then + enable_static="0" +else + enable_static="1" +fi +, +enable_static="1" +) +AC_SUBST([enable_static]) + +if test "$enable_shared$enable_static" = "00" ; then + AC_MSG_ERROR([Please enable one of shared or static builds]) +fi + +dnl Perform no name mangling by default. +AC_ARG_WITH([mangling], + [AS_HELP_STRING([--with-mangling=], [Mangle symbols in ])], + [mangling_map="$with_mangling"], [mangling_map=""]) + +dnl Do not prefix public APIs by default. +AC_ARG_WITH([jemalloc_prefix], + [AS_HELP_STRING([--with-jemalloc-prefix=], [Prefix to prepend to all public APIs])], + [JEMALLOC_PREFIX="$with_jemalloc_prefix"], + [if test "x$abi" != "xmacho" -a "x$abi" != "xpecoff"; then + JEMALLOC_PREFIX="" +else + JEMALLOC_PREFIX="je_" +fi] +) +if test "x$JEMALLOC_PREFIX" = "x" ; then + AC_DEFINE([JEMALLOC_IS_MALLOC], [ ], [ ]) +else + JEMALLOC_CPREFIX=`echo ${JEMALLOC_PREFIX} | tr "a-z" "A-Z"` + AC_DEFINE_UNQUOTED([JEMALLOC_PREFIX], ["$JEMALLOC_PREFIX"], [ ]) + AC_DEFINE_UNQUOTED([JEMALLOC_CPREFIX], ["$JEMALLOC_CPREFIX"], [ ]) +fi +AC_SUBST([JEMALLOC_PREFIX]) +AC_SUBST([JEMALLOC_CPREFIX]) + +AC_ARG_WITH([export], + [AS_HELP_STRING([--without-export], [disable exporting jemalloc public APIs])], + [if test "x$with_export" = "xno"; then + AC_DEFINE([JEMALLOC_EXPORT],[], [ ]) +fi] +) + +public_syms="aligned_alloc calloc dallocx free mallctl mallctlbymib mallctlnametomib malloc malloc_conf malloc_conf_2_conf_harder malloc_message malloc_stats_print malloc_usable_size mallocx smallocx_${jemalloc_version_gid} nallocx posix_memalign rallocx realloc sallocx sdallocx xallocx" +dnl Check for additional platform-specific public API functions. +AC_CHECK_FUNC([memalign], + [AC_DEFINE([JEMALLOC_OVERRIDE_MEMALIGN], [ ], [ ]) + public_syms="${public_syms} memalign"]) +AC_CHECK_FUNC([valloc], + [AC_DEFINE([JEMALLOC_OVERRIDE_VALLOC], [ ], [ ]) + public_syms="${public_syms} valloc"]) +AC_CHECK_FUNC([malloc_size], + [AC_DEFINE([JEMALLOC_HAVE_MALLOC_SIZE], [ ], [ ]) + public_syms="${public_syms} malloc_size"]) + +dnl Check for allocator-related functions that should be wrapped. +wrap_syms= +if test "x${JEMALLOC_PREFIX}" = "x" ; then + AC_CHECK_FUNC([__libc_calloc], + [AC_DEFINE([JEMALLOC_OVERRIDE___LIBC_CALLOC], [ ], [ ]) + wrap_syms="${wrap_syms} __libc_calloc"]) + AC_CHECK_FUNC([__libc_free], + [AC_DEFINE([JEMALLOC_OVERRIDE___LIBC_FREE], [ ], [ ]) + wrap_syms="${wrap_syms} __libc_free"]) + AC_CHECK_FUNC([__libc_malloc], + [AC_DEFINE([JEMALLOC_OVERRIDE___LIBC_MALLOC], [ ], [ ]) + wrap_syms="${wrap_syms} __libc_malloc"]) + AC_CHECK_FUNC([__libc_memalign], + [AC_DEFINE([JEMALLOC_OVERRIDE___LIBC_MEMALIGN], [ ], [ ]) + wrap_syms="${wrap_syms} __libc_memalign"]) + AC_CHECK_FUNC([__libc_realloc], + [AC_DEFINE([JEMALLOC_OVERRIDE___LIBC_REALLOC], [ ], [ ]) + wrap_syms="${wrap_syms} __libc_realloc"]) + AC_CHECK_FUNC([__libc_valloc], + [AC_DEFINE([JEMALLOC_OVERRIDE___LIBC_VALLOC], [ ], [ ]) + wrap_syms="${wrap_syms} __libc_valloc"]) + AC_CHECK_FUNC([__posix_memalign], + [AC_DEFINE([JEMALLOC_OVERRIDE___POSIX_MEMALIGN], [ ], [ ]) + wrap_syms="${wrap_syms} __posix_memalign"]) +fi + +case "${host}" in + *-*-mingw* | *-*-cygwin*) + wrap_syms="${wrap_syms} tls_callback" + ;; + *) + ;; +esac + +dnl Mangle library-private APIs. +AC_ARG_WITH([private_namespace], + [AS_HELP_STRING([--with-private-namespace=], [Prefix to prepend to all library-private APIs])], + [JEMALLOC_PRIVATE_NAMESPACE="${with_private_namespace}je_"], + [JEMALLOC_PRIVATE_NAMESPACE="je_"] +) +AC_DEFINE_UNQUOTED([JEMALLOC_PRIVATE_NAMESPACE], [$JEMALLOC_PRIVATE_NAMESPACE], [ ]) +private_namespace="$JEMALLOC_PRIVATE_NAMESPACE" +AC_SUBST([private_namespace]) + +dnl Do not add suffix to installed files by default. +AC_ARG_WITH([install_suffix], + [AS_HELP_STRING([--with-install-suffix=], [Suffix to append to all installed files])], + [case "$with_install_suffix" in + *\ * ) AC_MSG_ERROR([Install suffix should not contain spaces]) ;; + * ) INSTALL_SUFFIX="$with_install_suffix" ;; +esac], + [INSTALL_SUFFIX=] +) +install_suffix="$INSTALL_SUFFIX" +AC_SUBST([install_suffix]) + +dnl Specify default malloc_conf. +AC_ARG_WITH([malloc_conf], + [AS_HELP_STRING([--with-malloc-conf=], [config.malloc_conf options string])], + [JEMALLOC_CONFIG_MALLOC_CONF="$with_malloc_conf"], + [JEMALLOC_CONFIG_MALLOC_CONF=""] +) +config_malloc_conf="$JEMALLOC_CONFIG_MALLOC_CONF" +AC_DEFINE_UNQUOTED([JEMALLOC_CONFIG_MALLOC_CONF], ["$config_malloc_conf"], [ ]) + +dnl Substitute @je_@ in jemalloc_protos.h.in, primarily to make generation of +dnl jemalloc_protos_jet.h easy. +je_="je_" +AC_SUBST([je_]) + +cfgoutputs_in="Makefile.in" +cfgoutputs_in="${cfgoutputs_in} jemalloc.pc.in" +cfgoutputs_in="${cfgoutputs_in} doc/html.xsl.in" +cfgoutputs_in="${cfgoutputs_in} doc/manpages.xsl.in" +cfgoutputs_in="${cfgoutputs_in} doc/jemalloc.xml.in" +cfgoutputs_in="${cfgoutputs_in} include/jemalloc/jemalloc_macros.h.in" +cfgoutputs_in="${cfgoutputs_in} include/jemalloc/jemalloc_protos.h.in" +cfgoutputs_in="${cfgoutputs_in} include/jemalloc/jemalloc_typedefs.h.in" +cfgoutputs_in="${cfgoutputs_in} include/jemalloc/internal/jemalloc_preamble.h.in" +cfgoutputs_in="${cfgoutputs_in} test/test.sh.in" +cfgoutputs_in="${cfgoutputs_in} test/include/test/jemalloc_test.h.in" + +cfgoutputs_out="Makefile" +cfgoutputs_out="${cfgoutputs_out} jemalloc.pc" +cfgoutputs_out="${cfgoutputs_out} doc/html.xsl" +cfgoutputs_out="${cfgoutputs_out} doc/manpages.xsl" +cfgoutputs_out="${cfgoutputs_out} doc/jemalloc.xml" +cfgoutputs_out="${cfgoutputs_out} include/jemalloc/jemalloc_macros.h" +cfgoutputs_out="${cfgoutputs_out} include/jemalloc/jemalloc_protos.h" +cfgoutputs_out="${cfgoutputs_out} include/jemalloc/jemalloc_typedefs.h" +cfgoutputs_out="${cfgoutputs_out} include/jemalloc/internal/jemalloc_preamble.h" +cfgoutputs_out="${cfgoutputs_out} test/test.sh" +cfgoutputs_out="${cfgoutputs_out} test/include/test/jemalloc_test.h" + +cfgoutputs_tup="Makefile" +cfgoutputs_tup="${cfgoutputs_tup} jemalloc.pc:jemalloc.pc.in" +cfgoutputs_tup="${cfgoutputs_tup} doc/html.xsl:doc/html.xsl.in" +cfgoutputs_tup="${cfgoutputs_tup} doc/manpages.xsl:doc/manpages.xsl.in" +cfgoutputs_tup="${cfgoutputs_tup} doc/jemalloc.xml:doc/jemalloc.xml.in" +cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/jemalloc_macros.h:include/jemalloc/jemalloc_macros.h.in" +cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/jemalloc_protos.h:include/jemalloc/jemalloc_protos.h.in" +cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/jemalloc_typedefs.h:include/jemalloc/jemalloc_typedefs.h.in" +cfgoutputs_tup="${cfgoutputs_tup} include/jemalloc/internal/jemalloc_preamble.h" +cfgoutputs_tup="${cfgoutputs_tup} test/test.sh:test/test.sh.in" +cfgoutputs_tup="${cfgoutputs_tup} test/include/test/jemalloc_test.h:test/include/test/jemalloc_test.h.in" + +cfghdrs_in="include/jemalloc/jemalloc_defs.h.in" +cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/jemalloc_internal_defs.h.in" +cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/private_symbols.sh" +cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/private_namespace.sh" +cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/public_namespace.sh" +cfghdrs_in="${cfghdrs_in} include/jemalloc/internal/public_unnamespace.sh" +cfghdrs_in="${cfghdrs_in} include/jemalloc/jemalloc_rename.sh" +cfghdrs_in="${cfghdrs_in} include/jemalloc/jemalloc_mangle.sh" +cfghdrs_in="${cfghdrs_in} include/jemalloc/jemalloc.sh" +cfghdrs_in="${cfghdrs_in} test/include/test/jemalloc_test_defs.h.in" + +cfghdrs_out="include/jemalloc/jemalloc_defs.h" +cfghdrs_out="${cfghdrs_out} include/jemalloc/jemalloc${install_suffix}.h" +cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/private_symbols.awk" +cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/private_symbols_jet.awk" +cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/public_symbols.txt" +cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/public_namespace.h" +cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/public_unnamespace.h" +cfghdrs_out="${cfghdrs_out} include/jemalloc/jemalloc_protos_jet.h" +cfghdrs_out="${cfghdrs_out} include/jemalloc/jemalloc_rename.h" +cfghdrs_out="${cfghdrs_out} include/jemalloc/jemalloc_mangle.h" +cfghdrs_out="${cfghdrs_out} include/jemalloc/jemalloc_mangle_jet.h" +cfghdrs_out="${cfghdrs_out} include/jemalloc/internal/jemalloc_internal_defs.h" +cfghdrs_out="${cfghdrs_out} test/include/test/jemalloc_test_defs.h" + +cfghdrs_tup="include/jemalloc/jemalloc_defs.h:include/jemalloc/jemalloc_defs.h.in" +cfghdrs_tup="${cfghdrs_tup} include/jemalloc/internal/jemalloc_internal_defs.h:include/jemalloc/internal/jemalloc_internal_defs.h.in" +cfghdrs_tup="${cfghdrs_tup} test/include/test/jemalloc_test_defs.h:test/include/test/jemalloc_test_defs.h.in" + +dnl ============================================================================ +dnl jemalloc build options. +dnl + +dnl Do not compile with debugging by default. +AC_ARG_ENABLE([debug], + [AS_HELP_STRING([--enable-debug], + [Build debugging code])], +[if test "x$enable_debug" = "xno" ; then + enable_debug="0" +else + enable_debug="1" +fi +], +[enable_debug="0"] +) +if test "x$enable_debug" = "x1" ; then + AC_DEFINE([JEMALLOC_DEBUG], [ ], [ ]) +fi +AC_SUBST([enable_debug]) + +dnl Only optimize if not debugging. +if test "x$enable_debug" = "x0" ; then + if test "x$GCC" = "xyes" ; then + JE_CFLAGS_ADD([-O3]) + JE_CXXFLAGS_ADD([-O3]) + JE_CFLAGS_ADD([-funroll-loops]) + elif test "x$je_cv_msvc" = "xyes" ; then + JE_CFLAGS_ADD([-O2]) + JE_CXXFLAGS_ADD([-O2]) + else + JE_CFLAGS_ADD([-O]) + JE_CXXFLAGS_ADD([-O]) + fi +fi + +dnl Enable statistics calculation by default. +AC_ARG_ENABLE([stats], + [AS_HELP_STRING([--disable-stats], + [Disable statistics calculation/reporting])], +[if test "x$enable_stats" = "xno" ; then + enable_stats="0" +else + enable_stats="1" +fi +], +[enable_stats="1"] +) +if test "x$enable_stats" = "x1" ; then + AC_DEFINE([JEMALLOC_STATS], [ ], [ ]) +fi +AC_SUBST([enable_stats]) + +dnl Do not enable smallocx by default. +AC_ARG_ENABLE([experimental_smallocx], + [AS_HELP_STRING([--enable-experimental-smallocx], [Enable experimental smallocx API])], +[if test "x$enable_experimental_smallocx" = "xno" ; then +enable_experimental_smallocx="0" +else +enable_experimental_smallocx="1" +fi +], +[enable_experimental_smallocx="0"] +) +if test "x$enable_experimental_smallocx" = "x1" ; then + AC_DEFINE([JEMALLOC_EXPERIMENTAL_SMALLOCX_API], [ ], [ ]) +fi +AC_SUBST([enable_experimental_smallocx]) + +dnl Do not enable profiling by default. +AC_ARG_ENABLE([prof], + [AS_HELP_STRING([--enable-prof], [Enable allocation profiling])], +[if test "x$enable_prof" = "xno" ; then + enable_prof="0" +else + enable_prof="1" +fi +], +[enable_prof="0"] +) +if test "x$enable_prof" = "x1" ; then + backtrace_method="" +else + backtrace_method="N/A" +fi + +AC_ARG_ENABLE([prof-libunwind], + [AS_HELP_STRING([--enable-prof-libunwind], [Use libunwind for backtracing])], +[if test "x$enable_prof_libunwind" = "xno" ; then + enable_prof_libunwind="0" +else + enable_prof_libunwind="1" + if test "x$enable_prof" = "x0" ; then + AC_MSG_ERROR([--enable-prof-libunwind should only be used with --enable-prof]) + fi +fi +], +[enable_prof_libunwind="0"] +) +AC_ARG_WITH([static_libunwind], + [AS_HELP_STRING([--with-static-libunwind=], + [Path to static libunwind library; use rather than dynamically linking])], +if test "x$with_static_libunwind" = "xno" ; then + LUNWIND="-lunwind" +else + if test ! -f "$with_static_libunwind" ; then + AC_MSG_ERROR([Static libunwind not found: $with_static_libunwind]) + fi + LUNWIND="$with_static_libunwind" +fi, + LUNWIND="-lunwind" +) +if test "x$backtrace_method" = "x" -a "x$enable_prof_libunwind" = "x1" ; then + AC_CHECK_HEADERS([libunwind.h], , [enable_prof_libunwind="0"]) + if test "x$LUNWIND" = "x-lunwind" ; then + AC_CHECK_LIB([unwind], [unw_backtrace], [JE_APPEND_VS(LIBS, $LUNWIND)], + [enable_prof_libunwind="0"]) + else + JE_APPEND_VS(LIBS, $LUNWIND) + fi + if test "x${enable_prof_libunwind}" = "x1" ; then + backtrace_method="libunwind" + AC_DEFINE([JEMALLOC_PROF_LIBUNWIND], [ ], [ ]) + fi +fi + +AC_ARG_ENABLE([prof-libgcc], + [AS_HELP_STRING([--disable-prof-libgcc], + [Do not use libgcc for backtracing])], +[if test "x$enable_prof_libgcc" = "xno" ; then + enable_prof_libgcc="0" +else + enable_prof_libgcc="1" +fi +], +[enable_prof_libgcc="1"] +) +if test "x$backtrace_method" = "x" -a "x$enable_prof_libgcc" = "x1" \ + -a "x$GCC" = "xyes" ; then + AC_CHECK_HEADERS([unwind.h], , [enable_prof_libgcc="0"]) + if test "x${enable_prof_libgcc}" = "x1" ; then + AC_CHECK_LIB([gcc], [_Unwind_Backtrace], [JE_APPEND_VS(LIBS, -lgcc)], [enable_prof_libgcc="0"]) + fi + if test "x${enable_prof_libgcc}" = "x1" ; then + backtrace_method="libgcc" + AC_DEFINE([JEMALLOC_PROF_LIBGCC], [ ], [ ]) + fi +else + enable_prof_libgcc="0" +fi + +AC_ARG_ENABLE([prof-gcc], + [AS_HELP_STRING([--disable-prof-gcc], + [Do not use gcc intrinsics for backtracing])], +[if test "x$enable_prof_gcc" = "xno" ; then + enable_prof_gcc="0" +else + enable_prof_gcc="1" +fi +], +[enable_prof_gcc="1"] +) +if test "x$backtrace_method" = "x" -a "x$enable_prof_gcc" = "x1" \ + -a "x$GCC" = "xyes" ; then + JE_CFLAGS_ADD([-fno-omit-frame-pointer]) + backtrace_method="gcc intrinsics" + AC_DEFINE([JEMALLOC_PROF_GCC], [ ], [ ]) +else + enable_prof_gcc="0" +fi + +if test "x$backtrace_method" = "x" ; then + backtrace_method="none (disabling profiling)" + enable_prof="0" +fi +AC_MSG_CHECKING([configured backtracing method]) +AC_MSG_RESULT([$backtrace_method]) +if test "x$enable_prof" = "x1" ; then + dnl Heap profiling uses the log(3) function. + JE_APPEND_VS(LIBS, $LM) + + AC_DEFINE([JEMALLOC_PROF], [ ], [ ]) +fi +AC_SUBST([enable_prof]) + +dnl Indicate whether adjacent virtual memory mappings automatically coalesce +dnl (and fragment on demand). +if test "x${maps_coalesce}" = "x1" ; then + AC_DEFINE([JEMALLOC_MAPS_COALESCE], [ ], [ ]) +fi + +dnl Indicate whether to retain memory (rather than using munmap()) by default. +if test "x$default_retain" = "x1" ; then + AC_DEFINE([JEMALLOC_RETAIN], [ ], [ ]) +fi + +dnl Indicate whether realloc(ptr, 0) defaults to the "alloc" behavior. +if test "x$zero_realloc_default_free" = "x1" ; then + AC_DEFINE([JEMALLOC_ZERO_REALLOC_DEFAULT_FREE], [ ], [ ]) +fi + +dnl Enable allocation from DSS if supported by the OS. +have_dss="1" +dnl Check whether the BSD/SUSv1 sbrk() exists. If not, disable DSS support. +AC_CHECK_FUNC([sbrk], [have_sbrk="1"], [have_sbrk="0"]) +if test "x$have_sbrk" = "x1" ; then + if test "x$sbrk_deprecated" = "x1" ; then + AC_MSG_RESULT([Disabling dss allocation because sbrk is deprecated]) + have_dss="0" + fi +else + have_dss="0" +fi + +if test "x$have_dss" = "x1" ; then + AC_DEFINE([JEMALLOC_DSS], [ ], [ ]) +fi + +dnl Support the junk/zero filling option by default. +AC_ARG_ENABLE([fill], + [AS_HELP_STRING([--disable-fill], [Disable support for junk/zero filling])], +[if test "x$enable_fill" = "xno" ; then + enable_fill="0" +else + enable_fill="1" +fi +], +[enable_fill="1"] +) +if test "x$enable_fill" = "x1" ; then + AC_DEFINE([JEMALLOC_FILL], [ ], [ ]) +fi +AC_SUBST([enable_fill]) + +dnl Disable utrace(2)-based tracing by default. +AC_ARG_ENABLE([utrace], + [AS_HELP_STRING([--enable-utrace], [Enable utrace(2)-based tracing])], +[if test "x$enable_utrace" = "xno" ; then + enable_utrace="0" +else + enable_utrace="1" +fi +], +[enable_utrace="0"] +) +JE_COMPILABLE([utrace(2)], [ +#include +#include +#include +#include +#include +], [ + utrace((void *)0, 0); +], [je_cv_utrace]) +if test "x${je_cv_utrace}" = "xno" ; then + JE_COMPILABLE([utrace(2) with label], [ + #include + #include + #include + #include + #include + ], [ + utrace((void *)0, (void *)0, 0); + ], [je_cv_utrace_label]) + if test "x${je_cv_utrace_label}" = "xno"; then + enable_utrace="0" + fi + if test "x$enable_utrace" = "x1" ; then + AC_DEFINE([JEMALLOC_UTRACE_LABEL], [ ], [ ]) + fi +else + if test "x$enable_utrace" = "x1" ; then + AC_DEFINE([JEMALLOC_UTRACE], [ ], [ ]) + fi +fi +AC_SUBST([enable_utrace]) + +dnl Do not support the xmalloc option by default. +AC_ARG_ENABLE([xmalloc], + [AS_HELP_STRING([--enable-xmalloc], [Support xmalloc option])], +[if test "x$enable_xmalloc" = "xno" ; then + enable_xmalloc="0" +else + enable_xmalloc="1" +fi +], +[enable_xmalloc="0"] +) +if test "x$enable_xmalloc" = "x1" ; then + AC_DEFINE([JEMALLOC_XMALLOC], [ ], [ ]) +fi +AC_SUBST([enable_xmalloc]) + +dnl Support cache-oblivious allocation alignment by default. +AC_ARG_ENABLE([cache-oblivious], + [AS_HELP_STRING([--disable-cache-oblivious], + [Disable support for cache-oblivious allocation alignment])], +[if test "x$enable_cache_oblivious" = "xno" ; then + enable_cache_oblivious="0" +else + enable_cache_oblivious="1" +fi +], +[enable_cache_oblivious="1"] +) +if test "x$enable_cache_oblivious" = "x1" ; then + AC_DEFINE([JEMALLOC_CACHE_OBLIVIOUS], [ ], [ ]) +fi +AC_SUBST([enable_cache_oblivious]) + +dnl Do not log by default. +AC_ARG_ENABLE([log], + [AS_HELP_STRING([--enable-log], [Support debug logging])], +[if test "x$enable_log" = "xno" ; then + enable_log="0" +else + enable_log="1" +fi +], +[enable_log="0"] +) +if test "x$enable_log" = "x1" ; then + AC_DEFINE([JEMALLOC_LOG], [ ], [ ]) +fi +AC_SUBST([enable_log]) + +dnl Do not use readlinkat by default +AC_ARG_ENABLE([readlinkat], + [AS_HELP_STRING([--enable-readlinkat], [Use readlinkat over readlink])], +[if test "x$enable_readlinkat" = "xno" ; then + enable_readlinkat="0" +else + enable_readlinkat="1" +fi +], +[enable_readlinkat="0"] +) +if test "x$enable_readlinkat" = "x1" ; then + AC_DEFINE([JEMALLOC_READLINKAT], [ ], [ ]) +fi +AC_SUBST([enable_readlinkat]) + +dnl Avoid extra safety checks by default +AC_ARG_ENABLE([opt-safety-checks], + [AS_HELP_STRING([--enable-opt-safety-checks], + [Perform certain low-overhead checks, even in opt mode])], +[if test "x$enable_opt_safety_checks" = "xno" ; then + enable_opt_safety_checks="0" +else + enable_opt_safety_checks="1" +fi +], +[enable_opt_safety_checks="0"] +) +if test "x$enable_opt_safety_checks" = "x1" ; then + AC_DEFINE([JEMALLOC_OPT_SAFETY_CHECKS], [ ], [ ]) +fi +AC_SUBST([enable_opt_safety_checks]) + +dnl Look for sized-deallocation bugs while otherwise being in opt mode. +AC_ARG_ENABLE([opt-size-checks], + [AS_HELP_STRING([--enable-opt-size-checks], + [Perform sized-deallocation argument checks, even in opt mode])], +[if test "x$enable_opt_size_checks" = "xno" ; then + enable_opt_size_checks="0" +else + enable_opt_size_checks="1" +fi +], +[enable_opt_size_checks="0"] +) +if test "x$enable_opt_size_checks" = "x1" ; then + AC_DEFINE([JEMALLOC_OPT_SIZE_CHECKS], [ ], [ ]) +fi +AC_SUBST([enable_opt_size_checks]) + +dnl Do not check for use-after-free by default. +AC_ARG_ENABLE([uaf-detection], + [AS_HELP_STRING([--enable-uaf-detection], + [Allow sampled junk-filling on deallocation to detect use-after-free])], +[if test "x$enable_uaf_detection" = "xno" ; then + enable_uaf_detection="0" +else + enable_uaf_detection="1" +fi +], +[enable_uaf_detection="0"] +) +if test "x$enable_uaf_detection" = "x1" ; then + AC_DEFINE([JEMALLOC_UAF_DETECTION], [ ]) +fi +AC_SUBST([enable_uaf_detection]) + +JE_COMPILABLE([a program using __builtin_unreachable], [ +void foo (void) { + __builtin_unreachable(); +} +], [ + { + foo(); + } +], [je_cv_gcc_builtin_unreachable]) +if test "x${je_cv_gcc_builtin_unreachable}" = "xyes" ; then + AC_DEFINE([JEMALLOC_INTERNAL_UNREACHABLE], [__builtin_unreachable], [ ]) +else + AC_DEFINE([JEMALLOC_INTERNAL_UNREACHABLE], [abort], [ ]) +fi + +dnl ============================================================================ +dnl Check for __builtin_ffsl(), then ffsl(3), and fail if neither are found. +dnl One of those two functions should (theoretically) exist on all platforms +dnl that jemalloc currently has a chance of functioning on without modification. +dnl We additionally assume ffs[ll]() or __builtin_ffs[ll]() are defined if +dnl ffsl() or __builtin_ffsl() are defined, respectively. +JE_COMPILABLE([a program using __builtin_ffsl], [ +#include +#include +#include +], [ + { + int rv = __builtin_ffsl(0x08); + printf("%d\n", rv); + } +], [je_cv_gcc_builtin_ffsl]) +if test "x${je_cv_gcc_builtin_ffsl}" = "xyes" ; then + AC_DEFINE([JEMALLOC_INTERNAL_FFSLL], [__builtin_ffsll], [ ]) + AC_DEFINE([JEMALLOC_INTERNAL_FFSL], [__builtin_ffsl], [ ]) + AC_DEFINE([JEMALLOC_INTERNAL_FFS], [__builtin_ffs], [ ]) +else + JE_COMPILABLE([a program using ffsl], [ + #include + #include + #include + ], [ + { + int rv = ffsl(0x08); + printf("%d\n", rv); + } + ], [je_cv_function_ffsl]) + if test "x${je_cv_function_ffsl}" = "xyes" ; then + AC_DEFINE([JEMALLOC_INTERNAL_FFSLL], [ffsll], [ ]) + AC_DEFINE([JEMALLOC_INTERNAL_FFSL], [ffsl], [ ]) + AC_DEFINE([JEMALLOC_INTERNAL_FFS], [ffs], [ ]) + else + AC_MSG_ERROR([Cannot build without ffsl(3) or __builtin_ffsl()]) + fi +fi + +JE_COMPILABLE([a program using __builtin_popcountl], [ +#include +#include +#include +], [ + { + int rv = __builtin_popcountl(0x08); + printf("%d\n", rv); + } +], [je_cv_gcc_builtin_popcountl]) +if test "x${je_cv_gcc_builtin_popcountl}" = "xyes" ; then + AC_DEFINE([JEMALLOC_INTERNAL_POPCOUNT], [__builtin_popcount], [ ]) + AC_DEFINE([JEMALLOC_INTERNAL_POPCOUNTL], [__builtin_popcountl], [ ]) + AC_DEFINE([JEMALLOC_INTERNAL_POPCOUNTLL], [__builtin_popcountll], [ ]) +fi + +AC_ARG_WITH([lg_quantum], + [AS_HELP_STRING([--with-lg-quantum=], + [Base 2 log of minimum allocation alignment])]) +if test "x$with_lg_quantum" != "x" ; then + AC_DEFINE_UNQUOTED([LG_QUANTUM], [$with_lg_quantum], [ ]) +fi + +AC_ARG_WITH([lg_slab_maxregs], + [AS_HELP_STRING([--with-lg-slab-maxregs=], + [Base 2 log of maximum number of regions in a slab (used with malloc_conf slab_sizes)])], + [CONFIG_LG_SLAB_MAXREGS="with_lg_slab_maxregs"], + [CONFIG_LG_SLAB_MAXREGS=""]) +if test "x$with_lg_slab_maxregs" != "x" ; then + AC_DEFINE_UNQUOTED([CONFIG_LG_SLAB_MAXREGS], [$with_lg_slab_maxregs], [ ]) +fi + +AC_ARG_WITH([lg_page], + [AS_HELP_STRING([--with-lg-page=], [Base 2 log of system page size])], + [LG_PAGE="$with_lg_page"], [LG_PAGE="detect"]) +case "${host}" in + aarch64-apple-darwin*) + dnl When cross-compile for Apple M1 and no page size specified, use the + dnl default and skip detecting the page size (which is likely incorrect). + if test "x${host}" != "x${build}" -a "x$LG_PAGE" = "xdetect"; then + LG_PAGE=14 + fi + ;; +esac +if test "x$LG_PAGE" = "xdetect"; then + AC_CACHE_CHECK([LG_PAGE], + [je_cv_lg_page], + AC_RUN_IFELSE([AC_LANG_PROGRAM( +[[ +#include +#ifdef _WIN32 +#include +#else +#include +#endif +#include +]], +[[ + int result; + FILE *f; + +#ifdef _WIN32 + SYSTEM_INFO si; + GetSystemInfo(&si); + result = si.dwPageSize; +#else + result = sysconf(_SC_PAGESIZE); +#endif + if (result == -1) { + return 1; + } + result = JEMALLOC_INTERNAL_FFSL(result) - 1; + + f = fopen("conftest.out", "w"); + if (f == NULL) { + return 1; + } + fprintf(f, "%d", result); + fclose(f); + + return 0; +]])], + [je_cv_lg_page=`cat conftest.out`], + [je_cv_lg_page=undefined], + [je_cv_lg_page=12])) +fi +if test "x${je_cv_lg_page}" != "x" ; then + LG_PAGE="${je_cv_lg_page}" +fi +if test "x${LG_PAGE}" != "xundefined" ; then + AC_DEFINE_UNQUOTED([LG_PAGE], [$LG_PAGE], [ ]) +else + AC_MSG_ERROR([cannot determine value for LG_PAGE]) +fi + +AC_ARG_WITH([lg_hugepage], + [AS_HELP_STRING([--with-lg-hugepage=], + [Base 2 log of system huge page size])], + [je_cv_lg_hugepage="${with_lg_hugepage}"], + [je_cv_lg_hugepage=""]) +if test "x${je_cv_lg_hugepage}" = "x" ; then + dnl Look in /proc/meminfo (Linux-specific) for information on the default huge + dnl page size, if any. The relevant line looks like: + dnl + dnl Hugepagesize: 2048 kB + if test -e "/proc/meminfo" ; then + hpsk=[`cat /proc/meminfo 2>/dev/null | \ + grep -e '^Hugepagesize:[[:space:]]\+[0-9]\+[[:space:]]kB$' | \ + awk '{print $2}'`] + if test "x${hpsk}" != "x" ; then + je_cv_lg_hugepage=10 + while test "${hpsk}" -gt 1 ; do + hpsk="$((hpsk / 2))" + je_cv_lg_hugepage="$((je_cv_lg_hugepage + 1))" + done + fi + fi + + dnl Set default if unable to automatically configure. + if test "x${je_cv_lg_hugepage}" = "x" ; then + je_cv_lg_hugepage=21 + fi +fi +if test "x${LG_PAGE}" != "xundefined" -a \ + "${je_cv_lg_hugepage}" -lt "${LG_PAGE}" ; then + AC_MSG_ERROR([Huge page size (2^${je_cv_lg_hugepage}) must be at least page size (2^${LG_PAGE})]) +fi +AC_DEFINE_UNQUOTED([LG_HUGEPAGE], [${je_cv_lg_hugepage}], [ ]) + +dnl ============================================================================ +dnl Enable libdl by default. +AC_ARG_ENABLE([libdl], + [AS_HELP_STRING([--disable-libdl], + [Do not use libdl])], +[if test "x$enable_libdl" = "xno" ; then + enable_libdl="0" +else + enable_libdl="1" +fi +], +[enable_libdl="1"] +) +AC_SUBST([libdl]) + +dnl ============================================================================ +dnl Configure pthreads. + +if test "x$abi" != "xpecoff" ; then + AC_DEFINE([JEMALLOC_HAVE_PTHREAD], [ ], [ ]) + AC_CHECK_HEADERS([pthread.h], , [AC_MSG_ERROR([pthread.h is missing])]) + dnl Some systems may embed pthreads functionality in libc; check for libpthread + dnl first, but try libc too before failing. + AC_CHECK_LIB([pthread], [pthread_create], [JE_APPEND_VS(LIBS, -pthread)], + [AC_SEARCH_LIBS([pthread_create], , , + AC_MSG_ERROR([libpthread is missing]))]) + wrap_syms="${wrap_syms} pthread_create" + have_pthread="1" + +dnl Check if we have dlsym support. + if test "x$enable_libdl" = "x1" ; then + have_dlsym="1" + AC_CHECK_HEADERS([dlfcn.h], + AC_CHECK_FUNC([dlsym], [], + [AC_CHECK_LIB([dl], [dlsym], [LIBS="$LIBS -ldl"], [have_dlsym="0"])]), + [have_dlsym="0"]) + if test "x$have_dlsym" = "x1" ; then + AC_DEFINE([JEMALLOC_HAVE_DLSYM], [ ], [ ]) + fi + else + have_dlsym="0" + fi + + JE_COMPILABLE([pthread_atfork(3)], [ +#include +], [ + pthread_atfork((void *)0, (void *)0, (void *)0); +], [je_cv_pthread_atfork]) + if test "x${je_cv_pthread_atfork}" = "xyes" ; then + AC_DEFINE([JEMALLOC_HAVE_PTHREAD_ATFORK], [ ], [ ]) + fi + dnl Check if pthread_setname_np is available with the expected API. + JE_COMPILABLE([pthread_setname_np(3)], [ +#include +], [ + pthread_setname_np(pthread_self(), "setname_test"); +], [je_cv_pthread_setname_np]) + if test "x${je_cv_pthread_setname_np}" = "xyes" ; then + AC_DEFINE([JEMALLOC_HAVE_PTHREAD_SETNAME_NP], [ ], [ ]) + fi + dnl Check if pthread_getname_np is not necessarily present despite + dnl the pthread_setname_np counterpart + JE_COMPILABLE([pthread_getname_np(3)], [ +#include +#include +], [ + { + char *name = malloc(16); + pthread_getname_np(pthread_self(), name, 16); + free(name); + } +], [je_cv_pthread_getname_np]) + if test "x${je_cv_pthread_getname_np}" = "xyes" ; then + AC_DEFINE([JEMALLOC_HAVE_PTHREAD_GETNAME_NP], [ ], [ ]) + fi + dnl Check if pthread_get_name_np is not necessarily present despite + dnl the pthread_set_name_np counterpart + JE_COMPILABLE([pthread_get_name_np(3)], [ +#include +#include +#include +], [ + { + char *name = malloc(16); + pthread_get_name_np(pthread_self(), name, 16); + free(name); + } +], [je_cv_pthread_get_name_np]) + if test "x${je_cv_pthread_get_name_np}" = "xyes" ; then + AC_DEFINE([JEMALLOC_HAVE_PTHREAD_GET_NAME_NP], [ ], [ ]) + fi +fi + +JE_APPEND_VS(CPPFLAGS, -D_REENTRANT) + +dnl Check whether clock_gettime(2) is in libc or librt. +AC_SEARCH_LIBS([clock_gettime], [rt]) + +dnl Cray wrapper compiler often adds `-lrt` when using `-static`. Check with +dnl `-dynamic` as well in case a user tries to dynamically link in jemalloc +if test "x$je_cv_cray_prgenv_wrapper" = "xyes" ; then + if test "$ac_cv_search_clock_gettime" != "-lrt"; then + JE_CFLAGS_SAVE() + + unset ac_cv_search_clock_gettime + JE_CFLAGS_ADD([-dynamic]) + AC_SEARCH_LIBS([clock_gettime], [rt]) + + JE_CFLAGS_RESTORE() + fi +fi + +dnl check for CLOCK_MONOTONIC_COARSE (Linux-specific). +JE_COMPILABLE([clock_gettime(CLOCK_MONOTONIC_COARSE, ...)], [ +#include +], [ + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC_COARSE, &ts); +], [je_cv_clock_monotonic_coarse]) +if test "x${je_cv_clock_monotonic_coarse}" = "xyes" ; then + AC_DEFINE([JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE], [ ], [ ]) +fi + +dnl check for CLOCK_MONOTONIC. +JE_COMPILABLE([clock_gettime(CLOCK_MONOTONIC, ...)], [ +#include +#include +], [ + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC, &ts); +#if !defined(_POSIX_MONOTONIC_CLOCK) || _POSIX_MONOTONIC_CLOCK < 0 +# error _POSIX_MONOTONIC_CLOCK missing/invalid +#endif +], [je_cv_clock_monotonic]) +if test "x${je_cv_clock_monotonic}" = "xyes" ; then + AC_DEFINE([JEMALLOC_HAVE_CLOCK_MONOTONIC], [ ], [ ]) +fi + +dnl Check for mach_absolute_time(). +JE_COMPILABLE([mach_absolute_time()], [ +#include +], [ + mach_absolute_time(); +], [je_cv_mach_absolute_time]) +if test "x${je_cv_mach_absolute_time}" = "xyes" ; then + AC_DEFINE([JEMALLOC_HAVE_MACH_ABSOLUTE_TIME], [ ], [ ]) +fi + +dnl check for CLOCK_REALTIME (always should be available on Linux) +JE_COMPILABLE([clock_gettime(CLOCK_REALTIME, ...)], [ +#include +], [ + struct timespec ts; + + clock_gettime(CLOCK_REALTIME, &ts); +], [je_cv_clock_realtime]) +if test "x${je_cv_clock_realtime}" = "xyes" ; then + AC_DEFINE([JEMALLOC_HAVE_CLOCK_REALTIME], [ ], [ ]) +fi + +dnl Use syscall(2) (if available) by default. +AC_ARG_ENABLE([syscall], + [AS_HELP_STRING([--disable-syscall], [Disable use of syscall(2)])], +[if test "x$enable_syscall" = "xno" ; then + enable_syscall="0" +else + enable_syscall="1" +fi +], +[enable_syscall="1"] +) +if test "x$enable_syscall" = "x1" ; then + dnl Check if syscall(2) is usable. Treat warnings as errors, so that e.g. OS + dnl X 10.12's deprecation warning prevents use. + JE_CFLAGS_SAVE() + JE_CFLAGS_ADD([-Werror]) + JE_COMPILABLE([syscall(2)], [ +#include +#include +], [ + syscall(SYS_write, 2, "hello", 5); +], + [je_cv_syscall]) + JE_CFLAGS_RESTORE() + if test "x$je_cv_syscall" = "xyes" ; then + AC_DEFINE([JEMALLOC_USE_SYSCALL], [ ], [ ]) + fi +fi + +dnl Check if the GNU-specific secure_getenv function exists. +AC_CHECK_FUNC([secure_getenv], + [have_secure_getenv="1"], + [have_secure_getenv="0"] + ) +if test "x$have_secure_getenv" = "x1" ; then + AC_DEFINE([JEMALLOC_HAVE_SECURE_GETENV], [ ], [ ]) +fi + +dnl Check if the GNU-specific sched_getcpu function exists. +AC_CHECK_FUNC([sched_getcpu], + [have_sched_getcpu="1"], + [have_sched_getcpu="0"] + ) +if test "x$have_sched_getcpu" = "x1" ; then + AC_DEFINE([JEMALLOC_HAVE_SCHED_GETCPU], [ ], [ ]) +fi + +dnl Check if the GNU-specific sched_setaffinity function exists. +AC_CHECK_FUNC([sched_setaffinity], + [have_sched_setaffinity="1"], + [have_sched_setaffinity="0"] + ) +if test "x$have_sched_setaffinity" = "x1" ; then + AC_DEFINE([JEMALLOC_HAVE_SCHED_SETAFFINITY], [ ], [ ]) +fi + +dnl Check if the Solaris/BSD issetugid function exists. +AC_CHECK_FUNC([issetugid], + [have_issetugid="1"], + [have_issetugid="0"] + ) +if test "x$have_issetugid" = "x1" ; then + AC_DEFINE([JEMALLOC_HAVE_ISSETUGID], [ ], [ ]) +fi + +dnl Check whether the BSD-specific _malloc_thread_cleanup() exists. If so, use +dnl it rather than pthreads TSD cleanup functions to support cleanup during +dnl thread exit, in order to avoid pthreads library recursion during +dnl bootstrapping. +AC_CHECK_FUNC([_malloc_thread_cleanup], + [have__malloc_thread_cleanup="1"], + [have__malloc_thread_cleanup="0"] + ) +if test "x$have__malloc_thread_cleanup" = "x1" ; then + AC_DEFINE([JEMALLOC_MALLOC_THREAD_CLEANUP], [ ], [ ]) + wrap_syms="${wrap_syms} _malloc_thread_cleanup _malloc_tsd_cleanup_register" + force_tls="1" +fi + +dnl Check whether the BSD-specific _pthread_mutex_init_calloc_cb() exists. If +dnl so, mutex initialization causes allocation, and we need to implement this +dnl callback function in order to prevent recursive allocation. +AC_CHECK_FUNC([_pthread_mutex_init_calloc_cb], + [have__pthread_mutex_init_calloc_cb="1"], + [have__pthread_mutex_init_calloc_cb="0"] + ) +if test "x$have__pthread_mutex_init_calloc_cb" = "x1" ; then + AC_DEFINE([JEMALLOC_MUTEX_INIT_CB], [ ], [ ]) + wrap_syms="${wrap_syms} _malloc_prefork _malloc_postfork" +fi + +AC_CHECK_FUNC([memcntl], + [have_memcntl="1"], + [have_memcntl="0"], + ) +if test "x$have_memcntl" = "x1" ; then + AC_DEFINE([JEMALLOC_HAVE_MEMCNTL], [ ], [ ]) +fi + +dnl Disable lazy locking by default. +AC_ARG_ENABLE([lazy_lock], + [AS_HELP_STRING([--enable-lazy-lock], + [Enable lazy locking (only lock when multi-threaded)])], +[if test "x$enable_lazy_lock" = "xno" ; then + enable_lazy_lock="0" +else + enable_lazy_lock="1" +fi +], +[enable_lazy_lock=""] +) +if test "x${enable_lazy_lock}" = "x" ; then + if test "x${force_lazy_lock}" = "x1" ; then + AC_MSG_RESULT([Forcing lazy-lock to avoid allocator/threading bootstrap issues]) + enable_lazy_lock="1" + else + enable_lazy_lock="0" + fi +fi +if test "x${enable_lazy_lock}" = "x1" -a "x${abi}" = "xpecoff" ; then + AC_MSG_RESULT([Forcing no lazy-lock because thread creation monitoring is unimplemented]) + enable_lazy_lock="0" +fi +if test "x$enable_lazy_lock" = "x1" ; then + if test "x$have_dlsym" = "x1" ; then + AC_DEFINE([JEMALLOC_LAZY_LOCK], [ ], [ ]) + else + AC_MSG_ERROR([Missing dlsym support: lazy-lock cannot be enabled.]) + fi +fi +AC_SUBST([enable_lazy_lock]) + +dnl Automatically configure TLS. +if test "x${force_tls}" = "x1" ; then + enable_tls="1" +elif test "x${force_tls}" = "x0" ; then + enable_tls="0" +else + enable_tls="1" +fi +if test "x${enable_tls}" = "x1" ; then +AC_MSG_CHECKING([for TLS]) +AC_COMPILE_IFELSE([AC_LANG_PROGRAM( +[[ + __thread int x; +]], [[ + x = 42; + + return 0; +]])], + AC_MSG_RESULT([yes]), + AC_MSG_RESULT([no]) + enable_tls="0") +else + enable_tls="0" +fi +AC_SUBST([enable_tls]) +if test "x${enable_tls}" = "x1" ; then + AC_DEFINE_UNQUOTED([JEMALLOC_TLS], [ ], [ ]) +fi + +dnl ============================================================================ +dnl Check for C11 atomics. + +JE_COMPILABLE([C11 atomics], [ +#include +#if (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__) +#include +#else +#error Atomics not available +#endif +], [ + uint64_t *p = (uint64_t *)0; + uint64_t x = 1; + volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p; + uint64_t r = atomic_fetch_add(a, x) + x; + return r == 0; +], [je_cv_c11_atomics]) +if test "x${je_cv_c11_atomics}" = "xyes" ; then + AC_DEFINE([JEMALLOC_C11_ATOMICS], [ ], [ ]) +fi + +dnl ============================================================================ +dnl Check for GCC-style __atomic atomics. + +JE_COMPILABLE([GCC __atomic atomics], [ +], [ + int x = 0; + int val = 1; + int y = __atomic_fetch_add(&x, val, __ATOMIC_RELAXED); + int after_add = x; + return after_add == 1; +], [je_cv_gcc_atomic_atomics]) +if test "x${je_cv_gcc_atomic_atomics}" = "xyes" ; then + AC_DEFINE([JEMALLOC_GCC_ATOMIC_ATOMICS], [ ], [ ]) + + dnl check for 8-bit atomic support + JE_COMPILABLE([GCC 8-bit __atomic atomics], [ + ], [ + unsigned char x = 0; + int val = 1; + int y = __atomic_fetch_add(&x, val, __ATOMIC_RELAXED); + int after_add = (int)x; + return after_add == 1; + ], [je_cv_gcc_u8_atomic_atomics]) + if test "x${je_cv_gcc_u8_atomic_atomics}" = "xyes" ; then + AC_DEFINE([JEMALLOC_GCC_U8_ATOMIC_ATOMICS], [ ], [ ]) + fi +fi + +dnl ============================================================================ +dnl Check for GCC-style __sync atomics. + +JE_COMPILABLE([GCC __sync atomics], [ +], [ + int x = 0; + int before_add = __sync_fetch_and_add(&x, 1); + int after_add = x; + return (before_add == 0) && (after_add == 1); +], [je_cv_gcc_sync_atomics]) +if test "x${je_cv_gcc_sync_atomics}" = "xyes" ; then + AC_DEFINE([JEMALLOC_GCC_SYNC_ATOMICS], [ ], [ ]) + + dnl check for 8-bit atomic support + JE_COMPILABLE([GCC 8-bit __sync atomics], [ + ], [ + unsigned char x = 0; + int before_add = __sync_fetch_and_add(&x, 1); + int after_add = (int)x; + return (before_add == 0) && (after_add == 1); + ], [je_cv_gcc_u8_sync_atomics]) + if test "x${je_cv_gcc_u8_sync_atomics}" = "xyes" ; then + AC_DEFINE([JEMALLOC_GCC_U8_SYNC_ATOMICS], [ ], [ ]) + fi +fi + +dnl ============================================================================ +dnl Check for atomic(3) operations as provided on Darwin. +dnl We need this not for the atomic operations (which are provided above), but +dnl rather for the OS_unfair_lock type it exposes. + +JE_COMPILABLE([Darwin OSAtomic*()], [ +#include +#include +], [ + { + int32_t x32 = 0; + volatile int32_t *x32p = &x32; + OSAtomicAdd32(1, x32p); + } + { + int64_t x64 = 0; + volatile int64_t *x64p = &x64; + OSAtomicAdd64(1, x64p); + } +], [je_cv_osatomic]) +if test "x${je_cv_osatomic}" = "xyes" ; then + AC_DEFINE([JEMALLOC_OSATOMIC], [ ], [ ]) +fi + +dnl ============================================================================ +dnl Check for madvise(2). + +JE_COMPILABLE([madvise(2)], [ +#include +], [ + madvise((void *)0, 0, 0); +], [je_cv_madvise]) +if test "x${je_cv_madvise}" = "xyes" ; then + AC_DEFINE([JEMALLOC_HAVE_MADVISE], [ ], [ ]) + + dnl Check for madvise(..., MADV_FREE). + JE_COMPILABLE([madvise(..., MADV_FREE)], [ +#include +], [ + madvise((void *)0, 0, MADV_FREE); +], [je_cv_madv_free]) + if test "x${je_cv_madv_free}" = "xyes" ; then + AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE], [ ], [ ]) + elif test "x${je_cv_madvise}" = "xyes" ; then + case "${host_cpu}" in i686|x86_64) + case "${host}" in *-*-linux*) + AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE], [ ], [ ]) + AC_DEFINE([JEMALLOC_DEFINE_MADVISE_FREE], [ ], [ ]) + ;; + esac + ;; + esac + fi + + dnl Check for madvise(..., MADV_DONTNEED). + JE_COMPILABLE([madvise(..., MADV_DONTNEED)], [ +#include +], [ + madvise((void *)0, 0, MADV_DONTNEED); +], [je_cv_madv_dontneed]) + if test "x${je_cv_madv_dontneed}" = "xyes" ; then + AC_DEFINE([JEMALLOC_PURGE_MADVISE_DONTNEED], [ ], [ ]) + fi + + dnl Check for madvise(..., MADV_DO[NT]DUMP). + JE_COMPILABLE([madvise(..., MADV_DO[[NT]]DUMP)], [ +#include +], [ + madvise((void *)0, 0, MADV_DONTDUMP); + madvise((void *)0, 0, MADV_DODUMP); +], [je_cv_madv_dontdump]) + if test "x${je_cv_madv_dontdump}" = "xyes" ; then + AC_DEFINE([JEMALLOC_MADVISE_DONTDUMP], [ ], [ ]) + fi + + dnl Check for madvise(..., MADV_[NO]HUGEPAGE). + JE_COMPILABLE([madvise(..., MADV_[[NO]]HUGEPAGE)], [ +#include +], [ + madvise((void *)0, 0, MADV_HUGEPAGE); + madvise((void *)0, 0, MADV_NOHUGEPAGE); +], [je_cv_thp]) + dnl Check for madvise(..., MADV_[NO]CORE). + JE_COMPILABLE([madvise(..., MADV_[[NO]]CORE)], [ +#include +], [ + madvise((void *)0, 0, MADV_NOCORE); + madvise((void *)0, 0, MADV_CORE); +], [je_cv_madv_nocore]) + if test "x${je_cv_madv_nocore}" = "xyes" ; then + AC_DEFINE([JEMALLOC_MADVISE_NOCORE], [ ], [ ]) + fi +case "${host_cpu}" in + arm*) + ;; + *) + if test "x${je_cv_thp}" = "xyes" ; then + AC_DEFINE([JEMALLOC_HAVE_MADVISE_HUGE], [ ], [ ]) + fi + ;; +esac +else + dnl Check for posix_madvise. + JE_COMPILABLE([posix_madvise], [ + #include + ], [ + posix_madvise((void *)0, 0, 0); + ], [je_cv_posix_madvise]) + if test "x${je_cv_posix_madvise}" = "xyes" ; then + AC_DEFINE([JEMALLOC_HAVE_POSIX_MADVISE], [ ], [ ]) + + dnl Check for posix_madvise(..., POSIX_MADV_DONTNEED). + JE_COMPILABLE([posix_madvise(..., POSIX_MADV_DONTNEED)], [ + #include + ], [ + posix_madvise((void *)0, 0, POSIX_MADV_DONTNEED); + ], [je_cv_posix_madv_dontneed]) + if test "x${je_cv_posix_madv_dontneed}" = "xyes" ; then + AC_DEFINE([JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED], [ ], [ ]) + fi + fi +fi + +dnl ============================================================================ +dnl Check for mprotect(2). + +JE_COMPILABLE([mprotect(2)], [ +#include +], [ + mprotect((void *)0, 0, PROT_NONE); +], [je_cv_mprotect]) +if test "x${je_cv_mprotect}" = "xyes" ; then + AC_DEFINE([JEMALLOC_HAVE_MPROTECT], [ ], [ ]) +fi + +dnl ============================================================================ +dnl Check for __builtin_clz(), __builtin_clzl(), and __builtin_clzll(). + +AC_CACHE_CHECK([for __builtin_clz], + [je_cv_builtin_clz], + [AC_LINK_IFELSE([AC_LANG_PROGRAM([], + [ + { + unsigned x = 0; + int y = __builtin_clz(x); + } + { + unsigned long x = 0; + int y = __builtin_clzl(x); + } + { + unsigned long long x = 0; + int y = __builtin_clzll(x); + } + ])], + [je_cv_builtin_clz=yes], + [je_cv_builtin_clz=no])]) + +if test "x${je_cv_builtin_clz}" = "xyes" ; then + AC_DEFINE([JEMALLOC_HAVE_BUILTIN_CLZ], [ ], [ ]) +fi + +dnl ============================================================================ +dnl Check for os_unfair_lock operations as provided on Darwin. + +JE_COMPILABLE([Darwin os_unfair_lock_*()], [ +#include +#include +], [ + #if MAC_OS_X_VERSION_MIN_REQUIRED < 101200 + #error "os_unfair_lock is not supported" + #else + os_unfair_lock lock = OS_UNFAIR_LOCK_INIT; + os_unfair_lock_lock(&lock); + os_unfair_lock_unlock(&lock); + #endif +], [je_cv_os_unfair_lock]) +if test "x${je_cv_os_unfair_lock}" = "xyes" ; then + AC_DEFINE([JEMALLOC_OS_UNFAIR_LOCK], [ ], [ ]) +fi + +dnl ============================================================================ +dnl Darwin-related configuration. + +AC_ARG_ENABLE([zone-allocator], + [AS_HELP_STRING([--disable-zone-allocator], + [Disable zone allocator for Darwin])], +[if test "x$enable_zone_allocator" = "xno" ; then + enable_zone_allocator="0" +else + enable_zone_allocator="1" +fi +], +[if test "x${abi}" = "xmacho"; then + enable_zone_allocator="1" +fi +] +) +AC_SUBST([enable_zone_allocator]) + +if test "x${enable_zone_allocator}" = "x1" ; then + if test "x${abi}" != "xmacho"; then + AC_MSG_ERROR([--enable-zone-allocator is only supported on Darwin]) + fi + AC_DEFINE([JEMALLOC_ZONE], [ ], [ ]) +fi + +dnl ============================================================================ +dnl Use initial-exec TLS by default. +AC_ARG_ENABLE([initial-exec-tls], + [AS_HELP_STRING([--disable-initial-exec-tls], + [Disable the initial-exec tls model])], +[if test "x$enable_initial_exec_tls" = "xno" ; then + enable_initial_exec_tls="0" +else + enable_initial_exec_tls="1" +fi +], +[enable_initial_exec_tls="1"] +) +AC_SUBST([enable_initial_exec_tls]) + +if test "x${je_cv_tls_model}" = "xyes" -a \ + "x${enable_initial_exec_tls}" = "x1" ; then + AC_DEFINE([JEMALLOC_TLS_MODEL], + [__attribute__((tls_model("initial-exec")))], + [ ]) +else + AC_DEFINE([JEMALLOC_TLS_MODEL], [ ], [ ]) +fi + +dnl ============================================================================ +dnl Enable background threads if possible. + +if test "x${have_pthread}" = "x1" -a "x${je_cv_os_unfair_lock}" != "xyes" -a \ + "x${abi}" != "xmacho" ; then + AC_DEFINE([JEMALLOC_BACKGROUND_THREAD], [ ], [ ]) +fi + +dnl ============================================================================ +dnl Check for glibc malloc hooks + +if test "x$glibc" = "x1" ; then + JE_COMPILABLE([glibc malloc hook], [ + #include + + extern void (* __free_hook)(void *ptr); + extern void *(* __malloc_hook)(size_t size); + extern void *(* __realloc_hook)(void *ptr, size_t size); +], [ + void *ptr = 0L; + if (__malloc_hook) ptr = __malloc_hook(1); + if (__realloc_hook) ptr = __realloc_hook(ptr, 2); + if (__free_hook && ptr) __free_hook(ptr); +], [je_cv_glibc_malloc_hook]) + if test "x${je_cv_glibc_malloc_hook}" = "xyes" ; then + if test "x${JEMALLOC_PREFIX}" = "x" ; then + AC_DEFINE([JEMALLOC_GLIBC_MALLOC_HOOK], [ ], [ ]) + wrap_syms="${wrap_syms} __free_hook __malloc_hook __realloc_hook" + fi + fi + + JE_COMPILABLE([glibc memalign hook], [ + #include + + extern void *(* __memalign_hook)(size_t alignment, size_t size); +], [ + void *ptr = 0L; + if (__memalign_hook) ptr = __memalign_hook(16, 7); +], [je_cv_glibc_memalign_hook]) + if test "x${je_cv_glibc_memalign_hook}" = "xyes" ; then + if test "x${JEMALLOC_PREFIX}" = "x" ; then + AC_DEFINE([JEMALLOC_GLIBC_MEMALIGN_HOOK], [ ], [ ]) + wrap_syms="${wrap_syms} __memalign_hook" + fi + fi +fi + +JE_COMPILABLE([pthreads adaptive mutexes], [ +#include +], [ + pthread_mutexattr_t attr; + pthread_mutexattr_init(&attr); + pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ADAPTIVE_NP); + pthread_mutexattr_destroy(&attr); +], [je_cv_pthread_mutex_adaptive_np]) +if test "x${je_cv_pthread_mutex_adaptive_np}" = "xyes" ; then + AC_DEFINE([JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP], [ ], [ ]) +fi + +JE_CFLAGS_SAVE() +JE_CFLAGS_ADD([-D_GNU_SOURCE]) +JE_CFLAGS_ADD([-Werror]) +JE_CFLAGS_ADD([-herror_on_warning]) +JE_COMPILABLE([strerror_r returns char with gnu source], [ +#include +#include +#include +#include +], [ + char *buffer = (char *) malloc(100); + char *error = strerror_r(EINVAL, buffer, 100); + printf("%s\n", error); +], [je_cv_strerror_r_returns_char_with_gnu_source]) +JE_CFLAGS_RESTORE() +if test "x${je_cv_strerror_r_returns_char_with_gnu_source}" = "xyes" ; then + AC_DEFINE([JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE], [ ], [ ]) +fi + +dnl ============================================================================ +dnl Check for typedefs, structures, and compiler characteristics. +AC_HEADER_STDBOOL + +dnl ============================================================================ +dnl Define commands that generate output files. + +AC_CONFIG_COMMANDS([include/jemalloc/internal/public_symbols.txt], [ + f="${objroot}include/jemalloc/internal/public_symbols.txt" + mkdir -p "${objroot}include/jemalloc/internal" + cp /dev/null "${f}" + for nm in `echo ${mangling_map} |tr ',' ' '` ; do + n=`echo ${nm} |tr ':' ' ' |awk '{print $[]1}'` + m=`echo ${nm} |tr ':' ' ' |awk '{print $[]2}'` + echo "${n}:${m}" >> "${f}" + dnl Remove name from public_syms so that it isn't redefined later. + public_syms=`for sym in ${public_syms}; do echo "${sym}"; done |grep -v "^${n}\$" |tr '\n' ' '` + done + for sym in ${public_syms} ; do + n="${sym}" + m="${JEMALLOC_PREFIX}${sym}" + echo "${n}:${m}" >> "${f}" + done +], [ + srcdir="${srcdir}" + objroot="${objroot}" + mangling_map="${mangling_map}" + public_syms="${public_syms}" + JEMALLOC_PREFIX="${JEMALLOC_PREFIX}" +]) +AC_CONFIG_COMMANDS([include/jemalloc/internal/private_symbols.awk], [ + f="${objroot}include/jemalloc/internal/private_symbols.awk" + mkdir -p "${objroot}include/jemalloc/internal" + export_syms=`for sym in ${public_syms}; do echo "${JEMALLOC_PREFIX}${sym}"; done; for sym in ${wrap_syms}; do echo "${sym}"; done;` + "${srcdir}/include/jemalloc/internal/private_symbols.sh" "${SYM_PREFIX}" ${export_syms} > "${objroot}include/jemalloc/internal/private_symbols.awk" +], [ + srcdir="${srcdir}" + objroot="${objroot}" + public_syms="${public_syms}" + wrap_syms="${wrap_syms}" + SYM_PREFIX="${SYM_PREFIX}" + JEMALLOC_PREFIX="${JEMALLOC_PREFIX}" +]) +AC_CONFIG_COMMANDS([include/jemalloc/internal/private_symbols_jet.awk], [ + f="${objroot}include/jemalloc/internal/private_symbols_jet.awk" + mkdir -p "${objroot}include/jemalloc/internal" + export_syms=`for sym in ${public_syms}; do echo "jet_${sym}"; done; for sym in ${wrap_syms}; do echo "${sym}"; done;` + "${srcdir}/include/jemalloc/internal/private_symbols.sh" "${SYM_PREFIX}" ${export_syms} > "${objroot}include/jemalloc/internal/private_symbols_jet.awk" +], [ + srcdir="${srcdir}" + objroot="${objroot}" + public_syms="${public_syms}" + wrap_syms="${wrap_syms}" + SYM_PREFIX="${SYM_PREFIX}" +]) +AC_CONFIG_COMMANDS([include/jemalloc/internal/public_namespace.h], [ + mkdir -p "${objroot}include/jemalloc/internal" + "${srcdir}/include/jemalloc/internal/public_namespace.sh" "${objroot}include/jemalloc/internal/public_symbols.txt" > "${objroot}include/jemalloc/internal/public_namespace.h" +], [ + srcdir="${srcdir}" + objroot="${objroot}" +]) +AC_CONFIG_COMMANDS([include/jemalloc/internal/public_unnamespace.h], [ + mkdir -p "${objroot}include/jemalloc/internal" + "${srcdir}/include/jemalloc/internal/public_unnamespace.sh" "${objroot}include/jemalloc/internal/public_symbols.txt" > "${objroot}include/jemalloc/internal/public_unnamespace.h" +], [ + srcdir="${srcdir}" + objroot="${objroot}" +]) +AC_CONFIG_COMMANDS([include/jemalloc/jemalloc_protos_jet.h], [ + mkdir -p "${objroot}include/jemalloc" + cat "${srcdir}/include/jemalloc/jemalloc_protos.h.in" | sed -e 's/@je_@/jet_/g' > "${objroot}include/jemalloc/jemalloc_protos_jet.h" +], [ + srcdir="${srcdir}" + objroot="${objroot}" +]) +AC_CONFIG_COMMANDS([include/jemalloc/jemalloc_rename.h], [ + mkdir -p "${objroot}include/jemalloc" + "${srcdir}/include/jemalloc/jemalloc_rename.sh" "${objroot}include/jemalloc/internal/public_symbols.txt" > "${objroot}include/jemalloc/jemalloc_rename.h" +], [ + srcdir="${srcdir}" + objroot="${objroot}" +]) +AC_CONFIG_COMMANDS([include/jemalloc/jemalloc_mangle.h], [ + mkdir -p "${objroot}include/jemalloc" + "${srcdir}/include/jemalloc/jemalloc_mangle.sh" "${objroot}include/jemalloc/internal/public_symbols.txt" je_ > "${objroot}include/jemalloc/jemalloc_mangle.h" +], [ + srcdir="${srcdir}" + objroot="${objroot}" +]) +AC_CONFIG_COMMANDS([include/jemalloc/jemalloc_mangle_jet.h], [ + mkdir -p "${objroot}include/jemalloc" + "${srcdir}/include/jemalloc/jemalloc_mangle.sh" "${objroot}include/jemalloc/internal/public_symbols.txt" jet_ > "${objroot}include/jemalloc/jemalloc_mangle_jet.h" +], [ + srcdir="${srcdir}" + objroot="${objroot}" +]) +AC_CONFIG_COMMANDS([include/jemalloc/jemalloc.h], [ + mkdir -p "${objroot}include/jemalloc" + "${srcdir}/include/jemalloc/jemalloc.sh" "${objroot}" > "${objroot}include/jemalloc/jemalloc${install_suffix}.h" +], [ + srcdir="${srcdir}" + objroot="${objroot}" + install_suffix="${install_suffix}" +]) + +dnl Process .in files. +AC_SUBST([cfghdrs_in]) +AC_SUBST([cfghdrs_out]) +AC_CONFIG_HEADERS([$cfghdrs_tup]) + +dnl ============================================================================ +dnl Generate outputs. + +AC_CONFIG_FILES([$cfgoutputs_tup config.stamp bin/jemalloc-config bin/jemalloc.sh bin/jeprof]) +AC_SUBST([cfgoutputs_in]) +AC_SUBST([cfgoutputs_out]) +AC_OUTPUT + +dnl ============================================================================ +dnl Print out the results of configuration. +AC_MSG_RESULT([===============================================================================]) +AC_MSG_RESULT([jemalloc version : ${jemalloc_version}]) +AC_MSG_RESULT([library revision : ${rev}]) +AC_MSG_RESULT([]) +AC_MSG_RESULT([CONFIG : ${CONFIG}]) +AC_MSG_RESULT([CC : ${CC}]) +AC_MSG_RESULT([CONFIGURE_CFLAGS : ${CONFIGURE_CFLAGS}]) +AC_MSG_RESULT([SPECIFIED_CFLAGS : ${SPECIFIED_CFLAGS}]) +AC_MSG_RESULT([EXTRA_CFLAGS : ${EXTRA_CFLAGS}]) +AC_MSG_RESULT([CPPFLAGS : ${CPPFLAGS}]) +AC_MSG_RESULT([CXX : ${CXX}]) +AC_MSG_RESULT([CONFIGURE_CXXFLAGS : ${CONFIGURE_CXXFLAGS}]) +AC_MSG_RESULT([SPECIFIED_CXXFLAGS : ${SPECIFIED_CXXFLAGS}]) +AC_MSG_RESULT([EXTRA_CXXFLAGS : ${EXTRA_CXXFLAGS}]) +AC_MSG_RESULT([LDFLAGS : ${LDFLAGS}]) +AC_MSG_RESULT([EXTRA_LDFLAGS : ${EXTRA_LDFLAGS}]) +AC_MSG_RESULT([DSO_LDFLAGS : ${DSO_LDFLAGS}]) +AC_MSG_RESULT([LIBS : ${LIBS}]) +AC_MSG_RESULT([RPATH_EXTRA : ${RPATH_EXTRA}]) +AC_MSG_RESULT([]) +AC_MSG_RESULT([XSLTPROC : ${XSLTPROC}]) +AC_MSG_RESULT([XSLROOT : ${XSLROOT}]) +AC_MSG_RESULT([]) +AC_MSG_RESULT([PREFIX : ${PREFIX}]) +AC_MSG_RESULT([BINDIR : ${BINDIR}]) +AC_MSG_RESULT([DATADIR : ${DATADIR}]) +AC_MSG_RESULT([INCLUDEDIR : ${INCLUDEDIR}]) +AC_MSG_RESULT([LIBDIR : ${LIBDIR}]) +AC_MSG_RESULT([MANDIR : ${MANDIR}]) +AC_MSG_RESULT([]) +AC_MSG_RESULT([srcroot : ${srcroot}]) +AC_MSG_RESULT([abs_srcroot : ${abs_srcroot}]) +AC_MSG_RESULT([objroot : ${objroot}]) +AC_MSG_RESULT([abs_objroot : ${abs_objroot}]) +AC_MSG_RESULT([]) +AC_MSG_RESULT([JEMALLOC_PREFIX : ${JEMALLOC_PREFIX}]) +AC_MSG_RESULT([JEMALLOC_PRIVATE_NAMESPACE]) +AC_MSG_RESULT([ : ${JEMALLOC_PRIVATE_NAMESPACE}]) +AC_MSG_RESULT([install_suffix : ${install_suffix}]) +AC_MSG_RESULT([malloc_conf : ${config_malloc_conf}]) +AC_MSG_RESULT([documentation : ${enable_doc}]) +AC_MSG_RESULT([shared libs : ${enable_shared}]) +AC_MSG_RESULT([static libs : ${enable_static}]) +AC_MSG_RESULT([autogen : ${enable_autogen}]) +AC_MSG_RESULT([debug : ${enable_debug}]) +AC_MSG_RESULT([stats : ${enable_stats}]) +AC_MSG_RESULT([experimental_smallocx : ${enable_experimental_smallocx}]) +AC_MSG_RESULT([prof : ${enable_prof}]) +AC_MSG_RESULT([prof-libunwind : ${enable_prof_libunwind}]) +AC_MSG_RESULT([prof-libgcc : ${enable_prof_libgcc}]) +AC_MSG_RESULT([prof-gcc : ${enable_prof_gcc}]) +AC_MSG_RESULT([fill : ${enable_fill}]) +AC_MSG_RESULT([utrace : ${enable_utrace}]) +AC_MSG_RESULT([xmalloc : ${enable_xmalloc}]) +AC_MSG_RESULT([log : ${enable_log}]) +AC_MSG_RESULT([lazy_lock : ${enable_lazy_lock}]) +AC_MSG_RESULT([cache-oblivious : ${enable_cache_oblivious}]) +AC_MSG_RESULT([cxx : ${enable_cxx}]) +AC_MSG_RESULT([===============================================================================]) diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/activity_callback.h b/BeefRT/JEMalloc/include/jemalloc/internal/activity_callback.h new file mode 100644 index 00000000..6c2e84e3 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/activity_callback.h @@ -0,0 +1,23 @@ +#ifndef JEMALLOC_INTERNAL_ACTIVITY_CALLBACK_H +#define JEMALLOC_INTERNAL_ACTIVITY_CALLBACK_H + +/* + * The callback to be executed "periodically", in response to some amount of + * allocator activity. + * + * This callback need not be computing any sort of peak (although that's the + * intended first use case), but we drive it from the peak counter, so it's + * keeps things tidy to keep it here. + * + * The calls to this thunk get driven by the peak_event module. + */ +#define ACTIVITY_CALLBACK_THUNK_INITIALIZER {NULL, NULL} +typedef void (*activity_callback_t)(void *uctx, uint64_t allocated, + uint64_t deallocated); +typedef struct activity_callback_thunk_s activity_callback_thunk_t; +struct activity_callback_thunk_s { + activity_callback_t callback; + void *uctx; +}; + +#endif /* JEMALLOC_INTERNAL_ACTIVITY_CALLBACK_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/arena_externs.h b/BeefRT/JEMalloc/include/jemalloc/internal/arena_externs.h new file mode 100644 index 00000000..e6fceaaf --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/arena_externs.h @@ -0,0 +1,121 @@ +#ifndef JEMALLOC_INTERNAL_ARENA_EXTERNS_H +#define JEMALLOC_INTERNAL_ARENA_EXTERNS_H + +#include "jemalloc/internal/bin.h" +#include "jemalloc/internal/div.h" +#include "jemalloc/internal/extent_dss.h" +#include "jemalloc/internal/hook.h" +#include "jemalloc/internal/pages.h" +#include "jemalloc/internal/stats.h" + +/* + * When the amount of pages to be purged exceeds this amount, deferred purge + * should happen. + */ +#define ARENA_DEFERRED_PURGE_NPAGES_THRESHOLD UINT64_C(1024) + +extern ssize_t opt_dirty_decay_ms; +extern ssize_t opt_muzzy_decay_ms; + +extern percpu_arena_mode_t opt_percpu_arena; +extern const char *percpu_arena_mode_names[]; + +extern div_info_t arena_binind_div_info[SC_NBINS]; + +extern malloc_mutex_t arenas_lock; +extern emap_t arena_emap_global; + +extern size_t opt_oversize_threshold; +extern size_t oversize_threshold; + +/* + * arena_bin_offsets[binind] is the offset of the first bin shard for size class + * binind. + */ +extern uint32_t arena_bin_offsets[SC_NBINS]; + +void arena_basic_stats_merge(tsdn_t *tsdn, arena_t *arena, + unsigned *nthreads, const char **dss, ssize_t *dirty_decay_ms, + ssize_t *muzzy_decay_ms, size_t *nactive, size_t *ndirty, size_t *nmuzzy); +void arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads, + const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms, + size_t *nactive, size_t *ndirty, size_t *nmuzzy, arena_stats_t *astats, + bin_stats_data_t *bstats, arena_stats_large_t *lstats, + pac_estats_t *estats, hpa_shard_stats_t *hpastats, sec_stats_t *secstats); +void arena_handle_deferred_work(tsdn_t *tsdn, arena_t *arena); +edata_t *arena_extent_alloc_large(tsdn_t *tsdn, arena_t *arena, + size_t usize, size_t alignment, bool zero); +void arena_extent_dalloc_large_prep(tsdn_t *tsdn, arena_t *arena, + edata_t *edata); +void arena_extent_ralloc_large_shrink(tsdn_t *tsdn, arena_t *arena, + edata_t *edata, size_t oldsize); +void arena_extent_ralloc_large_expand(tsdn_t *tsdn, arena_t *arena, + edata_t *edata, size_t oldsize); +bool arena_decay_ms_set(tsdn_t *tsdn, arena_t *arena, extent_state_t state, + ssize_t decay_ms); +ssize_t arena_decay_ms_get(arena_t *arena, extent_state_t state); +void arena_decay(tsdn_t *tsdn, arena_t *arena, bool is_background_thread, + bool all); +uint64_t arena_time_until_deferred(tsdn_t *tsdn, arena_t *arena); +void arena_do_deferred_work(tsdn_t *tsdn, arena_t *arena); +void arena_reset(tsd_t *tsd, arena_t *arena); +void arena_destroy(tsd_t *tsd, arena_t *arena); +void arena_cache_bin_fill_small(tsdn_t *tsdn, arena_t *arena, + cache_bin_t *cache_bin, cache_bin_info_t *cache_bin_info, szind_t binind, + const unsigned nfill); + +void *arena_malloc_hard(tsdn_t *tsdn, arena_t *arena, size_t size, + szind_t ind, bool zero); +void *arena_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, + size_t alignment, bool zero, tcache_t *tcache); +void arena_prof_promote(tsdn_t *tsdn, void *ptr, size_t usize); +void arena_dalloc_promoted(tsdn_t *tsdn, void *ptr, tcache_t *tcache, + bool slow_path); +void arena_slab_dalloc(tsdn_t *tsdn, arena_t *arena, edata_t *slab); + +void arena_dalloc_bin_locked_handle_newly_empty(tsdn_t *tsdn, arena_t *arena, + edata_t *slab, bin_t *bin); +void arena_dalloc_bin_locked_handle_newly_nonempty(tsdn_t *tsdn, arena_t *arena, + edata_t *slab, bin_t *bin); +void arena_dalloc_small(tsdn_t *tsdn, void *ptr); +bool arena_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, + size_t extra, bool zero, size_t *newsize); +void *arena_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t oldsize, + size_t size, size_t alignment, bool zero, tcache_t *tcache, + hook_ralloc_args_t *hook_args); +dss_prec_t arena_dss_prec_get(arena_t *arena); +ehooks_t *arena_get_ehooks(arena_t *arena); +extent_hooks_t *arena_set_extent_hooks(tsd_t *tsd, arena_t *arena, + extent_hooks_t *extent_hooks); +bool arena_dss_prec_set(arena_t *arena, dss_prec_t dss_prec); +ssize_t arena_dirty_decay_ms_default_get(void); +bool arena_dirty_decay_ms_default_set(ssize_t decay_ms); +ssize_t arena_muzzy_decay_ms_default_get(void); +bool arena_muzzy_decay_ms_default_set(ssize_t decay_ms); +bool arena_retain_grow_limit_get_set(tsd_t *tsd, arena_t *arena, + size_t *old_limit, size_t *new_limit); +unsigned arena_nthreads_get(arena_t *arena, bool internal); +void arena_nthreads_inc(arena_t *arena, bool internal); +void arena_nthreads_dec(arena_t *arena, bool internal); +arena_t *arena_new(tsdn_t *tsdn, unsigned ind, const arena_config_t *config); +bool arena_init_huge(void); +bool arena_is_huge(unsigned arena_ind); +arena_t *arena_choose_huge(tsd_t *tsd); +bin_t *arena_bin_choose(tsdn_t *tsdn, arena_t *arena, szind_t binind, + unsigned *binshard); +size_t arena_fill_small_fresh(tsdn_t *tsdn, arena_t *arena, szind_t binind, + void **ptrs, size_t nfill, bool zero); +bool arena_boot(sc_data_t *sc_data, base_t *base, bool hpa); +void arena_prefork0(tsdn_t *tsdn, arena_t *arena); +void arena_prefork1(tsdn_t *tsdn, arena_t *arena); +void arena_prefork2(tsdn_t *tsdn, arena_t *arena); +void arena_prefork3(tsdn_t *tsdn, arena_t *arena); +void arena_prefork4(tsdn_t *tsdn, arena_t *arena); +void arena_prefork5(tsdn_t *tsdn, arena_t *arena); +void arena_prefork6(tsdn_t *tsdn, arena_t *arena); +void arena_prefork7(tsdn_t *tsdn, arena_t *arena); +void arena_prefork8(tsdn_t *tsdn, arena_t *arena); +void arena_postfork_parent(tsdn_t *tsdn, arena_t *arena); +void arena_postfork_child(tsdn_t *tsdn, arena_t *arena); + +#endif /* JEMALLOC_INTERNAL_ARENA_EXTERNS_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/arena_inlines_a.h b/BeefRT/JEMalloc/include/jemalloc/internal/arena_inlines_a.h new file mode 100644 index 00000000..8568358c --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/arena_inlines_a.h @@ -0,0 +1,24 @@ +#ifndef JEMALLOC_INTERNAL_ARENA_INLINES_A_H +#define JEMALLOC_INTERNAL_ARENA_INLINES_A_H + +static inline unsigned +arena_ind_get(const arena_t *arena) { + return arena->ind; +} + +static inline void +arena_internal_add(arena_t *arena, size_t size) { + atomic_fetch_add_zu(&arena->stats.internal, size, ATOMIC_RELAXED); +} + +static inline void +arena_internal_sub(arena_t *arena, size_t size) { + atomic_fetch_sub_zu(&arena->stats.internal, size, ATOMIC_RELAXED); +} + +static inline size_t +arena_internal_get(arena_t *arena) { + return atomic_load_zu(&arena->stats.internal, ATOMIC_RELAXED); +} + +#endif /* JEMALLOC_INTERNAL_ARENA_INLINES_A_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/arena_inlines_b.h b/BeefRT/JEMalloc/include/jemalloc/internal/arena_inlines_b.h new file mode 100644 index 00000000..fa81537c --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/arena_inlines_b.h @@ -0,0 +1,550 @@ +#ifndef JEMALLOC_INTERNAL_ARENA_INLINES_B_H +#define JEMALLOC_INTERNAL_ARENA_INLINES_B_H + +#include "jemalloc/internal/div.h" +#include "jemalloc/internal/emap.h" +#include "jemalloc/internal/jemalloc_internal_types.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/rtree.h" +#include "jemalloc/internal/safety_check.h" +#include "jemalloc/internal/sc.h" +#include "jemalloc/internal/sz.h" +#include "jemalloc/internal/ticker.h" + +static inline arena_t * +arena_get_from_edata(edata_t *edata) { + return (arena_t *)atomic_load_p(&arenas[edata_arena_ind_get(edata)], + ATOMIC_RELAXED); +} + +JEMALLOC_ALWAYS_INLINE arena_t * +arena_choose_maybe_huge(tsd_t *tsd, arena_t *arena, size_t size) { + if (arena != NULL) { + return arena; + } + + /* + * For huge allocations, use the dedicated huge arena if both are true: + * 1) is using auto arena selection (i.e. arena == NULL), and 2) the + * thread is not assigned to a manual arena. + */ + if (unlikely(size >= oversize_threshold)) { + arena_t *tsd_arena = tsd_arena_get(tsd); + if (tsd_arena == NULL || arena_is_auto(tsd_arena)) { + return arena_choose_huge(tsd); + } + } + + return arena_choose(tsd, NULL); +} + +JEMALLOC_ALWAYS_INLINE void +arena_prof_info_get(tsd_t *tsd, const void *ptr, emap_alloc_ctx_t *alloc_ctx, + prof_info_t *prof_info, bool reset_recent) { + cassert(config_prof); + assert(ptr != NULL); + assert(prof_info != NULL); + + edata_t *edata = NULL; + bool is_slab; + + /* Static check. */ + if (alloc_ctx == NULL) { + edata = emap_edata_lookup(tsd_tsdn(tsd), &arena_emap_global, + ptr); + is_slab = edata_slab_get(edata); + } else if (unlikely(!(is_slab = alloc_ctx->slab))) { + edata = emap_edata_lookup(tsd_tsdn(tsd), &arena_emap_global, + ptr); + } + + if (unlikely(!is_slab)) { + /* edata must have been initialized at this point. */ + assert(edata != NULL); + large_prof_info_get(tsd, edata, prof_info, reset_recent); + } else { + prof_info->alloc_tctx = (prof_tctx_t *)(uintptr_t)1U; + /* + * No need to set other fields in prof_info; they will never be + * accessed if (uintptr_t)alloc_tctx == (uintptr_t)1U. + */ + } +} + +JEMALLOC_ALWAYS_INLINE void +arena_prof_tctx_reset(tsd_t *tsd, const void *ptr, + emap_alloc_ctx_t *alloc_ctx) { + cassert(config_prof); + assert(ptr != NULL); + + /* Static check. */ + if (alloc_ctx == NULL) { + edata_t *edata = emap_edata_lookup(tsd_tsdn(tsd), + &arena_emap_global, ptr); + if (unlikely(!edata_slab_get(edata))) { + large_prof_tctx_reset(edata); + } + } else { + if (unlikely(!alloc_ctx->slab)) { + edata_t *edata = emap_edata_lookup(tsd_tsdn(tsd), + &arena_emap_global, ptr); + large_prof_tctx_reset(edata); + } + } +} + +JEMALLOC_ALWAYS_INLINE void +arena_prof_tctx_reset_sampled(tsd_t *tsd, const void *ptr) { + cassert(config_prof); + assert(ptr != NULL); + + edata_t *edata = emap_edata_lookup(tsd_tsdn(tsd), &arena_emap_global, + ptr); + assert(!edata_slab_get(edata)); + + large_prof_tctx_reset(edata); +} + +JEMALLOC_ALWAYS_INLINE void +arena_prof_info_set(tsd_t *tsd, edata_t *edata, prof_tctx_t *tctx, + size_t size) { + cassert(config_prof); + + assert(!edata_slab_get(edata)); + large_prof_info_set(edata, tctx, size); +} + +JEMALLOC_ALWAYS_INLINE void +arena_decay_ticks(tsdn_t *tsdn, arena_t *arena, unsigned nticks) { + if (unlikely(tsdn_null(tsdn))) { + return; + } + tsd_t *tsd = tsdn_tsd(tsdn); + /* + * We use the ticker_geom_t to avoid having per-arena state in the tsd. + * Instead of having a countdown-until-decay timer running for every + * arena in every thread, we flip a coin once per tick, whose + * probability of coming up heads is 1/nticks; this is effectively the + * operation of the ticker_geom_t. Each arena has the same chance of a + * coinflip coming up heads (1/ARENA_DECAY_NTICKS_PER_UPDATE), so we can + * use a single ticker for all of them. + */ + ticker_geom_t *decay_ticker = tsd_arena_decay_tickerp_get(tsd); + uint64_t *prng_state = tsd_prng_statep_get(tsd); + if (unlikely(ticker_geom_ticks(decay_ticker, prng_state, nticks))) { + arena_decay(tsdn, arena, false, false); + } +} + +JEMALLOC_ALWAYS_INLINE void +arena_decay_tick(tsdn_t *tsdn, arena_t *arena) { + arena_decay_ticks(tsdn, arena, 1); +} + +JEMALLOC_ALWAYS_INLINE void * +arena_malloc(tsdn_t *tsdn, arena_t *arena, size_t size, szind_t ind, bool zero, + tcache_t *tcache, bool slow_path) { + assert(!tsdn_null(tsdn) || tcache == NULL); + + if (likely(tcache != NULL)) { + if (likely(size <= SC_SMALL_MAXCLASS)) { + return tcache_alloc_small(tsdn_tsd(tsdn), arena, + tcache, size, ind, zero, slow_path); + } + if (likely(size <= tcache_maxclass)) { + return tcache_alloc_large(tsdn_tsd(tsdn), arena, + tcache, size, ind, zero, slow_path); + } + /* (size > tcache_maxclass) case falls through. */ + assert(size > tcache_maxclass); + } + + return arena_malloc_hard(tsdn, arena, size, ind, zero); +} + +JEMALLOC_ALWAYS_INLINE arena_t * +arena_aalloc(tsdn_t *tsdn, const void *ptr) { + edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global, ptr); + unsigned arena_ind = edata_arena_ind_get(edata); + return (arena_t *)atomic_load_p(&arenas[arena_ind], ATOMIC_RELAXED); +} + +JEMALLOC_ALWAYS_INLINE size_t +arena_salloc(tsdn_t *tsdn, const void *ptr) { + assert(ptr != NULL); + emap_alloc_ctx_t alloc_ctx; + emap_alloc_ctx_lookup(tsdn, &arena_emap_global, ptr, &alloc_ctx); + assert(alloc_ctx.szind != SC_NSIZES); + + return sz_index2size(alloc_ctx.szind); +} + +JEMALLOC_ALWAYS_INLINE size_t +arena_vsalloc(tsdn_t *tsdn, const void *ptr) { + /* + * Return 0 if ptr is not within an extent managed by jemalloc. This + * function has two extra costs relative to isalloc(): + * - The rtree calls cannot claim to be dependent lookups, which induces + * rtree lookup load dependencies. + * - The lookup may fail, so there is an extra branch to check for + * failure. + */ + + emap_full_alloc_ctx_t full_alloc_ctx; + bool missing = emap_full_alloc_ctx_try_lookup(tsdn, &arena_emap_global, + ptr, &full_alloc_ctx); + if (missing) { + return 0; + } + + if (full_alloc_ctx.edata == NULL) { + return 0; + } + assert(edata_state_get(full_alloc_ctx.edata) == extent_state_active); + /* Only slab members should be looked up via interior pointers. */ + assert(edata_addr_get(full_alloc_ctx.edata) == ptr + || edata_slab_get(full_alloc_ctx.edata)); + + assert(full_alloc_ctx.szind != SC_NSIZES); + + return sz_index2size(full_alloc_ctx.szind); +} + +JEMALLOC_ALWAYS_INLINE bool +large_dalloc_safety_checks(edata_t *edata, void *ptr, szind_t szind) { + if (!config_opt_safety_checks) { + return false; + } + + /* + * Eagerly detect double free and sized dealloc bugs for large sizes. + * The cost is low enough (as edata will be accessed anyway) to be + * enabled all the time. + */ + if (unlikely(edata == NULL || + edata_state_get(edata) != extent_state_active)) { + safety_check_fail("Invalid deallocation detected: " + "pages being freed (%p) not currently active, " + "possibly caused by double free bugs.", + (uintptr_t)edata_addr_get(edata)); + return true; + } + size_t input_size = sz_index2size(szind); + if (unlikely(input_size != edata_usize_get(edata))) { + safety_check_fail_sized_dealloc(/* current_dealloc */ true, ptr, + /* true_size */ edata_usize_get(edata), input_size); + return true; + } + + return false; +} + +static inline void +arena_dalloc_large_no_tcache(tsdn_t *tsdn, void *ptr, szind_t szind) { + if (config_prof && unlikely(szind < SC_NBINS)) { + arena_dalloc_promoted(tsdn, ptr, NULL, true); + } else { + edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global, + ptr); + if (large_dalloc_safety_checks(edata, ptr, szind)) { + /* See the comment in isfree. */ + return; + } + large_dalloc(tsdn, edata); + } +} + +static inline void +arena_dalloc_no_tcache(tsdn_t *tsdn, void *ptr) { + assert(ptr != NULL); + + emap_alloc_ctx_t alloc_ctx; + emap_alloc_ctx_lookup(tsdn, &arena_emap_global, ptr, &alloc_ctx); + + if (config_debug) { + edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global, + ptr); + assert(alloc_ctx.szind == edata_szind_get(edata)); + assert(alloc_ctx.szind < SC_NSIZES); + assert(alloc_ctx.slab == edata_slab_get(edata)); + } + + if (likely(alloc_ctx.slab)) { + /* Small allocation. */ + arena_dalloc_small(tsdn, ptr); + } else { + arena_dalloc_large_no_tcache(tsdn, ptr, alloc_ctx.szind); + } +} + +JEMALLOC_ALWAYS_INLINE void +arena_dalloc_large(tsdn_t *tsdn, void *ptr, tcache_t *tcache, szind_t szind, + bool slow_path) { + if (szind < nhbins) { + if (config_prof && unlikely(szind < SC_NBINS)) { + arena_dalloc_promoted(tsdn, ptr, tcache, slow_path); + } else { + tcache_dalloc_large(tsdn_tsd(tsdn), tcache, ptr, szind, + slow_path); + } + } else { + edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global, + ptr); + if (large_dalloc_safety_checks(edata, ptr, szind)) { + /* See the comment in isfree. */ + return; + } + large_dalloc(tsdn, edata); + } +} + +JEMALLOC_ALWAYS_INLINE void +arena_dalloc(tsdn_t *tsdn, void *ptr, tcache_t *tcache, + emap_alloc_ctx_t *caller_alloc_ctx, bool slow_path) { + assert(!tsdn_null(tsdn) || tcache == NULL); + assert(ptr != NULL); + + if (unlikely(tcache == NULL)) { + arena_dalloc_no_tcache(tsdn, ptr); + return; + } + + emap_alloc_ctx_t alloc_ctx; + if (caller_alloc_ctx != NULL) { + alloc_ctx = *caller_alloc_ctx; + } else { + util_assume(!tsdn_null(tsdn)); + emap_alloc_ctx_lookup(tsdn, &arena_emap_global, ptr, + &alloc_ctx); + } + + if (config_debug) { + edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global, + ptr); + assert(alloc_ctx.szind == edata_szind_get(edata)); + assert(alloc_ctx.szind < SC_NSIZES); + assert(alloc_ctx.slab == edata_slab_get(edata)); + } + + if (likely(alloc_ctx.slab)) { + /* Small allocation. */ + tcache_dalloc_small(tsdn_tsd(tsdn), tcache, ptr, + alloc_ctx.szind, slow_path); + } else { + arena_dalloc_large(tsdn, ptr, tcache, alloc_ctx.szind, + slow_path); + } +} + +static inline void +arena_sdalloc_no_tcache(tsdn_t *tsdn, void *ptr, size_t size) { + assert(ptr != NULL); + assert(size <= SC_LARGE_MAXCLASS); + + emap_alloc_ctx_t alloc_ctx; + if (!config_prof || !opt_prof) { + /* + * There is no risk of being confused by a promoted sampled + * object, so base szind and slab on the given size. + */ + alloc_ctx.szind = sz_size2index(size); + alloc_ctx.slab = (alloc_ctx.szind < SC_NBINS); + } + + if ((config_prof && opt_prof) || config_debug) { + emap_alloc_ctx_lookup(tsdn, &arena_emap_global, ptr, + &alloc_ctx); + + assert(alloc_ctx.szind == sz_size2index(size)); + assert((config_prof && opt_prof) + || alloc_ctx.slab == (alloc_ctx.szind < SC_NBINS)); + + if (config_debug) { + edata_t *edata = emap_edata_lookup(tsdn, + &arena_emap_global, ptr); + assert(alloc_ctx.szind == edata_szind_get(edata)); + assert(alloc_ctx.slab == edata_slab_get(edata)); + } + } + + if (likely(alloc_ctx.slab)) { + /* Small allocation. */ + arena_dalloc_small(tsdn, ptr); + } else { + arena_dalloc_large_no_tcache(tsdn, ptr, alloc_ctx.szind); + } +} + +JEMALLOC_ALWAYS_INLINE void +arena_sdalloc(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache, + emap_alloc_ctx_t *caller_alloc_ctx, bool slow_path) { + assert(!tsdn_null(tsdn) || tcache == NULL); + assert(ptr != NULL); + assert(size <= SC_LARGE_MAXCLASS); + + if (unlikely(tcache == NULL)) { + arena_sdalloc_no_tcache(tsdn, ptr, size); + return; + } + + emap_alloc_ctx_t alloc_ctx; + if (config_prof && opt_prof) { + if (caller_alloc_ctx == NULL) { + /* Uncommon case and should be a static check. */ + emap_alloc_ctx_lookup(tsdn, &arena_emap_global, ptr, + &alloc_ctx); + assert(alloc_ctx.szind == sz_size2index(size)); + } else { + alloc_ctx = *caller_alloc_ctx; + } + } else { + /* + * There is no risk of being confused by a promoted sampled + * object, so base szind and slab on the given size. + */ + alloc_ctx.szind = sz_size2index(size); + alloc_ctx.slab = (alloc_ctx.szind < SC_NBINS); + } + + if (config_debug) { + edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global, + ptr); + assert(alloc_ctx.szind == edata_szind_get(edata)); + assert(alloc_ctx.slab == edata_slab_get(edata)); + } + + if (likely(alloc_ctx.slab)) { + /* Small allocation. */ + tcache_dalloc_small(tsdn_tsd(tsdn), tcache, ptr, + alloc_ctx.szind, slow_path); + } else { + arena_dalloc_large(tsdn, ptr, tcache, alloc_ctx.szind, + slow_path); + } +} + +static inline void +arena_cache_oblivious_randomize(tsdn_t *tsdn, arena_t *arena, edata_t *edata, + size_t alignment) { + assert(edata_base_get(edata) == edata_addr_get(edata)); + + if (alignment < PAGE) { + unsigned lg_range = LG_PAGE - + lg_floor(CACHELINE_CEILING(alignment)); + size_t r; + if (!tsdn_null(tsdn)) { + tsd_t *tsd = tsdn_tsd(tsdn); + r = (size_t)prng_lg_range_u64( + tsd_prng_statep_get(tsd), lg_range); + } else { + uint64_t stack_value = (uint64_t)(uintptr_t)&r; + r = (size_t)prng_lg_range_u64(&stack_value, lg_range); + } + uintptr_t random_offset = ((uintptr_t)r) << (LG_PAGE - + lg_range); + edata->e_addr = (void *)((uintptr_t)edata->e_addr + + random_offset); + assert(ALIGNMENT_ADDR2BASE(edata->e_addr, alignment) == + edata->e_addr); + } +} + +/* + * The dalloc bin info contains just the information that the common paths need + * during tcache flushes. By force-inlining these paths, and using local copies + * of data (so that the compiler knows it's constant), we avoid a whole bunch of + * redundant loads and stores by leaving this information in registers. + */ +typedef struct arena_dalloc_bin_locked_info_s arena_dalloc_bin_locked_info_t; +struct arena_dalloc_bin_locked_info_s { + div_info_t div_info; + uint32_t nregs; + uint64_t ndalloc; +}; + +JEMALLOC_ALWAYS_INLINE size_t +arena_slab_regind(arena_dalloc_bin_locked_info_t *info, szind_t binind, + edata_t *slab, const void *ptr) { + size_t diff, regind; + + /* Freeing a pointer outside the slab can cause assertion failure. */ + assert((uintptr_t)ptr >= (uintptr_t)edata_addr_get(slab)); + assert((uintptr_t)ptr < (uintptr_t)edata_past_get(slab)); + /* Freeing an interior pointer can cause assertion failure. */ + assert(((uintptr_t)ptr - (uintptr_t)edata_addr_get(slab)) % + (uintptr_t)bin_infos[binind].reg_size == 0); + + diff = (size_t)((uintptr_t)ptr - (uintptr_t)edata_addr_get(slab)); + + /* Avoid doing division with a variable divisor. */ + regind = div_compute(&info->div_info, diff); + + assert(regind < bin_infos[binind].nregs); + + return regind; +} + +JEMALLOC_ALWAYS_INLINE void +arena_dalloc_bin_locked_begin(arena_dalloc_bin_locked_info_t *info, + szind_t binind) { + info->div_info = arena_binind_div_info[binind]; + info->nregs = bin_infos[binind].nregs; + info->ndalloc = 0; +} + +/* + * Does the deallocation work associated with freeing a single pointer (a + * "step") in between a arena_dalloc_bin_locked begin and end call. + * + * Returns true if arena_slab_dalloc must be called on slab. Doesn't do + * stats updates, which happen during finish (this lets running counts get left + * in a register). + */ +JEMALLOC_ALWAYS_INLINE bool +arena_dalloc_bin_locked_step(tsdn_t *tsdn, arena_t *arena, bin_t *bin, + arena_dalloc_bin_locked_info_t *info, szind_t binind, edata_t *slab, + void *ptr) { + const bin_info_t *bin_info = &bin_infos[binind]; + size_t regind = arena_slab_regind(info, binind, slab, ptr); + slab_data_t *slab_data = edata_slab_data_get(slab); + + assert(edata_nfree_get(slab) < bin_info->nregs); + /* Freeing an unallocated pointer can cause assertion failure. */ + assert(bitmap_get(slab_data->bitmap, &bin_info->bitmap_info, regind)); + + bitmap_unset(slab_data->bitmap, &bin_info->bitmap_info, regind); + edata_nfree_inc(slab); + + if (config_stats) { + info->ndalloc++; + } + + unsigned nfree = edata_nfree_get(slab); + if (nfree == bin_info->nregs) { + arena_dalloc_bin_locked_handle_newly_empty(tsdn, arena, slab, + bin); + return true; + } else if (nfree == 1 && slab != bin->slabcur) { + arena_dalloc_bin_locked_handle_newly_nonempty(tsdn, arena, slab, + bin); + } + return false; +} + +JEMALLOC_ALWAYS_INLINE void +arena_dalloc_bin_locked_finish(tsdn_t *tsdn, arena_t *arena, bin_t *bin, + arena_dalloc_bin_locked_info_t *info) { + if (config_stats) { + bin->stats.ndalloc += info->ndalloc; + assert(bin->stats.curregs >= (size_t)info->ndalloc); + bin->stats.curregs -= (size_t)info->ndalloc; + } +} + +static inline bin_t * +arena_get_bin(arena_t *arena, szind_t binind, unsigned binshard) { + bin_t *shard0 = (bin_t *)((uintptr_t)arena + arena_bin_offsets[binind]); + return shard0 + binshard; +} + +#endif /* JEMALLOC_INTERNAL_ARENA_INLINES_B_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/arena_stats.h b/BeefRT/JEMalloc/include/jemalloc/internal/arena_stats.h new file mode 100644 index 00000000..15f1d345 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/arena_stats.h @@ -0,0 +1,114 @@ +#ifndef JEMALLOC_INTERNAL_ARENA_STATS_H +#define JEMALLOC_INTERNAL_ARENA_STATS_H + +#include "jemalloc/internal/atomic.h" +#include "jemalloc/internal/lockedint.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/mutex_prof.h" +#include "jemalloc/internal/pa.h" +#include "jemalloc/internal/sc.h" + +JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS + +typedef struct arena_stats_large_s arena_stats_large_t; +struct arena_stats_large_s { + /* + * Total number of allocation/deallocation requests served directly by + * the arena. + */ + locked_u64_t nmalloc; + locked_u64_t ndalloc; + + /* + * Number of allocation requests that correspond to this size class. + * This includes requests served by tcache, though tcache only + * periodically merges into this counter. + */ + locked_u64_t nrequests; /* Partially derived. */ + /* + * Number of tcache fills / flushes for large (similarly, periodically + * merged). Note that there is no large tcache batch-fill currently + * (i.e. only fill 1 at a time); however flush may be batched. + */ + locked_u64_t nfills; /* Partially derived. */ + locked_u64_t nflushes; /* Partially derived. */ + + /* Current number of allocations of this size class. */ + size_t curlextents; /* Derived. */ +}; + +/* + * Arena stats. Note that fields marked "derived" are not directly maintained + * within the arena code; rather their values are derived during stats merge + * requests. + */ +typedef struct arena_stats_s arena_stats_t; +struct arena_stats_s { + LOCKEDINT_MTX_DECLARE(mtx) + + /* + * resident includes the base stats -- that's why it lives here and not + * in pa_shard_stats_t. + */ + size_t base; /* Derived. */ + size_t resident; /* Derived. */ + size_t metadata_thp; /* Derived. */ + size_t mapped; /* Derived. */ + + atomic_zu_t internal; + + size_t allocated_large; /* Derived. */ + uint64_t nmalloc_large; /* Derived. */ + uint64_t ndalloc_large; /* Derived. */ + uint64_t nfills_large; /* Derived. */ + uint64_t nflushes_large; /* Derived. */ + uint64_t nrequests_large; /* Derived. */ + + /* + * The stats logically owned by the pa_shard in the same arena. This + * lives here only because it's convenient for the purposes of the ctl + * module -- it only knows about the single arena_stats. + */ + pa_shard_stats_t pa_shard_stats; + + /* Number of bytes cached in tcache associated with this arena. */ + size_t tcache_bytes; /* Derived. */ + size_t tcache_stashed_bytes; /* Derived. */ + + mutex_prof_data_t mutex_prof_data[mutex_prof_num_arena_mutexes]; + + /* One element for each large size class. */ + arena_stats_large_t lstats[SC_NSIZES - SC_NBINS]; + + /* Arena uptime. */ + nstime_t uptime; +}; + +static inline bool +arena_stats_init(tsdn_t *tsdn, arena_stats_t *arena_stats) { + if (config_debug) { + for (size_t i = 0; i < sizeof(arena_stats_t); i++) { + assert(((char *)arena_stats)[i] == 0); + } + } + if (LOCKEDINT_MTX_INIT(arena_stats->mtx, "arena_stats", + WITNESS_RANK_ARENA_STATS, malloc_mutex_rank_exclusive)) { + return true; + } + /* Memory is zeroed, so there is no need to clear stats. */ + return false; +} + +static inline void +arena_stats_large_flush_nrequests_add(tsdn_t *tsdn, arena_stats_t *arena_stats, + szind_t szind, uint64_t nrequests) { + LOCKEDINT_MTX_LOCK(tsdn, arena_stats->mtx); + arena_stats_large_t *lstats = &arena_stats->lstats[szind - SC_NBINS]; + locked_inc_u64(tsdn, LOCKEDINT_MTX(arena_stats->mtx), + &lstats->nrequests, nrequests); + locked_inc_u64(tsdn, LOCKEDINT_MTX(arena_stats->mtx), + &lstats->nflushes, 1); + LOCKEDINT_MTX_UNLOCK(tsdn, arena_stats->mtx); +} + +#endif /* JEMALLOC_INTERNAL_ARENA_STATS_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/arena_structs.h b/BeefRT/JEMalloc/include/jemalloc/internal/arena_structs.h new file mode 100644 index 00000000..e2a5a408 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/arena_structs.h @@ -0,0 +1,101 @@ +#ifndef JEMALLOC_INTERNAL_ARENA_STRUCTS_H +#define JEMALLOC_INTERNAL_ARENA_STRUCTS_H + +#include "jemalloc/internal/arena_stats.h" +#include "jemalloc/internal/atomic.h" +#include "jemalloc/internal/bin.h" +#include "jemalloc/internal/bitmap.h" +#include "jemalloc/internal/counter.h" +#include "jemalloc/internal/ecache.h" +#include "jemalloc/internal/edata_cache.h" +#include "jemalloc/internal/extent_dss.h" +#include "jemalloc/internal/jemalloc_internal_types.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/nstime.h" +#include "jemalloc/internal/pa.h" +#include "jemalloc/internal/ql.h" +#include "jemalloc/internal/sc.h" +#include "jemalloc/internal/ticker.h" + +struct arena_s { + /* + * Number of threads currently assigned to this arena. Each thread has + * two distinct assignments, one for application-serving allocation, and + * the other for internal metadata allocation. Internal metadata must + * not be allocated from arenas explicitly created via the arenas.create + * mallctl, because the arena..reset mallctl indiscriminately + * discards all allocations for the affected arena. + * + * 0: Application allocation. + * 1: Internal metadata allocation. + * + * Synchronization: atomic. + */ + atomic_u_t nthreads[2]; + + /* Next bin shard for binding new threads. Synchronization: atomic. */ + atomic_u_t binshard_next; + + /* + * When percpu_arena is enabled, to amortize the cost of reading / + * updating the current CPU id, track the most recent thread accessing + * this arena, and only read CPU if there is a mismatch. + */ + tsdn_t *last_thd; + + /* Synchronization: internal. */ + arena_stats_t stats; + + /* + * Lists of tcaches and cache_bin_array_descriptors for extant threads + * associated with this arena. Stats from these are merged + * incrementally, and at exit if opt_stats_print is enabled. + * + * Synchronization: tcache_ql_mtx. + */ + ql_head(tcache_slow_t) tcache_ql; + ql_head(cache_bin_array_descriptor_t) cache_bin_array_descriptor_ql; + malloc_mutex_t tcache_ql_mtx; + + /* + * Represents a dss_prec_t, but atomically. + * + * Synchronization: atomic. + */ + atomic_u_t dss_prec; + + /* + * Extant large allocations. + * + * Synchronization: large_mtx. + */ + edata_list_active_t large; + /* Synchronizes all large allocation/update/deallocation. */ + malloc_mutex_t large_mtx; + + /* The page-level allocator shard this arena uses. */ + pa_shard_t pa_shard; + + /* + * A cached copy of base->ind. This can get accessed on hot paths; + * looking it up in base requires an extra pointer hop / cache miss. + */ + unsigned ind; + + /* + * Base allocator, from which arena metadata are allocated. + * + * Synchronization: internal. + */ + base_t *base; + /* Used to determine uptime. Read-only after initialization. */ + nstime_t create_time; + + /* + * The arena is allocated alongside its bins; really this is a + * dynamically sized array determined by the binshard settings. + */ + bin_t bins[0]; +}; + +#endif /* JEMALLOC_INTERNAL_ARENA_STRUCTS_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/arena_types.h b/BeefRT/JEMalloc/include/jemalloc/internal/arena_types.h new file mode 100644 index 00000000..d0e12917 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/arena_types.h @@ -0,0 +1,58 @@ +#ifndef JEMALLOC_INTERNAL_ARENA_TYPES_H +#define JEMALLOC_INTERNAL_ARENA_TYPES_H + +#include "jemalloc/internal/sc.h" + +/* Default decay times in milliseconds. */ +#define DIRTY_DECAY_MS_DEFAULT ZD(10 * 1000) +#define MUZZY_DECAY_MS_DEFAULT (0) +/* Number of event ticks between time checks. */ +#define ARENA_DECAY_NTICKS_PER_UPDATE 1000 + +typedef struct arena_decay_s arena_decay_t; +typedef struct arena_s arena_t; + +typedef enum { + percpu_arena_mode_names_base = 0, /* Used for options processing. */ + + /* + * *_uninit are used only during bootstrapping, and must correspond + * to initialized variant plus percpu_arena_mode_enabled_base. + */ + percpu_arena_uninit = 0, + per_phycpu_arena_uninit = 1, + + /* All non-disabled modes must come after percpu_arena_disabled. */ + percpu_arena_disabled = 2, + + percpu_arena_mode_names_limit = 3, /* Used for options processing. */ + percpu_arena_mode_enabled_base = 3, + + percpu_arena = 3, + per_phycpu_arena = 4 /* Hyper threads share arena. */ +} percpu_arena_mode_t; + +#define PERCPU_ARENA_ENABLED(m) ((m) >= percpu_arena_mode_enabled_base) +#define PERCPU_ARENA_DEFAULT percpu_arena_disabled + +/* + * When allocation_size >= oversize_threshold, use the dedicated huge arena + * (unless have explicitly spicified arena index). 0 disables the feature. + */ +#define OVERSIZE_THRESHOLD_DEFAULT (8 << 20) + +struct arena_config_s { + /* extent hooks to be used for the arena */ + extent_hooks_t *extent_hooks; + + /* + * Use extent hooks for metadata (base) allocations when true. + */ + bool metadata_use_hooks; +}; + +typedef struct arena_config_s arena_config_t; + +extern const arena_config_t arena_config_default; + +#endif /* JEMALLOC_INTERNAL_ARENA_TYPES_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/assert.h b/BeefRT/JEMalloc/include/jemalloc/internal/assert.h new file mode 100644 index 00000000..be4d45b3 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/assert.h @@ -0,0 +1,56 @@ +#include "jemalloc/internal/malloc_io.h" +#include "jemalloc/internal/util.h" + +/* + * Define a custom assert() in order to reduce the chances of deadlock during + * assertion failure. + */ +#ifndef assert +#define assert(e) do { \ + if (unlikely(config_debug && !(e))) { \ + malloc_printf( \ + ": %s:%d: Failed assertion: \"%s\"\n", \ + __FILE__, __LINE__, #e); \ + abort(); \ + } \ +} while (0) +#endif + +#ifndef not_reached +#define not_reached() do { \ + if (config_debug) { \ + malloc_printf( \ + ": %s:%d: Unreachable code reached\n", \ + __FILE__, __LINE__); \ + abort(); \ + } \ + unreachable(); \ +} while (0) +#endif + +#ifndef not_implemented +#define not_implemented() do { \ + if (config_debug) { \ + malloc_printf(": %s:%d: Not implemented\n", \ + __FILE__, __LINE__); \ + abort(); \ + } \ +} while (0) +#endif + +#ifndef assert_not_implemented +#define assert_not_implemented(e) do { \ + if (unlikely(config_debug && !(e))) { \ + not_implemented(); \ + } \ +} while (0) +#endif + +/* Use to assert a particular configuration, e.g., cassert(config_debug). */ +#ifndef cassert +#define cassert(c) do { \ + if (unlikely(!(c))) { \ + not_reached(); \ + } \ +} while (0) +#endif diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/atomic.h b/BeefRT/JEMalloc/include/jemalloc/internal/atomic.h new file mode 100644 index 00000000..c0f73122 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/atomic.h @@ -0,0 +1,107 @@ +#ifndef JEMALLOC_INTERNAL_ATOMIC_H +#define JEMALLOC_INTERNAL_ATOMIC_H + +#define ATOMIC_INLINE JEMALLOC_ALWAYS_INLINE + +#define JEMALLOC_U8_ATOMICS +#if defined(JEMALLOC_GCC_ATOMIC_ATOMICS) +# include "jemalloc/internal/atomic_gcc_atomic.h" +# if !defined(JEMALLOC_GCC_U8_ATOMIC_ATOMICS) +# undef JEMALLOC_U8_ATOMICS +# endif +#elif defined(JEMALLOC_GCC_SYNC_ATOMICS) +# include "jemalloc/internal/atomic_gcc_sync.h" +# if !defined(JEMALLOC_GCC_U8_SYNC_ATOMICS) +# undef JEMALLOC_U8_ATOMICS +# endif +#elif defined(_MSC_VER) +# include "jemalloc/internal/atomic_msvc.h" +#elif defined(JEMALLOC_C11_ATOMICS) +# include "jemalloc/internal/atomic_c11.h" +#else +# error "Don't have atomics implemented on this platform." +#endif + +/* + * This header gives more or less a backport of C11 atomics. The user can write + * JEMALLOC_GENERATE_ATOMICS(type, short_type, lg_sizeof_type); to generate + * counterparts of the C11 atomic functions for type, as so: + * JEMALLOC_GENERATE_ATOMICS(int *, pi, 3); + * and then write things like: + * int *some_ptr; + * atomic_pi_t atomic_ptr_to_int; + * atomic_store_pi(&atomic_ptr_to_int, some_ptr, ATOMIC_RELAXED); + * int *prev_value = atomic_exchange_pi(&ptr_to_int, NULL, ATOMIC_ACQ_REL); + * assert(some_ptr == prev_value); + * and expect things to work in the obvious way. + * + * Also included (with naming differences to avoid conflicts with the standard + * library): + * atomic_fence(atomic_memory_order_t) (mimics C11's atomic_thread_fence). + * ATOMIC_INIT (mimics C11's ATOMIC_VAR_INIT). + */ + +/* + * Pure convenience, so that we don't have to type "atomic_memory_order_" + * quite so often. + */ +#define ATOMIC_RELAXED atomic_memory_order_relaxed +#define ATOMIC_ACQUIRE atomic_memory_order_acquire +#define ATOMIC_RELEASE atomic_memory_order_release +#define ATOMIC_ACQ_REL atomic_memory_order_acq_rel +#define ATOMIC_SEQ_CST atomic_memory_order_seq_cst + +/* + * Another convenience -- simple atomic helper functions. + */ +#define JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(type, short_type, \ + lg_size) \ + JEMALLOC_GENERATE_INT_ATOMICS(type, short_type, lg_size) \ + ATOMIC_INLINE void \ + atomic_load_add_store_##short_type(atomic_##short_type##_t *a, \ + type inc) { \ + type oldval = atomic_load_##short_type(a, ATOMIC_RELAXED); \ + type newval = oldval + inc; \ + atomic_store_##short_type(a, newval, ATOMIC_RELAXED); \ + } \ + ATOMIC_INLINE void \ + atomic_load_sub_store_##short_type(atomic_##short_type##_t *a, \ + type inc) { \ + type oldval = atomic_load_##short_type(a, ATOMIC_RELAXED); \ + type newval = oldval - inc; \ + atomic_store_##short_type(a, newval, ATOMIC_RELAXED); \ + } + +/* + * Not all platforms have 64-bit atomics. If we do, this #define exposes that + * fact. + */ +#if (LG_SIZEOF_PTR == 3 || LG_SIZEOF_INT == 3) +# define JEMALLOC_ATOMIC_U64 +#endif + +JEMALLOC_GENERATE_ATOMICS(void *, p, LG_SIZEOF_PTR) + +/* + * There's no actual guarantee that sizeof(bool) == 1, but it's true on the only + * platform that actually needs to know the size, MSVC. + */ +JEMALLOC_GENERATE_ATOMICS(bool, b, 0) + +JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(unsigned, u, LG_SIZEOF_INT) + +JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(size_t, zu, LG_SIZEOF_PTR) + +JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(ssize_t, zd, LG_SIZEOF_PTR) + +JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(uint8_t, u8, 0) + +JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(uint32_t, u32, 2) + +#ifdef JEMALLOC_ATOMIC_U64 +JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(uint64_t, u64, 3) +#endif + +#undef ATOMIC_INLINE + +#endif /* JEMALLOC_INTERNAL_ATOMIC_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/atomic_c11.h b/BeefRT/JEMalloc/include/jemalloc/internal/atomic_c11.h new file mode 100644 index 00000000..a5f9313a --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/atomic_c11.h @@ -0,0 +1,97 @@ +#ifndef JEMALLOC_INTERNAL_ATOMIC_C11_H +#define JEMALLOC_INTERNAL_ATOMIC_C11_H + +#include + +#define ATOMIC_INIT(...) ATOMIC_VAR_INIT(__VA_ARGS__) + +#define atomic_memory_order_t memory_order +#define atomic_memory_order_relaxed memory_order_relaxed +#define atomic_memory_order_acquire memory_order_acquire +#define atomic_memory_order_release memory_order_release +#define atomic_memory_order_acq_rel memory_order_acq_rel +#define atomic_memory_order_seq_cst memory_order_seq_cst + +#define atomic_fence atomic_thread_fence + +#define JEMALLOC_GENERATE_ATOMICS(type, short_type, \ + /* unused */ lg_size) \ +typedef _Atomic(type) atomic_##short_type##_t; \ + \ +ATOMIC_INLINE type \ +atomic_load_##short_type(const atomic_##short_type##_t *a, \ + atomic_memory_order_t mo) { \ + /* \ + * A strict interpretation of the C standard prevents \ + * atomic_load from taking a const argument, but it's \ + * convenient for our purposes. This cast is a workaround. \ + */ \ + atomic_##short_type##_t* a_nonconst = \ + (atomic_##short_type##_t*)a; \ + return atomic_load_explicit(a_nonconst, mo); \ +} \ + \ +ATOMIC_INLINE void \ +atomic_store_##short_type(atomic_##short_type##_t *a, \ + type val, atomic_memory_order_t mo) { \ + atomic_store_explicit(a, val, mo); \ +} \ + \ +ATOMIC_INLINE type \ +atomic_exchange_##short_type(atomic_##short_type##_t *a, type val, \ + atomic_memory_order_t mo) { \ + return atomic_exchange_explicit(a, val, mo); \ +} \ + \ +ATOMIC_INLINE bool \ +atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a, \ + type *expected, type desired, atomic_memory_order_t success_mo, \ + atomic_memory_order_t failure_mo) { \ + return atomic_compare_exchange_weak_explicit(a, expected, \ + desired, success_mo, failure_mo); \ +} \ + \ +ATOMIC_INLINE bool \ +atomic_compare_exchange_strong_##short_type(atomic_##short_type##_t *a, \ + type *expected, type desired, atomic_memory_order_t success_mo, \ + atomic_memory_order_t failure_mo) { \ + return atomic_compare_exchange_strong_explicit(a, expected, \ + desired, success_mo, failure_mo); \ +} + +/* + * Integral types have some special operations available that non-integral ones + * lack. + */ +#define JEMALLOC_GENERATE_INT_ATOMICS(type, short_type, \ + /* unused */ lg_size) \ +JEMALLOC_GENERATE_ATOMICS(type, short_type, /* unused */ lg_size) \ + \ +ATOMIC_INLINE type \ +atomic_fetch_add_##short_type(atomic_##short_type##_t *a, \ + type val, atomic_memory_order_t mo) { \ + return atomic_fetch_add_explicit(a, val, mo); \ +} \ + \ +ATOMIC_INLINE type \ +atomic_fetch_sub_##short_type(atomic_##short_type##_t *a, \ + type val, atomic_memory_order_t mo) { \ + return atomic_fetch_sub_explicit(a, val, mo); \ +} \ +ATOMIC_INLINE type \ +atomic_fetch_and_##short_type(atomic_##short_type##_t *a, \ + type val, atomic_memory_order_t mo) { \ + return atomic_fetch_and_explicit(a, val, mo); \ +} \ +ATOMIC_INLINE type \ +atomic_fetch_or_##short_type(atomic_##short_type##_t *a, \ + type val, atomic_memory_order_t mo) { \ + return atomic_fetch_or_explicit(a, val, mo); \ +} \ +ATOMIC_INLINE type \ +atomic_fetch_xor_##short_type(atomic_##short_type##_t *a, \ + type val, atomic_memory_order_t mo) { \ + return atomic_fetch_xor_explicit(a, val, mo); \ +} + +#endif /* JEMALLOC_INTERNAL_ATOMIC_C11_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/atomic_gcc_atomic.h b/BeefRT/JEMalloc/include/jemalloc/internal/atomic_gcc_atomic.h new file mode 100644 index 00000000..471515e8 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/atomic_gcc_atomic.h @@ -0,0 +1,129 @@ +#ifndef JEMALLOC_INTERNAL_ATOMIC_GCC_ATOMIC_H +#define JEMALLOC_INTERNAL_ATOMIC_GCC_ATOMIC_H + +#include "jemalloc/internal/assert.h" + +#define ATOMIC_INIT(...) {__VA_ARGS__} + +typedef enum { + atomic_memory_order_relaxed, + atomic_memory_order_acquire, + atomic_memory_order_release, + atomic_memory_order_acq_rel, + atomic_memory_order_seq_cst +} atomic_memory_order_t; + +ATOMIC_INLINE int +atomic_enum_to_builtin(atomic_memory_order_t mo) { + switch (mo) { + case atomic_memory_order_relaxed: + return __ATOMIC_RELAXED; + case atomic_memory_order_acquire: + return __ATOMIC_ACQUIRE; + case atomic_memory_order_release: + return __ATOMIC_RELEASE; + case atomic_memory_order_acq_rel: + return __ATOMIC_ACQ_REL; + case atomic_memory_order_seq_cst: + return __ATOMIC_SEQ_CST; + } + /* Can't happen; the switch is exhaustive. */ + not_reached(); +} + +ATOMIC_INLINE void +atomic_fence(atomic_memory_order_t mo) { + __atomic_thread_fence(atomic_enum_to_builtin(mo)); +} + +#define JEMALLOC_GENERATE_ATOMICS(type, short_type, \ + /* unused */ lg_size) \ +typedef struct { \ + type repr; \ +} atomic_##short_type##_t; \ + \ +ATOMIC_INLINE type \ +atomic_load_##short_type(const atomic_##short_type##_t *a, \ + atomic_memory_order_t mo) { \ + type result; \ + __atomic_load(&a->repr, &result, atomic_enum_to_builtin(mo)); \ + return result; \ +} \ + \ +ATOMIC_INLINE void \ +atomic_store_##short_type(atomic_##short_type##_t *a, type val, \ + atomic_memory_order_t mo) { \ + __atomic_store(&a->repr, &val, atomic_enum_to_builtin(mo)); \ +} \ + \ +ATOMIC_INLINE type \ +atomic_exchange_##short_type(atomic_##short_type##_t *a, type val, \ + atomic_memory_order_t mo) { \ + type result; \ + __atomic_exchange(&a->repr, &val, &result, \ + atomic_enum_to_builtin(mo)); \ + return result; \ +} \ + \ +ATOMIC_INLINE bool \ +atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a, \ + UNUSED type *expected, type desired, \ + atomic_memory_order_t success_mo, \ + atomic_memory_order_t failure_mo) { \ + return __atomic_compare_exchange(&a->repr, expected, &desired, \ + true, atomic_enum_to_builtin(success_mo), \ + atomic_enum_to_builtin(failure_mo)); \ +} \ + \ +ATOMIC_INLINE bool \ +atomic_compare_exchange_strong_##short_type(atomic_##short_type##_t *a, \ + UNUSED type *expected, type desired, \ + atomic_memory_order_t success_mo, \ + atomic_memory_order_t failure_mo) { \ + return __atomic_compare_exchange(&a->repr, expected, &desired, \ + false, \ + atomic_enum_to_builtin(success_mo), \ + atomic_enum_to_builtin(failure_mo)); \ +} + + +#define JEMALLOC_GENERATE_INT_ATOMICS(type, short_type, \ + /* unused */ lg_size) \ +JEMALLOC_GENERATE_ATOMICS(type, short_type, /* unused */ lg_size) \ + \ +ATOMIC_INLINE type \ +atomic_fetch_add_##short_type(atomic_##short_type##_t *a, type val, \ + atomic_memory_order_t mo) { \ + return __atomic_fetch_add(&a->repr, val, \ + atomic_enum_to_builtin(mo)); \ +} \ + \ +ATOMIC_INLINE type \ +atomic_fetch_sub_##short_type(atomic_##short_type##_t *a, type val, \ + atomic_memory_order_t mo) { \ + return __atomic_fetch_sub(&a->repr, val, \ + atomic_enum_to_builtin(mo)); \ +} \ + \ +ATOMIC_INLINE type \ +atomic_fetch_and_##short_type(atomic_##short_type##_t *a, type val, \ + atomic_memory_order_t mo) { \ + return __atomic_fetch_and(&a->repr, val, \ + atomic_enum_to_builtin(mo)); \ +} \ + \ +ATOMIC_INLINE type \ +atomic_fetch_or_##short_type(atomic_##short_type##_t *a, type val, \ + atomic_memory_order_t mo) { \ + return __atomic_fetch_or(&a->repr, val, \ + atomic_enum_to_builtin(mo)); \ +} \ + \ +ATOMIC_INLINE type \ +atomic_fetch_xor_##short_type(atomic_##short_type##_t *a, type val, \ + atomic_memory_order_t mo) { \ + return __atomic_fetch_xor(&a->repr, val, \ + atomic_enum_to_builtin(mo)); \ +} + +#endif /* JEMALLOC_INTERNAL_ATOMIC_GCC_ATOMIC_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/atomic_gcc_sync.h b/BeefRT/JEMalloc/include/jemalloc/internal/atomic_gcc_sync.h new file mode 100644 index 00000000..e02b7cbe --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/atomic_gcc_sync.h @@ -0,0 +1,195 @@ +#ifndef JEMALLOC_INTERNAL_ATOMIC_GCC_SYNC_H +#define JEMALLOC_INTERNAL_ATOMIC_GCC_SYNC_H + +#define ATOMIC_INIT(...) {__VA_ARGS__} + +typedef enum { + atomic_memory_order_relaxed, + atomic_memory_order_acquire, + atomic_memory_order_release, + atomic_memory_order_acq_rel, + atomic_memory_order_seq_cst +} atomic_memory_order_t; + +ATOMIC_INLINE void +atomic_fence(atomic_memory_order_t mo) { + /* Easy cases first: no barrier, and full barrier. */ + if (mo == atomic_memory_order_relaxed) { + asm volatile("" ::: "memory"); + return; + } + if (mo == atomic_memory_order_seq_cst) { + asm volatile("" ::: "memory"); + __sync_synchronize(); + asm volatile("" ::: "memory"); + return; + } + asm volatile("" ::: "memory"); +# if defined(__i386__) || defined(__x86_64__) + /* This is implicit on x86. */ +# elif defined(__ppc64__) + asm volatile("lwsync"); +# elif defined(__ppc__) + asm volatile("sync"); +# elif defined(__sparc__) && defined(__arch64__) + if (mo == atomic_memory_order_acquire) { + asm volatile("membar #LoadLoad | #LoadStore"); + } else if (mo == atomic_memory_order_release) { + asm volatile("membar #LoadStore | #StoreStore"); + } else { + asm volatile("membar #LoadLoad | #LoadStore | #StoreStore"); + } +# else + __sync_synchronize(); +# endif + asm volatile("" ::: "memory"); +} + +/* + * A correct implementation of seq_cst loads and stores on weakly ordered + * architectures could do either of the following: + * 1. store() is weak-fence -> store -> strong fence, load() is load -> + * strong-fence. + * 2. store() is strong-fence -> store, load() is strong-fence -> load -> + * weak-fence. + * The tricky thing is, load() and store() above can be the load or store + * portions of a gcc __sync builtin, so we have to follow GCC's lead, which + * means going with strategy 2. + * On strongly ordered architectures, the natural strategy is to stick a strong + * fence after seq_cst stores, and have naked loads. So we want the strong + * fences in different places on different architectures. + * atomic_pre_sc_load_fence and atomic_post_sc_store_fence allow us to + * accomplish this. + */ + +ATOMIC_INLINE void +atomic_pre_sc_load_fence() { +# if defined(__i386__) || defined(__x86_64__) || \ + (defined(__sparc__) && defined(__arch64__)) + atomic_fence(atomic_memory_order_relaxed); +# else + atomic_fence(atomic_memory_order_seq_cst); +# endif +} + +ATOMIC_INLINE void +atomic_post_sc_store_fence() { +# if defined(__i386__) || defined(__x86_64__) || \ + (defined(__sparc__) && defined(__arch64__)) + atomic_fence(atomic_memory_order_seq_cst); +# else + atomic_fence(atomic_memory_order_relaxed); +# endif + +} + +#define JEMALLOC_GENERATE_ATOMICS(type, short_type, \ + /* unused */ lg_size) \ +typedef struct { \ + type volatile repr; \ +} atomic_##short_type##_t; \ + \ +ATOMIC_INLINE type \ +atomic_load_##short_type(const atomic_##short_type##_t *a, \ + atomic_memory_order_t mo) { \ + if (mo == atomic_memory_order_seq_cst) { \ + atomic_pre_sc_load_fence(); \ + } \ + type result = a->repr; \ + if (mo != atomic_memory_order_relaxed) { \ + atomic_fence(atomic_memory_order_acquire); \ + } \ + return result; \ +} \ + \ +ATOMIC_INLINE void \ +atomic_store_##short_type(atomic_##short_type##_t *a, \ + type val, atomic_memory_order_t mo) { \ + if (mo != atomic_memory_order_relaxed) { \ + atomic_fence(atomic_memory_order_release); \ + } \ + a->repr = val; \ + if (mo == atomic_memory_order_seq_cst) { \ + atomic_post_sc_store_fence(); \ + } \ +} \ + \ +ATOMIC_INLINE type \ +atomic_exchange_##short_type(atomic_##short_type##_t *a, type val, \ + atomic_memory_order_t mo) { \ + /* \ + * Because of FreeBSD, we care about gcc 4.2, which doesn't have\ + * an atomic exchange builtin. We fake it with a CAS loop. \ + */ \ + while (true) { \ + type old = a->repr; \ + if (__sync_bool_compare_and_swap(&a->repr, old, val)) { \ + return old; \ + } \ + } \ +} \ + \ +ATOMIC_INLINE bool \ +atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a, \ + type *expected, type desired, \ + atomic_memory_order_t success_mo, \ + atomic_memory_order_t failure_mo) { \ + type prev = __sync_val_compare_and_swap(&a->repr, *expected, \ + desired); \ + if (prev == *expected) { \ + return true; \ + } else { \ + *expected = prev; \ + return false; \ + } \ +} \ +ATOMIC_INLINE bool \ +atomic_compare_exchange_strong_##short_type(atomic_##short_type##_t *a, \ + type *expected, type desired, \ + atomic_memory_order_t success_mo, \ + atomic_memory_order_t failure_mo) { \ + type prev = __sync_val_compare_and_swap(&a->repr, *expected, \ + desired); \ + if (prev == *expected) { \ + return true; \ + } else { \ + *expected = prev; \ + return false; \ + } \ +} + +#define JEMALLOC_GENERATE_INT_ATOMICS(type, short_type, \ + /* unused */ lg_size) \ +JEMALLOC_GENERATE_ATOMICS(type, short_type, /* unused */ lg_size) \ + \ +ATOMIC_INLINE type \ +atomic_fetch_add_##short_type(atomic_##short_type##_t *a, type val, \ + atomic_memory_order_t mo) { \ + return __sync_fetch_and_add(&a->repr, val); \ +} \ + \ +ATOMIC_INLINE type \ +atomic_fetch_sub_##short_type(atomic_##short_type##_t *a, type val, \ + atomic_memory_order_t mo) { \ + return __sync_fetch_and_sub(&a->repr, val); \ +} \ + \ +ATOMIC_INLINE type \ +atomic_fetch_and_##short_type(atomic_##short_type##_t *a, type val, \ + atomic_memory_order_t mo) { \ + return __sync_fetch_and_and(&a->repr, val); \ +} \ + \ +ATOMIC_INLINE type \ +atomic_fetch_or_##short_type(atomic_##short_type##_t *a, type val, \ + atomic_memory_order_t mo) { \ + return __sync_fetch_and_or(&a->repr, val); \ +} \ + \ +ATOMIC_INLINE type \ +atomic_fetch_xor_##short_type(atomic_##short_type##_t *a, type val, \ + atomic_memory_order_t mo) { \ + return __sync_fetch_and_xor(&a->repr, val); \ +} + +#endif /* JEMALLOC_INTERNAL_ATOMIC_GCC_SYNC_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/atomic_msvc.h b/BeefRT/JEMalloc/include/jemalloc/internal/atomic_msvc.h new file mode 100644 index 00000000..67057ce5 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/atomic_msvc.h @@ -0,0 +1,158 @@ +#ifndef JEMALLOC_INTERNAL_ATOMIC_MSVC_H +#define JEMALLOC_INTERNAL_ATOMIC_MSVC_H + +#define ATOMIC_INIT(...) {__VA_ARGS__} + +typedef enum { + atomic_memory_order_relaxed, + atomic_memory_order_acquire, + atomic_memory_order_release, + atomic_memory_order_acq_rel, + atomic_memory_order_seq_cst +} atomic_memory_order_t; + +typedef char atomic_repr_0_t; +typedef short atomic_repr_1_t; +typedef long atomic_repr_2_t; +typedef __int64 atomic_repr_3_t; + +ATOMIC_INLINE void +atomic_fence(atomic_memory_order_t mo) { + _ReadWriteBarrier(); +# if defined(_M_ARM) || defined(_M_ARM64) + /* ARM needs a barrier for everything but relaxed. */ + if (mo != atomic_memory_order_relaxed) { + MemoryBarrier(); + } +# elif defined(_M_IX86) || defined (_M_X64) + /* x86 needs a barrier only for seq_cst. */ + if (mo == atomic_memory_order_seq_cst) { + MemoryBarrier(); + } +# else +# error "Don't know how to create atomics for this platform for MSVC." +# endif + _ReadWriteBarrier(); +} + +#define ATOMIC_INTERLOCKED_REPR(lg_size) atomic_repr_ ## lg_size ## _t + +#define ATOMIC_CONCAT(a, b) ATOMIC_RAW_CONCAT(a, b) +#define ATOMIC_RAW_CONCAT(a, b) a ## b + +#define ATOMIC_INTERLOCKED_NAME(base_name, lg_size) ATOMIC_CONCAT( \ + base_name, ATOMIC_INTERLOCKED_SUFFIX(lg_size)) + +#define ATOMIC_INTERLOCKED_SUFFIX(lg_size) \ + ATOMIC_CONCAT(ATOMIC_INTERLOCKED_SUFFIX_, lg_size) + +#define ATOMIC_INTERLOCKED_SUFFIX_0 8 +#define ATOMIC_INTERLOCKED_SUFFIX_1 16 +#define ATOMIC_INTERLOCKED_SUFFIX_2 +#define ATOMIC_INTERLOCKED_SUFFIX_3 64 + +#define JEMALLOC_GENERATE_ATOMICS(type, short_type, lg_size) \ +typedef struct { \ + ATOMIC_INTERLOCKED_REPR(lg_size) repr; \ +} atomic_##short_type##_t; \ + \ +ATOMIC_INLINE type \ +atomic_load_##short_type(const atomic_##short_type##_t *a, \ + atomic_memory_order_t mo) { \ + ATOMIC_INTERLOCKED_REPR(lg_size) ret = a->repr; \ + if (mo != atomic_memory_order_relaxed) { \ + atomic_fence(atomic_memory_order_acquire); \ + } \ + return (type) ret; \ +} \ + \ +ATOMIC_INLINE void \ +atomic_store_##short_type(atomic_##short_type##_t *a, \ + type val, atomic_memory_order_t mo) { \ + if (mo != atomic_memory_order_relaxed) { \ + atomic_fence(atomic_memory_order_release); \ + } \ + a->repr = (ATOMIC_INTERLOCKED_REPR(lg_size)) val; \ + if (mo == atomic_memory_order_seq_cst) { \ + atomic_fence(atomic_memory_order_seq_cst); \ + } \ +} \ + \ +ATOMIC_INLINE type \ +atomic_exchange_##short_type(atomic_##short_type##_t *a, type val, \ + atomic_memory_order_t mo) { \ + return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedExchange, \ + lg_size)(&a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val); \ +} \ + \ +ATOMIC_INLINE bool \ +atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a, \ + type *expected, type desired, atomic_memory_order_t success_mo, \ + atomic_memory_order_t failure_mo) { \ + ATOMIC_INTERLOCKED_REPR(lg_size) e = \ + (ATOMIC_INTERLOCKED_REPR(lg_size))*expected; \ + ATOMIC_INTERLOCKED_REPR(lg_size) d = \ + (ATOMIC_INTERLOCKED_REPR(lg_size))desired; \ + ATOMIC_INTERLOCKED_REPR(lg_size) old = \ + ATOMIC_INTERLOCKED_NAME(_InterlockedCompareExchange, \ + lg_size)(&a->repr, d, e); \ + if (old == e) { \ + return true; \ + } else { \ + *expected = (type)old; \ + return false; \ + } \ +} \ + \ +ATOMIC_INLINE bool \ +atomic_compare_exchange_strong_##short_type(atomic_##short_type##_t *a, \ + type *expected, type desired, atomic_memory_order_t success_mo, \ + atomic_memory_order_t failure_mo) { \ + /* We implement the weak version with strong semantics. */ \ + return atomic_compare_exchange_weak_##short_type(a, expected, \ + desired, success_mo, failure_mo); \ +} + + +#define JEMALLOC_GENERATE_INT_ATOMICS(type, short_type, lg_size) \ +JEMALLOC_GENERATE_ATOMICS(type, short_type, lg_size) \ + \ +ATOMIC_INLINE type \ +atomic_fetch_add_##short_type(atomic_##short_type##_t *a, \ + type val, atomic_memory_order_t mo) { \ + return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedExchangeAdd, \ + lg_size)(&a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val); \ +} \ + \ +ATOMIC_INLINE type \ +atomic_fetch_sub_##short_type(atomic_##short_type##_t *a, \ + type val, atomic_memory_order_t mo) { \ + /* \ + * MSVC warns on negation of unsigned operands, but for us it \ + * gives exactly the right semantics (MAX_TYPE + 1 - operand). \ + */ \ + __pragma(warning(push)) \ + __pragma(warning(disable: 4146)) \ + return atomic_fetch_add_##short_type(a, -val, mo); \ + __pragma(warning(pop)) \ +} \ +ATOMIC_INLINE type \ +atomic_fetch_and_##short_type(atomic_##short_type##_t *a, \ + type val, atomic_memory_order_t mo) { \ + return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedAnd, lg_size)( \ + &a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val); \ +} \ +ATOMIC_INLINE type \ +atomic_fetch_or_##short_type(atomic_##short_type##_t *a, \ + type val, atomic_memory_order_t mo) { \ + return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedOr, lg_size)( \ + &a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val); \ +} \ +ATOMIC_INLINE type \ +atomic_fetch_xor_##short_type(atomic_##short_type##_t *a, \ + type val, atomic_memory_order_t mo) { \ + return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedXor, lg_size)( \ + &a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val); \ +} + +#endif /* JEMALLOC_INTERNAL_ATOMIC_MSVC_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/background_thread_externs.h b/BeefRT/JEMalloc/include/jemalloc/internal/background_thread_externs.h new file mode 100644 index 00000000..6ae3c8d8 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/background_thread_externs.h @@ -0,0 +1,33 @@ +#ifndef JEMALLOC_INTERNAL_BACKGROUND_THREAD_EXTERNS_H +#define JEMALLOC_INTERNAL_BACKGROUND_THREAD_EXTERNS_H + +extern bool opt_background_thread; +extern size_t opt_max_background_threads; +extern malloc_mutex_t background_thread_lock; +extern atomic_b_t background_thread_enabled_state; +extern size_t n_background_threads; +extern size_t max_background_threads; +extern background_thread_info_t *background_thread_info; + +bool background_thread_create(tsd_t *tsd, unsigned arena_ind); +bool background_threads_enable(tsd_t *tsd); +bool background_threads_disable(tsd_t *tsd); +bool background_thread_is_started(background_thread_info_t* info); +void background_thread_wakeup_early(background_thread_info_t *info, + nstime_t *remaining_sleep); +void background_thread_prefork0(tsdn_t *tsdn); +void background_thread_prefork1(tsdn_t *tsdn); +void background_thread_postfork_parent(tsdn_t *tsdn); +void background_thread_postfork_child(tsdn_t *tsdn); +bool background_thread_stats_read(tsdn_t *tsdn, + background_thread_stats_t *stats); +void background_thread_ctl_init(tsdn_t *tsdn); + +#ifdef JEMALLOC_PTHREAD_CREATE_WRAPPER +extern int pthread_create_wrapper(pthread_t *__restrict, const pthread_attr_t *, + void *(*)(void *), void *__restrict); +#endif +bool background_thread_boot0(void); +bool background_thread_boot1(tsdn_t *tsdn, base_t *base); + +#endif /* JEMALLOC_INTERNAL_BACKGROUND_THREAD_EXTERNS_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/background_thread_inlines.h b/BeefRT/JEMalloc/include/jemalloc/internal/background_thread_inlines.h new file mode 100644 index 00000000..92c5febe --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/background_thread_inlines.h @@ -0,0 +1,48 @@ +#ifndef JEMALLOC_INTERNAL_BACKGROUND_THREAD_INLINES_H +#define JEMALLOC_INTERNAL_BACKGROUND_THREAD_INLINES_H + +JEMALLOC_ALWAYS_INLINE bool +background_thread_enabled(void) { + return atomic_load_b(&background_thread_enabled_state, ATOMIC_RELAXED); +} + +JEMALLOC_ALWAYS_INLINE void +background_thread_enabled_set(tsdn_t *tsdn, bool state) { + malloc_mutex_assert_owner(tsdn, &background_thread_lock); + atomic_store_b(&background_thread_enabled_state, state, ATOMIC_RELAXED); +} + +JEMALLOC_ALWAYS_INLINE background_thread_info_t * +arena_background_thread_info_get(arena_t *arena) { + unsigned arena_ind = arena_ind_get(arena); + return &background_thread_info[arena_ind % max_background_threads]; +} + +JEMALLOC_ALWAYS_INLINE background_thread_info_t * +background_thread_info_get(size_t ind) { + return &background_thread_info[ind % max_background_threads]; +} + +JEMALLOC_ALWAYS_INLINE uint64_t +background_thread_wakeup_time_get(background_thread_info_t *info) { + uint64_t next_wakeup = nstime_ns(&info->next_wakeup); + assert(atomic_load_b(&info->indefinite_sleep, ATOMIC_ACQUIRE) == + (next_wakeup == BACKGROUND_THREAD_INDEFINITE_SLEEP)); + return next_wakeup; +} + +JEMALLOC_ALWAYS_INLINE void +background_thread_wakeup_time_set(tsdn_t *tsdn, background_thread_info_t *info, + uint64_t wakeup_time) { + malloc_mutex_assert_owner(tsdn, &info->mtx); + atomic_store_b(&info->indefinite_sleep, + wakeup_time == BACKGROUND_THREAD_INDEFINITE_SLEEP, ATOMIC_RELEASE); + nstime_init(&info->next_wakeup, wakeup_time); +} + +JEMALLOC_ALWAYS_INLINE bool +background_thread_indefinite_sleep(background_thread_info_t *info) { + return atomic_load_b(&info->indefinite_sleep, ATOMIC_ACQUIRE); +} + +#endif /* JEMALLOC_INTERNAL_BACKGROUND_THREAD_INLINES_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/background_thread_structs.h b/BeefRT/JEMalloc/include/jemalloc/internal/background_thread_structs.h new file mode 100644 index 00000000..83a91984 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/background_thread_structs.h @@ -0,0 +1,66 @@ +#ifndef JEMALLOC_INTERNAL_BACKGROUND_THREAD_STRUCTS_H +#define JEMALLOC_INTERNAL_BACKGROUND_THREAD_STRUCTS_H + +/* This file really combines "structs" and "types", but only transitionally. */ + +#if defined(JEMALLOC_BACKGROUND_THREAD) || defined(JEMALLOC_LAZY_LOCK) +# define JEMALLOC_PTHREAD_CREATE_WRAPPER +#endif + +#define BACKGROUND_THREAD_INDEFINITE_SLEEP UINT64_MAX +#define MAX_BACKGROUND_THREAD_LIMIT MALLOCX_ARENA_LIMIT +#define DEFAULT_NUM_BACKGROUND_THREAD 4 + +/* + * These exist only as a transitional state. Eventually, deferral should be + * part of the PAI, and each implementation can indicate wait times with more + * specificity. + */ +#define BACKGROUND_THREAD_HPA_INTERVAL_MAX_UNINITIALIZED (-2) +#define BACKGROUND_THREAD_HPA_INTERVAL_MAX_DEFAULT_WHEN_ENABLED 5000 + +#define BACKGROUND_THREAD_DEFERRED_MIN UINT64_C(0) +#define BACKGROUND_THREAD_DEFERRED_MAX UINT64_MAX + +typedef enum { + background_thread_stopped, + background_thread_started, + /* Thread waits on the global lock when paused (for arena_reset). */ + background_thread_paused, +} background_thread_state_t; + +struct background_thread_info_s { +#ifdef JEMALLOC_BACKGROUND_THREAD + /* Background thread is pthread specific. */ + pthread_t thread; + pthread_cond_t cond; +#endif + malloc_mutex_t mtx; + background_thread_state_t state; + /* When true, it means no wakeup scheduled. */ + atomic_b_t indefinite_sleep; + /* Next scheduled wakeup time (absolute time in ns). */ + nstime_t next_wakeup; + /* + * Since the last background thread run, newly added number of pages + * that need to be purged by the next wakeup. This is adjusted on + * epoch advance, and is used to determine whether we should signal the + * background thread to wake up earlier. + */ + size_t npages_to_purge_new; + /* Stats: total number of runs since started. */ + uint64_t tot_n_runs; + /* Stats: total sleep time since started. */ + nstime_t tot_sleep_time; +}; +typedef struct background_thread_info_s background_thread_info_t; + +struct background_thread_stats_s { + size_t num_threads; + uint64_t num_runs; + nstime_t run_interval; + mutex_prof_data_t max_counter_per_bg_thd; +}; +typedef struct background_thread_stats_s background_thread_stats_t; + +#endif /* JEMALLOC_INTERNAL_BACKGROUND_THREAD_STRUCTS_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/base.h b/BeefRT/JEMalloc/include/jemalloc/internal/base.h new file mode 100644 index 00000000..9b2c9fb1 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/base.h @@ -0,0 +1,110 @@ +#ifndef JEMALLOC_INTERNAL_BASE_H +#define JEMALLOC_INTERNAL_BASE_H + +#include "jemalloc/internal/edata.h" +#include "jemalloc/internal/ehooks.h" +#include "jemalloc/internal/mutex.h" + +enum metadata_thp_mode_e { + metadata_thp_disabled = 0, + /* + * Lazily enable hugepage for metadata. To avoid high RSS caused by THP + * + low usage arena (i.e. THP becomes a significant percentage), the + * "auto" option only starts using THP after a base allocator used up + * the first THP region. Starting from the second hugepage (in a single + * arena), "auto" behaves the same as "always", i.e. madvise hugepage + * right away. + */ + metadata_thp_auto = 1, + metadata_thp_always = 2, + metadata_thp_mode_limit = 3 +}; +typedef enum metadata_thp_mode_e metadata_thp_mode_t; + +#define METADATA_THP_DEFAULT metadata_thp_disabled +extern metadata_thp_mode_t opt_metadata_thp; +extern const char *metadata_thp_mode_names[]; + + +/* Embedded at the beginning of every block of base-managed virtual memory. */ +typedef struct base_block_s base_block_t; +struct base_block_s { + /* Total size of block's virtual memory mapping. */ + size_t size; + + /* Next block in list of base's blocks. */ + base_block_t *next; + + /* Tracks unused trailing space. */ + edata_t edata; +}; + +typedef struct base_s base_t; +struct base_s { + /* + * User-configurable extent hook functions. + */ + ehooks_t ehooks; + + /* + * User-configurable extent hook functions for metadata allocations. + */ + ehooks_t ehooks_base; + + /* Protects base_alloc() and base_stats_get() operations. */ + malloc_mutex_t mtx; + + /* Using THP when true (metadata_thp auto mode). */ + bool auto_thp_switched; + /* + * Most recent size class in the series of increasingly large base + * extents. Logarithmic spacing between subsequent allocations ensures + * that the total number of distinct mappings remains small. + */ + pszind_t pind_last; + + /* Serial number generation state. */ + size_t extent_sn_next; + + /* Chain of all blocks associated with base. */ + base_block_t *blocks; + + /* Heap of extents that track unused trailing space within blocks. */ + edata_heap_t avail[SC_NSIZES]; + + /* Stats, only maintained if config_stats. */ + size_t allocated; + size_t resident; + size_t mapped; + /* Number of THP regions touched. */ + size_t n_thp; +}; + +static inline unsigned +base_ind_get(const base_t *base) { + return ehooks_ind_get(&base->ehooks); +} + +static inline bool +metadata_thp_enabled(void) { + return (opt_metadata_thp != metadata_thp_disabled); +} + +base_t *b0get(void); +base_t *base_new(tsdn_t *tsdn, unsigned ind, + const extent_hooks_t *extent_hooks, bool metadata_use_hooks); +void base_delete(tsdn_t *tsdn, base_t *base); +ehooks_t *base_ehooks_get(base_t *base); +ehooks_t *base_ehooks_get_for_metadata(base_t *base); +extent_hooks_t *base_extent_hooks_set(base_t *base, + extent_hooks_t *extent_hooks); +void *base_alloc(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment); +edata_t *base_alloc_edata(tsdn_t *tsdn, base_t *base); +void base_stats_get(tsdn_t *tsdn, base_t *base, size_t *allocated, + size_t *resident, size_t *mapped, size_t *n_thp); +void base_prefork(tsdn_t *tsdn, base_t *base); +void base_postfork_parent(tsdn_t *tsdn, base_t *base); +void base_postfork_child(tsdn_t *tsdn, base_t *base); +bool base_boot(tsdn_t *tsdn); + +#endif /* JEMALLOC_INTERNAL_BASE_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/bin.h b/BeefRT/JEMalloc/include/jemalloc/internal/bin.h new file mode 100644 index 00000000..63f97395 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/bin.h @@ -0,0 +1,82 @@ +#ifndef JEMALLOC_INTERNAL_BIN_H +#define JEMALLOC_INTERNAL_BIN_H + +#include "jemalloc/internal/bin_stats.h" +#include "jemalloc/internal/bin_types.h" +#include "jemalloc/internal/edata.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/sc.h" + +/* + * A bin contains a set of extents that are currently being used for slab + * allocations. + */ +typedef struct bin_s bin_t; +struct bin_s { + /* All operations on bin_t fields require lock ownership. */ + malloc_mutex_t lock; + + /* + * Bin statistics. These get touched every time the lock is acquired, + * so put them close by in the hopes of getting some cache locality. + */ + bin_stats_t stats; + + /* + * Current slab being used to service allocations of this bin's size + * class. slabcur is independent of slabs_{nonfull,full}; whenever + * slabcur is reassigned, the previous slab must be deallocated or + * inserted into slabs_{nonfull,full}. + */ + edata_t *slabcur; + + /* + * Heap of non-full slabs. This heap is used to assure that new + * allocations come from the non-full slab that is oldest/lowest in + * memory. + */ + edata_heap_t slabs_nonfull; + + /* List used to track full slabs. */ + edata_list_active_t slabs_full; +}; + +/* A set of sharded bins of the same size class. */ +typedef struct bins_s bins_t; +struct bins_s { + /* Sharded bins. Dynamically sized. */ + bin_t *bin_shards; +}; + +void bin_shard_sizes_boot(unsigned bin_shards[SC_NBINS]); +bool bin_update_shard_size(unsigned bin_shards[SC_NBINS], size_t start_size, + size_t end_size, size_t nshards); + +/* Initializes a bin to empty. Returns true on error. */ +bool bin_init(bin_t *bin); + +/* Forking. */ +void bin_prefork(tsdn_t *tsdn, bin_t *bin); +void bin_postfork_parent(tsdn_t *tsdn, bin_t *bin); +void bin_postfork_child(tsdn_t *tsdn, bin_t *bin); + +/* Stats. */ +static inline void +bin_stats_merge(tsdn_t *tsdn, bin_stats_data_t *dst_bin_stats, bin_t *bin) { + malloc_mutex_lock(tsdn, &bin->lock); + malloc_mutex_prof_accum(tsdn, &dst_bin_stats->mutex_data, &bin->lock); + bin_stats_t *stats = &dst_bin_stats->stats_data; + stats->nmalloc += bin->stats.nmalloc; + stats->ndalloc += bin->stats.ndalloc; + stats->nrequests += bin->stats.nrequests; + stats->curregs += bin->stats.curregs; + stats->nfills += bin->stats.nfills; + stats->nflushes += bin->stats.nflushes; + stats->nslabs += bin->stats.nslabs; + stats->reslabs += bin->stats.reslabs; + stats->curslabs += bin->stats.curslabs; + stats->nonfull_slabs += bin->stats.nonfull_slabs; + malloc_mutex_unlock(tsdn, &bin->lock); +} + +#endif /* JEMALLOC_INTERNAL_BIN_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/bin_info.h b/BeefRT/JEMalloc/include/jemalloc/internal/bin_info.h new file mode 100644 index 00000000..7fe65c86 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/bin_info.h @@ -0,0 +1,50 @@ +#ifndef JEMALLOC_INTERNAL_BIN_INFO_H +#define JEMALLOC_INTERNAL_BIN_INFO_H + +#include "jemalloc/internal/bitmap.h" + +/* + * Read-only information associated with each element of arena_t's bins array + * is stored separately, partly to reduce memory usage (only one copy, rather + * than one per arena), but mainly to avoid false cacheline sharing. + * + * Each slab has the following layout: + * + * /--------------------\ + * | region 0 | + * |--------------------| + * | region 1 | + * |--------------------| + * | ... | + * | ... | + * | ... | + * |--------------------| + * | region nregs-1 | + * \--------------------/ + */ +typedef struct bin_info_s bin_info_t; +struct bin_info_s { + /* Size of regions in a slab for this bin's size class. */ + size_t reg_size; + + /* Total size of a slab for this bin's size class. */ + size_t slab_size; + + /* Total number of regions in a slab for this bin's size class. */ + uint32_t nregs; + + /* Number of sharded bins in each arena for this size class. */ + uint32_t n_shards; + + /* + * Metadata used to manipulate bitmaps for slabs associated with this + * bin. + */ + bitmap_info_t bitmap_info; +}; + +extern bin_info_t bin_infos[SC_NBINS]; + +void bin_info_boot(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS]); + +#endif /* JEMALLOC_INTERNAL_BIN_INFO_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/bin_stats.h b/BeefRT/JEMalloc/include/jemalloc/internal/bin_stats.h new file mode 100644 index 00000000..0b99297c --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/bin_stats.h @@ -0,0 +1,57 @@ +#ifndef JEMALLOC_INTERNAL_BIN_STATS_H +#define JEMALLOC_INTERNAL_BIN_STATS_H + +#include "jemalloc/internal/mutex_prof.h" + +typedef struct bin_stats_s bin_stats_t; +struct bin_stats_s { + /* + * Total number of allocation/deallocation requests served directly by + * the bin. Note that tcache may allocate an object, then recycle it + * many times, resulting many increments to nrequests, but only one + * each to nmalloc and ndalloc. + */ + uint64_t nmalloc; + uint64_t ndalloc; + + /* + * Number of allocation requests that correspond to the size of this + * bin. This includes requests served by tcache, though tcache only + * periodically merges into this counter. + */ + uint64_t nrequests; + + /* + * Current number of regions of this size class, including regions + * currently cached by tcache. + */ + size_t curregs; + + /* Number of tcache fills from this bin. */ + uint64_t nfills; + + /* Number of tcache flushes to this bin. */ + uint64_t nflushes; + + /* Total number of slabs created for this bin's size class. */ + uint64_t nslabs; + + /* + * Total number of slabs reused by extracting them from the slabs heap + * for this bin's size class. + */ + uint64_t reslabs; + + /* Current number of slabs in this bin. */ + size_t curslabs; + + /* Current size of nonfull slabs heap in this bin. */ + size_t nonfull_slabs; +}; + +typedef struct bin_stats_data_s bin_stats_data_t; +struct bin_stats_data_s { + bin_stats_t stats_data; + mutex_prof_data_t mutex_data; +}; +#endif /* JEMALLOC_INTERNAL_BIN_STATS_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/bin_types.h b/BeefRT/JEMalloc/include/jemalloc/internal/bin_types.h new file mode 100644 index 00000000..945e8326 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/bin_types.h @@ -0,0 +1,17 @@ +#ifndef JEMALLOC_INTERNAL_BIN_TYPES_H +#define JEMALLOC_INTERNAL_BIN_TYPES_H + +#include "jemalloc/internal/sc.h" + +#define BIN_SHARDS_MAX (1 << EDATA_BITS_BINSHARD_WIDTH) +#define N_BIN_SHARDS_DEFAULT 1 + +/* Used in TSD static initializer only. Real init in arena_bind(). */ +#define TSD_BINSHARDS_ZERO_INITIALIZER {{UINT8_MAX}} + +typedef struct tsd_binshards_s tsd_binshards_t; +struct tsd_binshards_s { + uint8_t binshard[SC_NBINS]; +}; + +#endif /* JEMALLOC_INTERNAL_BIN_TYPES_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/bit_util.h b/BeefRT/JEMalloc/include/jemalloc/internal/bit_util.h new file mode 100644 index 00000000..bac59140 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/bit_util.h @@ -0,0 +1,422 @@ +#ifndef JEMALLOC_INTERNAL_BIT_UTIL_H +#define JEMALLOC_INTERNAL_BIT_UTIL_H + +#include "jemalloc/internal/assert.h" + +/* Sanity check. */ +#if !defined(JEMALLOC_INTERNAL_FFSLL) || !defined(JEMALLOC_INTERNAL_FFSL) \ + || !defined(JEMALLOC_INTERNAL_FFS) +# error JEMALLOC_INTERNAL_FFS{,L,LL} should have been defined by configure +#endif + +/* + * Unlike the builtins and posix ffs functions, our ffs requires a non-zero + * input, and returns the position of the lowest bit set (as opposed to the + * posix versions, which return 1 larger than that position and use a return + * value of zero as a sentinel. This tends to simplify logic in callers, and + * allows for consistency with the builtins we build fls on top of. + */ +static inline unsigned +ffs_llu(unsigned long long x) { + util_assume(x != 0); + return JEMALLOC_INTERNAL_FFSLL(x) - 1; +} + +static inline unsigned +ffs_lu(unsigned long x) { + util_assume(x != 0); + return JEMALLOC_INTERNAL_FFSL(x) - 1; +} + +static inline unsigned +ffs_u(unsigned x) { + util_assume(x != 0); + return JEMALLOC_INTERNAL_FFS(x) - 1; +} + +#define DO_FLS_SLOW(x, suffix) do { \ + util_assume(x != 0); \ + x |= (x >> 1); \ + x |= (x >> 2); \ + x |= (x >> 4); \ + x |= (x >> 8); \ + x |= (x >> 16); \ + if (sizeof(x) > 4) { \ + /* \ + * If sizeof(x) is 4, then the expression "x >> 32" \ + * will generate compiler warnings even if the code \ + * never executes. This circumvents the warning, and \ + * gets compiled out in optimized builds. \ + */ \ + int constant_32 = sizeof(x) * 4; \ + x |= (x >> constant_32); \ + } \ + x++; \ + if (x == 0) { \ + return 8 * sizeof(x) - 1; \ + } \ + return ffs_##suffix(x) - 1; \ +} while(0) + +static inline unsigned +fls_llu_slow(unsigned long long x) { + DO_FLS_SLOW(x, llu); +} + +static inline unsigned +fls_lu_slow(unsigned long x) { + DO_FLS_SLOW(x, lu); +} + +static inline unsigned +fls_u_slow(unsigned x) { + DO_FLS_SLOW(x, u); +} + +#undef DO_FLS_SLOW + +#ifdef JEMALLOC_HAVE_BUILTIN_CLZ +static inline unsigned +fls_llu(unsigned long long x) { + util_assume(x != 0); + /* + * Note that the xor here is more naturally written as subtraction; the + * last bit set is the number of bits in the type minus the number of + * leading zero bits. But GCC implements that as: + * bsr edi, edi + * mov eax, 31 + * xor edi, 31 + * sub eax, edi + * If we write it as xor instead, then we get + * bsr eax, edi + * as desired. + */ + return (8 * sizeof(x) - 1) ^ __builtin_clzll(x); +} + +static inline unsigned +fls_lu(unsigned long x) { + util_assume(x != 0); + return (8 * sizeof(x) - 1) ^ __builtin_clzl(x); +} + +static inline unsigned +fls_u(unsigned x) { + util_assume(x != 0); + return (8 * sizeof(x) - 1) ^ __builtin_clz(x); +} +#elif defined(_MSC_VER) + +#if LG_SIZEOF_PTR == 3 +#define DO_BSR64(bit, x) _BitScanReverse64(&bit, x) +#else +/* + * This never actually runs; we're just dodging a compiler error for the + * never-taken branch where sizeof(void *) == 8. + */ +#define DO_BSR64(bit, x) bit = 0; unreachable() +#endif + +#define DO_FLS(x) do { \ + if (x == 0) { \ + return 8 * sizeof(x); \ + } \ + unsigned long bit; \ + if (sizeof(x) == 4) { \ + _BitScanReverse(&bit, (unsigned)x); \ + return (unsigned)bit; \ + } \ + if (sizeof(x) == 8 && sizeof(void *) == 8) { \ + DO_BSR64(bit, x); \ + return (unsigned)bit; \ + } \ + if (sizeof(x) == 8 && sizeof(void *) == 4) { \ + /* Dodge a compiler warning, as above. */ \ + int constant_32 = sizeof(x) * 4; \ + if (_BitScanReverse(&bit, \ + (unsigned)(x >> constant_32))) { \ + return 32 + (unsigned)bit; \ + } else { \ + _BitScanReverse(&bit, (unsigned)x); \ + return (unsigned)bit; \ + } \ + } \ + unreachable(); \ +} while (0) + +static inline unsigned +fls_llu(unsigned long long x) { + DO_FLS(x); +} + +static inline unsigned +fls_lu(unsigned long x) { + DO_FLS(x); +} + +static inline unsigned +fls_u(unsigned x) { + DO_FLS(x); +} + +#undef DO_FLS +#undef DO_BSR64 +#else + +static inline unsigned +fls_llu(unsigned long long x) { + return fls_llu_slow(x); +} + +static inline unsigned +fls_lu(unsigned long x) { + return fls_lu_slow(x); +} + +static inline unsigned +fls_u(unsigned x) { + return fls_u_slow(x); +} +#endif + +#if LG_SIZEOF_LONG_LONG > 3 +# error "Haven't implemented popcount for 16-byte ints." +#endif + +#define DO_POPCOUNT(x, type) do { \ + /* \ + * Algorithm from an old AMD optimization reference manual. \ + * We're putting a little bit more work than you might expect \ + * into the no-instrinsic case, since we only support the \ + * GCC intrinsics spelling of popcount (for now). Detecting \ + * whether or not the popcount builtin is actually useable in \ + * MSVC is nontrivial. \ + */ \ + \ + type bmul = (type)0x0101010101010101ULL; \ + \ + /* \ + * Replace each 2 bits with the sideways sum of the original \ + * values. 0x5 = 0b0101. \ + * \ + * You might expect this to be: \ + * x = (x & 0x55...) + ((x >> 1) & 0x55...). \ + * That costs an extra mask relative to this, though. \ + */ \ + x = x - ((x >> 1) & (0x55U * bmul)); \ + /* Replace each 4 bits with their sideays sum. 0x3 = 0b0011. */\ + x = (x & (bmul * 0x33U)) + ((x >> 2) & (bmul * 0x33U)); \ + /* \ + * Replace each 8 bits with their sideways sum. Note that we \ + * can't overflow within each 4-bit sum here, so we can skip \ + * the initial mask. \ + */ \ + x = (x + (x >> 4)) & (bmul * 0x0FU); \ + /* \ + * None of the partial sums in this multiplication (viewed in \ + * base-256) can overflow into the next digit. So the least \ + * significant byte of the product will be the least \ + * significant byte of the original value, the second least \ + * significant byte will be the sum of the two least \ + * significant bytes of the original value, and so on. \ + * Importantly, the high byte will be the byte-wise sum of all \ + * the bytes of the original value. \ + */ \ + x = x * bmul; \ + x >>= ((sizeof(x) - 1) * 8); \ + return (unsigned)x; \ +} while(0) + +static inline unsigned +popcount_u_slow(unsigned bitmap) { + DO_POPCOUNT(bitmap, unsigned); +} + +static inline unsigned +popcount_lu_slow(unsigned long bitmap) { + DO_POPCOUNT(bitmap, unsigned long); +} + +static inline unsigned +popcount_llu_slow(unsigned long long bitmap) { + DO_POPCOUNT(bitmap, unsigned long long); +} + +#undef DO_POPCOUNT + +static inline unsigned +popcount_u(unsigned bitmap) { +#ifdef JEMALLOC_INTERNAL_POPCOUNT + return JEMALLOC_INTERNAL_POPCOUNT(bitmap); +#else + return popcount_u_slow(bitmap); +#endif +} + +static inline unsigned +popcount_lu(unsigned long bitmap) { +#ifdef JEMALLOC_INTERNAL_POPCOUNTL + return JEMALLOC_INTERNAL_POPCOUNTL(bitmap); +#else + return popcount_lu_slow(bitmap); +#endif +} + +static inline unsigned +popcount_llu(unsigned long long bitmap) { +#ifdef JEMALLOC_INTERNAL_POPCOUNTLL + return JEMALLOC_INTERNAL_POPCOUNTLL(bitmap); +#else + return popcount_llu_slow(bitmap); +#endif +} + +/* + * Clears first unset bit in bitmap, and returns + * place of bit. bitmap *must not* be 0. + */ + +static inline size_t +cfs_lu(unsigned long* bitmap) { + util_assume(*bitmap != 0); + size_t bit = ffs_lu(*bitmap); + *bitmap ^= ZU(1) << bit; + return bit; +} + +static inline unsigned +ffs_zu(size_t x) { +#if LG_SIZEOF_PTR == LG_SIZEOF_INT + return ffs_u(x); +#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG + return ffs_lu(x); +#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG + return ffs_llu(x); +#else +#error No implementation for size_t ffs() +#endif +} + +static inline unsigned +fls_zu(size_t x) { +#if LG_SIZEOF_PTR == LG_SIZEOF_INT + return fls_u(x); +#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG + return fls_lu(x); +#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG + return fls_llu(x); +#else +#error No implementation for size_t fls() +#endif +} + + +static inline unsigned +ffs_u64(uint64_t x) { +#if LG_SIZEOF_LONG == 3 + return ffs_lu(x); +#elif LG_SIZEOF_LONG_LONG == 3 + return ffs_llu(x); +#else +#error No implementation for 64-bit ffs() +#endif +} + +static inline unsigned +fls_u64(uint64_t x) { +#if LG_SIZEOF_LONG == 3 + return fls_lu(x); +#elif LG_SIZEOF_LONG_LONG == 3 + return fls_llu(x); +#else +#error No implementation for 64-bit fls() +#endif +} + +static inline unsigned +ffs_u32(uint32_t x) { +#if LG_SIZEOF_INT == 2 + return ffs_u(x); +#else +#error No implementation for 32-bit ffs() +#endif + return ffs_u(x); +} + +static inline unsigned +fls_u32(uint32_t x) { +#if LG_SIZEOF_INT == 2 + return fls_u(x); +#else +#error No implementation for 32-bit fls() +#endif + return fls_u(x); +} + +static inline uint64_t +pow2_ceil_u64(uint64_t x) { + if (unlikely(x <= 1)) { + return x; + } + size_t msb_on_index = fls_u64(x - 1); + /* + * Range-check; it's on the callers to ensure that the result of this + * call won't overflow. + */ + assert(msb_on_index < 63); + return 1ULL << (msb_on_index + 1); +} + +static inline uint32_t +pow2_ceil_u32(uint32_t x) { + if (unlikely(x <= 1)) { + return x; + } + size_t msb_on_index = fls_u32(x - 1); + /* As above. */ + assert(msb_on_index < 31); + return 1U << (msb_on_index + 1); +} + +/* Compute the smallest power of 2 that is >= x. */ +static inline size_t +pow2_ceil_zu(size_t x) { +#if (LG_SIZEOF_PTR == 3) + return pow2_ceil_u64(x); +#else + return pow2_ceil_u32(x); +#endif +} + +static inline unsigned +lg_floor(size_t x) { + util_assume(x != 0); +#if (LG_SIZEOF_PTR == 3) + return fls_u64(x); +#else + return fls_u32(x); +#endif +} + +static inline unsigned +lg_ceil(size_t x) { + return lg_floor(x) + ((x & (x - 1)) == 0 ? 0 : 1); +} + +/* A compile-time version of lg_floor and lg_ceil. */ +#define LG_FLOOR_1(x) 0 +#define LG_FLOOR_2(x) (x < (1ULL << 1) ? LG_FLOOR_1(x) : 1 + LG_FLOOR_1(x >> 1)) +#define LG_FLOOR_4(x) (x < (1ULL << 2) ? LG_FLOOR_2(x) : 2 + LG_FLOOR_2(x >> 2)) +#define LG_FLOOR_8(x) (x < (1ULL << 4) ? LG_FLOOR_4(x) : 4 + LG_FLOOR_4(x >> 4)) +#define LG_FLOOR_16(x) (x < (1ULL << 8) ? LG_FLOOR_8(x) : 8 + LG_FLOOR_8(x >> 8)) +#define LG_FLOOR_32(x) (x < (1ULL << 16) ? LG_FLOOR_16(x) : 16 + LG_FLOOR_16(x >> 16)) +#define LG_FLOOR_64(x) (x < (1ULL << 32) ? LG_FLOOR_32(x) : 32 + LG_FLOOR_32(x >> 32)) +#if LG_SIZEOF_PTR == 2 +# define LG_FLOOR(x) LG_FLOOR_32((x)) +#else +# define LG_FLOOR(x) LG_FLOOR_64((x)) +#endif + +#define LG_CEIL(x) (LG_FLOOR(x) + (((x) & ((x) - 1)) == 0 ? 0 : 1)) + +#endif /* JEMALLOC_INTERNAL_BIT_UTIL_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/bitmap.h b/BeefRT/JEMalloc/include/jemalloc/internal/bitmap.h new file mode 100644 index 00000000..dc19454d --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/bitmap.h @@ -0,0 +1,368 @@ +#ifndef JEMALLOC_INTERNAL_BITMAP_H +#define JEMALLOC_INTERNAL_BITMAP_H + +#include "jemalloc/internal/bit_util.h" +#include "jemalloc/internal/sc.h" + +typedef unsigned long bitmap_t; +#define LG_SIZEOF_BITMAP LG_SIZEOF_LONG + +/* Maximum bitmap bit count is 2^LG_BITMAP_MAXBITS. */ +#if SC_LG_SLAB_MAXREGS > LG_CEIL(SC_NSIZES) +/* Maximum bitmap bit count is determined by maximum regions per slab. */ +# define LG_BITMAP_MAXBITS SC_LG_SLAB_MAXREGS +#else +/* Maximum bitmap bit count is determined by number of extent size classes. */ +# define LG_BITMAP_MAXBITS LG_CEIL(SC_NSIZES) +#endif +#define BITMAP_MAXBITS (ZU(1) << LG_BITMAP_MAXBITS) + +/* Number of bits per group. */ +#define LG_BITMAP_GROUP_NBITS (LG_SIZEOF_BITMAP + 3) +#define BITMAP_GROUP_NBITS (1U << LG_BITMAP_GROUP_NBITS) +#define BITMAP_GROUP_NBITS_MASK (BITMAP_GROUP_NBITS-1) + +/* + * Do some analysis on how big the bitmap is before we use a tree. For a brute + * force linear search, if we would have to call ffs_lu() more than 2^3 times, + * use a tree instead. + */ +#if LG_BITMAP_MAXBITS - LG_BITMAP_GROUP_NBITS > 3 +# define BITMAP_USE_TREE +#endif + +/* Number of groups required to store a given number of bits. */ +#define BITMAP_BITS2GROUPS(nbits) \ + (((nbits) + BITMAP_GROUP_NBITS_MASK) >> LG_BITMAP_GROUP_NBITS) + +/* + * Number of groups required at a particular level for a given number of bits. + */ +#define BITMAP_GROUPS_L0(nbits) \ + BITMAP_BITS2GROUPS(nbits) +#define BITMAP_GROUPS_L1(nbits) \ + BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(nbits)) +#define BITMAP_GROUPS_L2(nbits) \ + BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS((nbits)))) +#define BITMAP_GROUPS_L3(nbits) \ + BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS( \ + BITMAP_BITS2GROUPS((nbits))))) +#define BITMAP_GROUPS_L4(nbits) \ + BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS( \ + BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS((nbits)))))) + +/* + * Assuming the number of levels, number of groups required for a given number + * of bits. + */ +#define BITMAP_GROUPS_1_LEVEL(nbits) \ + BITMAP_GROUPS_L0(nbits) +#define BITMAP_GROUPS_2_LEVEL(nbits) \ + (BITMAP_GROUPS_1_LEVEL(nbits) + BITMAP_GROUPS_L1(nbits)) +#define BITMAP_GROUPS_3_LEVEL(nbits) \ + (BITMAP_GROUPS_2_LEVEL(nbits) + BITMAP_GROUPS_L2(nbits)) +#define BITMAP_GROUPS_4_LEVEL(nbits) \ + (BITMAP_GROUPS_3_LEVEL(nbits) + BITMAP_GROUPS_L3(nbits)) +#define BITMAP_GROUPS_5_LEVEL(nbits) \ + (BITMAP_GROUPS_4_LEVEL(nbits) + BITMAP_GROUPS_L4(nbits)) + +/* + * Maximum number of groups required to support LG_BITMAP_MAXBITS. + */ +#ifdef BITMAP_USE_TREE + +#if LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS +# define BITMAP_GROUPS(nbits) BITMAP_GROUPS_1_LEVEL(nbits) +# define BITMAP_GROUPS_MAX BITMAP_GROUPS_1_LEVEL(BITMAP_MAXBITS) +#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 2 +# define BITMAP_GROUPS(nbits) BITMAP_GROUPS_2_LEVEL(nbits) +# define BITMAP_GROUPS_MAX BITMAP_GROUPS_2_LEVEL(BITMAP_MAXBITS) +#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 3 +# define BITMAP_GROUPS(nbits) BITMAP_GROUPS_3_LEVEL(nbits) +# define BITMAP_GROUPS_MAX BITMAP_GROUPS_3_LEVEL(BITMAP_MAXBITS) +#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 4 +# define BITMAP_GROUPS(nbits) BITMAP_GROUPS_4_LEVEL(nbits) +# define BITMAP_GROUPS_MAX BITMAP_GROUPS_4_LEVEL(BITMAP_MAXBITS) +#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 5 +# define BITMAP_GROUPS(nbits) BITMAP_GROUPS_5_LEVEL(nbits) +# define BITMAP_GROUPS_MAX BITMAP_GROUPS_5_LEVEL(BITMAP_MAXBITS) +#else +# error "Unsupported bitmap size" +#endif + +/* + * Maximum number of levels possible. This could be statically computed based + * on LG_BITMAP_MAXBITS: + * + * #define BITMAP_MAX_LEVELS \ + * (LG_BITMAP_MAXBITS / LG_SIZEOF_BITMAP) \ + * + !!(LG_BITMAP_MAXBITS % LG_SIZEOF_BITMAP) + * + * However, that would not allow the generic BITMAP_INFO_INITIALIZER() macro, so + * instead hardcode BITMAP_MAX_LEVELS to the largest number supported by the + * various cascading macros. The only additional cost this incurs is some + * unused trailing entries in bitmap_info_t structures; the bitmaps themselves + * are not impacted. + */ +#define BITMAP_MAX_LEVELS 5 + +#define BITMAP_INFO_INITIALIZER(nbits) { \ + /* nbits. */ \ + nbits, \ + /* nlevels. */ \ + (BITMAP_GROUPS_L0(nbits) > BITMAP_GROUPS_L1(nbits)) + \ + (BITMAP_GROUPS_L1(nbits) > BITMAP_GROUPS_L2(nbits)) + \ + (BITMAP_GROUPS_L2(nbits) > BITMAP_GROUPS_L3(nbits)) + \ + (BITMAP_GROUPS_L3(nbits) > BITMAP_GROUPS_L4(nbits)) + 1, \ + /* levels. */ \ + { \ + {0}, \ + {BITMAP_GROUPS_L0(nbits)}, \ + {BITMAP_GROUPS_L1(nbits) + BITMAP_GROUPS_L0(nbits)}, \ + {BITMAP_GROUPS_L2(nbits) + BITMAP_GROUPS_L1(nbits) + \ + BITMAP_GROUPS_L0(nbits)}, \ + {BITMAP_GROUPS_L3(nbits) + BITMAP_GROUPS_L2(nbits) + \ + BITMAP_GROUPS_L1(nbits) + BITMAP_GROUPS_L0(nbits)}, \ + {BITMAP_GROUPS_L4(nbits) + BITMAP_GROUPS_L3(nbits) + \ + BITMAP_GROUPS_L2(nbits) + BITMAP_GROUPS_L1(nbits) \ + + BITMAP_GROUPS_L0(nbits)} \ + } \ +} + +#else /* BITMAP_USE_TREE */ + +#define BITMAP_GROUPS(nbits) BITMAP_BITS2GROUPS(nbits) +#define BITMAP_GROUPS_MAX BITMAP_BITS2GROUPS(BITMAP_MAXBITS) + +#define BITMAP_INFO_INITIALIZER(nbits) { \ + /* nbits. */ \ + nbits, \ + /* ngroups. */ \ + BITMAP_BITS2GROUPS(nbits) \ +} + +#endif /* BITMAP_USE_TREE */ + +typedef struct bitmap_level_s { + /* Offset of this level's groups within the array of groups. */ + size_t group_offset; +} bitmap_level_t; + +typedef struct bitmap_info_s { + /* Logical number of bits in bitmap (stored at bottom level). */ + size_t nbits; + +#ifdef BITMAP_USE_TREE + /* Number of levels necessary for nbits. */ + unsigned nlevels; + + /* + * Only the first (nlevels+1) elements are used, and levels are ordered + * bottom to top (e.g. the bottom level is stored in levels[0]). + */ + bitmap_level_t levels[BITMAP_MAX_LEVELS+1]; +#else /* BITMAP_USE_TREE */ + /* Number of groups necessary for nbits. */ + size_t ngroups; +#endif /* BITMAP_USE_TREE */ +} bitmap_info_t; + +void bitmap_info_init(bitmap_info_t *binfo, size_t nbits); +void bitmap_init(bitmap_t *bitmap, const bitmap_info_t *binfo, bool fill); +size_t bitmap_size(const bitmap_info_t *binfo); + +static inline bool +bitmap_full(bitmap_t *bitmap, const bitmap_info_t *binfo) { +#ifdef BITMAP_USE_TREE + size_t rgoff = binfo->levels[binfo->nlevels].group_offset - 1; + bitmap_t rg = bitmap[rgoff]; + /* The bitmap is full iff the root group is 0. */ + return (rg == 0); +#else + size_t i; + + for (i = 0; i < binfo->ngroups; i++) { + if (bitmap[i] != 0) { + return false; + } + } + return true; +#endif +} + +static inline bool +bitmap_get(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit) { + size_t goff; + bitmap_t g; + + assert(bit < binfo->nbits); + goff = bit >> LG_BITMAP_GROUP_NBITS; + g = bitmap[goff]; + return !(g & (ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK))); +} + +static inline void +bitmap_set(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit) { + size_t goff; + bitmap_t *gp; + bitmap_t g; + + assert(bit < binfo->nbits); + assert(!bitmap_get(bitmap, binfo, bit)); + goff = bit >> LG_BITMAP_GROUP_NBITS; + gp = &bitmap[goff]; + g = *gp; + assert(g & (ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK))); + g ^= ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK); + *gp = g; + assert(bitmap_get(bitmap, binfo, bit)); +#ifdef BITMAP_USE_TREE + /* Propagate group state transitions up the tree. */ + if (g == 0) { + unsigned i; + for (i = 1; i < binfo->nlevels; i++) { + bit = goff; + goff = bit >> LG_BITMAP_GROUP_NBITS; + gp = &bitmap[binfo->levels[i].group_offset + goff]; + g = *gp; + assert(g & (ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK))); + g ^= ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK); + *gp = g; + if (g != 0) { + break; + } + } + } +#endif +} + +/* ffu: find first unset >= bit. */ +static inline size_t +bitmap_ffu(const bitmap_t *bitmap, const bitmap_info_t *binfo, size_t min_bit) { + assert(min_bit < binfo->nbits); + +#ifdef BITMAP_USE_TREE + size_t bit = 0; + for (unsigned level = binfo->nlevels; level--;) { + size_t lg_bits_per_group = (LG_BITMAP_GROUP_NBITS * (level + + 1)); + bitmap_t group = bitmap[binfo->levels[level].group_offset + (bit + >> lg_bits_per_group)]; + unsigned group_nmask = (unsigned)(((min_bit > bit) ? (min_bit - + bit) : 0) >> (lg_bits_per_group - LG_BITMAP_GROUP_NBITS)); + assert(group_nmask <= BITMAP_GROUP_NBITS); + bitmap_t group_mask = ~((1LU << group_nmask) - 1); + bitmap_t group_masked = group & group_mask; + if (group_masked == 0LU) { + if (group == 0LU) { + return binfo->nbits; + } + /* + * min_bit was preceded by one or more unset bits in + * this group, but there are no other unset bits in this + * group. Try again starting at the first bit of the + * next sibling. This will recurse at most once per + * non-root level. + */ + size_t sib_base = bit + (ZU(1) << lg_bits_per_group); + assert(sib_base > min_bit); + assert(sib_base > bit); + if (sib_base >= binfo->nbits) { + return binfo->nbits; + } + return bitmap_ffu(bitmap, binfo, sib_base); + } + bit += ((size_t)ffs_lu(group_masked)) << + (lg_bits_per_group - LG_BITMAP_GROUP_NBITS); + } + assert(bit >= min_bit); + assert(bit < binfo->nbits); + return bit; +#else + size_t i = min_bit >> LG_BITMAP_GROUP_NBITS; + bitmap_t g = bitmap[i] & ~((1LU << (min_bit & BITMAP_GROUP_NBITS_MASK)) + - 1); + size_t bit; + do { + if (g != 0) { + bit = ffs_lu(g); + return (i << LG_BITMAP_GROUP_NBITS) + bit; + } + i++; + g = bitmap[i]; + } while (i < binfo->ngroups); + return binfo->nbits; +#endif +} + +/* sfu: set first unset. */ +static inline size_t +bitmap_sfu(bitmap_t *bitmap, const bitmap_info_t *binfo) { + size_t bit; + bitmap_t g; + unsigned i; + + assert(!bitmap_full(bitmap, binfo)); + +#ifdef BITMAP_USE_TREE + i = binfo->nlevels - 1; + g = bitmap[binfo->levels[i].group_offset]; + bit = ffs_lu(g); + while (i > 0) { + i--; + g = bitmap[binfo->levels[i].group_offset + bit]; + bit = (bit << LG_BITMAP_GROUP_NBITS) + ffs_lu(g); + } +#else + i = 0; + g = bitmap[0]; + while (g == 0) { + i++; + g = bitmap[i]; + } + bit = (i << LG_BITMAP_GROUP_NBITS) + ffs_lu(g); +#endif + bitmap_set(bitmap, binfo, bit); + return bit; +} + +static inline void +bitmap_unset(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit) { + size_t goff; + bitmap_t *gp; + bitmap_t g; + UNUSED bool propagate; + + assert(bit < binfo->nbits); + assert(bitmap_get(bitmap, binfo, bit)); + goff = bit >> LG_BITMAP_GROUP_NBITS; + gp = &bitmap[goff]; + g = *gp; + propagate = (g == 0); + assert((g & (ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK))) == 0); + g ^= ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK); + *gp = g; + assert(!bitmap_get(bitmap, binfo, bit)); +#ifdef BITMAP_USE_TREE + /* Propagate group state transitions up the tree. */ + if (propagate) { + unsigned i; + for (i = 1; i < binfo->nlevels; i++) { + bit = goff; + goff = bit >> LG_BITMAP_GROUP_NBITS; + gp = &bitmap[binfo->levels[i].group_offset + goff]; + g = *gp; + propagate = (g == 0); + assert((g & (ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK))) + == 0); + g ^= ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK); + *gp = g; + if (!propagate) { + break; + } + } + } +#endif /* BITMAP_USE_TREE */ +} + +#endif /* JEMALLOC_INTERNAL_BITMAP_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/buf_writer.h b/BeefRT/JEMalloc/include/jemalloc/internal/buf_writer.h new file mode 100644 index 00000000..37aa6de5 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/buf_writer.h @@ -0,0 +1,32 @@ +#ifndef JEMALLOC_INTERNAL_BUF_WRITER_H +#define JEMALLOC_INTERNAL_BUF_WRITER_H + +/* + * Note: when using the buffered writer, cbopaque is passed to write_cb only + * when the buffer is flushed. It would make a difference if cbopaque points + * to something that's changing for each write_cb call, or something that + * affects write_cb in a way dependent on the content of the output string. + * However, the most typical usage case in practice is that cbopaque points to + * some "option like" content for the write_cb, so it doesn't matter. + */ + +typedef struct { + write_cb_t *write_cb; + void *cbopaque; + char *buf; + size_t buf_size; + size_t buf_end; + bool internal_buf; +} buf_writer_t; + +bool buf_writer_init(tsdn_t *tsdn, buf_writer_t *buf_writer, + write_cb_t *write_cb, void *cbopaque, char *buf, size_t buf_len); +void buf_writer_flush(buf_writer_t *buf_writer); +write_cb_t buf_writer_cb; +void buf_writer_terminate(tsdn_t *tsdn, buf_writer_t *buf_writer); + +typedef ssize_t (read_cb_t)(void *read_cbopaque, void *buf, size_t limit); +void buf_writer_pipe(buf_writer_t *buf_writer, read_cb_t *read_cb, + void *read_cbopaque); + +#endif /* JEMALLOC_INTERNAL_BUF_WRITER_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/cache_bin.h b/BeefRT/JEMalloc/include/jemalloc/internal/cache_bin.h new file mode 100644 index 00000000..caf5be33 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/cache_bin.h @@ -0,0 +1,670 @@ +#ifndef JEMALLOC_INTERNAL_CACHE_BIN_H +#define JEMALLOC_INTERNAL_CACHE_BIN_H + +#include "jemalloc/internal/ql.h" +#include "jemalloc/internal/sz.h" + +/* + * The cache_bins are the mechanism that the tcache and the arena use to + * communicate. The tcache fills from and flushes to the arena by passing a + * cache_bin_t to fill/flush. When the arena needs to pull stats from the + * tcaches associated with it, it does so by iterating over its + * cache_bin_array_descriptor_t objects and reading out per-bin stats it + * contains. This makes it so that the arena need not know about the existence + * of the tcache at all. + */ + +/* + * The size in bytes of each cache bin stack. We also use this to indicate + * *counts* of individual objects. + */ +typedef uint16_t cache_bin_sz_t; + +/* + * Leave a noticeable mark pattern on the cache bin stack boundaries, in case a + * bug starts leaking those. Make it look like the junk pattern but be distinct + * from it. + */ +static const uintptr_t cache_bin_preceding_junk = + (uintptr_t)0x7a7a7a7a7a7a7a7aULL; +/* Note: a7 vs. 7a above -- this tells you which pointer leaked. */ +static const uintptr_t cache_bin_trailing_junk = + (uintptr_t)0xa7a7a7a7a7a7a7a7ULL; + +/* + * That implies the following value, for the maximum number of items in any + * individual bin. The cache bins track their bounds looking just at the low + * bits of a pointer, compared against a cache_bin_sz_t. So that's + * 1 << (sizeof(cache_bin_sz_t) * 8) + * bytes spread across pointer sized objects to get the maximum. + */ +#define CACHE_BIN_NCACHED_MAX (((size_t)1 << sizeof(cache_bin_sz_t) * 8) \ + / sizeof(void *) - 1) + +/* + * This lives inside the cache_bin (for locality reasons), and is initialized + * alongside it, but is otherwise not modified by any cache bin operations. + * It's logically public and maintained by its callers. + */ +typedef struct cache_bin_stats_s cache_bin_stats_t; +struct cache_bin_stats_s { + /* + * Number of allocation requests that corresponded to the size of this + * bin. + */ + uint64_t nrequests; +}; + +/* + * Read-only information associated with each element of tcache_t's tbins array + * is stored separately, mainly to reduce memory usage. + */ +typedef struct cache_bin_info_s cache_bin_info_t; +struct cache_bin_info_s { + cache_bin_sz_t ncached_max; +}; + +/* + * Responsible for caching allocations associated with a single size. + * + * Several pointers are used to track the stack. To save on metadata bytes, + * only the stack_head is a full sized pointer (which is dereferenced on the + * fastpath), while the others store only the low 16 bits -- this is correct + * because a single stack never takes more space than 2^16 bytes, and at the + * same time only equality checks are performed on the low bits. + * + * (low addr) (high addr) + * |------stashed------|------available------|------cached-----| + * ^ ^ ^ ^ + * low_bound(derived) low_bits_full stack_head low_bits_empty + */ +typedef struct cache_bin_s cache_bin_t; +struct cache_bin_s { + /* + * The stack grows down. Whenever the bin is nonempty, the head points + * to an array entry containing a valid allocation. When it is empty, + * the head points to one element past the owned array. + */ + void **stack_head; + /* + * cur_ptr and stats are both modified frequently. Let's keep them + * close so that they have a higher chance of being on the same + * cacheline, thus less write-backs. + */ + cache_bin_stats_t tstats; + + /* + * The low bits of the address of the first item in the stack that + * hasn't been used since the last GC, to track the low water mark (min + * # of cached items). + * + * Since the stack grows down, this is a higher address than + * low_bits_full. + */ + uint16_t low_bits_low_water; + + /* + * The low bits of the value that stack_head will take on when the array + * is full (of cached & stashed items). But remember that stack_head + * always points to a valid item when the array is nonempty -- this is + * in the array. + * + * Recall that since the stack grows down, this is the lowest available + * address in the array for caching. Only adjusted when stashing items. + */ + uint16_t low_bits_full; + + /* + * The low bits of the value that stack_head will take on when the array + * is empty. + * + * The stack grows down -- this is one past the highest address in the + * array. Immutable after initialization. + */ + uint16_t low_bits_empty; +}; + +/* + * The cache_bins live inside the tcache, but the arena (by design) isn't + * supposed to know much about tcache internals. To let the arena iterate over + * associated bins, we keep (with the tcache) a linked list of + * cache_bin_array_descriptor_ts that tell the arena how to find the bins. + */ +typedef struct cache_bin_array_descriptor_s cache_bin_array_descriptor_t; +struct cache_bin_array_descriptor_s { + /* + * The arena keeps a list of the cache bins associated with it, for + * stats collection. + */ + ql_elm(cache_bin_array_descriptor_t) link; + /* Pointers to the tcache bins. */ + cache_bin_t *bins; +}; + +static inline void +cache_bin_array_descriptor_init(cache_bin_array_descriptor_t *descriptor, + cache_bin_t *bins) { + ql_elm_new(descriptor, link); + descriptor->bins = bins; +} + +JEMALLOC_ALWAYS_INLINE bool +cache_bin_nonfast_aligned(const void *ptr) { + if (!config_uaf_detection) { + return false; + } + /* + * Currently we use alignment to decide which pointer to junk & stash on + * dealloc (for catching use-after-free). In some common cases a + * page-aligned check is needed already (sdalloc w/ config_prof), so we + * are getting it more or less for free -- no added instructions on + * free_fastpath. + * + * Another way of deciding which pointer to sample, is adding another + * thread_event to pick one every N bytes. That also adds no cost on + * the fastpath, however it will tend to pick large allocations which is + * not the desired behavior. + */ + return ((uintptr_t)ptr & san_cache_bin_nonfast_mask) == 0; +} + +/* Returns ncached_max: Upper limit on ncached. */ +static inline cache_bin_sz_t +cache_bin_info_ncached_max(cache_bin_info_t *info) { + return info->ncached_max; +} + +/* + * Internal. + * + * Asserts that the pointer associated with earlier is <= the one associated + * with later. + */ +static inline void +cache_bin_assert_earlier(cache_bin_t *bin, uint16_t earlier, uint16_t later) { + if (earlier > later) { + assert(bin->low_bits_full > bin->low_bits_empty); + } +} + +/* + * Internal. + * + * Does difference calculations that handle wraparound correctly. Earlier must + * be associated with the position earlier in memory. + */ +static inline uint16_t +cache_bin_diff(cache_bin_t *bin, uint16_t earlier, uint16_t later, bool racy) { + /* + * When it's racy, bin->low_bits_full can be modified concurrently. It + * can cross the uint16_t max value and become less than + * bin->low_bits_empty at the time of the check. + */ + if (!racy) { + cache_bin_assert_earlier(bin, earlier, later); + } + return later - earlier; +} + +/* + * Number of items currently cached in the bin, without checking ncached_max. + * We require specifying whether or not the request is racy or not (i.e. whether + * or not concurrent modifications are possible). + */ +static inline cache_bin_sz_t +cache_bin_ncached_get_internal(cache_bin_t *bin, bool racy) { + cache_bin_sz_t diff = cache_bin_diff(bin, + (uint16_t)(uintptr_t)bin->stack_head, bin->low_bits_empty, racy); + cache_bin_sz_t n = diff / sizeof(void *); + /* + * We have undefined behavior here; if this function is called from the + * arena stats updating code, then stack_head could change from the + * first line to the next one. Morally, these loads should be atomic, + * but compilers won't currently generate comparisons with in-memory + * operands against atomics, and these variables get accessed on the + * fast paths. This should still be "safe" in the sense of generating + * the correct assembly for the foreseeable future, though. + */ + assert(n == 0 || *(bin->stack_head) != NULL || racy); + return n; +} + +/* + * Number of items currently cached in the bin, with checking ncached_max. The + * caller must know that no concurrent modification of the cache_bin is + * possible. + */ +static inline cache_bin_sz_t +cache_bin_ncached_get_local(cache_bin_t *bin, cache_bin_info_t *info) { + cache_bin_sz_t n = cache_bin_ncached_get_internal(bin, + /* racy */ false); + assert(n <= cache_bin_info_ncached_max(info)); + return n; +} + +/* + * Internal. + * + * A pointer to the position one past the end of the backing array. + * + * Do not call if racy, because both 'bin->stack_head' and 'bin->low_bits_full' + * are subject to concurrent modifications. + */ +static inline void ** +cache_bin_empty_position_get(cache_bin_t *bin) { + cache_bin_sz_t diff = cache_bin_diff(bin, + (uint16_t)(uintptr_t)bin->stack_head, bin->low_bits_empty, + /* racy */ false); + uintptr_t empty_bits = (uintptr_t)bin->stack_head + diff; + void **ret = (void **)empty_bits; + + assert(ret >= bin->stack_head); + + return ret; +} + +/* + * Internal. + * + * Calculates low bits of the lower bound of the usable cache bin's range (see + * cache_bin_t visual representation above). + * + * No values are concurrently modified, so should be safe to read in a + * multithreaded environment. Currently concurrent access happens only during + * arena statistics collection. + */ +static inline uint16_t +cache_bin_low_bits_low_bound_get(cache_bin_t *bin, cache_bin_info_t *info) { + return (uint16_t)bin->low_bits_empty - + info->ncached_max * sizeof(void *); +} + +/* + * Internal. + * + * A pointer to the position with the lowest address of the backing array. + */ +static inline void ** +cache_bin_low_bound_get(cache_bin_t *bin, cache_bin_info_t *info) { + cache_bin_sz_t ncached_max = cache_bin_info_ncached_max(info); + void **ret = cache_bin_empty_position_get(bin) - ncached_max; + assert(ret <= bin->stack_head); + + return ret; +} + +/* + * As the name implies. This is important since it's not correct to try to + * batch fill a nonempty cache bin. + */ +static inline void +cache_bin_assert_empty(cache_bin_t *bin, cache_bin_info_t *info) { + assert(cache_bin_ncached_get_local(bin, info) == 0); + assert(cache_bin_empty_position_get(bin) == bin->stack_head); +} + +/* + * Get low water, but without any of the correctness checking we do for the + * caller-usable version, if we are temporarily breaking invariants (like + * ncached >= low_water during flush). + */ +static inline cache_bin_sz_t +cache_bin_low_water_get_internal(cache_bin_t *bin) { + return cache_bin_diff(bin, bin->low_bits_low_water, + bin->low_bits_empty, /* racy */ false) / sizeof(void *); +} + +/* Returns the numeric value of low water in [0, ncached]. */ +static inline cache_bin_sz_t +cache_bin_low_water_get(cache_bin_t *bin, cache_bin_info_t *info) { + cache_bin_sz_t low_water = cache_bin_low_water_get_internal(bin); + assert(low_water <= cache_bin_info_ncached_max(info)); + assert(low_water <= cache_bin_ncached_get_local(bin, info)); + + cache_bin_assert_earlier(bin, (uint16_t)(uintptr_t)bin->stack_head, + bin->low_bits_low_water); + + return low_water; +} + +/* + * Indicates that the current cache bin position should be the low water mark + * going forward. + */ +static inline void +cache_bin_low_water_set(cache_bin_t *bin) { + bin->low_bits_low_water = (uint16_t)(uintptr_t)bin->stack_head; +} + +static inline void +cache_bin_low_water_adjust(cache_bin_t *bin) { + if (cache_bin_ncached_get_internal(bin, /* racy */ false) + < cache_bin_low_water_get_internal(bin)) { + cache_bin_low_water_set(bin); + } +} + +JEMALLOC_ALWAYS_INLINE void * +cache_bin_alloc_impl(cache_bin_t *bin, bool *success, bool adjust_low_water) { + /* + * success (instead of ret) should be checked upon the return of this + * function. We avoid checking (ret == NULL) because there is never a + * null stored on the avail stack (which is unknown to the compiler), + * and eagerly checking ret would cause pipeline stall (waiting for the + * cacheline). + */ + + /* + * This may read from the empty position; however the loaded value won't + * be used. It's safe because the stack has one more slot reserved. + */ + void *ret = *bin->stack_head; + uint16_t low_bits = (uint16_t)(uintptr_t)bin->stack_head; + void **new_head = bin->stack_head + 1; + + /* + * Note that the low water mark is at most empty; if we pass this check, + * we know we're non-empty. + */ + if (likely(low_bits != bin->low_bits_low_water)) { + bin->stack_head = new_head; + *success = true; + return ret; + } + if (!adjust_low_water) { + *success = false; + return NULL; + } + /* + * In the fast-path case where we call alloc_easy and then alloc, the + * previous checking and computation is optimized away -- we didn't + * actually commit any of our operations. + */ + if (likely(low_bits != bin->low_bits_empty)) { + bin->stack_head = new_head; + bin->low_bits_low_water = (uint16_t)(uintptr_t)new_head; + *success = true; + return ret; + } + *success = false; + return NULL; +} + +/* + * Allocate an item out of the bin, failing if we're at the low-water mark. + */ +JEMALLOC_ALWAYS_INLINE void * +cache_bin_alloc_easy(cache_bin_t *bin, bool *success) { + /* We don't look at info if we're not adjusting low-water. */ + return cache_bin_alloc_impl(bin, success, false); +} + +/* + * Allocate an item out of the bin, even if we're currently at the low-water + * mark (and failing only if the bin is empty). + */ +JEMALLOC_ALWAYS_INLINE void * +cache_bin_alloc(cache_bin_t *bin, bool *success) { + return cache_bin_alloc_impl(bin, success, true); +} + +JEMALLOC_ALWAYS_INLINE cache_bin_sz_t +cache_bin_alloc_batch(cache_bin_t *bin, size_t num, void **out) { + cache_bin_sz_t n = cache_bin_ncached_get_internal(bin, + /* racy */ false); + if (n > num) { + n = (cache_bin_sz_t)num; + } + memcpy(out, bin->stack_head, n * sizeof(void *)); + bin->stack_head += n; + cache_bin_low_water_adjust(bin); + + return n; +} + +JEMALLOC_ALWAYS_INLINE bool +cache_bin_full(cache_bin_t *bin) { + return ((uint16_t)(uintptr_t)bin->stack_head == bin->low_bits_full); +} + +/* + * Free an object into the given bin. Fails only if the bin is full. + */ +JEMALLOC_ALWAYS_INLINE bool +cache_bin_dalloc_easy(cache_bin_t *bin, void *ptr) { + if (unlikely(cache_bin_full(bin))) { + return false; + } + + bin->stack_head--; + *bin->stack_head = ptr; + cache_bin_assert_earlier(bin, bin->low_bits_full, + (uint16_t)(uintptr_t)bin->stack_head); + + return true; +} + +/* Returns false if failed to stash (i.e. bin is full). */ +JEMALLOC_ALWAYS_INLINE bool +cache_bin_stash(cache_bin_t *bin, void *ptr) { + if (cache_bin_full(bin)) { + return false; + } + + /* Stash at the full position, in the [full, head) range. */ + uint16_t low_bits_head = (uint16_t)(uintptr_t)bin->stack_head; + /* Wraparound handled as well. */ + uint16_t diff = cache_bin_diff(bin, bin->low_bits_full, low_bits_head, + /* racy */ false); + *(void **)((uintptr_t)bin->stack_head - diff) = ptr; + + assert(!cache_bin_full(bin)); + bin->low_bits_full += sizeof(void *); + cache_bin_assert_earlier(bin, bin->low_bits_full, low_bits_head); + + return true; +} + +/* + * Get the number of stashed pointers. + * + * When called from a thread not owning the TLS (i.e. racy = true), it's + * important to keep in mind that 'bin->stack_head' and 'bin->low_bits_full' can + * be modified concurrently and almost none assertions about their values can be + * made. + */ +JEMALLOC_ALWAYS_INLINE cache_bin_sz_t +cache_bin_nstashed_get_internal(cache_bin_t *bin, cache_bin_info_t *info, + bool racy) { + cache_bin_sz_t ncached_max = cache_bin_info_ncached_max(info); + uint16_t low_bits_low_bound = cache_bin_low_bits_low_bound_get(bin, + info); + + cache_bin_sz_t n = cache_bin_diff(bin, low_bits_low_bound, + bin->low_bits_full, racy) / sizeof(void *); + assert(n <= ncached_max); + + if (!racy) { + /* Below are for assertions only. */ + void **low_bound = cache_bin_low_bound_get(bin, info); + + assert((uint16_t)(uintptr_t)low_bound == low_bits_low_bound); + void *stashed = *(low_bound + n - 1); + bool aligned = cache_bin_nonfast_aligned(stashed); +#ifdef JEMALLOC_JET + /* Allow arbitrary pointers to be stashed in tests. */ + aligned = true; +#endif + assert(n == 0 || (stashed != NULL && aligned)); + } + + return n; +} + +JEMALLOC_ALWAYS_INLINE cache_bin_sz_t +cache_bin_nstashed_get_local(cache_bin_t *bin, cache_bin_info_t *info) { + cache_bin_sz_t n = cache_bin_nstashed_get_internal(bin, info, + /* racy */ false); + assert(n <= cache_bin_info_ncached_max(info)); + return n; +} + +/* + * Obtain a racy view of the number of items currently in the cache bin, in the + * presence of possible concurrent modifications. + */ +static inline void +cache_bin_nitems_get_remote(cache_bin_t *bin, cache_bin_info_t *info, + cache_bin_sz_t *ncached, cache_bin_sz_t *nstashed) { + cache_bin_sz_t n = cache_bin_ncached_get_internal(bin, /* racy */ true); + assert(n <= cache_bin_info_ncached_max(info)); + *ncached = n; + + n = cache_bin_nstashed_get_internal(bin, info, /* racy */ true); + assert(n <= cache_bin_info_ncached_max(info)); + *nstashed = n; + /* Note that cannot assert ncached + nstashed <= ncached_max (racy). */ +} + +/* + * Filling and flushing are done in batch, on arrays of void *s. For filling, + * the arrays go forward, and can be accessed with ordinary array arithmetic. + * For flushing, we work from the end backwards, and so need to use special + * accessors that invert the usual ordering. + * + * This is important for maintaining first-fit; the arena code fills with + * earliest objects first, and so those are the ones we should return first for + * cache_bin_alloc calls. When flushing, we should flush the objects that we + * wish to return later; those at the end of the array. This is better for the + * first-fit heuristic as well as for cache locality; the most recently freed + * objects are the ones most likely to still be in cache. + * + * This all sounds very hand-wavey and theoretical, but reverting the ordering + * on one or the other pathway leads to measurable slowdowns. + */ + +typedef struct cache_bin_ptr_array_s cache_bin_ptr_array_t; +struct cache_bin_ptr_array_s { + cache_bin_sz_t n; + void **ptr; +}; + +/* + * Declare a cache_bin_ptr_array_t sufficient for nval items. + * + * In the current implementation, this could be just part of a + * cache_bin_ptr_array_init_... call, since we reuse the cache bin stack memory. + * Indirecting behind a macro, though, means experimenting with linked-list + * representations is easy (since they'll require an alloca in the calling + * frame). + */ +#define CACHE_BIN_PTR_ARRAY_DECLARE(name, nval) \ + cache_bin_ptr_array_t name; \ + name.n = (nval) + +/* + * Start a fill. The bin must be empty, and This must be followed by a + * finish_fill call before doing any alloc/dalloc operations on the bin. + */ +static inline void +cache_bin_init_ptr_array_for_fill(cache_bin_t *bin, cache_bin_info_t *info, + cache_bin_ptr_array_t *arr, cache_bin_sz_t nfill) { + cache_bin_assert_empty(bin, info); + arr->ptr = cache_bin_empty_position_get(bin) - nfill; +} + +/* + * While nfill in cache_bin_init_ptr_array_for_fill is the number we *intend* to + * fill, nfilled here is the number we actually filled (which may be less, in + * case of OOM. + */ +static inline void +cache_bin_finish_fill(cache_bin_t *bin, cache_bin_info_t *info, + cache_bin_ptr_array_t *arr, cache_bin_sz_t nfilled) { + cache_bin_assert_empty(bin, info); + void **empty_position = cache_bin_empty_position_get(bin); + if (nfilled < arr->n) { + memmove(empty_position - nfilled, empty_position - arr->n, + nfilled * sizeof(void *)); + } + bin->stack_head = empty_position - nfilled; +} + +/* + * Same deal, but with flush. Unlike fill (which can fail), the user must flush + * everything we give them. + */ +static inline void +cache_bin_init_ptr_array_for_flush(cache_bin_t *bin, cache_bin_info_t *info, + cache_bin_ptr_array_t *arr, cache_bin_sz_t nflush) { + arr->ptr = cache_bin_empty_position_get(bin) - nflush; + assert(cache_bin_ncached_get_local(bin, info) == 0 + || *arr->ptr != NULL); +} + +static inline void +cache_bin_finish_flush(cache_bin_t *bin, cache_bin_info_t *info, + cache_bin_ptr_array_t *arr, cache_bin_sz_t nflushed) { + unsigned rem = cache_bin_ncached_get_local(bin, info) - nflushed; + memmove(bin->stack_head + nflushed, bin->stack_head, + rem * sizeof(void *)); + bin->stack_head = bin->stack_head + nflushed; + cache_bin_low_water_adjust(bin); +} + +static inline void +cache_bin_init_ptr_array_for_stashed(cache_bin_t *bin, szind_t binind, + cache_bin_info_t *info, cache_bin_ptr_array_t *arr, + cache_bin_sz_t nstashed) { + assert(nstashed > 0); + assert(cache_bin_nstashed_get_local(bin, info) == nstashed); + + void **low_bound = cache_bin_low_bound_get(bin, info); + arr->ptr = low_bound; + assert(*arr->ptr != NULL); +} + +static inline void +cache_bin_finish_flush_stashed(cache_bin_t *bin, cache_bin_info_t *info) { + void **low_bound = cache_bin_low_bound_get(bin, info); + + /* Reset the bin local full position. */ + bin->low_bits_full = (uint16_t)(uintptr_t)low_bound; + assert(cache_bin_nstashed_get_local(bin, info) == 0); +} + +/* + * Initialize a cache_bin_info to represent up to the given number of items in + * the cache_bins it is associated with. + */ +void cache_bin_info_init(cache_bin_info_t *bin_info, + cache_bin_sz_t ncached_max); +/* + * Given an array of initialized cache_bin_info_ts, determine how big an + * allocation is required to initialize a full set of cache_bin_ts. + */ +void cache_bin_info_compute_alloc(cache_bin_info_t *infos, szind_t ninfos, + size_t *size, size_t *alignment); + +/* + * Actually initialize some cache bins. Callers should allocate the backing + * memory indicated by a call to cache_bin_compute_alloc. They should then + * preincrement, call init once for each bin and info, and then call + * cache_bin_postincrement. *alloc_cur will then point immediately past the end + * of the allocation. + */ +void cache_bin_preincrement(cache_bin_info_t *infos, szind_t ninfos, + void *alloc, size_t *cur_offset); +void cache_bin_postincrement(cache_bin_info_t *infos, szind_t ninfos, + void *alloc, size_t *cur_offset); +void cache_bin_init(cache_bin_t *bin, cache_bin_info_t *info, void *alloc, + size_t *cur_offset); + +/* + * If a cache bin was zero initialized (either because it lives in static or + * thread-local storage, or was memset to 0), this function indicates whether or + * not cache_bin_init was called on it. + */ +bool cache_bin_still_zero_initialized(cache_bin_t *bin); + +#endif /* JEMALLOC_INTERNAL_CACHE_BIN_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/ckh.h b/BeefRT/JEMalloc/include/jemalloc/internal/ckh.h new file mode 100644 index 00000000..7b3850bc --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/ckh.h @@ -0,0 +1,101 @@ +#ifndef JEMALLOC_INTERNAL_CKH_H +#define JEMALLOC_INTERNAL_CKH_H + +#include "jemalloc/internal/tsd.h" + +/* Cuckoo hashing implementation. Skip to the end for the interface. */ + +/******************************************************************************/ +/* INTERNAL DEFINITIONS -- IGNORE */ +/******************************************************************************/ + +/* Maintain counters used to get an idea of performance. */ +/* #define CKH_COUNT */ +/* Print counter values in ckh_delete() (requires CKH_COUNT). */ +/* #define CKH_VERBOSE */ + +/* + * There are 2^LG_CKH_BUCKET_CELLS cells in each hash table bucket. Try to fit + * one bucket per L1 cache line. + */ +#define LG_CKH_BUCKET_CELLS (LG_CACHELINE - LG_SIZEOF_PTR - 1) + +/* Typedefs to allow easy function pointer passing. */ +typedef void ckh_hash_t (const void *, size_t[2]); +typedef bool ckh_keycomp_t (const void *, const void *); + +/* Hash table cell. */ +typedef struct { + const void *key; + const void *data; +} ckhc_t; + +/* The hash table itself. */ +typedef struct { +#ifdef CKH_COUNT + /* Counters used to get an idea of performance. */ + uint64_t ngrows; + uint64_t nshrinks; + uint64_t nshrinkfails; + uint64_t ninserts; + uint64_t nrelocs; +#endif + + /* Used for pseudo-random number generation. */ + uint64_t prng_state; + + /* Total number of items. */ + size_t count; + + /* + * Minimum and current number of hash table buckets. There are + * 2^LG_CKH_BUCKET_CELLS cells per bucket. + */ + unsigned lg_minbuckets; + unsigned lg_curbuckets; + + /* Hash and comparison functions. */ + ckh_hash_t *hash; + ckh_keycomp_t *keycomp; + + /* Hash table with 2^lg_curbuckets buckets. */ + ckhc_t *tab; +} ckh_t; + +/******************************************************************************/ +/* BEGIN PUBLIC API */ +/******************************************************************************/ + +/* Lifetime management. Minitems is the initial capacity. */ +bool ckh_new(tsd_t *tsd, ckh_t *ckh, size_t minitems, ckh_hash_t *hash, + ckh_keycomp_t *keycomp); +void ckh_delete(tsd_t *tsd, ckh_t *ckh); + +/* Get the number of elements in the set. */ +size_t ckh_count(ckh_t *ckh); + +/* + * To iterate over the elements in the table, initialize *tabind to 0 and call + * this function until it returns true. Each call that returns false will + * update *key and *data to the next element in the table, assuming the pointers + * are non-NULL. + */ +bool ckh_iter(ckh_t *ckh, size_t *tabind, void **key, void **data); + +/* + * Basic hash table operations -- insert, removal, lookup. For ckh_remove and + * ckh_search, key or data can be NULL. The hash-table only stores pointers to + * the key and value, and doesn't do any lifetime management. + */ +bool ckh_insert(tsd_t *tsd, ckh_t *ckh, const void *key, const void *data); +bool ckh_remove(tsd_t *tsd, ckh_t *ckh, const void *searchkey, void **key, + void **data); +bool ckh_search(ckh_t *ckh, const void *searchkey, void **key, void **data); + +/* Some useful hash and comparison functions for strings and pointers. */ +void ckh_string_hash(const void *key, size_t r_hash[2]); +bool ckh_string_keycomp(const void *k1, const void *k2); +void ckh_pointer_hash(const void *key, size_t r_hash[2]); +bool ckh_pointer_keycomp(const void *k1, const void *k2); + +#endif /* JEMALLOC_INTERNAL_CKH_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/counter.h b/BeefRT/JEMalloc/include/jemalloc/internal/counter.h new file mode 100644 index 00000000..79abf064 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/counter.h @@ -0,0 +1,34 @@ +#ifndef JEMALLOC_INTERNAL_COUNTER_H +#define JEMALLOC_INTERNAL_COUNTER_H + +#include "jemalloc/internal/mutex.h" + +typedef struct counter_accum_s { + LOCKEDINT_MTX_DECLARE(mtx) + locked_u64_t accumbytes; + uint64_t interval; +} counter_accum_t; + +JEMALLOC_ALWAYS_INLINE bool +counter_accum(tsdn_t *tsdn, counter_accum_t *counter, uint64_t bytes) { + uint64_t interval = counter->interval; + assert(interval > 0); + LOCKEDINT_MTX_LOCK(tsdn, counter->mtx); + /* + * If the event moves fast enough (and/or if the event handling is slow + * enough), extreme overflow can cause counter trigger coalescing. + * This is an intentional mechanism that avoids rate-limiting + * allocation. + */ + bool overflow = locked_inc_mod_u64(tsdn, LOCKEDINT_MTX(counter->mtx), + &counter->accumbytes, bytes, interval); + LOCKEDINT_MTX_UNLOCK(tsdn, counter->mtx); + return overflow; +} + +bool counter_accum_init(counter_accum_t *counter, uint64_t interval); +void counter_prefork(tsdn_t *tsdn, counter_accum_t *counter); +void counter_postfork_parent(tsdn_t *tsdn, counter_accum_t *counter); +void counter_postfork_child(tsdn_t *tsdn, counter_accum_t *counter); + +#endif /* JEMALLOC_INTERNAL_COUNTER_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/ctl.h b/BeefRT/JEMalloc/include/jemalloc/internal/ctl.h new file mode 100644 index 00000000..63d27f8a --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/ctl.h @@ -0,0 +1,159 @@ +#ifndef JEMALLOC_INTERNAL_CTL_H +#define JEMALLOC_INTERNAL_CTL_H + +#include "jemalloc/internal/jemalloc_internal_types.h" +#include "jemalloc/internal/malloc_io.h" +#include "jemalloc/internal/mutex_prof.h" +#include "jemalloc/internal/ql.h" +#include "jemalloc/internal/sc.h" +#include "jemalloc/internal/stats.h" + +/* Maximum ctl tree depth. */ +#define CTL_MAX_DEPTH 7 + +typedef struct ctl_node_s { + bool named; +} ctl_node_t; + +typedef struct ctl_named_node_s { + ctl_node_t node; + const char *name; + /* If (nchildren == 0), this is a terminal node. */ + size_t nchildren; + const ctl_node_t *children; + int (*ctl)(tsd_t *, const size_t *, size_t, void *, size_t *, void *, + size_t); +} ctl_named_node_t; + +typedef struct ctl_indexed_node_s { + struct ctl_node_s node; + const ctl_named_node_t *(*index)(tsdn_t *, const size_t *, size_t, + size_t); +} ctl_indexed_node_t; + +typedef struct ctl_arena_stats_s { + arena_stats_t astats; + + /* Aggregate stats for small size classes, based on bin stats. */ + size_t allocated_small; + uint64_t nmalloc_small; + uint64_t ndalloc_small; + uint64_t nrequests_small; + uint64_t nfills_small; + uint64_t nflushes_small; + + bin_stats_data_t bstats[SC_NBINS]; + arena_stats_large_t lstats[SC_NSIZES - SC_NBINS]; + pac_estats_t estats[SC_NPSIZES]; + hpa_shard_stats_t hpastats; + sec_stats_t secstats; +} ctl_arena_stats_t; + +typedef struct ctl_stats_s { + size_t allocated; + size_t active; + size_t metadata; + size_t metadata_thp; + size_t resident; + size_t mapped; + size_t retained; + + background_thread_stats_t background_thread; + mutex_prof_data_t mutex_prof_data[mutex_prof_num_global_mutexes]; +} ctl_stats_t; + +typedef struct ctl_arena_s ctl_arena_t; +struct ctl_arena_s { + unsigned arena_ind; + bool initialized; + ql_elm(ctl_arena_t) destroyed_link; + + /* Basic stats, supported even if !config_stats. */ + unsigned nthreads; + const char *dss; + ssize_t dirty_decay_ms; + ssize_t muzzy_decay_ms; + size_t pactive; + size_t pdirty; + size_t pmuzzy; + + /* NULL if !config_stats. */ + ctl_arena_stats_t *astats; +}; + +typedef struct ctl_arenas_s { + uint64_t epoch; + unsigned narenas; + ql_head(ctl_arena_t) destroyed; + + /* + * Element 0 corresponds to merged stats for extant arenas (accessed via + * MALLCTL_ARENAS_ALL), element 1 corresponds to merged stats for + * destroyed arenas (accessed via MALLCTL_ARENAS_DESTROYED), and the + * remaining MALLOCX_ARENA_LIMIT elements correspond to arenas. + */ + ctl_arena_t *arenas[2 + MALLOCX_ARENA_LIMIT]; +} ctl_arenas_t; + +int ctl_byname(tsd_t *tsd, const char *name, void *oldp, size_t *oldlenp, + void *newp, size_t newlen); +int ctl_nametomib(tsd_t *tsd, const char *name, size_t *mibp, size_t *miblenp); +int ctl_bymib(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp, + size_t *oldlenp, void *newp, size_t newlen); +int ctl_mibnametomib(tsd_t *tsd, size_t *mib, size_t miblen, const char *name, + size_t *miblenp); +int ctl_bymibname(tsd_t *tsd, size_t *mib, size_t miblen, const char *name, + size_t *miblenp, void *oldp, size_t *oldlenp, void *newp, size_t newlen); +bool ctl_boot(void); +void ctl_prefork(tsdn_t *tsdn); +void ctl_postfork_parent(tsdn_t *tsdn); +void ctl_postfork_child(tsdn_t *tsdn); +void ctl_mtx_assert_held(tsdn_t *tsdn); + +#define xmallctl(name, oldp, oldlenp, newp, newlen) do { \ + if (je_mallctl(name, oldp, oldlenp, newp, newlen) \ + != 0) { \ + malloc_printf( \ + ": Failure in xmallctl(\"%s\", ...)\n", \ + name); \ + abort(); \ + } \ +} while (0) + +#define xmallctlnametomib(name, mibp, miblenp) do { \ + if (je_mallctlnametomib(name, mibp, miblenp) != 0) { \ + malloc_printf(": Failure in " \ + "xmallctlnametomib(\"%s\", ...)\n", name); \ + abort(); \ + } \ +} while (0) + +#define xmallctlbymib(mib, miblen, oldp, oldlenp, newp, newlen) do { \ + if (je_mallctlbymib(mib, miblen, oldp, oldlenp, newp, \ + newlen) != 0) { \ + malloc_write( \ + ": Failure in xmallctlbymib()\n"); \ + abort(); \ + } \ +} while (0) + +#define xmallctlmibnametomib(mib, miblen, name, miblenp) do { \ + if (ctl_mibnametomib(tsd_fetch(), mib, miblen, name, miblenp) \ + != 0) { \ + malloc_write( \ + ": Failure in ctl_mibnametomib()\n"); \ + abort(); \ + } \ +} while (0) + +#define xmallctlbymibname(mib, miblen, name, miblenp, oldp, oldlenp, \ + newp, newlen) do { \ + if (ctl_bymibname(tsd_fetch(), mib, miblen, name, miblenp, \ + oldp, oldlenp, newp, newlen) != 0) { \ + malloc_write( \ + ": Failure in ctl_bymibname()\n"); \ + abort(); \ + } \ +} while (0) + +#endif /* JEMALLOC_INTERNAL_CTL_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/decay.h b/BeefRT/JEMalloc/include/jemalloc/internal/decay.h new file mode 100644 index 00000000..cf6a9d22 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/decay.h @@ -0,0 +1,186 @@ +#ifndef JEMALLOC_INTERNAL_DECAY_H +#define JEMALLOC_INTERNAL_DECAY_H + +#include "jemalloc/internal/smoothstep.h" + +#define DECAY_UNBOUNDED_TIME_TO_PURGE ((uint64_t)-1) + +/* + * The decay_t computes the number of pages we should purge at any given time. + * Page allocators inform a decay object when pages enter a decay-able state + * (i.e. dirty or muzzy), and query it to determine how many pages should be + * purged at any given time. + * + * This is mostly a single-threaded data structure and doesn't care about + * synchronization at all; it's the caller's responsibility to manage their + * synchronization on their own. There are two exceptions: + * 1) It's OK to racily call decay_ms_read (i.e. just the simplest state query). + * 2) The mtx and purging fields live (and are initialized) here, but are + * logically owned by the page allocator. This is just a convenience (since + * those fields would be duplicated for both the dirty and muzzy states + * otherwise). + */ +typedef struct decay_s decay_t; +struct decay_s { + /* Synchronizes all non-atomic fields. */ + malloc_mutex_t mtx; + /* + * True if a thread is currently purging the extents associated with + * this decay structure. + */ + bool purging; + /* + * Approximate time in milliseconds from the creation of a set of unused + * dirty pages until an equivalent set of unused dirty pages is purged + * and/or reused. + */ + atomic_zd_t time_ms; + /* time / SMOOTHSTEP_NSTEPS. */ + nstime_t interval; + /* + * Time at which the current decay interval logically started. We do + * not actually advance to a new epoch until sometime after it starts + * because of scheduling and computation delays, and it is even possible + * to completely skip epochs. In all cases, during epoch advancement we + * merge all relevant activity into the most recently recorded epoch. + */ + nstime_t epoch; + /* Deadline randomness generator. */ + uint64_t jitter_state; + /* + * Deadline for current epoch. This is the sum of interval and per + * epoch jitter which is a uniform random variable in [0..interval). + * Epochs always advance by precise multiples of interval, but we + * randomize the deadline to reduce the likelihood of arenas purging in + * lockstep. + */ + nstime_t deadline; + /* + * The number of pages we cap ourselves at in the current epoch, per + * decay policies. Updated on an epoch change. After an epoch change, + * the caller should take steps to try to purge down to this amount. + */ + size_t npages_limit; + /* + * Number of unpurged pages at beginning of current epoch. During epoch + * advancement we use the delta between arena->decay_*.nunpurged and + * ecache_npages_get(&arena->ecache_*) to determine how many dirty pages, + * if any, were generated. + */ + size_t nunpurged; + /* + * Trailing log of how many unused dirty pages were generated during + * each of the past SMOOTHSTEP_NSTEPS decay epochs, where the last + * element is the most recent epoch. Corresponding epoch times are + * relative to epoch. + * + * Updated only on epoch advance, triggered by + * decay_maybe_advance_epoch, below. + */ + size_t backlog[SMOOTHSTEP_NSTEPS]; + + /* Peak number of pages in associated extents. Used for debug only. */ + uint64_t ceil_npages; +}; + +/* + * The current decay time setting. This is the only public access to a decay_t + * that's allowed without holding mtx. + */ +static inline ssize_t +decay_ms_read(const decay_t *decay) { + return atomic_load_zd(&decay->time_ms, ATOMIC_RELAXED); +} + +/* + * See the comment on the struct field -- the limit on pages we should allow in + * this decay state this epoch. + */ +static inline size_t +decay_npages_limit_get(const decay_t *decay) { + return decay->npages_limit; +} + +/* How many unused dirty pages were generated during the last epoch. */ +static inline size_t +decay_epoch_npages_delta(const decay_t *decay) { + return decay->backlog[SMOOTHSTEP_NSTEPS - 1]; +} + +/* + * Current epoch duration, in nanoseconds. Given that new epochs are started + * somewhat haphazardly, this is not necessarily exactly the time between any + * two calls to decay_maybe_advance_epoch; see the comments on fields in the + * decay_t. + */ +static inline uint64_t +decay_epoch_duration_ns(const decay_t *decay) { + return nstime_ns(&decay->interval); +} + +static inline bool +decay_immediately(const decay_t *decay) { + ssize_t decay_ms = decay_ms_read(decay); + return decay_ms == 0; +} + +static inline bool +decay_disabled(const decay_t *decay) { + ssize_t decay_ms = decay_ms_read(decay); + return decay_ms < 0; +} + +/* Returns true if decay is enabled and done gradually. */ +static inline bool +decay_gradually(const decay_t *decay) { + ssize_t decay_ms = decay_ms_read(decay); + return decay_ms > 0; +} + +/* + * Returns true if the passed in decay time setting is valid. + * < -1 : invalid + * -1 : never decay + * 0 : decay immediately + * > 0 : some positive decay time, up to a maximum allowed value of + * NSTIME_SEC_MAX * 1000, which corresponds to decaying somewhere in the early + * 27th century. By that time, we expect to have implemented alternate purging + * strategies. + */ +bool decay_ms_valid(ssize_t decay_ms); + +/* + * As a precondition, the decay_t must be zeroed out (as if with memset). + * + * Returns true on error. + */ +bool decay_init(decay_t *decay, nstime_t *cur_time, ssize_t decay_ms); + +/* + * Given an already-initialized decay_t, reinitialize it with the given decay + * time. The decay_t must have previously been initialized (and should not then + * be zeroed). + */ +void decay_reinit(decay_t *decay, nstime_t *cur_time, ssize_t decay_ms); + +/* + * Compute how many of 'npages_new' pages we would need to purge in 'time'. + */ +uint64_t decay_npages_purge_in(decay_t *decay, nstime_t *time, + size_t npages_new); + +/* Returns true if the epoch advanced and there are pages to purge. */ +bool decay_maybe_advance_epoch(decay_t *decay, nstime_t *new_time, + size_t current_npages); + +/* + * Calculates wait time until a number of pages in the interval + * [0.5 * npages_threshold .. 1.5 * npages_threshold] should be purged. + * + * Returns number of nanoseconds or DECAY_UNBOUNDED_TIME_TO_PURGE in case of + * indefinite wait. + */ +uint64_t decay_ns_until_purge(decay_t *decay, size_t npages_current, + uint64_t npages_threshold); + +#endif /* JEMALLOC_INTERNAL_DECAY_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/div.h b/BeefRT/JEMalloc/include/jemalloc/internal/div.h new file mode 100644 index 00000000..aebae939 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/div.h @@ -0,0 +1,41 @@ +#ifndef JEMALLOC_INTERNAL_DIV_H +#define JEMALLOC_INTERNAL_DIV_H + +#include "jemalloc/internal/assert.h" + +/* + * This module does the division that computes the index of a region in a slab, + * given its offset relative to the base. + * That is, given a divisor d, an n = i * d (all integers), we'll return i. + * We do some pre-computation to do this more quickly than a CPU division + * instruction. + * We bound n < 2^32, and don't support dividing by one. + */ + +typedef struct div_info_s div_info_t; +struct div_info_s { + uint32_t magic; +#ifdef JEMALLOC_DEBUG + size_t d; +#endif +}; + +void div_init(div_info_t *div_info, size_t divisor); + +static inline size_t +div_compute(div_info_t *div_info, size_t n) { + assert(n <= (uint32_t)-1); + /* + * This generates, e.g. mov; imul; shr on x86-64. On a 32-bit machine, + * the compilers I tried were all smart enough to turn this into the + * appropriate "get the high 32 bits of the result of a multiply" (e.g. + * mul; mov edx eax; on x86, umull on arm, etc.). + */ + size_t i = ((uint64_t)n * (uint64_t)div_info->magic) >> 32; +#ifdef JEMALLOC_DEBUG + assert(i * div_info->d == n); +#endif + return i; +} + +#endif /* JEMALLOC_INTERNAL_DIV_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/ecache.h b/BeefRT/JEMalloc/include/jemalloc/internal/ecache.h new file mode 100644 index 00000000..71cae3e3 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/ecache.h @@ -0,0 +1,55 @@ +#ifndef JEMALLOC_INTERNAL_ECACHE_H +#define JEMALLOC_INTERNAL_ECACHE_H + +#include "jemalloc/internal/eset.h" +#include "jemalloc/internal/san.h" +#include "jemalloc/internal/mutex.h" + +typedef struct ecache_s ecache_t; +struct ecache_s { + malloc_mutex_t mtx; + eset_t eset; + eset_t guarded_eset; + /* All stored extents must be in the same state. */ + extent_state_t state; + /* The index of the ehooks the ecache is associated with. */ + unsigned ind; + /* + * If true, delay coalescing until eviction; otherwise coalesce during + * deallocation. + */ + bool delay_coalesce; +}; + +static inline size_t +ecache_npages_get(ecache_t *ecache) { + return eset_npages_get(&ecache->eset) + + eset_npages_get(&ecache->guarded_eset); +} + +/* Get the number of extents in the given page size index. */ +static inline size_t +ecache_nextents_get(ecache_t *ecache, pszind_t ind) { + return eset_nextents_get(&ecache->eset, ind) + + eset_nextents_get(&ecache->guarded_eset, ind); +} + +/* Get the sum total bytes of the extents in the given page size index. */ +static inline size_t +ecache_nbytes_get(ecache_t *ecache, pszind_t ind) { + return eset_nbytes_get(&ecache->eset, ind) + + eset_nbytes_get(&ecache->guarded_eset, ind); +} + +static inline unsigned +ecache_ind_get(ecache_t *ecache) { + return ecache->ind; +} + +bool ecache_init(tsdn_t *tsdn, ecache_t *ecache, extent_state_t state, + unsigned ind, bool delay_coalesce); +void ecache_prefork(tsdn_t *tsdn, ecache_t *ecache); +void ecache_postfork_parent(tsdn_t *tsdn, ecache_t *ecache); +void ecache_postfork_child(tsdn_t *tsdn, ecache_t *ecache); + +#endif /* JEMALLOC_INTERNAL_ECACHE_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/edata.h b/BeefRT/JEMalloc/include/jemalloc/internal/edata.h new file mode 100644 index 00000000..af039ea7 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/edata.h @@ -0,0 +1,698 @@ +#ifndef JEMALLOC_INTERNAL_EDATA_H +#define JEMALLOC_INTERNAL_EDATA_H + +#include "jemalloc/internal/atomic.h" +#include "jemalloc/internal/bin_info.h" +#include "jemalloc/internal/bit_util.h" +#include "jemalloc/internal/hpdata.h" +#include "jemalloc/internal/nstime.h" +#include "jemalloc/internal/ph.h" +#include "jemalloc/internal/ql.h" +#include "jemalloc/internal/sc.h" +#include "jemalloc/internal/slab_data.h" +#include "jemalloc/internal/sz.h" +#include "jemalloc/internal/typed_list.h" + +/* + * sizeof(edata_t) is 128 bytes on 64-bit architectures. Ensure the alignment + * to free up the low bits in the rtree leaf. + */ +#define EDATA_ALIGNMENT 128 + +enum extent_state_e { + extent_state_active = 0, + extent_state_dirty = 1, + extent_state_muzzy = 2, + extent_state_retained = 3, + extent_state_transition = 4, /* States below are intermediate. */ + extent_state_merging = 5, + extent_state_max = 5 /* Sanity checking only. */ +}; +typedef enum extent_state_e extent_state_t; + +enum extent_head_state_e { + EXTENT_NOT_HEAD, + EXTENT_IS_HEAD /* See comments in ehooks_default_merge_impl(). */ +}; +typedef enum extent_head_state_e extent_head_state_t; + +/* + * Which implementation of the page allocator interface, (PAI, defined in + * pai.h) owns the given extent? + */ +enum extent_pai_e { + EXTENT_PAI_PAC = 0, + EXTENT_PAI_HPA = 1 +}; +typedef enum extent_pai_e extent_pai_t; + +struct e_prof_info_s { + /* Time when this was allocated. */ + nstime_t e_prof_alloc_time; + /* Allocation request size. */ + size_t e_prof_alloc_size; + /* Points to a prof_tctx_t. */ + atomic_p_t e_prof_tctx; + /* + * Points to a prof_recent_t for the allocation; NULL + * means the recent allocation record no longer exists. + * Protected by prof_recent_alloc_mtx. + */ + atomic_p_t e_prof_recent_alloc; +}; +typedef struct e_prof_info_s e_prof_info_t; + +/* + * The information about a particular edata that lives in an emap. Space is + * more precious there (the information, plus the edata pointer, has to live in + * a 64-bit word if we want to enable a packed representation. + * + * There are two things that are special about the information here: + * - It's quicker to access. You have one fewer pointer hop, since finding the + * edata_t associated with an item always requires accessing the rtree leaf in + * which this data is stored. + * - It can be read unsynchronized, and without worrying about lifetime issues. + */ +typedef struct edata_map_info_s edata_map_info_t; +struct edata_map_info_s { + bool slab; + szind_t szind; +}; + +typedef struct edata_cmp_summary_s edata_cmp_summary_t; +struct edata_cmp_summary_s { + uint64_t sn; + uintptr_t addr; +}; + +/* Extent (span of pages). Use accessor functions for e_* fields. */ +typedef struct edata_s edata_t; +ph_structs(edata_avail, edata_t); +ph_structs(edata_heap, edata_t); +struct edata_s { + /* + * Bitfield containing several fields: + * + * a: arena_ind + * b: slab + * c: committed + * p: pai + * z: zeroed + * g: guarded + * t: state + * i: szind + * f: nfree + * s: bin_shard + * + * 00000000 ... 0000ssss ssffffff ffffiiii iiiitttg zpcbaaaa aaaaaaaa + * + * arena_ind: Arena from which this extent came, or all 1 bits if + * unassociated. + * + * slab: The slab flag indicates whether the extent is used for a slab + * of small regions. This helps differentiate small size classes, + * and it indicates whether interior pointers can be looked up via + * iealloc(). + * + * committed: The committed flag indicates whether physical memory is + * committed to the extent, whether explicitly or implicitly + * as on a system that overcommits and satisfies physical + * memory needs on demand via soft page faults. + * + * pai: The pai flag is an extent_pai_t. + * + * zeroed: The zeroed flag is used by extent recycling code to track + * whether memory is zero-filled. + * + * guarded: The guarded flag is use by the sanitizer to track whether + * the extent has page guards around it. + * + * state: The state flag is an extent_state_t. + * + * szind: The szind flag indicates usable size class index for + * allocations residing in this extent, regardless of whether the + * extent is a slab. Extent size and usable size often differ + * even for non-slabs, either due to sz_large_pad or promotion of + * sampled small regions. + * + * nfree: Number of free regions in slab. + * + * bin_shard: the shard of the bin from which this extent came. + */ + uint64_t e_bits; +#define MASK(CURRENT_FIELD_WIDTH, CURRENT_FIELD_SHIFT) ((((((uint64_t)0x1U) << (CURRENT_FIELD_WIDTH)) - 1)) << (CURRENT_FIELD_SHIFT)) + +#define EDATA_BITS_ARENA_WIDTH MALLOCX_ARENA_BITS +#define EDATA_BITS_ARENA_SHIFT 0 +#define EDATA_BITS_ARENA_MASK MASK(EDATA_BITS_ARENA_WIDTH, EDATA_BITS_ARENA_SHIFT) + +#define EDATA_BITS_SLAB_WIDTH 1 +#define EDATA_BITS_SLAB_SHIFT (EDATA_BITS_ARENA_WIDTH + EDATA_BITS_ARENA_SHIFT) +#define EDATA_BITS_SLAB_MASK MASK(EDATA_BITS_SLAB_WIDTH, EDATA_BITS_SLAB_SHIFT) + +#define EDATA_BITS_COMMITTED_WIDTH 1 +#define EDATA_BITS_COMMITTED_SHIFT (EDATA_BITS_SLAB_WIDTH + EDATA_BITS_SLAB_SHIFT) +#define EDATA_BITS_COMMITTED_MASK MASK(EDATA_BITS_COMMITTED_WIDTH, EDATA_BITS_COMMITTED_SHIFT) + +#define EDATA_BITS_PAI_WIDTH 1 +#define EDATA_BITS_PAI_SHIFT (EDATA_BITS_COMMITTED_WIDTH + EDATA_BITS_COMMITTED_SHIFT) +#define EDATA_BITS_PAI_MASK MASK(EDATA_BITS_PAI_WIDTH, EDATA_BITS_PAI_SHIFT) + +#define EDATA_BITS_ZEROED_WIDTH 1 +#define EDATA_BITS_ZEROED_SHIFT (EDATA_BITS_PAI_WIDTH + EDATA_BITS_PAI_SHIFT) +#define EDATA_BITS_ZEROED_MASK MASK(EDATA_BITS_ZEROED_WIDTH, EDATA_BITS_ZEROED_SHIFT) + +#define EDATA_BITS_GUARDED_WIDTH 1 +#define EDATA_BITS_GUARDED_SHIFT (EDATA_BITS_ZEROED_WIDTH + EDATA_BITS_ZEROED_SHIFT) +#define EDATA_BITS_GUARDED_MASK MASK(EDATA_BITS_GUARDED_WIDTH, EDATA_BITS_GUARDED_SHIFT) + +#define EDATA_BITS_STATE_WIDTH 3 +#define EDATA_BITS_STATE_SHIFT (EDATA_BITS_GUARDED_WIDTH + EDATA_BITS_GUARDED_SHIFT) +#define EDATA_BITS_STATE_MASK MASK(EDATA_BITS_STATE_WIDTH, EDATA_BITS_STATE_SHIFT) + +#define EDATA_BITS_SZIND_WIDTH LG_CEIL(SC_NSIZES) +#define EDATA_BITS_SZIND_SHIFT (EDATA_BITS_STATE_WIDTH + EDATA_BITS_STATE_SHIFT) +#define EDATA_BITS_SZIND_MASK MASK(EDATA_BITS_SZIND_WIDTH, EDATA_BITS_SZIND_SHIFT) + +#define EDATA_BITS_NFREE_WIDTH (SC_LG_SLAB_MAXREGS + 1) +#define EDATA_BITS_NFREE_SHIFT (EDATA_BITS_SZIND_WIDTH + EDATA_BITS_SZIND_SHIFT) +#define EDATA_BITS_NFREE_MASK MASK(EDATA_BITS_NFREE_WIDTH, EDATA_BITS_NFREE_SHIFT) + +#define EDATA_BITS_BINSHARD_WIDTH 6 +#define EDATA_BITS_BINSHARD_SHIFT (EDATA_BITS_NFREE_WIDTH + EDATA_BITS_NFREE_SHIFT) +#define EDATA_BITS_BINSHARD_MASK MASK(EDATA_BITS_BINSHARD_WIDTH, EDATA_BITS_BINSHARD_SHIFT) + +#define EDATA_BITS_IS_HEAD_WIDTH 1 +#define EDATA_BITS_IS_HEAD_SHIFT (EDATA_BITS_BINSHARD_WIDTH + EDATA_BITS_BINSHARD_SHIFT) +#define EDATA_BITS_IS_HEAD_MASK MASK(EDATA_BITS_IS_HEAD_WIDTH, EDATA_BITS_IS_HEAD_SHIFT) + + /* Pointer to the extent that this structure is responsible for. */ + void *e_addr; + + union { + /* + * Extent size and serial number associated with the extent + * structure (different than the serial number for the extent at + * e_addr). + * + * ssssssss [...] ssssssss ssssnnnn nnnnnnnn + */ + size_t e_size_esn; + #define EDATA_SIZE_MASK ((size_t)~(PAGE-1)) + #define EDATA_ESN_MASK ((size_t)PAGE-1) + /* Base extent size, which may not be a multiple of PAGE. */ + size_t e_bsize; + }; + + /* + * If this edata is a user allocation from an HPA, it comes out of some + * pageslab (we don't yet support huegpage allocations that don't fit + * into pageslabs). This tracks it. + */ + hpdata_t *e_ps; + + /* + * Serial number. These are not necessarily unique; splitting an extent + * results in two extents with the same serial number. + */ + uint64_t e_sn; + + union { + /* + * List linkage used when the edata_t is active; either in + * arena's large allocations or bin_t's slabs_full. + */ + ql_elm(edata_t) ql_link_active; + /* + * Pairing heap linkage. Used whenever the extent is inactive + * (in the page allocators), or when it is active and in + * slabs_nonfull, or when the edata_t is unassociated with an + * extent and sitting in an edata_cache. + */ + union { + edata_heap_link_t heap_link; + edata_avail_link_t avail_link; + }; + }; + + union { + /* + * List linkage used when the extent is inactive: + * - Stashed dirty extents + * - Ecache LRU functionality. + */ + ql_elm(edata_t) ql_link_inactive; + /* Small region slab metadata. */ + slab_data_t e_slab_data; + + /* Profiling data, used for large objects. */ + e_prof_info_t e_prof_info; + }; +}; + +TYPED_LIST(edata_list_active, edata_t, ql_link_active) +TYPED_LIST(edata_list_inactive, edata_t, ql_link_inactive) + +static inline unsigned +edata_arena_ind_get(const edata_t *edata) { + unsigned arena_ind = (unsigned)((edata->e_bits & + EDATA_BITS_ARENA_MASK) >> EDATA_BITS_ARENA_SHIFT); + assert(arena_ind < MALLOCX_ARENA_LIMIT); + + return arena_ind; +} + +static inline szind_t +edata_szind_get_maybe_invalid(const edata_t *edata) { + szind_t szind = (szind_t)((edata->e_bits & EDATA_BITS_SZIND_MASK) >> + EDATA_BITS_SZIND_SHIFT); + assert(szind <= SC_NSIZES); + return szind; +} + +static inline szind_t +edata_szind_get(const edata_t *edata) { + szind_t szind = edata_szind_get_maybe_invalid(edata); + assert(szind < SC_NSIZES); /* Never call when "invalid". */ + return szind; +} + +static inline size_t +edata_usize_get(const edata_t *edata) { + return sz_index2size(edata_szind_get(edata)); +} + +static inline unsigned +edata_binshard_get(const edata_t *edata) { + unsigned binshard = (unsigned)((edata->e_bits & + EDATA_BITS_BINSHARD_MASK) >> EDATA_BITS_BINSHARD_SHIFT); + assert(binshard < bin_infos[edata_szind_get(edata)].n_shards); + return binshard; +} + +static inline uint64_t +edata_sn_get(const edata_t *edata) { + return edata->e_sn; +} + +static inline extent_state_t +edata_state_get(const edata_t *edata) { + return (extent_state_t)((edata->e_bits & EDATA_BITS_STATE_MASK) >> + EDATA_BITS_STATE_SHIFT); +} + +static inline bool +edata_guarded_get(const edata_t *edata) { + return (bool)((edata->e_bits & EDATA_BITS_GUARDED_MASK) >> + EDATA_BITS_GUARDED_SHIFT); +} + +static inline bool +edata_zeroed_get(const edata_t *edata) { + return (bool)((edata->e_bits & EDATA_BITS_ZEROED_MASK) >> + EDATA_BITS_ZEROED_SHIFT); +} + +static inline bool +edata_committed_get(const edata_t *edata) { + return (bool)((edata->e_bits & EDATA_BITS_COMMITTED_MASK) >> + EDATA_BITS_COMMITTED_SHIFT); +} + +static inline extent_pai_t +edata_pai_get(const edata_t *edata) { + return (extent_pai_t)((edata->e_bits & EDATA_BITS_PAI_MASK) >> + EDATA_BITS_PAI_SHIFT); +} + +static inline bool +edata_slab_get(const edata_t *edata) { + return (bool)((edata->e_bits & EDATA_BITS_SLAB_MASK) >> + EDATA_BITS_SLAB_SHIFT); +} + +static inline unsigned +edata_nfree_get(const edata_t *edata) { + assert(edata_slab_get(edata)); + return (unsigned)((edata->e_bits & EDATA_BITS_NFREE_MASK) >> + EDATA_BITS_NFREE_SHIFT); +} + +static inline void * +edata_base_get(const edata_t *edata) { + assert(edata->e_addr == PAGE_ADDR2BASE(edata->e_addr) || + !edata_slab_get(edata)); + return PAGE_ADDR2BASE(edata->e_addr); +} + +static inline void * +edata_addr_get(const edata_t *edata) { + assert(edata->e_addr == PAGE_ADDR2BASE(edata->e_addr) || + !edata_slab_get(edata)); + return edata->e_addr; +} + +static inline size_t +edata_size_get(const edata_t *edata) { + return (edata->e_size_esn & EDATA_SIZE_MASK); +} + +static inline size_t +edata_esn_get(const edata_t *edata) { + return (edata->e_size_esn & EDATA_ESN_MASK); +} + +static inline size_t +edata_bsize_get(const edata_t *edata) { + return edata->e_bsize; +} + +static inline hpdata_t * +edata_ps_get(const edata_t *edata) { + assert(edata_pai_get(edata) == EXTENT_PAI_HPA); + return edata->e_ps; +} + +static inline void * +edata_before_get(const edata_t *edata) { + return (void *)((uintptr_t)edata_base_get(edata) - PAGE); +} + +static inline void * +edata_last_get(const edata_t *edata) { + return (void *)((uintptr_t)edata_base_get(edata) + + edata_size_get(edata) - PAGE); +} + +static inline void * +edata_past_get(const edata_t *edata) { + return (void *)((uintptr_t)edata_base_get(edata) + + edata_size_get(edata)); +} + +static inline slab_data_t * +edata_slab_data_get(edata_t *edata) { + assert(edata_slab_get(edata)); + return &edata->e_slab_data; +} + +static inline const slab_data_t * +edata_slab_data_get_const(const edata_t *edata) { + assert(edata_slab_get(edata)); + return &edata->e_slab_data; +} + +static inline prof_tctx_t * +edata_prof_tctx_get(const edata_t *edata) { + return (prof_tctx_t *)atomic_load_p(&edata->e_prof_info.e_prof_tctx, + ATOMIC_ACQUIRE); +} + +static inline const nstime_t * +edata_prof_alloc_time_get(const edata_t *edata) { + return &edata->e_prof_info.e_prof_alloc_time; +} + +static inline size_t +edata_prof_alloc_size_get(const edata_t *edata) { + return edata->e_prof_info.e_prof_alloc_size; +} + +static inline prof_recent_t * +edata_prof_recent_alloc_get_dont_call_directly(const edata_t *edata) { + return (prof_recent_t *)atomic_load_p( + &edata->e_prof_info.e_prof_recent_alloc, ATOMIC_RELAXED); +} + +static inline void +edata_arena_ind_set(edata_t *edata, unsigned arena_ind) { + edata->e_bits = (edata->e_bits & ~EDATA_BITS_ARENA_MASK) | + ((uint64_t)arena_ind << EDATA_BITS_ARENA_SHIFT); +} + +static inline void +edata_binshard_set(edata_t *edata, unsigned binshard) { + /* The assertion assumes szind is set already. */ + assert(binshard < bin_infos[edata_szind_get(edata)].n_shards); + edata->e_bits = (edata->e_bits & ~EDATA_BITS_BINSHARD_MASK) | + ((uint64_t)binshard << EDATA_BITS_BINSHARD_SHIFT); +} + +static inline void +edata_addr_set(edata_t *edata, void *addr) { + edata->e_addr = addr; +} + +static inline void +edata_size_set(edata_t *edata, size_t size) { + assert((size & ~EDATA_SIZE_MASK) == 0); + edata->e_size_esn = size | (edata->e_size_esn & ~EDATA_SIZE_MASK); +} + +static inline void +edata_esn_set(edata_t *edata, size_t esn) { + edata->e_size_esn = (edata->e_size_esn & ~EDATA_ESN_MASK) | (esn & + EDATA_ESN_MASK); +} + +static inline void +edata_bsize_set(edata_t *edata, size_t bsize) { + edata->e_bsize = bsize; +} + +static inline void +edata_ps_set(edata_t *edata, hpdata_t *ps) { + assert(edata_pai_get(edata) == EXTENT_PAI_HPA); + edata->e_ps = ps; +} + +static inline void +edata_szind_set(edata_t *edata, szind_t szind) { + assert(szind <= SC_NSIZES); /* SC_NSIZES means "invalid". */ + edata->e_bits = (edata->e_bits & ~EDATA_BITS_SZIND_MASK) | + ((uint64_t)szind << EDATA_BITS_SZIND_SHIFT); +} + +static inline void +edata_nfree_set(edata_t *edata, unsigned nfree) { + assert(edata_slab_get(edata)); + edata->e_bits = (edata->e_bits & ~EDATA_BITS_NFREE_MASK) | + ((uint64_t)nfree << EDATA_BITS_NFREE_SHIFT); +} + +static inline void +edata_nfree_binshard_set(edata_t *edata, unsigned nfree, unsigned binshard) { + /* The assertion assumes szind is set already. */ + assert(binshard < bin_infos[edata_szind_get(edata)].n_shards); + edata->e_bits = (edata->e_bits & + (~EDATA_BITS_NFREE_MASK & ~EDATA_BITS_BINSHARD_MASK)) | + ((uint64_t)binshard << EDATA_BITS_BINSHARD_SHIFT) | + ((uint64_t)nfree << EDATA_BITS_NFREE_SHIFT); +} + +static inline void +edata_nfree_inc(edata_t *edata) { + assert(edata_slab_get(edata)); + edata->e_bits += ((uint64_t)1U << EDATA_BITS_NFREE_SHIFT); +} + +static inline void +edata_nfree_dec(edata_t *edata) { + assert(edata_slab_get(edata)); + edata->e_bits -= ((uint64_t)1U << EDATA_BITS_NFREE_SHIFT); +} + +static inline void +edata_nfree_sub(edata_t *edata, uint64_t n) { + assert(edata_slab_get(edata)); + edata->e_bits -= (n << EDATA_BITS_NFREE_SHIFT); +} + +static inline void +edata_sn_set(edata_t *edata, uint64_t sn) { + edata->e_sn = sn; +} + +static inline void +edata_state_set(edata_t *edata, extent_state_t state) { + edata->e_bits = (edata->e_bits & ~EDATA_BITS_STATE_MASK) | + ((uint64_t)state << EDATA_BITS_STATE_SHIFT); +} + +static inline void +edata_guarded_set(edata_t *edata, bool guarded) { + edata->e_bits = (edata->e_bits & ~EDATA_BITS_GUARDED_MASK) | + ((uint64_t)guarded << EDATA_BITS_GUARDED_SHIFT); +} + +static inline void +edata_zeroed_set(edata_t *edata, bool zeroed) { + edata->e_bits = (edata->e_bits & ~EDATA_BITS_ZEROED_MASK) | + ((uint64_t)zeroed << EDATA_BITS_ZEROED_SHIFT); +} + +static inline void +edata_committed_set(edata_t *edata, bool committed) { + edata->e_bits = (edata->e_bits & ~EDATA_BITS_COMMITTED_MASK) | + ((uint64_t)committed << EDATA_BITS_COMMITTED_SHIFT); +} + +static inline void +edata_pai_set(edata_t *edata, extent_pai_t pai) { + edata->e_bits = (edata->e_bits & ~EDATA_BITS_PAI_MASK) | + ((uint64_t)pai << EDATA_BITS_PAI_SHIFT); +} + +static inline void +edata_slab_set(edata_t *edata, bool slab) { + edata->e_bits = (edata->e_bits & ~EDATA_BITS_SLAB_MASK) | + ((uint64_t)slab << EDATA_BITS_SLAB_SHIFT); +} + +static inline void +edata_prof_tctx_set(edata_t *edata, prof_tctx_t *tctx) { + atomic_store_p(&edata->e_prof_info.e_prof_tctx, tctx, ATOMIC_RELEASE); +} + +static inline void +edata_prof_alloc_time_set(edata_t *edata, nstime_t *t) { + nstime_copy(&edata->e_prof_info.e_prof_alloc_time, t); +} + +static inline void +edata_prof_alloc_size_set(edata_t *edata, size_t size) { + edata->e_prof_info.e_prof_alloc_size = size; +} + +static inline void +edata_prof_recent_alloc_set_dont_call_directly(edata_t *edata, + prof_recent_t *recent_alloc) { + atomic_store_p(&edata->e_prof_info.e_prof_recent_alloc, recent_alloc, + ATOMIC_RELAXED); +} + +static inline bool +edata_is_head_get(edata_t *edata) { + return (bool)((edata->e_bits & EDATA_BITS_IS_HEAD_MASK) >> + EDATA_BITS_IS_HEAD_SHIFT); +} + +static inline void +edata_is_head_set(edata_t *edata, bool is_head) { + edata->e_bits = (edata->e_bits & ~EDATA_BITS_IS_HEAD_MASK) | + ((uint64_t)is_head << EDATA_BITS_IS_HEAD_SHIFT); +} + +static inline bool +edata_state_in_transition(extent_state_t state) { + return state >= extent_state_transition; +} + +/* + * Because this function is implemented as a sequence of bitfield modifications, + * even though each individual bit is properly initialized, we technically read + * uninitialized data within it. This is mostly fine, since most callers get + * their edatas from zeroing sources, but callers who make stack edata_ts need + * to manually zero them. + */ +static inline void +edata_init(edata_t *edata, unsigned arena_ind, void *addr, size_t size, + bool slab, szind_t szind, uint64_t sn, extent_state_t state, bool zeroed, + bool committed, extent_pai_t pai, extent_head_state_t is_head) { + assert(addr == PAGE_ADDR2BASE(addr) || !slab); + + edata_arena_ind_set(edata, arena_ind); + edata_addr_set(edata, addr); + edata_size_set(edata, size); + edata_slab_set(edata, slab); + edata_szind_set(edata, szind); + edata_sn_set(edata, sn); + edata_state_set(edata, state); + edata_guarded_set(edata, false); + edata_zeroed_set(edata, zeroed); + edata_committed_set(edata, committed); + edata_pai_set(edata, pai); + edata_is_head_set(edata, is_head == EXTENT_IS_HEAD); + if (config_prof) { + edata_prof_tctx_set(edata, NULL); + } +} + +static inline void +edata_binit(edata_t *edata, void *addr, size_t bsize, uint64_t sn) { + edata_arena_ind_set(edata, (1U << MALLOCX_ARENA_BITS) - 1); + edata_addr_set(edata, addr); + edata_bsize_set(edata, bsize); + edata_slab_set(edata, false); + edata_szind_set(edata, SC_NSIZES); + edata_sn_set(edata, sn); + edata_state_set(edata, extent_state_active); + edata_guarded_set(edata, false); + edata_zeroed_set(edata, true); + edata_committed_set(edata, true); + /* + * This isn't strictly true, but base allocated extents never get + * deallocated and can't be looked up in the emap, but no sense in + * wasting a state bit to encode this fact. + */ + edata_pai_set(edata, EXTENT_PAI_PAC); +} + +static inline int +edata_esn_comp(const edata_t *a, const edata_t *b) { + size_t a_esn = edata_esn_get(a); + size_t b_esn = edata_esn_get(b); + + return (a_esn > b_esn) - (a_esn < b_esn); +} + +static inline int +edata_ead_comp(const edata_t *a, const edata_t *b) { + uintptr_t a_eaddr = (uintptr_t)a; + uintptr_t b_eaddr = (uintptr_t)b; + + return (a_eaddr > b_eaddr) - (a_eaddr < b_eaddr); +} + +static inline edata_cmp_summary_t +edata_cmp_summary_get(const edata_t *edata) { + return (edata_cmp_summary_t){edata_sn_get(edata), + (uintptr_t)edata_addr_get(edata)}; +} + +static inline int +edata_cmp_summary_comp(edata_cmp_summary_t a, edata_cmp_summary_t b) { + int ret; + ret = (a.sn > b.sn) - (a.sn < b.sn); + if (ret != 0) { + return ret; + } + ret = (a.addr > b.addr) - (a.addr < b.addr); + return ret; +} + +static inline int +edata_snad_comp(const edata_t *a, const edata_t *b) { + edata_cmp_summary_t a_cmp = edata_cmp_summary_get(a); + edata_cmp_summary_t b_cmp = edata_cmp_summary_get(b); + + return edata_cmp_summary_comp(a_cmp, b_cmp); +} + +static inline int +edata_esnead_comp(const edata_t *a, const edata_t *b) { + int ret; + + ret = edata_esn_comp(a, b); + if (ret != 0) { + return ret; + } + + ret = edata_ead_comp(a, b); + return ret; +} + +ph_proto(, edata_avail, edata_t) +ph_proto(, edata_heap, edata_t) + +#endif /* JEMALLOC_INTERNAL_EDATA_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/edata_cache.h b/BeefRT/JEMalloc/include/jemalloc/internal/edata_cache.h new file mode 100644 index 00000000..8b6c0ef7 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/edata_cache.h @@ -0,0 +1,49 @@ +#ifndef JEMALLOC_INTERNAL_EDATA_CACHE_H +#define JEMALLOC_INTERNAL_EDATA_CACHE_H + +#include "jemalloc/internal/base.h" + +/* For tests only. */ +#define EDATA_CACHE_FAST_FILL 4 + +/* + * A cache of edata_t structures allocated via base_alloc_edata (as opposed to + * the underlying extents they describe). The contents of returned edata_t + * objects are garbage and cannot be relied upon. + */ + +typedef struct edata_cache_s edata_cache_t; +struct edata_cache_s { + edata_avail_t avail; + atomic_zu_t count; + malloc_mutex_t mtx; + base_t *base; +}; + +bool edata_cache_init(edata_cache_t *edata_cache, base_t *base); +edata_t *edata_cache_get(tsdn_t *tsdn, edata_cache_t *edata_cache); +void edata_cache_put(tsdn_t *tsdn, edata_cache_t *edata_cache, edata_t *edata); + +void edata_cache_prefork(tsdn_t *tsdn, edata_cache_t *edata_cache); +void edata_cache_postfork_parent(tsdn_t *tsdn, edata_cache_t *edata_cache); +void edata_cache_postfork_child(tsdn_t *tsdn, edata_cache_t *edata_cache); + +/* + * An edata_cache_small is like an edata_cache, but it relies on external + * synchronization and avoids first-fit strategies. + */ + +typedef struct edata_cache_fast_s edata_cache_fast_t; +struct edata_cache_fast_s { + edata_list_inactive_t list; + edata_cache_t *fallback; + bool disabled; +}; + +void edata_cache_fast_init(edata_cache_fast_t *ecs, edata_cache_t *fallback); +edata_t *edata_cache_fast_get(tsdn_t *tsdn, edata_cache_fast_t *ecs); +void edata_cache_fast_put(tsdn_t *tsdn, edata_cache_fast_t *ecs, + edata_t *edata); +void edata_cache_fast_disable(tsdn_t *tsdn, edata_cache_fast_t *ecs); + +#endif /* JEMALLOC_INTERNAL_EDATA_CACHE_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/ehooks.h b/BeefRT/JEMalloc/include/jemalloc/internal/ehooks.h new file mode 100644 index 00000000..8d9513e2 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/ehooks.h @@ -0,0 +1,412 @@ +#ifndef JEMALLOC_INTERNAL_EHOOKS_H +#define JEMALLOC_INTERNAL_EHOOKS_H + +#include "jemalloc/internal/atomic.h" +#include "jemalloc/internal/extent_mmap.h" + +/* + * This module is the internal interface to the extent hooks (both + * user-specified and external). Eventually, this will give us the flexibility + * to use multiple different versions of user-visible extent-hook APIs under a + * single user interface. + * + * Current API expansions (not available to anyone but the default hooks yet): + * - Head state tracking. Hooks can decide whether or not to merge two + * extents based on whether or not one of them is the head (i.e. was + * allocated on its own). The later extent loses its "head" status. + */ + +extern const extent_hooks_t ehooks_default_extent_hooks; + +typedef struct ehooks_s ehooks_t; +struct ehooks_s { + /* + * The user-visible id that goes with the ehooks (i.e. that of the base + * they're a part of, the associated arena's index within the arenas + * array). + */ + unsigned ind; + /* Logically an extent_hooks_t *. */ + atomic_p_t ptr; +}; + +extern const extent_hooks_t ehooks_default_extent_hooks; + +/* + * These are not really part of the public API. Each hook has a fast-path for + * the default-hooks case that can avoid various small inefficiencies: + * - Forgetting tsd and then calling tsd_get within the hook. + * - Getting more state than necessary out of the extent_t. + * - Doing arena_ind -> arena -> arena_ind lookups. + * By making the calls to these functions visible to the compiler, it can move + * those extra bits of computation down below the fast-paths where they get ignored. + */ +void *ehooks_default_alloc_impl(tsdn_t *tsdn, void *new_addr, size_t size, + size_t alignment, bool *zero, bool *commit, unsigned arena_ind); +bool ehooks_default_dalloc_impl(void *addr, size_t size); +void ehooks_default_destroy_impl(void *addr, size_t size); +bool ehooks_default_commit_impl(void *addr, size_t offset, size_t length); +bool ehooks_default_decommit_impl(void *addr, size_t offset, size_t length); +#ifdef PAGES_CAN_PURGE_LAZY +bool ehooks_default_purge_lazy_impl(void *addr, size_t offset, size_t length); +#endif +#ifdef PAGES_CAN_PURGE_FORCED +bool ehooks_default_purge_forced_impl(void *addr, size_t offset, size_t length); +#endif +bool ehooks_default_split_impl(); +/* + * Merge is the only default extent hook we declare -- see the comment in + * ehooks_merge. + */ +bool ehooks_default_merge(extent_hooks_t *extent_hooks, void *addr_a, + size_t size_a, void *addr_b, size_t size_b, bool committed, + unsigned arena_ind); +bool ehooks_default_merge_impl(tsdn_t *tsdn, void *addr_a, void *addr_b); +void ehooks_default_zero_impl(void *addr, size_t size); +void ehooks_default_guard_impl(void *guard1, void *guard2); +void ehooks_default_unguard_impl(void *guard1, void *guard2); + +/* + * We don't officially support reentrancy from wtihin the extent hooks. But + * various people who sit within throwing distance of the jemalloc team want + * that functionality in certain limited cases. The default reentrancy guards + * assert that we're not reentrant from a0 (since it's the bootstrap arena, + * where reentrant allocations would be redirected), which we would incorrectly + * trigger in cases where a0 has extent hooks (those hooks themselves can't be + * reentrant, then, but there are reasonable uses for such functionality, like + * putting internal metadata on hugepages). Therefore, we use the raw + * reentrancy guards. + * + * Eventually, we need to think more carefully about whether and where we + * support allocating from within extent hooks (and what that means for things + * like profiling, stats collection, etc.), and document what the guarantee is. + */ +static inline void +ehooks_pre_reentrancy(tsdn_t *tsdn) { + tsd_t *tsd = tsdn_null(tsdn) ? tsd_fetch() : tsdn_tsd(tsdn); + tsd_pre_reentrancy_raw(tsd); +} + +static inline void +ehooks_post_reentrancy(tsdn_t *tsdn) { + tsd_t *tsd = tsdn_null(tsdn) ? tsd_fetch() : tsdn_tsd(tsdn); + tsd_post_reentrancy_raw(tsd); +} + +/* Beginning of the public API. */ +void ehooks_init(ehooks_t *ehooks, extent_hooks_t *extent_hooks, unsigned ind); + +static inline unsigned +ehooks_ind_get(const ehooks_t *ehooks) { + return ehooks->ind; +} + +static inline void +ehooks_set_extent_hooks_ptr(ehooks_t *ehooks, extent_hooks_t *extent_hooks) { + atomic_store_p(&ehooks->ptr, extent_hooks, ATOMIC_RELEASE); +} + +static inline extent_hooks_t * +ehooks_get_extent_hooks_ptr(ehooks_t *ehooks) { + return (extent_hooks_t *)atomic_load_p(&ehooks->ptr, ATOMIC_ACQUIRE); +} + +static inline bool +ehooks_are_default(ehooks_t *ehooks) { + return ehooks_get_extent_hooks_ptr(ehooks) == + &ehooks_default_extent_hooks; +} + +/* + * In some cases, a caller needs to allocate resources before attempting to call + * a hook. If that hook is doomed to fail, this is wasteful. We therefore + * include some checks for such cases. + */ +static inline bool +ehooks_dalloc_will_fail(ehooks_t *ehooks) { + if (ehooks_are_default(ehooks)) { + return opt_retain; + } else { + return ehooks_get_extent_hooks_ptr(ehooks)->dalloc == NULL; + } +} + +static inline bool +ehooks_split_will_fail(ehooks_t *ehooks) { + return ehooks_get_extent_hooks_ptr(ehooks)->split == NULL; +} + +static inline bool +ehooks_merge_will_fail(ehooks_t *ehooks) { + return ehooks_get_extent_hooks_ptr(ehooks)->merge == NULL; +} + +static inline bool +ehooks_guard_will_fail(ehooks_t *ehooks) { + /* + * Before the guard hooks are officially introduced, limit the use to + * the default hooks only. + */ + return !ehooks_are_default(ehooks); +} + +/* + * Some hooks are required to return zeroed memory in certain situations. In + * debug mode, we do some heuristic checks that they did what they were supposed + * to. + * + * This isn't really ehooks-specific (i.e. anyone can check for zeroed memory). + * But incorrect zero information indicates an ehook bug. + */ +static inline void +ehooks_debug_zero_check(void *addr, size_t size) { + assert(((uintptr_t)addr & PAGE_MASK) == 0); + assert((size & PAGE_MASK) == 0); + assert(size > 0); + if (config_debug) { + /* Check the whole first page. */ + size_t *p = (size_t *)addr; + for (size_t i = 0; i < PAGE / sizeof(size_t); i++) { + assert(p[i] == 0); + } + /* + * And 4 spots within. There's a tradeoff here; the larger + * this number, the more likely it is that we'll catch a bug + * where ehooks return a sparsely non-zero range. But + * increasing the number of checks also increases the number of + * page faults in debug mode. FreeBSD does much of their + * day-to-day development work in debug mode, so we don't want + * even the debug builds to be too slow. + */ + const size_t nchecks = 4; + assert(PAGE >= sizeof(size_t) * nchecks); + for (size_t i = 0; i < nchecks; ++i) { + assert(p[i * (size / sizeof(size_t) / nchecks)] == 0); + } + } +} + + +static inline void * +ehooks_alloc(tsdn_t *tsdn, ehooks_t *ehooks, void *new_addr, size_t size, + size_t alignment, bool *zero, bool *commit) { + bool orig_zero = *zero; + void *ret; + extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks); + if (extent_hooks == &ehooks_default_extent_hooks) { + ret = ehooks_default_alloc_impl(tsdn, new_addr, size, + alignment, zero, commit, ehooks_ind_get(ehooks)); + } else { + ehooks_pre_reentrancy(tsdn); + ret = extent_hooks->alloc(extent_hooks, new_addr, size, + alignment, zero, commit, ehooks_ind_get(ehooks)); + ehooks_post_reentrancy(tsdn); + } + assert(new_addr == NULL || ret == NULL || new_addr == ret); + assert(!orig_zero || *zero); + if (*zero && ret != NULL) { + ehooks_debug_zero_check(ret, size); + } + return ret; +} + +static inline bool +ehooks_dalloc(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size, + bool committed) { + extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks); + if (extent_hooks == &ehooks_default_extent_hooks) { + return ehooks_default_dalloc_impl(addr, size); + } else if (extent_hooks->dalloc == NULL) { + return true; + } else { + ehooks_pre_reentrancy(tsdn); + bool err = extent_hooks->dalloc(extent_hooks, addr, size, + committed, ehooks_ind_get(ehooks)); + ehooks_post_reentrancy(tsdn); + return err; + } +} + +static inline void +ehooks_destroy(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size, + bool committed) { + extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks); + if (extent_hooks == &ehooks_default_extent_hooks) { + ehooks_default_destroy_impl(addr, size); + } else if (extent_hooks->destroy == NULL) { + /* Do nothing. */ + } else { + ehooks_pre_reentrancy(tsdn); + extent_hooks->destroy(extent_hooks, addr, size, committed, + ehooks_ind_get(ehooks)); + ehooks_post_reentrancy(tsdn); + } +} + +static inline bool +ehooks_commit(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size, + size_t offset, size_t length) { + extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks); + bool err; + if (extent_hooks == &ehooks_default_extent_hooks) { + err = ehooks_default_commit_impl(addr, offset, length); + } else if (extent_hooks->commit == NULL) { + err = true; + } else { + ehooks_pre_reentrancy(tsdn); + err = extent_hooks->commit(extent_hooks, addr, size, + offset, length, ehooks_ind_get(ehooks)); + ehooks_post_reentrancy(tsdn); + } + if (!err) { + ehooks_debug_zero_check(addr, size); + } + return err; +} + +static inline bool +ehooks_decommit(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size, + size_t offset, size_t length) { + extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks); + if (extent_hooks == &ehooks_default_extent_hooks) { + return ehooks_default_decommit_impl(addr, offset, length); + } else if (extent_hooks->decommit == NULL) { + return true; + } else { + ehooks_pre_reentrancy(tsdn); + bool err = extent_hooks->decommit(extent_hooks, addr, size, + offset, length, ehooks_ind_get(ehooks)); + ehooks_post_reentrancy(tsdn); + return err; + } +} + +static inline bool +ehooks_purge_lazy(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size, + size_t offset, size_t length) { + extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks); +#ifdef PAGES_CAN_PURGE_LAZY + if (extent_hooks == &ehooks_default_extent_hooks) { + return ehooks_default_purge_lazy_impl(addr, offset, length); + } +#endif + if (extent_hooks->purge_lazy == NULL) { + return true; + } else { + ehooks_pre_reentrancy(tsdn); + bool err = extent_hooks->purge_lazy(extent_hooks, addr, size, + offset, length, ehooks_ind_get(ehooks)); + ehooks_post_reentrancy(tsdn); + return err; + } +} + +static inline bool +ehooks_purge_forced(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size, + size_t offset, size_t length) { + extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks); + /* + * It would be correct to have a ehooks_debug_zero_check call at the end + * of this function; purge_forced is required to zero. But checking + * would touch the page in question, which may have performance + * consequences (imagine the hooks are using hugepages, with a global + * zero page off). Even in debug mode, it's usually a good idea to + * avoid cases that can dramatically increase memory consumption. + */ +#ifdef PAGES_CAN_PURGE_FORCED + if (extent_hooks == &ehooks_default_extent_hooks) { + return ehooks_default_purge_forced_impl(addr, offset, length); + } +#endif + if (extent_hooks->purge_forced == NULL) { + return true; + } else { + ehooks_pre_reentrancy(tsdn); + bool err = extent_hooks->purge_forced(extent_hooks, addr, size, + offset, length, ehooks_ind_get(ehooks)); + ehooks_post_reentrancy(tsdn); + return err; + } +} + +static inline bool +ehooks_split(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size, + size_t size_a, size_t size_b, bool committed) { + extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks); + if (ehooks_are_default(ehooks)) { + return ehooks_default_split_impl(); + } else if (extent_hooks->split == NULL) { + return true; + } else { + ehooks_pre_reentrancy(tsdn); + bool err = extent_hooks->split(extent_hooks, addr, size, size_a, + size_b, committed, ehooks_ind_get(ehooks)); + ehooks_post_reentrancy(tsdn); + return err; + } +} + +static inline bool +ehooks_merge(tsdn_t *tsdn, ehooks_t *ehooks, void *addr_a, size_t size_a, + void *addr_b, size_t size_b, bool committed) { + extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks); + if (extent_hooks == &ehooks_default_extent_hooks) { + return ehooks_default_merge_impl(tsdn, addr_a, addr_b); + } else if (extent_hooks->merge == NULL) { + return true; + } else { + ehooks_pre_reentrancy(tsdn); + bool err = extent_hooks->merge(extent_hooks, addr_a, size_a, + addr_b, size_b, committed, ehooks_ind_get(ehooks)); + ehooks_post_reentrancy(tsdn); + return err; + } +} + +static inline void +ehooks_zero(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size) { + extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks); + if (extent_hooks == &ehooks_default_extent_hooks) { + ehooks_default_zero_impl(addr, size); + } else { + /* + * It would be correct to try using the user-provided purge + * hooks (since they are required to have zeroed the extent if + * they indicate success), but we don't necessarily know their + * cost. We'll be conservative and use memset. + */ + memset(addr, 0, size); + } +} + +static inline bool +ehooks_guard(tsdn_t *tsdn, ehooks_t *ehooks, void *guard1, void *guard2) { + bool err; + extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks); + + if (extent_hooks == &ehooks_default_extent_hooks) { + ehooks_default_guard_impl(guard1, guard2); + err = false; + } else { + err = true; + } + + return err; +} + +static inline bool +ehooks_unguard(tsdn_t *tsdn, ehooks_t *ehooks, void *guard1, void *guard2) { + bool err; + extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks); + + if (extent_hooks == &ehooks_default_extent_hooks) { + ehooks_default_unguard_impl(guard1, guard2); + err = false; + } else { + err = true; + } + + return err; +} + +#endif /* JEMALLOC_INTERNAL_EHOOKS_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/emap.h b/BeefRT/JEMalloc/include/jemalloc/internal/emap.h new file mode 100644 index 00000000..847af327 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/emap.h @@ -0,0 +1,357 @@ +#ifndef JEMALLOC_INTERNAL_EMAP_H +#define JEMALLOC_INTERNAL_EMAP_H + +#include "jemalloc/internal/base.h" +#include "jemalloc/internal/rtree.h" + +/* + * Note: Ends without at semicolon, so that + * EMAP_DECLARE_RTREE_CTX; + * in uses will avoid empty-statement warnings. + */ +#define EMAP_DECLARE_RTREE_CTX \ + rtree_ctx_t rtree_ctx_fallback; \ + rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback) + +typedef struct emap_s emap_t; +struct emap_s { + rtree_t rtree; +}; + +/* Used to pass rtree lookup context down the path. */ +typedef struct emap_alloc_ctx_t emap_alloc_ctx_t; +struct emap_alloc_ctx_t { + szind_t szind; + bool slab; +}; + +typedef struct emap_full_alloc_ctx_s emap_full_alloc_ctx_t; +struct emap_full_alloc_ctx_s { + szind_t szind; + bool slab; + edata_t *edata; +}; + +bool emap_init(emap_t *emap, base_t *base, bool zeroed); + +void emap_remap(tsdn_t *tsdn, emap_t *emap, edata_t *edata, szind_t szind, + bool slab); + +void emap_update_edata_state(tsdn_t *tsdn, emap_t *emap, edata_t *edata, + extent_state_t state); + +/* + * The two acquire functions below allow accessing neighbor edatas, if it's safe + * and valid to do so (i.e. from the same arena, of the same state, etc.). This + * is necessary because the ecache locks are state based, and only protect + * edatas with the same state. Therefore the neighbor edata's state needs to be + * verified first, before chasing the edata pointer. The returned edata will be + * in an acquired state, meaning other threads will be prevented from accessing + * it, even if technically the edata can still be discovered from the rtree. + * + * This means, at any moment when holding pointers to edata, either one of the + * state based locks is held (and the edatas are all of the protected state), or + * the edatas are in an acquired state (e.g. in active or merging state). The + * acquire operation itself (changing the edata to an acquired state) is done + * under the state locks. + */ +edata_t *emap_try_acquire_edata_neighbor(tsdn_t *tsdn, emap_t *emap, + edata_t *edata, extent_pai_t pai, extent_state_t expected_state, + bool forward); +edata_t *emap_try_acquire_edata_neighbor_expand(tsdn_t *tsdn, emap_t *emap, + edata_t *edata, extent_pai_t pai, extent_state_t expected_state); +void emap_release_edata(tsdn_t *tsdn, emap_t *emap, edata_t *edata, + extent_state_t new_state); + +/* + * Associate the given edata with its beginning and end address, setting the + * szind and slab info appropriately. + * Returns true on error (i.e. resource exhaustion). + */ +bool emap_register_boundary(tsdn_t *tsdn, emap_t *emap, edata_t *edata, + szind_t szind, bool slab); + +/* + * Does the same thing, but with the interior of the range, for slab + * allocations. + * + * You might wonder why we don't just have a single emap_register function that + * does both depending on the value of 'slab'. The answer is twofold: + * - As a practical matter, in places like the extract->split->commit pathway, + * we defer the interior operation until we're sure that the commit won't fail + * (but we have to register the split boundaries there). + * - In general, we're trying to move to a world where the page-specific + * allocator doesn't know as much about how the pages it allocates will be + * used, and passing a 'slab' parameter everywhere makes that more + * complicated. + * + * Unlike the boundary version, this function can't fail; this is because slabs + * can't get big enough to touch a new page that neither of the boundaries + * touched, so no allocation is necessary to fill the interior once the boundary + * has been touched. + */ +void emap_register_interior(tsdn_t *tsdn, emap_t *emap, edata_t *edata, + szind_t szind); + +void emap_deregister_boundary(tsdn_t *tsdn, emap_t *emap, edata_t *edata); +void emap_deregister_interior(tsdn_t *tsdn, emap_t *emap, edata_t *edata); + +typedef struct emap_prepare_s emap_prepare_t; +struct emap_prepare_s { + rtree_leaf_elm_t *lead_elm_a; + rtree_leaf_elm_t *lead_elm_b; + rtree_leaf_elm_t *trail_elm_a; + rtree_leaf_elm_t *trail_elm_b; +}; + +/** + * These functions the emap metadata management for merging, splitting, and + * reusing extents. In particular, they set the boundary mappings from + * addresses to edatas. If the result is going to be used as a slab, you + * still need to call emap_register_interior on it, though. + * + * Remap simply changes the szind and slab status of an extent's boundary + * mappings. If the extent is not a slab, it doesn't bother with updating the + * end mapping (since lookups only occur in the interior of an extent for + * slabs). Since the szind and slab status only make sense for active extents, + * this should only be called while activating or deactivating an extent. + * + * Split and merge have a "prepare" and a "commit" portion. The prepare portion + * does the operations that can be done without exclusive access to the extent + * in question, while the commit variant requires exclusive access to maintain + * the emap invariants. The only function that can fail is emap_split_prepare, + * and it returns true on failure (at which point the caller shouldn't commit). + * + * In all cases, "lead" refers to the lower-addressed extent, and trail to the + * higher-addressed one. It's the caller's responsibility to set the edata + * state appropriately. + */ +bool emap_split_prepare(tsdn_t *tsdn, emap_t *emap, emap_prepare_t *prepare, + edata_t *edata, size_t size_a, edata_t *trail, size_t size_b); +void emap_split_commit(tsdn_t *tsdn, emap_t *emap, emap_prepare_t *prepare, + edata_t *lead, size_t size_a, edata_t *trail, size_t size_b); +void emap_merge_prepare(tsdn_t *tsdn, emap_t *emap, emap_prepare_t *prepare, + edata_t *lead, edata_t *trail); +void emap_merge_commit(tsdn_t *tsdn, emap_t *emap, emap_prepare_t *prepare, + edata_t *lead, edata_t *trail); + +/* Assert that the emap's view of the given edata matches the edata's view. */ +void emap_do_assert_mapped(tsdn_t *tsdn, emap_t *emap, edata_t *edata); +static inline void +emap_assert_mapped(tsdn_t *tsdn, emap_t *emap, edata_t *edata) { + if (config_debug) { + emap_do_assert_mapped(tsdn, emap, edata); + } +} + +/* Assert that the given edata isn't in the map. */ +void emap_do_assert_not_mapped(tsdn_t *tsdn, emap_t *emap, edata_t *edata); +static inline void +emap_assert_not_mapped(tsdn_t *tsdn, emap_t *emap, edata_t *edata) { + if (config_debug) { + emap_do_assert_not_mapped(tsdn, emap, edata); + } +} + +JEMALLOC_ALWAYS_INLINE bool +emap_edata_in_transition(tsdn_t *tsdn, emap_t *emap, edata_t *edata) { + assert(config_debug); + emap_assert_mapped(tsdn, emap, edata); + + EMAP_DECLARE_RTREE_CTX; + rtree_contents_t contents = rtree_read(tsdn, &emap->rtree, rtree_ctx, + (uintptr_t)edata_base_get(edata)); + + return edata_state_in_transition(contents.metadata.state); +} + +JEMALLOC_ALWAYS_INLINE bool +emap_edata_is_acquired(tsdn_t *tsdn, emap_t *emap, edata_t *edata) { + if (!config_debug) { + /* For assertions only. */ + return false; + } + + /* + * The edata is considered acquired if no other threads will attempt to + * read / write any fields from it. This includes a few cases: + * + * 1) edata not hooked into emap yet -- This implies the edata just got + * allocated or initialized. + * + * 2) in an active or transition state -- In both cases, the edata can + * be discovered from the emap, however the state tracked in the rtree + * will prevent other threads from accessing the actual edata. + */ + EMAP_DECLARE_RTREE_CTX; + rtree_leaf_elm_t *elm = rtree_leaf_elm_lookup(tsdn, &emap->rtree, + rtree_ctx, (uintptr_t)edata_base_get(edata), /* dependent */ true, + /* init_missing */ false); + if (elm == NULL) { + return true; + } + rtree_contents_t contents = rtree_leaf_elm_read(tsdn, &emap->rtree, elm, + /* dependent */ true); + if (contents.edata == NULL || + contents.metadata.state == extent_state_active || + edata_state_in_transition(contents.metadata.state)) { + return true; + } + + return false; +} + +JEMALLOC_ALWAYS_INLINE void +extent_assert_can_coalesce(const edata_t *inner, const edata_t *outer) { + assert(edata_arena_ind_get(inner) == edata_arena_ind_get(outer)); + assert(edata_pai_get(inner) == edata_pai_get(outer)); + assert(edata_committed_get(inner) == edata_committed_get(outer)); + assert(edata_state_get(inner) == extent_state_active); + assert(edata_state_get(outer) == extent_state_merging); + assert(!edata_guarded_get(inner) && !edata_guarded_get(outer)); + assert(edata_base_get(inner) == edata_past_get(outer) || + edata_base_get(outer) == edata_past_get(inner)); +} + +JEMALLOC_ALWAYS_INLINE void +extent_assert_can_expand(const edata_t *original, const edata_t *expand) { + assert(edata_arena_ind_get(original) == edata_arena_ind_get(expand)); + assert(edata_pai_get(original) == edata_pai_get(expand)); + assert(edata_state_get(original) == extent_state_active); + assert(edata_state_get(expand) == extent_state_merging); + assert(edata_past_get(original) == edata_base_get(expand)); +} + +JEMALLOC_ALWAYS_INLINE edata_t * +emap_edata_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr) { + EMAP_DECLARE_RTREE_CTX; + + return rtree_read(tsdn, &emap->rtree, rtree_ctx, (uintptr_t)ptr).edata; +} + +/* Fills in alloc_ctx with the info in the map. */ +JEMALLOC_ALWAYS_INLINE void +emap_alloc_ctx_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr, + emap_alloc_ctx_t *alloc_ctx) { + EMAP_DECLARE_RTREE_CTX; + + rtree_metadata_t metadata = rtree_metadata_read(tsdn, &emap->rtree, + rtree_ctx, (uintptr_t)ptr); + alloc_ctx->szind = metadata.szind; + alloc_ctx->slab = metadata.slab; +} + +/* The pointer must be mapped. */ +JEMALLOC_ALWAYS_INLINE void +emap_full_alloc_ctx_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr, + emap_full_alloc_ctx_t *full_alloc_ctx) { + EMAP_DECLARE_RTREE_CTX; + + rtree_contents_t contents = rtree_read(tsdn, &emap->rtree, rtree_ctx, + (uintptr_t)ptr); + full_alloc_ctx->edata = contents.edata; + full_alloc_ctx->szind = contents.metadata.szind; + full_alloc_ctx->slab = contents.metadata.slab; +} + +/* + * The pointer is allowed to not be mapped. + * + * Returns true when the pointer is not present. + */ +JEMALLOC_ALWAYS_INLINE bool +emap_full_alloc_ctx_try_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr, + emap_full_alloc_ctx_t *full_alloc_ctx) { + EMAP_DECLARE_RTREE_CTX; + + rtree_contents_t contents; + bool err = rtree_read_independent(tsdn, &emap->rtree, rtree_ctx, + (uintptr_t)ptr, &contents); + if (err) { + return true; + } + full_alloc_ctx->edata = contents.edata; + full_alloc_ctx->szind = contents.metadata.szind; + full_alloc_ctx->slab = contents.metadata.slab; + return false; +} + +/* + * Only used on the fastpath of free. Returns true when cannot be fulfilled by + * fast path, e.g. when the metadata key is not cached. + */ +JEMALLOC_ALWAYS_INLINE bool +emap_alloc_ctx_try_lookup_fast(tsd_t *tsd, emap_t *emap, const void *ptr, + emap_alloc_ctx_t *alloc_ctx) { + /* Use the unsafe getter since this may gets called during exit. */ + rtree_ctx_t *rtree_ctx = tsd_rtree_ctxp_get_unsafe(tsd); + + rtree_metadata_t metadata; + bool err = rtree_metadata_try_read_fast(tsd_tsdn(tsd), &emap->rtree, + rtree_ctx, (uintptr_t)ptr, &metadata); + if (err) { + return true; + } + alloc_ctx->szind = metadata.szind; + alloc_ctx->slab = metadata.slab; + return false; +} + +/* + * We want to do batch lookups out of the cache bins, which use + * cache_bin_ptr_array_get to access the i'th element of the bin (since they + * invert usual ordering in deciding what to flush). This lets the emap avoid + * caring about its caller's ordering. + */ +typedef const void *(*emap_ptr_getter)(void *ctx, size_t ind); +/* + * This allows size-checking assertions, which we can only do while we're in the + * process of edata lookups. + */ +typedef void (*emap_metadata_visitor)(void *ctx, emap_full_alloc_ctx_t *alloc_ctx); + +typedef union emap_batch_lookup_result_u emap_batch_lookup_result_t; +union emap_batch_lookup_result_u { + edata_t *edata; + rtree_leaf_elm_t *rtree_leaf; +}; + +JEMALLOC_ALWAYS_INLINE void +emap_edata_lookup_batch(tsd_t *tsd, emap_t *emap, size_t nptrs, + emap_ptr_getter ptr_getter, void *ptr_getter_ctx, + emap_metadata_visitor metadata_visitor, void *metadata_visitor_ctx, + emap_batch_lookup_result_t *result) { + /* Avoids null-checking tsdn in the loop below. */ + util_assume(tsd != NULL); + rtree_ctx_t *rtree_ctx = tsd_rtree_ctxp_get(tsd); + + for (size_t i = 0; i < nptrs; i++) { + const void *ptr = ptr_getter(ptr_getter_ctx, i); + /* + * Reuse the edatas array as a temp buffer, lying a little about + * the types. + */ + result[i].rtree_leaf = rtree_leaf_elm_lookup(tsd_tsdn(tsd), + &emap->rtree, rtree_ctx, (uintptr_t)ptr, + /* dependent */ true, /* init_missing */ false); + } + + for (size_t i = 0; i < nptrs; i++) { + rtree_leaf_elm_t *elm = result[i].rtree_leaf; + rtree_contents_t contents = rtree_leaf_elm_read(tsd_tsdn(tsd), + &emap->rtree, elm, /* dependent */ true); + result[i].edata = contents.edata; + emap_full_alloc_ctx_t alloc_ctx; + /* + * Not all these fields are read in practice by the metadata + * visitor. But the compiler can easily optimize away the ones + * that aren't, so no sense in being incomplete. + */ + alloc_ctx.szind = contents.metadata.szind; + alloc_ctx.slab = contents.metadata.slab; + alloc_ctx.edata = contents.edata; + metadata_visitor(metadata_visitor_ctx, &alloc_ctx); + } +} + +#endif /* JEMALLOC_INTERNAL_EMAP_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/emitter.h b/BeefRT/JEMalloc/include/jemalloc/internal/emitter.h new file mode 100644 index 00000000..9482f68b --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/emitter.h @@ -0,0 +1,510 @@ +#ifndef JEMALLOC_INTERNAL_EMITTER_H +#define JEMALLOC_INTERNAL_EMITTER_H + +#include "jemalloc/internal/ql.h" + +typedef enum emitter_output_e emitter_output_t; +enum emitter_output_e { + emitter_output_json, + emitter_output_json_compact, + emitter_output_table +}; + +typedef enum emitter_justify_e emitter_justify_t; +enum emitter_justify_e { + emitter_justify_left, + emitter_justify_right, + /* Not for users; just to pass to internal functions. */ + emitter_justify_none +}; + +typedef enum emitter_type_e emitter_type_t; +enum emitter_type_e { + emitter_type_bool, + emitter_type_int, + emitter_type_int64, + emitter_type_unsigned, + emitter_type_uint32, + emitter_type_uint64, + emitter_type_size, + emitter_type_ssize, + emitter_type_string, + /* + * A title is a column title in a table; it's just a string, but it's + * not quoted. + */ + emitter_type_title, +}; + +typedef struct emitter_col_s emitter_col_t; +struct emitter_col_s { + /* Filled in by the user. */ + emitter_justify_t justify; + int width; + emitter_type_t type; + union { + bool bool_val; + int int_val; + unsigned unsigned_val; + uint32_t uint32_val; + uint32_t uint32_t_val; + uint64_t uint64_val; + uint64_t uint64_t_val; + size_t size_val; + ssize_t ssize_val; + const char *str_val; + }; + + /* Filled in by initialization. */ + ql_elm(emitter_col_t) link; +}; + +typedef struct emitter_row_s emitter_row_t; +struct emitter_row_s { + ql_head(emitter_col_t) cols; +}; + +typedef struct emitter_s emitter_t; +struct emitter_s { + emitter_output_t output; + /* The output information. */ + write_cb_t *write_cb; + void *cbopaque; + int nesting_depth; + /* True if we've already emitted a value at the given depth. */ + bool item_at_depth; + /* True if we emitted a key and will emit corresponding value next. */ + bool emitted_key; +}; + +static inline bool +emitter_outputs_json(emitter_t *emitter) { + return emitter->output == emitter_output_json || + emitter->output == emitter_output_json_compact; +} + +/* Internal convenience function. Write to the emitter the given string. */ +JEMALLOC_FORMAT_PRINTF(2, 3) +static inline void +emitter_printf(emitter_t *emitter, const char *format, ...) { + va_list ap; + + va_start(ap, format); + malloc_vcprintf(emitter->write_cb, emitter->cbopaque, format, ap); + va_end(ap); +} + +static inline const char * JEMALLOC_FORMAT_ARG(3) +emitter_gen_fmt(char *out_fmt, size_t out_size, const char *fmt_specifier, + emitter_justify_t justify, int width) { + size_t written; + fmt_specifier++; + if (justify == emitter_justify_none) { + written = malloc_snprintf(out_fmt, out_size, + "%%%s", fmt_specifier); + } else if (justify == emitter_justify_left) { + written = malloc_snprintf(out_fmt, out_size, + "%%-%d%s", width, fmt_specifier); + } else { + written = malloc_snprintf(out_fmt, out_size, + "%%%d%s", width, fmt_specifier); + } + /* Only happens in case of bad format string, which *we* choose. */ + assert(written < out_size); + return out_fmt; +} + +/* + * Internal. Emit the given value type in the relevant encoding (so that the + * bool true gets mapped to json "true", but the string "true" gets mapped to + * json "\"true\"", for instance. + * + * Width is ignored if justify is emitter_justify_none. + */ +static inline void +emitter_print_value(emitter_t *emitter, emitter_justify_t justify, int width, + emitter_type_t value_type, const void *value) { + size_t str_written; +#define BUF_SIZE 256 +#define FMT_SIZE 10 + /* + * We dynamically generate a format string to emit, to let us use the + * snprintf machinery. This is kinda hacky, but gets the job done + * quickly without having to think about the various snprintf edge + * cases. + */ + char fmt[FMT_SIZE]; + char buf[BUF_SIZE]; + +#define EMIT_SIMPLE(type, format) \ + emitter_printf(emitter, \ + emitter_gen_fmt(fmt, FMT_SIZE, format, justify, width), \ + *(const type *)value); + + switch (value_type) { + case emitter_type_bool: + emitter_printf(emitter, + emitter_gen_fmt(fmt, FMT_SIZE, "%s", justify, width), + *(const bool *)value ? "true" : "false"); + break; + case emitter_type_int: + EMIT_SIMPLE(int, "%d") + break; + case emitter_type_int64: + EMIT_SIMPLE(int64_t, "%" FMTd64) + break; + case emitter_type_unsigned: + EMIT_SIMPLE(unsigned, "%u") + break; + case emitter_type_ssize: + EMIT_SIMPLE(ssize_t, "%zd") + break; + case emitter_type_size: + EMIT_SIMPLE(size_t, "%zu") + break; + case emitter_type_string: + str_written = malloc_snprintf(buf, BUF_SIZE, "\"%s\"", + *(const char *const *)value); + /* + * We control the strings we output; we shouldn't get anything + * anywhere near the fmt size. + */ + assert(str_written < BUF_SIZE); + emitter_printf(emitter, + emitter_gen_fmt(fmt, FMT_SIZE, "%s", justify, width), buf); + break; + case emitter_type_uint32: + EMIT_SIMPLE(uint32_t, "%" FMTu32) + break; + case emitter_type_uint64: + EMIT_SIMPLE(uint64_t, "%" FMTu64) + break; + case emitter_type_title: + EMIT_SIMPLE(char *const, "%s"); + break; + default: + unreachable(); + } +#undef BUF_SIZE +#undef FMT_SIZE +} + + +/* Internal functions. In json mode, tracks nesting state. */ +static inline void +emitter_nest_inc(emitter_t *emitter) { + emitter->nesting_depth++; + emitter->item_at_depth = false; +} + +static inline void +emitter_nest_dec(emitter_t *emitter) { + emitter->nesting_depth--; + emitter->item_at_depth = true; +} + +static inline void +emitter_indent(emitter_t *emitter) { + int amount = emitter->nesting_depth; + const char *indent_str; + assert(emitter->output != emitter_output_json_compact); + if (emitter->output == emitter_output_json) { + indent_str = "\t"; + } else { + amount *= 2; + indent_str = " "; + } + for (int i = 0; i < amount; i++) { + emitter_printf(emitter, "%s", indent_str); + } +} + +static inline void +emitter_json_key_prefix(emitter_t *emitter) { + assert(emitter_outputs_json(emitter)); + if (emitter->emitted_key) { + emitter->emitted_key = false; + return; + } + if (emitter->item_at_depth) { + emitter_printf(emitter, ","); + } + if (emitter->output != emitter_output_json_compact) { + emitter_printf(emitter, "\n"); + emitter_indent(emitter); + } +} + +/******************************************************************************/ +/* Public functions for emitter_t. */ + +static inline void +emitter_init(emitter_t *emitter, emitter_output_t emitter_output, + write_cb_t *write_cb, void *cbopaque) { + emitter->output = emitter_output; + emitter->write_cb = write_cb; + emitter->cbopaque = cbopaque; + emitter->item_at_depth = false; + emitter->emitted_key = false; + emitter->nesting_depth = 0; +} + +/******************************************************************************/ +/* JSON public API. */ + +/* + * Emits a key (e.g. as appears in an object). The next json entity emitted will + * be the corresponding value. + */ +static inline void +emitter_json_key(emitter_t *emitter, const char *json_key) { + if (emitter_outputs_json(emitter)) { + emitter_json_key_prefix(emitter); + emitter_printf(emitter, "\"%s\":%s", json_key, + emitter->output == emitter_output_json_compact ? "" : " "); + emitter->emitted_key = true; + } +} + +static inline void +emitter_json_value(emitter_t *emitter, emitter_type_t value_type, + const void *value) { + if (emitter_outputs_json(emitter)) { + emitter_json_key_prefix(emitter); + emitter_print_value(emitter, emitter_justify_none, -1, + value_type, value); + emitter->item_at_depth = true; + } +} + +/* Shorthand for calling emitter_json_key and then emitter_json_value. */ +static inline void +emitter_json_kv(emitter_t *emitter, const char *json_key, + emitter_type_t value_type, const void *value) { + emitter_json_key(emitter, json_key); + emitter_json_value(emitter, value_type, value); +} + +static inline void +emitter_json_array_begin(emitter_t *emitter) { + if (emitter_outputs_json(emitter)) { + emitter_json_key_prefix(emitter); + emitter_printf(emitter, "["); + emitter_nest_inc(emitter); + } +} + +/* Shorthand for calling emitter_json_key and then emitter_json_array_begin. */ +static inline void +emitter_json_array_kv_begin(emitter_t *emitter, const char *json_key) { + emitter_json_key(emitter, json_key); + emitter_json_array_begin(emitter); +} + +static inline void +emitter_json_array_end(emitter_t *emitter) { + if (emitter_outputs_json(emitter)) { + assert(emitter->nesting_depth > 0); + emitter_nest_dec(emitter); + if (emitter->output != emitter_output_json_compact) { + emitter_printf(emitter, "\n"); + emitter_indent(emitter); + } + emitter_printf(emitter, "]"); + } +} + +static inline void +emitter_json_object_begin(emitter_t *emitter) { + if (emitter_outputs_json(emitter)) { + emitter_json_key_prefix(emitter); + emitter_printf(emitter, "{"); + emitter_nest_inc(emitter); + } +} + +/* Shorthand for calling emitter_json_key and then emitter_json_object_begin. */ +static inline void +emitter_json_object_kv_begin(emitter_t *emitter, const char *json_key) { + emitter_json_key(emitter, json_key); + emitter_json_object_begin(emitter); +} + +static inline void +emitter_json_object_end(emitter_t *emitter) { + if (emitter_outputs_json(emitter)) { + assert(emitter->nesting_depth > 0); + emitter_nest_dec(emitter); + if (emitter->output != emitter_output_json_compact) { + emitter_printf(emitter, "\n"); + emitter_indent(emitter); + } + emitter_printf(emitter, "}"); + } +} + + +/******************************************************************************/ +/* Table public API. */ + +static inline void +emitter_table_dict_begin(emitter_t *emitter, const char *table_key) { + if (emitter->output == emitter_output_table) { + emitter_indent(emitter); + emitter_printf(emitter, "%s\n", table_key); + emitter_nest_inc(emitter); + } +} + +static inline void +emitter_table_dict_end(emitter_t *emitter) { + if (emitter->output == emitter_output_table) { + emitter_nest_dec(emitter); + } +} + +static inline void +emitter_table_kv_note(emitter_t *emitter, const char *table_key, + emitter_type_t value_type, const void *value, + const char *table_note_key, emitter_type_t table_note_value_type, + const void *table_note_value) { + if (emitter->output == emitter_output_table) { + emitter_indent(emitter); + emitter_printf(emitter, "%s: ", table_key); + emitter_print_value(emitter, emitter_justify_none, -1, + value_type, value); + if (table_note_key != NULL) { + emitter_printf(emitter, " (%s: ", table_note_key); + emitter_print_value(emitter, emitter_justify_none, -1, + table_note_value_type, table_note_value); + emitter_printf(emitter, ")"); + } + emitter_printf(emitter, "\n"); + } + emitter->item_at_depth = true; +} + +static inline void +emitter_table_kv(emitter_t *emitter, const char *table_key, + emitter_type_t value_type, const void *value) { + emitter_table_kv_note(emitter, table_key, value_type, value, NULL, + emitter_type_bool, NULL); +} + + +/* Write to the emitter the given string, but only in table mode. */ +JEMALLOC_FORMAT_PRINTF(2, 3) +static inline void +emitter_table_printf(emitter_t *emitter, const char *format, ...) { + if (emitter->output == emitter_output_table) { + va_list ap; + va_start(ap, format); + malloc_vcprintf(emitter->write_cb, emitter->cbopaque, format, ap); + va_end(ap); + } +} + +static inline void +emitter_table_row(emitter_t *emitter, emitter_row_t *row) { + if (emitter->output != emitter_output_table) { + return; + } + emitter_col_t *col; + ql_foreach(col, &row->cols, link) { + emitter_print_value(emitter, col->justify, col->width, + col->type, (const void *)&col->bool_val); + } + emitter_table_printf(emitter, "\n"); +} + +static inline void +emitter_row_init(emitter_row_t *row) { + ql_new(&row->cols); +} + +static inline void +emitter_col_init(emitter_col_t *col, emitter_row_t *row) { + ql_elm_new(col, link); + ql_tail_insert(&row->cols, col, link); +} + + +/******************************************************************************/ +/* + * Generalized public API. Emits using either JSON or table, according to + * settings in the emitter_t. */ + +/* + * Note emits a different kv pair as well, but only in table mode. Omits the + * note if table_note_key is NULL. + */ +static inline void +emitter_kv_note(emitter_t *emitter, const char *json_key, const char *table_key, + emitter_type_t value_type, const void *value, + const char *table_note_key, emitter_type_t table_note_value_type, + const void *table_note_value) { + if (emitter_outputs_json(emitter)) { + emitter_json_key(emitter, json_key); + emitter_json_value(emitter, value_type, value); + } else { + emitter_table_kv_note(emitter, table_key, value_type, value, + table_note_key, table_note_value_type, table_note_value); + } + emitter->item_at_depth = true; +} + +static inline void +emitter_kv(emitter_t *emitter, const char *json_key, const char *table_key, + emitter_type_t value_type, const void *value) { + emitter_kv_note(emitter, json_key, table_key, value_type, value, NULL, + emitter_type_bool, NULL); +} + +static inline void +emitter_dict_begin(emitter_t *emitter, const char *json_key, + const char *table_header) { + if (emitter_outputs_json(emitter)) { + emitter_json_key(emitter, json_key); + emitter_json_object_begin(emitter); + } else { + emitter_table_dict_begin(emitter, table_header); + } +} + +static inline void +emitter_dict_end(emitter_t *emitter) { + if (emitter_outputs_json(emitter)) { + emitter_json_object_end(emitter); + } else { + emitter_table_dict_end(emitter); + } +} + +static inline void +emitter_begin(emitter_t *emitter) { + if (emitter_outputs_json(emitter)) { + assert(emitter->nesting_depth == 0); + emitter_printf(emitter, "{"); + emitter_nest_inc(emitter); + } else { + /* + * This guarantees that we always call write_cb at least once. + * This is useful if some invariant is established by each call + * to write_cb, but doesn't hold initially: e.g., some buffer + * holds a null-terminated string. + */ + emitter_printf(emitter, "%s", ""); + } +} + +static inline void +emitter_end(emitter_t *emitter) { + if (emitter_outputs_json(emitter)) { + assert(emitter->nesting_depth == 1); + emitter_nest_dec(emitter); + emitter_printf(emitter, "%s", emitter->output == + emitter_output_json_compact ? "}" : "\n}\n"); + } +} + +#endif /* JEMALLOC_INTERNAL_EMITTER_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/eset.h b/BeefRT/JEMalloc/include/jemalloc/internal/eset.h new file mode 100644 index 00000000..4f689b47 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/eset.h @@ -0,0 +1,77 @@ +#ifndef JEMALLOC_INTERNAL_ESET_H +#define JEMALLOC_INTERNAL_ESET_H + +#include "jemalloc/internal/atomic.h" +#include "jemalloc/internal/fb.h" +#include "jemalloc/internal/edata.h" +#include "jemalloc/internal/mutex.h" + +/* + * An eset ("extent set") is a quantized collection of extents, with built-in + * LRU queue. + * + * This class is not thread-safe; synchronization must be done externally if + * there are mutating operations. One exception is the stats counters, which + * may be read without any locking. + */ + +typedef struct eset_bin_s eset_bin_t; +struct eset_bin_s { + edata_heap_t heap; + /* + * We do first-fit across multiple size classes. If we compared against + * the min element in each heap directly, we'd take a cache miss per + * extent we looked at. If we co-locate the edata summaries, we only + * take a miss on the edata we're actually going to return (which is + * inevitable anyways). + */ + edata_cmp_summary_t heap_min; +}; + +typedef struct eset_bin_stats_s eset_bin_stats_t; +struct eset_bin_stats_s { + atomic_zu_t nextents; + atomic_zu_t nbytes; +}; + +typedef struct eset_s eset_t; +struct eset_s { + /* Bitmap for which set bits correspond to non-empty heaps. */ + fb_group_t bitmap[FB_NGROUPS(SC_NPSIZES + 1)]; + + /* Quantized per size class heaps of extents. */ + eset_bin_t bins[SC_NPSIZES + 1]; + + eset_bin_stats_t bin_stats[SC_NPSIZES + 1]; + + /* LRU of all extents in heaps. */ + edata_list_inactive_t lru; + + /* Page sum for all extents in heaps. */ + atomic_zu_t npages; + + /* + * A duplication of the data in the containing ecache. We use this only + * for assertions on the states of the passed-in extents. + */ + extent_state_t state; +}; + +void eset_init(eset_t *eset, extent_state_t state); + +size_t eset_npages_get(eset_t *eset); +/* Get the number of extents in the given page size index. */ +size_t eset_nextents_get(eset_t *eset, pszind_t ind); +/* Get the sum total bytes of the extents in the given page size index. */ +size_t eset_nbytes_get(eset_t *eset, pszind_t ind); + +void eset_insert(eset_t *eset, edata_t *edata); +void eset_remove(eset_t *eset, edata_t *edata); +/* + * Select an extent from this eset of the given size and alignment. Returns + * null if no such item could be found. + */ +edata_t *eset_fit(eset_t *eset, size_t esize, size_t alignment, bool exact_only, + unsigned lg_max_fit); + +#endif /* JEMALLOC_INTERNAL_ESET_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/exp_grow.h b/BeefRT/JEMalloc/include/jemalloc/internal/exp_grow.h new file mode 100644 index 00000000..8566b8a4 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/exp_grow.h @@ -0,0 +1,50 @@ +#ifndef JEMALLOC_INTERNAL_EXP_GROW_H +#define JEMALLOC_INTERNAL_EXP_GROW_H + +typedef struct exp_grow_s exp_grow_t; +struct exp_grow_s { + /* + * Next extent size class in a growing series to use when satisfying a + * request via the extent hooks (only if opt_retain). This limits the + * number of disjoint virtual memory ranges so that extent merging can + * be effective even if multiple arenas' extent allocation requests are + * highly interleaved. + * + * retain_grow_limit is the max allowed size ind to expand (unless the + * required size is greater). Default is no limit, and controlled + * through mallctl only. + */ + pszind_t next; + pszind_t limit; +}; + +static inline bool +exp_grow_size_prepare(exp_grow_t *exp_grow, size_t alloc_size_min, + size_t *r_alloc_size, pszind_t *r_skip) { + *r_skip = 0; + *r_alloc_size = sz_pind2sz(exp_grow->next + *r_skip); + while (*r_alloc_size < alloc_size_min) { + (*r_skip)++; + if (exp_grow->next + *r_skip >= + sz_psz2ind(SC_LARGE_MAXCLASS)) { + /* Outside legal range. */ + return true; + } + *r_alloc_size = sz_pind2sz(exp_grow->next + *r_skip); + } + return false; +} + +static inline void +exp_grow_size_commit(exp_grow_t *exp_grow, pszind_t skip) { + if (exp_grow->next + skip + 1 <= exp_grow->limit) { + exp_grow->next += skip + 1; + } else { + exp_grow->next = exp_grow->limit; + } + +} + +void exp_grow_init(exp_grow_t *exp_grow); + +#endif /* JEMALLOC_INTERNAL_EXP_GROW_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/extent.h b/BeefRT/JEMalloc/include/jemalloc/internal/extent.h new file mode 100644 index 00000000..1d51d410 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/extent.h @@ -0,0 +1,137 @@ +#ifndef JEMALLOC_INTERNAL_EXTENT_H +#define JEMALLOC_INTERNAL_EXTENT_H + +#include "jemalloc/internal/ecache.h" +#include "jemalloc/internal/ehooks.h" +#include "jemalloc/internal/ph.h" +#include "jemalloc/internal/rtree.h" + +/* + * This module contains the page-level allocator. It chooses the addresses that + * allocations requested by other modules will inhabit, and updates the global + * metadata to reflect allocation/deallocation/purging decisions. + */ + +/* + * When reuse (and split) an active extent, (1U << opt_lg_extent_max_active_fit) + * is the max ratio between the size of the active extent and the new extent. + */ +#define LG_EXTENT_MAX_ACTIVE_FIT_DEFAULT 6 +extern size_t opt_lg_extent_max_active_fit; + +edata_t *ecache_alloc(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + ecache_t *ecache, edata_t *expand_edata, size_t size, size_t alignment, + bool zero, bool guarded); +edata_t *ecache_alloc_grow(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + ecache_t *ecache, edata_t *expand_edata, size_t size, size_t alignment, + bool zero, bool guarded); +void ecache_dalloc(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + ecache_t *ecache, edata_t *edata); +edata_t *ecache_evict(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + ecache_t *ecache, size_t npages_min); + +void extent_gdump_add(tsdn_t *tsdn, const edata_t *edata); +void extent_record(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache, + edata_t *edata); +void extent_dalloc_gap(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *edata); +edata_t *extent_alloc_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + void *new_addr, size_t size, size_t alignment, bool zero, bool *commit, + bool growing_retained); +void extent_dalloc_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *edata); +void extent_destroy_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *edata); +bool extent_commit_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + size_t offset, size_t length); +bool extent_decommit_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + size_t offset, size_t length); +bool extent_purge_lazy_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + size_t offset, size_t length); +bool extent_purge_forced_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + size_t offset, size_t length); +edata_t *extent_split_wrapper(tsdn_t *tsdn, pac_t *pac, + ehooks_t *ehooks, edata_t *edata, size_t size_a, size_t size_b, + bool holding_core_locks); +bool extent_merge_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *a, edata_t *b); +bool extent_commit_zero(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + bool commit, bool zero, bool growing_retained); +size_t extent_sn_next(pac_t *pac); +bool extent_boot(void); + +JEMALLOC_ALWAYS_INLINE bool +extent_neighbor_head_state_mergeable(bool edata_is_head, + bool neighbor_is_head, bool forward) { + /* + * Head states checking: disallow merging if the higher addr extent is a + * head extent. This helps preserve first-fit, and more importantly + * makes sure no merge across arenas. + */ + if (forward) { + if (neighbor_is_head) { + return false; + } + } else { + if (edata_is_head) { + return false; + } + } + return true; +} + +JEMALLOC_ALWAYS_INLINE bool +extent_can_acquire_neighbor(edata_t *edata, rtree_contents_t contents, + extent_pai_t pai, extent_state_t expected_state, bool forward, + bool expanding) { + edata_t *neighbor = contents.edata; + if (neighbor == NULL) { + return false; + } + /* It's not safe to access *neighbor yet; must verify states first. */ + bool neighbor_is_head = contents.metadata.is_head; + if (!extent_neighbor_head_state_mergeable(edata_is_head_get(edata), + neighbor_is_head, forward)) { + return false; + } + extent_state_t neighbor_state = contents.metadata.state; + if (pai == EXTENT_PAI_PAC) { + if (neighbor_state != expected_state) { + return false; + } + /* From this point, it's safe to access *neighbor. */ + if (!expanding && (edata_committed_get(edata) != + edata_committed_get(neighbor))) { + /* + * Some platforms (e.g. Windows) require an explicit + * commit step (and writing to uncommitted memory is not + * allowed). + */ + return false; + } + } else { + if (neighbor_state == extent_state_active) { + return false; + } + /* From this point, it's safe to access *neighbor. */ + } + + assert(edata_pai_get(edata) == pai); + if (edata_pai_get(neighbor) != pai) { + return false; + } + if (opt_retain) { + assert(edata_arena_ind_get(edata) == + edata_arena_ind_get(neighbor)); + } else { + if (edata_arena_ind_get(edata) != + edata_arena_ind_get(neighbor)) { + return false; + } + } + assert(!edata_guarded_get(edata) && !edata_guarded_get(neighbor)); + + return true; +} + +#endif /* JEMALLOC_INTERNAL_EXTENT_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/extent_dss.h b/BeefRT/JEMalloc/include/jemalloc/internal/extent_dss.h new file mode 100644 index 00000000..e8f02ce2 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/extent_dss.h @@ -0,0 +1,26 @@ +#ifndef JEMALLOC_INTERNAL_EXTENT_DSS_H +#define JEMALLOC_INTERNAL_EXTENT_DSS_H + +typedef enum { + dss_prec_disabled = 0, + dss_prec_primary = 1, + dss_prec_secondary = 2, + + dss_prec_limit = 3 +} dss_prec_t; +#define DSS_PREC_DEFAULT dss_prec_secondary +#define DSS_DEFAULT "secondary" + +extern const char *dss_prec_names[]; + +extern const char *opt_dss; + +dss_prec_t extent_dss_prec_get(void); +bool extent_dss_prec_set(dss_prec_t dss_prec); +void *extent_alloc_dss(tsdn_t *tsdn, arena_t *arena, void *new_addr, + size_t size, size_t alignment, bool *zero, bool *commit); +bool extent_in_dss(void *addr); +bool extent_dss_mergeable(void *addr_a, void *addr_b); +void extent_dss_boot(void); + +#endif /* JEMALLOC_INTERNAL_EXTENT_DSS_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/extent_mmap.h b/BeefRT/JEMalloc/include/jemalloc/internal/extent_mmap.h new file mode 100644 index 00000000..55f17ee4 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/extent_mmap.h @@ -0,0 +1,10 @@ +#ifndef JEMALLOC_INTERNAL_EXTENT_MMAP_EXTERNS_H +#define JEMALLOC_INTERNAL_EXTENT_MMAP_EXTERNS_H + +extern bool opt_retain; + +void *extent_alloc_mmap(void *new_addr, size_t size, size_t alignment, + bool *zero, bool *commit); +bool extent_dalloc_mmap(void *addr, size_t size); + +#endif /* JEMALLOC_INTERNAL_EXTENT_MMAP_EXTERNS_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/fb.h b/BeefRT/JEMalloc/include/jemalloc/internal/fb.h new file mode 100644 index 00000000..90c4091f --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/fb.h @@ -0,0 +1,373 @@ +#ifndef JEMALLOC_INTERNAL_FB_H +#define JEMALLOC_INTERNAL_FB_H + +/* + * The flat bitmap module. This has a larger API relative to the bitmap module + * (supporting things like backwards searches, and searching for both set and + * unset bits), at the cost of slower operations for very large bitmaps. + * + * Initialized flat bitmaps start at all-zeros (all bits unset). + */ + +typedef unsigned long fb_group_t; +#define FB_GROUP_BITS (ZU(1) << (LG_SIZEOF_LONG + 3)) +#define FB_NGROUPS(nbits) ((nbits) / FB_GROUP_BITS \ + + ((nbits) % FB_GROUP_BITS == 0 ? 0 : 1)) + +static inline void +fb_init(fb_group_t *fb, size_t nbits) { + size_t ngroups = FB_NGROUPS(nbits); + memset(fb, 0, ngroups * sizeof(fb_group_t)); +} + +static inline bool +fb_empty(fb_group_t *fb, size_t nbits) { + size_t ngroups = FB_NGROUPS(nbits); + for (size_t i = 0; i < ngroups; i++) { + if (fb[i] != 0) { + return false; + } + } + return true; +} + +static inline bool +fb_full(fb_group_t *fb, size_t nbits) { + size_t ngroups = FB_NGROUPS(nbits); + size_t trailing_bits = nbits % FB_GROUP_BITS; + size_t limit = (trailing_bits == 0 ? ngroups : ngroups - 1); + for (size_t i = 0; i < limit; i++) { + if (fb[i] != ~(fb_group_t)0) { + return false; + } + } + if (trailing_bits == 0) { + return true; + } + return fb[ngroups - 1] == ((fb_group_t)1 << trailing_bits) - 1; +} + +static inline bool +fb_get(fb_group_t *fb, size_t nbits, size_t bit) { + assert(bit < nbits); + size_t group_ind = bit / FB_GROUP_BITS; + size_t bit_ind = bit % FB_GROUP_BITS; + return (bool)(fb[group_ind] & ((fb_group_t)1 << bit_ind)); +} + +static inline void +fb_set(fb_group_t *fb, size_t nbits, size_t bit) { + assert(bit < nbits); + size_t group_ind = bit / FB_GROUP_BITS; + size_t bit_ind = bit % FB_GROUP_BITS; + fb[group_ind] |= ((fb_group_t)1 << bit_ind); +} + +static inline void +fb_unset(fb_group_t *fb, size_t nbits, size_t bit) { + assert(bit < nbits); + size_t group_ind = bit / FB_GROUP_BITS; + size_t bit_ind = bit % FB_GROUP_BITS; + fb[group_ind] &= ~((fb_group_t)1 << bit_ind); +} + + +/* + * Some implementation details. This visitation function lets us apply a group + * visitor to each group in the bitmap (potentially modifying it). The mask + * indicates which bits are logically part of the visitation. + */ +typedef void (*fb_group_visitor_t)(void *ctx, fb_group_t *fb, fb_group_t mask); +JEMALLOC_ALWAYS_INLINE void +fb_visit_impl(fb_group_t *fb, size_t nbits, fb_group_visitor_t visit, void *ctx, + size_t start, size_t cnt) { + assert(cnt > 0); + assert(start + cnt <= nbits); + size_t group_ind = start / FB_GROUP_BITS; + size_t start_bit_ind = start % FB_GROUP_BITS; + /* + * The first group is special; it's the only one we don't start writing + * to from bit 0. + */ + size_t first_group_cnt = (start_bit_ind + cnt > FB_GROUP_BITS + ? FB_GROUP_BITS - start_bit_ind : cnt); + /* + * We can basically split affected words into: + * - The first group, where we touch only the high bits + * - The last group, where we touch only the low bits + * - The middle, where we set all the bits to the same thing. + * We treat each case individually. The last two could be merged, but + * this can lead to bad codegen for those middle words. + */ + /* First group */ + fb_group_t mask = ((~(fb_group_t)0) + >> (FB_GROUP_BITS - first_group_cnt)) + << start_bit_ind; + visit(ctx, &fb[group_ind], mask); + + cnt -= first_group_cnt; + group_ind++; + /* Middle groups */ + while (cnt > FB_GROUP_BITS) { + visit(ctx, &fb[group_ind], ~(fb_group_t)0); + cnt -= FB_GROUP_BITS; + group_ind++; + } + /* Last group */ + if (cnt != 0) { + mask = (~(fb_group_t)0) >> (FB_GROUP_BITS - cnt); + visit(ctx, &fb[group_ind], mask); + } +} + +JEMALLOC_ALWAYS_INLINE void +fb_assign_visitor(void *ctx, fb_group_t *fb, fb_group_t mask) { + bool val = *(bool *)ctx; + if (val) { + *fb |= mask; + } else { + *fb &= ~mask; + } +} + +/* Sets the cnt bits starting at position start. Must not have a 0 count. */ +static inline void +fb_set_range(fb_group_t *fb, size_t nbits, size_t start, size_t cnt) { + bool val = true; + fb_visit_impl(fb, nbits, &fb_assign_visitor, &val, start, cnt); +} + +/* Unsets the cnt bits starting at position start. Must not have a 0 count. */ +static inline void +fb_unset_range(fb_group_t *fb, size_t nbits, size_t start, size_t cnt) { + bool val = false; + fb_visit_impl(fb, nbits, &fb_assign_visitor, &val, start, cnt); +} + +JEMALLOC_ALWAYS_INLINE void +fb_scount_visitor(void *ctx, fb_group_t *fb, fb_group_t mask) { + size_t *scount = (size_t *)ctx; + *scount += popcount_lu(*fb & mask); +} + +/* Finds the number of set bit in the of length cnt starting at start. */ +JEMALLOC_ALWAYS_INLINE size_t +fb_scount(fb_group_t *fb, size_t nbits, size_t start, size_t cnt) { + size_t scount = 0; + fb_visit_impl(fb, nbits, &fb_scount_visitor, &scount, start, cnt); + return scount; +} + +/* Finds the number of unset bit in the of length cnt starting at start. */ +JEMALLOC_ALWAYS_INLINE size_t +fb_ucount(fb_group_t *fb, size_t nbits, size_t start, size_t cnt) { + size_t scount = fb_scount(fb, nbits, start, cnt); + return cnt - scount; +} + +/* + * An implementation detail; find the first bit at position >= min_bit with the + * value val. + * + * Returns the number of bits in the bitmap if no such bit exists. + */ +JEMALLOC_ALWAYS_INLINE ssize_t +fb_find_impl(fb_group_t *fb, size_t nbits, size_t start, bool val, + bool forward) { + assert(start < nbits); + size_t ngroups = FB_NGROUPS(nbits); + ssize_t group_ind = start / FB_GROUP_BITS; + size_t bit_ind = start % FB_GROUP_BITS; + + fb_group_t maybe_invert = (val ? 0 : (fb_group_t)-1); + + fb_group_t group = fb[group_ind]; + group ^= maybe_invert; + if (forward) { + /* Only keep ones in bits bit_ind and above. */ + group &= ~((1LU << bit_ind) - 1); + } else { + /* + * Only keep ones in bits bit_ind and below. You might more + * naturally express this as (1 << (bit_ind + 1)) - 1, but + * that shifts by an invalid amount if bit_ind is one less than + * FB_GROUP_BITS. + */ + group &= ((2LU << bit_ind) - 1); + } + ssize_t group_ind_bound = forward ? (ssize_t)ngroups : -1; + while (group == 0) { + group_ind += forward ? 1 : -1; + if (group_ind == group_ind_bound) { + return forward ? (ssize_t)nbits : (ssize_t)-1; + } + group = fb[group_ind]; + group ^= maybe_invert; + } + assert(group != 0); + size_t bit = forward ? ffs_lu(group) : fls_lu(group); + size_t pos = group_ind * FB_GROUP_BITS + bit; + /* + * The high bits of a partially filled last group are zeros, so if we're + * looking for zeros we don't want to report an invalid result. + */ + if (forward && !val && pos > nbits) { + return nbits; + } + return pos; +} + +/* + * Find the first set bit in the bitmap with an index >= min_bit. Returns the + * number of bits in the bitmap if no such bit exists. + */ +static inline size_t +fb_ffu(fb_group_t *fb, size_t nbits, size_t min_bit) { + return (size_t)fb_find_impl(fb, nbits, min_bit, /* val */ false, + /* forward */ true); +} + +/* The same, but looks for an unset bit. */ +static inline size_t +fb_ffs(fb_group_t *fb, size_t nbits, size_t min_bit) { + return (size_t)fb_find_impl(fb, nbits, min_bit, /* val */ true, + /* forward */ true); +} + +/* + * Find the last set bit in the bitmap with an index <= max_bit. Returns -1 if + * no such bit exists. + */ +static inline ssize_t +fb_flu(fb_group_t *fb, size_t nbits, size_t max_bit) { + return fb_find_impl(fb, nbits, max_bit, /* val */ false, + /* forward */ false); +} + +static inline ssize_t +fb_fls(fb_group_t *fb, size_t nbits, size_t max_bit) { + return fb_find_impl(fb, nbits, max_bit, /* val */ true, + /* forward */ false); +} + +/* Returns whether or not we found a range. */ +JEMALLOC_ALWAYS_INLINE bool +fb_iter_range_impl(fb_group_t *fb, size_t nbits, size_t start, size_t *r_begin, + size_t *r_len, bool val, bool forward) { + assert(start < nbits); + ssize_t next_range_begin = fb_find_impl(fb, nbits, start, val, forward); + if ((forward && next_range_begin == (ssize_t)nbits) + || (!forward && next_range_begin == (ssize_t)-1)) { + return false; + } + /* Half open range; the set bits are [begin, end). */ + ssize_t next_range_end = fb_find_impl(fb, nbits, next_range_begin, !val, + forward); + if (forward) { + *r_begin = next_range_begin; + *r_len = next_range_end - next_range_begin; + } else { + *r_begin = next_range_end + 1; + *r_len = next_range_begin - next_range_end; + } + return true; +} + +/* + * Used to iterate through ranges of set bits. + * + * Tries to find the next contiguous sequence of set bits with a first index >= + * start. If one exists, puts the earliest bit of the range in *r_begin, its + * length in *r_len, and returns true. Otherwise, returns false (without + * touching *r_begin or *r_end). + */ +static inline bool +fb_srange_iter(fb_group_t *fb, size_t nbits, size_t start, size_t *r_begin, + size_t *r_len) { + return fb_iter_range_impl(fb, nbits, start, r_begin, r_len, + /* val */ true, /* forward */ true); +} + +/* + * The same as fb_srange_iter, but searches backwards from start rather than + * forwards. (The position returned is still the earliest bit in the range). + */ +static inline bool +fb_srange_riter(fb_group_t *fb, size_t nbits, size_t start, size_t *r_begin, + size_t *r_len) { + return fb_iter_range_impl(fb, nbits, start, r_begin, r_len, + /* val */ true, /* forward */ false); +} + +/* Similar to fb_srange_iter, but searches for unset bits. */ +static inline bool +fb_urange_iter(fb_group_t *fb, size_t nbits, size_t start, size_t *r_begin, + size_t *r_len) { + return fb_iter_range_impl(fb, nbits, start, r_begin, r_len, + /* val */ false, /* forward */ true); +} + +/* Similar to fb_srange_riter, but searches for unset bits. */ +static inline bool +fb_urange_riter(fb_group_t *fb, size_t nbits, size_t start, size_t *r_begin, + size_t *r_len) { + return fb_iter_range_impl(fb, nbits, start, r_begin, r_len, + /* val */ false, /* forward */ false); +} + +JEMALLOC_ALWAYS_INLINE size_t +fb_range_longest_impl(fb_group_t *fb, size_t nbits, bool val) { + size_t begin = 0; + size_t longest_len = 0; + size_t len = 0; + while (begin < nbits && fb_iter_range_impl(fb, nbits, begin, &begin, + &len, val, /* forward */ true)) { + if (len > longest_len) { + longest_len = len; + } + begin += len; + } + return longest_len; +} + +static inline size_t +fb_srange_longest(fb_group_t *fb, size_t nbits) { + return fb_range_longest_impl(fb, nbits, /* val */ true); +} + +static inline size_t +fb_urange_longest(fb_group_t *fb, size_t nbits) { + return fb_range_longest_impl(fb, nbits, /* val */ false); +} + +/* + * Initializes each bit of dst with the bitwise-AND of the corresponding bits of + * src1 and src2. All bitmaps must be the same size. + */ +static inline void +fb_bit_and(fb_group_t *dst, fb_group_t *src1, fb_group_t *src2, size_t nbits) { + size_t ngroups = FB_NGROUPS(nbits); + for (size_t i = 0; i < ngroups; i++) { + dst[i] = src1[i] & src2[i]; + } +} + +/* Like fb_bit_and, but with bitwise-OR. */ +static inline void +fb_bit_or(fb_group_t *dst, fb_group_t *src1, fb_group_t *src2, size_t nbits) { + size_t ngroups = FB_NGROUPS(nbits); + for (size_t i = 0; i < ngroups; i++) { + dst[i] = src1[i] | src2[i]; + } +} + +/* Initializes dst bit i to the negation of source bit i. */ +static inline void +fb_bit_not(fb_group_t *dst, fb_group_t *src, size_t nbits) { + size_t ngroups = FB_NGROUPS(nbits); + for (size_t i = 0; i < ngroups; i++) { + dst[i] = ~src[i]; + } +} + +#endif /* JEMALLOC_INTERNAL_FB_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/fxp.h b/BeefRT/JEMalloc/include/jemalloc/internal/fxp.h new file mode 100644 index 00000000..415a9828 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/fxp.h @@ -0,0 +1,126 @@ +#ifndef JEMALLOC_INTERNAL_FXP_H +#define JEMALLOC_INTERNAL_FXP_H + +/* + * A simple fixed-point math implementation, supporting only unsigned values + * (with overflow being an error). + * + * It's not in general safe to use floating point in core code, because various + * libc implementations we get linked against can assume that malloc won't touch + * floating point state and call it with an unusual calling convention. + */ + +/* + * High 16 bits are the integer part, low 16 are the fractional part. Or + * equivalently, repr == 2**16 * val, where we use "val" to refer to the + * (imaginary) fractional representation of the true value. + * + * We pick a uint32_t here since it's convenient in some places to + * double the representation size (i.e. multiplication and division use + * 64-bit integer types), and a uint64_t is the largest type we're + * certain is available. + */ +typedef uint32_t fxp_t; +#define FXP_INIT_INT(x) ((x) << 16) +#define FXP_INIT_PERCENT(pct) (((pct) << 16) / 100) + +/* + * Amount of precision used in parsing and printing numbers. The integer bound + * is simply because the integer part of the number gets 16 bits, and so is + * bounded by 65536. + * + * We use a lot of precision for the fractional part, even though most of it + * gets rounded off; this lets us get exact values for the important special + * case where the denominator is a small power of 2 (for instance, + * 1/512 == 0.001953125 is exactly representable even with only 16 bits of + * fractional precision). We need to left-shift by 16 before dividing by + * 10**precision, so we pick precision to be floor(log(2**48)) = 14. + */ +#define FXP_INTEGER_PART_DIGITS 5 +#define FXP_FRACTIONAL_PART_DIGITS 14 + +/* + * In addition to the integer and fractional parts of the number, we need to + * include a null character and (possibly) a decimal point. + */ +#define FXP_BUF_SIZE (FXP_INTEGER_PART_DIGITS + FXP_FRACTIONAL_PART_DIGITS + 2) + +static inline fxp_t +fxp_add(fxp_t a, fxp_t b) { + return a + b; +} + +static inline fxp_t +fxp_sub(fxp_t a, fxp_t b) { + assert(a >= b); + return a - b; +} + +static inline fxp_t +fxp_mul(fxp_t a, fxp_t b) { + uint64_t unshifted = (uint64_t)a * (uint64_t)b; + /* + * Unshifted is (a.val * 2**16) * (b.val * 2**16) + * == (a.val * b.val) * 2**32, but we want + * (a.val * b.val) * 2 ** 16. + */ + return (uint32_t)(unshifted >> 16); +} + +static inline fxp_t +fxp_div(fxp_t a, fxp_t b) { + assert(b != 0); + uint64_t unshifted = ((uint64_t)a << 32) / (uint64_t)b; + /* + * Unshifted is (a.val * 2**16) * (2**32) / (b.val * 2**16) + * == (a.val / b.val) * (2 ** 32), which again corresponds to a right + * shift of 16. + */ + return (uint32_t)(unshifted >> 16); +} + +static inline uint32_t +fxp_round_down(fxp_t a) { + return a >> 16; +} + +static inline uint32_t +fxp_round_nearest(fxp_t a) { + uint32_t fractional_part = (a & ((1U << 16) - 1)); + uint32_t increment = (uint32_t)(fractional_part >= (1U << 15)); + return (a >> 16) + increment; +} + +/* + * Approximately computes x * frac, without the size limitations that would be + * imposed by converting u to an fxp_t. + */ +static inline size_t +fxp_mul_frac(size_t x_orig, fxp_t frac) { + assert(frac <= (1U << 16)); + /* + * Work around an over-enthusiastic warning about type limits below (on + * 32-bit platforms, a size_t is always less than 1ULL << 48). + */ + uint64_t x = (uint64_t)x_orig; + /* + * If we can guarantee no overflow, multiply first before shifting, to + * preserve some precision. Otherwise, shift first and then multiply. + * In the latter case, we only lose the low 16 bits of a 48-bit number, + * so we're still accurate to within 1/2**32. + */ + if (x < (1ULL << 48)) { + return (size_t)((x * frac) >> 16); + } else { + return (size_t)((x >> 16) * (uint64_t)frac); + } +} + +/* + * Returns true on error. Otherwise, returns false and updates *ptr to point to + * the first character not parsed (because it wasn't a digit). + */ +bool fxp_parse(fxp_t *a, const char *ptr, char **end); +void fxp_print(fxp_t a, char buf[FXP_BUF_SIZE]); + +#endif /* JEMALLOC_INTERNAL_FXP_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/hash.h b/BeefRT/JEMalloc/include/jemalloc/internal/hash.h new file mode 100644 index 00000000..7f945679 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/hash.h @@ -0,0 +1,320 @@ +#ifndef JEMALLOC_INTERNAL_HASH_H +#define JEMALLOC_INTERNAL_HASH_H + +#include "jemalloc/internal/assert.h" + +/* + * The following hash function is based on MurmurHash3, placed into the public + * domain by Austin Appleby. See https://github.com/aappleby/smhasher for + * details. + */ + +/******************************************************************************/ +/* Internal implementation. */ +static inline uint32_t +hash_rotl_32(uint32_t x, int8_t r) { + return ((x << r) | (x >> (32 - r))); +} + +static inline uint64_t +hash_rotl_64(uint64_t x, int8_t r) { + return ((x << r) | (x >> (64 - r))); +} + +static inline uint32_t +hash_get_block_32(const uint32_t *p, int i) { + /* Handle unaligned read. */ + if (unlikely((uintptr_t)p & (sizeof(uint32_t)-1)) != 0) { + uint32_t ret; + + memcpy(&ret, (uint8_t *)(p + i), sizeof(uint32_t)); + return ret; + } + + return p[i]; +} + +static inline uint64_t +hash_get_block_64(const uint64_t *p, int i) { + /* Handle unaligned read. */ + if (unlikely((uintptr_t)p & (sizeof(uint64_t)-1)) != 0) { + uint64_t ret; + + memcpy(&ret, (uint8_t *)(p + i), sizeof(uint64_t)); + return ret; + } + + return p[i]; +} + +static inline uint32_t +hash_fmix_32(uint32_t h) { + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; +} + +static inline uint64_t +hash_fmix_64(uint64_t k) { + k ^= k >> 33; + k *= KQU(0xff51afd7ed558ccd); + k ^= k >> 33; + k *= KQU(0xc4ceb9fe1a85ec53); + k ^= k >> 33; + + return k; +} + +static inline uint32_t +hash_x86_32(const void *key, int len, uint32_t seed) { + const uint8_t *data = (const uint8_t *) key; + const int nblocks = len / 4; + + uint32_t h1 = seed; + + const uint32_t c1 = 0xcc9e2d51; + const uint32_t c2 = 0x1b873593; + + /* body */ + { + const uint32_t *blocks = (const uint32_t *) (data + nblocks*4); + int i; + + for (i = -nblocks; i; i++) { + uint32_t k1 = hash_get_block_32(blocks, i); + + k1 *= c1; + k1 = hash_rotl_32(k1, 15); + k1 *= c2; + + h1 ^= k1; + h1 = hash_rotl_32(h1, 13); + h1 = h1*5 + 0xe6546b64; + } + } + + /* tail */ + { + const uint8_t *tail = (const uint8_t *) (data + nblocks*4); + + uint32_t k1 = 0; + + switch (len & 3) { + case 3: k1 ^= tail[2] << 16; JEMALLOC_FALLTHROUGH; + case 2: k1 ^= tail[1] << 8; JEMALLOC_FALLTHROUGH; + case 1: k1 ^= tail[0]; k1 *= c1; k1 = hash_rotl_32(k1, 15); + k1 *= c2; h1 ^= k1; + } + } + + /* finalization */ + h1 ^= len; + + h1 = hash_fmix_32(h1); + + return h1; +} + +static inline void +hash_x86_128(const void *key, const int len, uint32_t seed, + uint64_t r_out[2]) { + const uint8_t * data = (const uint8_t *) key; + const int nblocks = len / 16; + + uint32_t h1 = seed; + uint32_t h2 = seed; + uint32_t h3 = seed; + uint32_t h4 = seed; + + const uint32_t c1 = 0x239b961b; + const uint32_t c2 = 0xab0e9789; + const uint32_t c3 = 0x38b34ae5; + const uint32_t c4 = 0xa1e38b93; + + /* body */ + { + const uint32_t *blocks = (const uint32_t *) (data + nblocks*16); + int i; + + for (i = -nblocks; i; i++) { + uint32_t k1 = hash_get_block_32(blocks, i*4 + 0); + uint32_t k2 = hash_get_block_32(blocks, i*4 + 1); + uint32_t k3 = hash_get_block_32(blocks, i*4 + 2); + uint32_t k4 = hash_get_block_32(blocks, i*4 + 3); + + k1 *= c1; k1 = hash_rotl_32(k1, 15); k1 *= c2; h1 ^= k1; + + h1 = hash_rotl_32(h1, 19); h1 += h2; + h1 = h1*5 + 0x561ccd1b; + + k2 *= c2; k2 = hash_rotl_32(k2, 16); k2 *= c3; h2 ^= k2; + + h2 = hash_rotl_32(h2, 17); h2 += h3; + h2 = h2*5 + 0x0bcaa747; + + k3 *= c3; k3 = hash_rotl_32(k3, 17); k3 *= c4; h3 ^= k3; + + h3 = hash_rotl_32(h3, 15); h3 += h4; + h3 = h3*5 + 0x96cd1c35; + + k4 *= c4; k4 = hash_rotl_32(k4, 18); k4 *= c1; h4 ^= k4; + + h4 = hash_rotl_32(h4, 13); h4 += h1; + h4 = h4*5 + 0x32ac3b17; + } + } + + /* tail */ + { + const uint8_t *tail = (const uint8_t *) (data + nblocks*16); + uint32_t k1 = 0; + uint32_t k2 = 0; + uint32_t k3 = 0; + uint32_t k4 = 0; + + switch (len & 15) { + case 15: k4 ^= tail[14] << 16; JEMALLOC_FALLTHROUGH; + case 14: k4 ^= tail[13] << 8; JEMALLOC_FALLTHROUGH; + case 13: k4 ^= tail[12] << 0; + k4 *= c4; k4 = hash_rotl_32(k4, 18); k4 *= c1; h4 ^= k4; + JEMALLOC_FALLTHROUGH; + case 12: k3 ^= (uint32_t) tail[11] << 24; JEMALLOC_FALLTHROUGH; + case 11: k3 ^= tail[10] << 16; JEMALLOC_FALLTHROUGH; + case 10: k3 ^= tail[ 9] << 8; JEMALLOC_FALLTHROUGH; + case 9: k3 ^= tail[ 8] << 0; + k3 *= c3; k3 = hash_rotl_32(k3, 17); k3 *= c4; h3 ^= k3; + JEMALLOC_FALLTHROUGH; + case 8: k2 ^= (uint32_t) tail[ 7] << 24; JEMALLOC_FALLTHROUGH; + case 7: k2 ^= tail[ 6] << 16; JEMALLOC_FALLTHROUGH; + case 6: k2 ^= tail[ 5] << 8; JEMALLOC_FALLTHROUGH; + case 5: k2 ^= tail[ 4] << 0; + k2 *= c2; k2 = hash_rotl_32(k2, 16); k2 *= c3; h2 ^= k2; + JEMALLOC_FALLTHROUGH; + case 4: k1 ^= (uint32_t) tail[ 3] << 24; JEMALLOC_FALLTHROUGH; + case 3: k1 ^= tail[ 2] << 16; JEMALLOC_FALLTHROUGH; + case 2: k1 ^= tail[ 1] << 8; JEMALLOC_FALLTHROUGH; + case 1: k1 ^= tail[ 0] << 0; + k1 *= c1; k1 = hash_rotl_32(k1, 15); k1 *= c2; h1 ^= k1; + break; + } + } + + /* finalization */ + h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + h1 = hash_fmix_32(h1); + h2 = hash_fmix_32(h2); + h3 = hash_fmix_32(h3); + h4 = hash_fmix_32(h4); + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + r_out[0] = (((uint64_t) h2) << 32) | h1; + r_out[1] = (((uint64_t) h4) << 32) | h3; +} + +static inline void +hash_x64_128(const void *key, const int len, const uint32_t seed, + uint64_t r_out[2]) { + const uint8_t *data = (const uint8_t *) key; + const int nblocks = len / 16; + + uint64_t h1 = seed; + uint64_t h2 = seed; + + const uint64_t c1 = KQU(0x87c37b91114253d5); + const uint64_t c2 = KQU(0x4cf5ad432745937f); + + /* body */ + { + const uint64_t *blocks = (const uint64_t *) (data); + int i; + + for (i = 0; i < nblocks; i++) { + uint64_t k1 = hash_get_block_64(blocks, i*2 + 0); + uint64_t k2 = hash_get_block_64(blocks, i*2 + 1); + + k1 *= c1; k1 = hash_rotl_64(k1, 31); k1 *= c2; h1 ^= k1; + + h1 = hash_rotl_64(h1, 27); h1 += h2; + h1 = h1*5 + 0x52dce729; + + k2 *= c2; k2 = hash_rotl_64(k2, 33); k2 *= c1; h2 ^= k2; + + h2 = hash_rotl_64(h2, 31); h2 += h1; + h2 = h2*5 + 0x38495ab5; + } + } + + /* tail */ + { + const uint8_t *tail = (const uint8_t*)(data + nblocks*16); + uint64_t k1 = 0; + uint64_t k2 = 0; + + switch (len & 15) { + case 15: k2 ^= ((uint64_t)(tail[14])) << 48; JEMALLOC_FALLTHROUGH; + case 14: k2 ^= ((uint64_t)(tail[13])) << 40; JEMALLOC_FALLTHROUGH; + case 13: k2 ^= ((uint64_t)(tail[12])) << 32; JEMALLOC_FALLTHROUGH; + case 12: k2 ^= ((uint64_t)(tail[11])) << 24; JEMALLOC_FALLTHROUGH; + case 11: k2 ^= ((uint64_t)(tail[10])) << 16; JEMALLOC_FALLTHROUGH; + case 10: k2 ^= ((uint64_t)(tail[ 9])) << 8; JEMALLOC_FALLTHROUGH; + case 9: k2 ^= ((uint64_t)(tail[ 8])) << 0; + k2 *= c2; k2 = hash_rotl_64(k2, 33); k2 *= c1; h2 ^= k2; + JEMALLOC_FALLTHROUGH; + case 8: k1 ^= ((uint64_t)(tail[ 7])) << 56; JEMALLOC_FALLTHROUGH; + case 7: k1 ^= ((uint64_t)(tail[ 6])) << 48; JEMALLOC_FALLTHROUGH; + case 6: k1 ^= ((uint64_t)(tail[ 5])) << 40; JEMALLOC_FALLTHROUGH; + case 5: k1 ^= ((uint64_t)(tail[ 4])) << 32; JEMALLOC_FALLTHROUGH; + case 4: k1 ^= ((uint64_t)(tail[ 3])) << 24; JEMALLOC_FALLTHROUGH; + case 3: k1 ^= ((uint64_t)(tail[ 2])) << 16; JEMALLOC_FALLTHROUGH; + case 2: k1 ^= ((uint64_t)(tail[ 1])) << 8; JEMALLOC_FALLTHROUGH; + case 1: k1 ^= ((uint64_t)(tail[ 0])) << 0; + k1 *= c1; k1 = hash_rotl_64(k1, 31); k1 *= c2; h1 ^= k1; + break; + } + } + + /* finalization */ + h1 ^= len; h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 = hash_fmix_64(h1); + h2 = hash_fmix_64(h2); + + h1 += h2; + h2 += h1; + + r_out[0] = h1; + r_out[1] = h2; +} + +/******************************************************************************/ +/* API. */ +static inline void +hash(const void *key, size_t len, const uint32_t seed, size_t r_hash[2]) { + assert(len <= INT_MAX); /* Unfortunate implementation limitation. */ + +#if (LG_SIZEOF_PTR == 3 && !defined(JEMALLOC_BIG_ENDIAN)) + hash_x64_128(key, (int)len, seed, (uint64_t *)r_hash); +#else + { + uint64_t hashes[2]; + hash_x86_128(key, (int)len, seed, hashes); + r_hash[0] = (size_t)hashes[0]; + r_hash[1] = (size_t)hashes[1]; + } +#endif +} + +#endif /* JEMALLOC_INTERNAL_HASH_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/hook.h b/BeefRT/JEMalloc/include/jemalloc/internal/hook.h new file mode 100644 index 00000000..ee246b1e --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/hook.h @@ -0,0 +1,163 @@ +#ifndef JEMALLOC_INTERNAL_HOOK_H +#define JEMALLOC_INTERNAL_HOOK_H + +#include "jemalloc/internal/tsd.h" + +/* + * This API is *extremely* experimental, and may get ripped out, changed in API- + * and ABI-incompatible ways, be insufficiently or incorrectly documented, etc. + * + * It allows hooking the stateful parts of the API to see changes as they + * happen. + * + * Allocation hooks are called after the allocation is done, free hooks are + * called before the free is done, and expand hooks are called after the + * allocation is expanded. + * + * For realloc and rallocx, if the expansion happens in place, the expansion + * hook is called. If it is moved, then the alloc hook is called on the new + * location, and then the free hook is called on the old location (i.e. both + * hooks are invoked in between the alloc and the dalloc). + * + * If we return NULL from OOM, then usize might not be trustworthy. Calling + * realloc(NULL, size) only calls the alloc hook, and calling realloc(ptr, 0) + * only calls the free hook. (Calling realloc(NULL, 0) is treated as malloc(0), + * and only calls the alloc hook). + * + * Reentrancy: + * Reentrancy is guarded against from within the hook implementation. If you + * call allocator functions from within a hook, the hooks will not be invoked + * again. + * Threading: + * The installation of a hook synchronizes with all its uses. If you can + * prove the installation of a hook happens-before a jemalloc entry point, + * then the hook will get invoked (unless there's a racing removal). + * + * Hook insertion appears to be atomic at a per-thread level (i.e. if a thread + * allocates and has the alloc hook invoked, then a subsequent free on the + * same thread will also have the free hook invoked). + * + * The *removal* of a hook does *not* block until all threads are done with + * the hook. Hook authors have to be resilient to this, and need some + * out-of-band mechanism for cleaning up any dynamically allocated memory + * associated with their hook. + * Ordering: + * Order of hook execution is unspecified, and may be different than insertion + * order. + */ + +#define HOOK_MAX 4 + +enum hook_alloc_e { + hook_alloc_malloc, + hook_alloc_posix_memalign, + hook_alloc_aligned_alloc, + hook_alloc_calloc, + hook_alloc_memalign, + hook_alloc_valloc, + hook_alloc_mallocx, + + /* The reallocating functions have both alloc and dalloc variants */ + hook_alloc_realloc, + hook_alloc_rallocx, +}; +/* + * We put the enum typedef after the enum, since this file may get included by + * jemalloc_cpp.cpp, and C++ disallows enum forward declarations. + */ +typedef enum hook_alloc_e hook_alloc_t; + +enum hook_dalloc_e { + hook_dalloc_free, + hook_dalloc_dallocx, + hook_dalloc_sdallocx, + + /* + * The dalloc halves of reallocation (not called if in-place expansion + * happens). + */ + hook_dalloc_realloc, + hook_dalloc_rallocx, +}; +typedef enum hook_dalloc_e hook_dalloc_t; + + +enum hook_expand_e { + hook_expand_realloc, + hook_expand_rallocx, + hook_expand_xallocx, +}; +typedef enum hook_expand_e hook_expand_t; + +typedef void (*hook_alloc)( + void *extra, hook_alloc_t type, void *result, uintptr_t result_raw, + uintptr_t args_raw[3]); + +typedef void (*hook_dalloc)( + void *extra, hook_dalloc_t type, void *address, uintptr_t args_raw[3]); + +typedef void (*hook_expand)( + void *extra, hook_expand_t type, void *address, size_t old_usize, + size_t new_usize, uintptr_t result_raw, uintptr_t args_raw[4]); + +typedef struct hooks_s hooks_t; +struct hooks_s { + hook_alloc alloc_hook; + hook_dalloc dalloc_hook; + hook_expand expand_hook; + void *extra; +}; + +/* + * Begin implementation details; everything above this point might one day live + * in a public API. Everything below this point never will. + */ + +/* + * The realloc pathways haven't gotten any refactoring love in a while, and it's + * fairly difficult to pass information from the entry point to the hooks. We + * put the informaiton the hooks will need into a struct to encapsulate + * everything. + * + * Much of these pathways are force-inlined, so that the compiler can avoid + * materializing this struct until we hit an extern arena function. For fairly + * goofy reasons, *many* of the realloc paths hit an extern arena function. + * These paths are cold enough that it doesn't matter; eventually, we should + * rewrite the realloc code to make the expand-in-place and the + * free-then-realloc paths more orthogonal, at which point we don't need to + * spread the hook logic all over the place. + */ +typedef struct hook_ralloc_args_s hook_ralloc_args_t; +struct hook_ralloc_args_s { + /* I.e. as opposed to rallocx. */ + bool is_realloc; + /* + * The expand hook takes 4 arguments, even if only 3 are actually used; + * we add an extra one in case the user decides to memcpy without + * looking too closely at the hooked function. + */ + uintptr_t args[4]; +}; + +/* + * Returns an opaque handle to be used when removing the hook. NULL means that + * we couldn't install the hook. + */ +bool hook_boot(); + +void *hook_install(tsdn_t *tsdn, hooks_t *hooks); +/* Uninstalls the hook with the handle previously returned from hook_install. */ +void hook_remove(tsdn_t *tsdn, void *opaque); + +/* Hooks */ + +void hook_invoke_alloc(hook_alloc_t type, void *result, uintptr_t result_raw, + uintptr_t args_raw[3]); + +void hook_invoke_dalloc(hook_dalloc_t type, void *address, + uintptr_t args_raw[3]); + +void hook_invoke_expand(hook_expand_t type, void *address, size_t old_usize, + size_t new_usize, uintptr_t result_raw, uintptr_t args_raw[4]); + +#endif /* JEMALLOC_INTERNAL_HOOK_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/hpa.h b/BeefRT/JEMalloc/include/jemalloc/internal/hpa.h new file mode 100644 index 00000000..f3562853 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/hpa.h @@ -0,0 +1,182 @@ +#ifndef JEMALLOC_INTERNAL_HPA_H +#define JEMALLOC_INTERNAL_HPA_H + +#include "jemalloc/internal/exp_grow.h" +#include "jemalloc/internal/hpa_hooks.h" +#include "jemalloc/internal/hpa_opts.h" +#include "jemalloc/internal/pai.h" +#include "jemalloc/internal/psset.h" + +typedef struct hpa_central_s hpa_central_t; +struct hpa_central_s { + /* + * The mutex guarding most of the operations on the central data + * structure. + */ + malloc_mutex_t mtx; + /* + * Guards expansion of eden. We separate this from the regular mutex so + * that cheaper operations can still continue while we're doing the OS + * call. + */ + malloc_mutex_t grow_mtx; + /* + * Either NULL (if empty), or some integer multiple of a + * hugepage-aligned number of hugepages. We carve them off one at a + * time to satisfy new pageslab requests. + * + * Guarded by grow_mtx. + */ + void *eden; + size_t eden_len; + /* Source for metadata. */ + base_t *base; + /* Number of grow operations done on this hpa_central_t. */ + uint64_t age_counter; + + /* The HPA hooks. */ + hpa_hooks_t hooks; +}; + +typedef struct hpa_shard_nonderived_stats_s hpa_shard_nonderived_stats_t; +struct hpa_shard_nonderived_stats_s { + /* + * The number of times we've purged within a hugepage. + * + * Guarded by mtx. + */ + uint64_t npurge_passes; + /* + * The number of individual purge calls we perform (which should always + * be bigger than npurge_passes, since each pass purges at least one + * extent within a hugepage. + * + * Guarded by mtx. + */ + uint64_t npurges; + + /* + * The number of times we've hugified a pageslab. + * + * Guarded by mtx. + */ + uint64_t nhugifies; + /* + * The number of times we've dehugified a pageslab. + * + * Guarded by mtx. + */ + uint64_t ndehugifies; +}; + +/* Completely derived; only used by CTL. */ +typedef struct hpa_shard_stats_s hpa_shard_stats_t; +struct hpa_shard_stats_s { + psset_stats_t psset_stats; + hpa_shard_nonderived_stats_t nonderived_stats; +}; + +typedef struct hpa_shard_s hpa_shard_t; +struct hpa_shard_s { + /* + * pai must be the first member; we cast from a pointer to it to a + * pointer to the hpa_shard_t. + */ + pai_t pai; + + /* The central allocator we get our hugepages from. */ + hpa_central_t *central; + /* Protects most of this shard's state. */ + malloc_mutex_t mtx; + /* + * Guards the shard's access to the central allocator (preventing + * multiple threads operating on this shard from accessing the central + * allocator). + */ + malloc_mutex_t grow_mtx; + /* The base metadata allocator. */ + base_t *base; + + /* + * This edata cache is the one we use when allocating a small extent + * from a pageslab. The pageslab itself comes from the centralized + * allocator, and so will use its edata_cache. + */ + edata_cache_fast_t ecf; + + psset_t psset; + + /* + * How many grow operations have occurred. + * + * Guarded by grow_mtx. + */ + uint64_t age_counter; + + /* The arena ind we're associated with. */ + unsigned ind; + + /* + * Our emap. This is just a cache of the emap pointer in the associated + * hpa_central. + */ + emap_t *emap; + + /* The configuration choices for this hpa shard. */ + hpa_shard_opts_t opts; + + /* + * How many pages have we started but not yet finished purging in this + * hpa shard. + */ + size_t npending_purge; + + /* + * Those stats which are copied directly into the CTL-centric hpa shard + * stats. + */ + hpa_shard_nonderived_stats_t stats; + + /* + * Last time we performed purge on this shard. + */ + nstime_t last_purge; +}; + +/* + * Whether or not the HPA can be used given the current configuration. This is + * is not necessarily a guarantee that it backs its allocations by hugepages, + * just that it can function properly given the system it's running on. + */ +bool hpa_supported(); +bool hpa_central_init(hpa_central_t *central, base_t *base, const hpa_hooks_t *hooks); +bool hpa_shard_init(hpa_shard_t *shard, hpa_central_t *central, emap_t *emap, + base_t *base, edata_cache_t *edata_cache, unsigned ind, + const hpa_shard_opts_t *opts); + +void hpa_shard_stats_accum(hpa_shard_stats_t *dst, hpa_shard_stats_t *src); +void hpa_shard_stats_merge(tsdn_t *tsdn, hpa_shard_t *shard, + hpa_shard_stats_t *dst); + +/* + * Notify the shard that we won't use it for allocations much longer. Due to + * the possibility of races, we don't actually prevent allocations; just flush + * and disable the embedded edata_cache_small. + */ +void hpa_shard_disable(tsdn_t *tsdn, hpa_shard_t *shard); +void hpa_shard_destroy(tsdn_t *tsdn, hpa_shard_t *shard); + +void hpa_shard_set_deferral_allowed(tsdn_t *tsdn, hpa_shard_t *shard, + bool deferral_allowed); +void hpa_shard_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard); + +/* + * We share the fork ordering with the PA and arena prefork handling; that's why + * these are 3 and 4 rather than 0 and 1. + */ +void hpa_shard_prefork3(tsdn_t *tsdn, hpa_shard_t *shard); +void hpa_shard_prefork4(tsdn_t *tsdn, hpa_shard_t *shard); +void hpa_shard_postfork_parent(tsdn_t *tsdn, hpa_shard_t *shard); +void hpa_shard_postfork_child(tsdn_t *tsdn, hpa_shard_t *shard); + +#endif /* JEMALLOC_INTERNAL_HPA_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/hpa_hooks.h b/BeefRT/JEMalloc/include/jemalloc/internal/hpa_hooks.h new file mode 100644 index 00000000..4ea221cb --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/hpa_hooks.h @@ -0,0 +1,17 @@ +#ifndef JEMALLOC_INTERNAL_HPA_HOOKS_H +#define JEMALLOC_INTERNAL_HPA_HOOKS_H + +typedef struct hpa_hooks_s hpa_hooks_t; +struct hpa_hooks_s { + void *(*map)(size_t size); + void (*unmap)(void *ptr, size_t size); + void (*purge)(void *ptr, size_t size); + void (*hugify)(void *ptr, size_t size); + void (*dehugify)(void *ptr, size_t size); + void (*curtime)(nstime_t *r_time, bool first_reading); + uint64_t (*ms_since)(nstime_t *r_time); +}; + +extern hpa_hooks_t hpa_hooks_default; + +#endif /* JEMALLOC_INTERNAL_HPA_HOOKS_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/hpa_opts.h b/BeefRT/JEMalloc/include/jemalloc/internal/hpa_opts.h new file mode 100644 index 00000000..ee84fea1 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/hpa_opts.h @@ -0,0 +1,74 @@ +#ifndef JEMALLOC_INTERNAL_HPA_OPTS_H +#define JEMALLOC_INTERNAL_HPA_OPTS_H + +#include "jemalloc/internal/fxp.h" + +/* + * This file is morally part of hpa.h, but is split out for header-ordering + * reasons. + */ + +typedef struct hpa_shard_opts_s hpa_shard_opts_t; +struct hpa_shard_opts_s { + /* + * The largest size we'll allocate out of the shard. For those + * allocations refused, the caller (in practice, the PA module) will + * fall back to the more general (for now) PAC, which can always handle + * any allocation request. + */ + size_t slab_max_alloc; + + /* + * When the number of active bytes in a hugepage is >= + * hugification_threshold, we force hugify it. + */ + size_t hugification_threshold; + + /* + * The HPA purges whenever the number of pages exceeds dirty_mult * + * active_pages. This may be set to (fxp_t)-1 to disable purging. + */ + fxp_t dirty_mult; + + /* + * Whether or not the PAI methods are allowed to defer work to a + * subsequent hpa_shard_do_deferred_work() call. Practically, this + * corresponds to background threads being enabled. We track this + * ourselves for encapsulation purposes. + */ + bool deferral_allowed; + + /* + * How long a hugepage has to be a hugification candidate before it will + * actually get hugified. + */ + uint64_t hugify_delay_ms; + + /* + * Minimum amount of time between purges. + */ + uint64_t min_purge_interval_ms; +}; + +#define HPA_SHARD_OPTS_DEFAULT { \ + /* slab_max_alloc */ \ + 64 * 1024, \ + /* hugification_threshold */ \ + HUGEPAGE * 95 / 100, \ + /* dirty_mult */ \ + FXP_INIT_PERCENT(25), \ + /* \ + * deferral_allowed \ + * \ + * Really, this is always set by the arena during creation \ + * or by an hpa_shard_set_deferral_allowed call, so the value \ + * we put here doesn't matter. \ + */ \ + false, \ + /* hugify_delay_ms */ \ + 10 * 1000, \ + /* min_purge_interval_ms */ \ + 5 * 1000 \ +} + +#endif /* JEMALLOC_INTERNAL_HPA_OPTS_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/hpdata.h b/BeefRT/JEMalloc/include/jemalloc/internal/hpdata.h new file mode 100644 index 00000000..1fb534db --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/hpdata.h @@ -0,0 +1,413 @@ +#ifndef JEMALLOC_INTERNAL_HPDATA_H +#define JEMALLOC_INTERNAL_HPDATA_H + +#include "jemalloc/internal/fb.h" +#include "jemalloc/internal/ph.h" +#include "jemalloc/internal/ql.h" +#include "jemalloc/internal/typed_list.h" + +/* + * The metadata representation we use for extents in hugepages. While the PAC + * uses the edata_t to represent both active and inactive extents, the HP only + * uses the edata_t for active ones; instead, inactive extent state is tracked + * within hpdata associated with the enclosing hugepage-sized, hugepage-aligned + * region of virtual address space. + * + * An hpdata need not be "truly" backed by a hugepage (which is not necessarily + * an observable property of any given region of address space). It's just + * hugepage-sized and hugepage-aligned; it's *potentially* huge. + */ +typedef struct hpdata_s hpdata_t; +ph_structs(hpdata_age_heap, hpdata_t); +struct hpdata_s { + /* + * We likewise follow the edata convention of mangling names and forcing + * the use of accessors -- this lets us add some consistency checks on + * access. + */ + + /* + * The address of the hugepage in question. This can't be named h_addr, + * since that conflicts with a macro defined in Windows headers. + */ + void *h_address; + /* Its age (measured in psset operations). */ + uint64_t h_age; + /* Whether or not we think the hugepage is mapped that way by the OS. */ + bool h_huge; + + /* + * For some properties, we keep parallel sets of bools; h_foo_allowed + * and h_in_psset_foo_container. This is a decoupling mechanism to + * avoid bothering the hpa (which manages policies) from the psset + * (which is the mechanism used to enforce those policies). This allows + * all the container management logic to live in one place, without the + * HPA needing to know or care how that happens. + */ + + /* + * Whether or not the hpdata is allowed to be used to serve allocations, + * and whether or not the psset is currently tracking it as such. + */ + bool h_alloc_allowed; + bool h_in_psset_alloc_container; + + /* + * The same, but with purging. There's no corresponding + * h_in_psset_purge_container, because the psset (currently) always + * removes hpdatas from their containers during updates (to implement + * LRU for purging). + */ + bool h_purge_allowed; + + /* And with hugifying. */ + bool h_hugify_allowed; + /* When we became a hugification candidate. */ + nstime_t h_time_hugify_allowed; + bool h_in_psset_hugify_container; + + /* Whether or not a purge or hugify is currently happening. */ + bool h_mid_purge; + bool h_mid_hugify; + + /* + * Whether or not the hpdata is being updated in the psset (i.e. if + * there has been a psset_update_begin call issued without a matching + * psset_update_end call). Eventually this will expand to other types + * of updates. + */ + bool h_updating; + + /* Whether or not the hpdata is in a psset. */ + bool h_in_psset; + + union { + /* When nonempty (and also nonfull), used by the psset bins. */ + hpdata_age_heap_link_t age_link; + /* + * When empty (or not corresponding to any hugepage), list + * linkage. + */ + ql_elm(hpdata_t) ql_link_empty; + }; + + /* + * Linkage for the psset to track candidates for purging and hugifying. + */ + ql_elm(hpdata_t) ql_link_purge; + ql_elm(hpdata_t) ql_link_hugify; + + /* The length of the largest contiguous sequence of inactive pages. */ + size_t h_longest_free_range; + + /* Number of active pages. */ + size_t h_nactive; + + /* A bitmap with bits set in the active pages. */ + fb_group_t active_pages[FB_NGROUPS(HUGEPAGE_PAGES)]; + + /* + * Number of dirty or active pages, and a bitmap tracking them. One + * way to think of this is as which pages are dirty from the OS's + * perspective. + */ + size_t h_ntouched; + + /* The touched pages (using the same definition as above). */ + fb_group_t touched_pages[FB_NGROUPS(HUGEPAGE_PAGES)]; +}; + +TYPED_LIST(hpdata_empty_list, hpdata_t, ql_link_empty) +TYPED_LIST(hpdata_purge_list, hpdata_t, ql_link_purge) +TYPED_LIST(hpdata_hugify_list, hpdata_t, ql_link_hugify) + +ph_proto(, hpdata_age_heap, hpdata_t); + +static inline void * +hpdata_addr_get(const hpdata_t *hpdata) { + return hpdata->h_address; +} + +static inline void +hpdata_addr_set(hpdata_t *hpdata, void *addr) { + assert(HUGEPAGE_ADDR2BASE(addr) == addr); + hpdata->h_address = addr; +} + +static inline uint64_t +hpdata_age_get(const hpdata_t *hpdata) { + return hpdata->h_age; +} + +static inline void +hpdata_age_set(hpdata_t *hpdata, uint64_t age) { + hpdata->h_age = age; +} + +static inline bool +hpdata_huge_get(const hpdata_t *hpdata) { + return hpdata->h_huge; +} + +static inline bool +hpdata_alloc_allowed_get(const hpdata_t *hpdata) { + return hpdata->h_alloc_allowed; +} + +static inline void +hpdata_alloc_allowed_set(hpdata_t *hpdata, bool alloc_allowed) { + hpdata->h_alloc_allowed = alloc_allowed; +} + +static inline bool +hpdata_in_psset_alloc_container_get(const hpdata_t *hpdata) { + return hpdata->h_in_psset_alloc_container; +} + +static inline void +hpdata_in_psset_alloc_container_set(hpdata_t *hpdata, bool in_container) { + assert(in_container != hpdata->h_in_psset_alloc_container); + hpdata->h_in_psset_alloc_container = in_container; +} + +static inline bool +hpdata_purge_allowed_get(const hpdata_t *hpdata) { + return hpdata->h_purge_allowed; +} + +static inline void +hpdata_purge_allowed_set(hpdata_t *hpdata, bool purge_allowed) { + assert(purge_allowed == false || !hpdata->h_mid_purge); + hpdata->h_purge_allowed = purge_allowed; +} + +static inline bool +hpdata_hugify_allowed_get(const hpdata_t *hpdata) { + return hpdata->h_hugify_allowed; +} + +static inline void +hpdata_allow_hugify(hpdata_t *hpdata, nstime_t now) { + assert(!hpdata->h_mid_hugify); + hpdata->h_hugify_allowed = true; + hpdata->h_time_hugify_allowed = now; +} + +static inline nstime_t +hpdata_time_hugify_allowed(hpdata_t *hpdata) { + return hpdata->h_time_hugify_allowed; +} + +static inline void +hpdata_disallow_hugify(hpdata_t *hpdata) { + hpdata->h_hugify_allowed = false; +} + +static inline bool +hpdata_in_psset_hugify_container_get(const hpdata_t *hpdata) { + return hpdata->h_in_psset_hugify_container; +} + +static inline void +hpdata_in_psset_hugify_container_set(hpdata_t *hpdata, bool in_container) { + assert(in_container != hpdata->h_in_psset_hugify_container); + hpdata->h_in_psset_hugify_container = in_container; +} + +static inline bool +hpdata_mid_purge_get(const hpdata_t *hpdata) { + return hpdata->h_mid_purge; +} + +static inline void +hpdata_mid_purge_set(hpdata_t *hpdata, bool mid_purge) { + assert(mid_purge != hpdata->h_mid_purge); + hpdata->h_mid_purge = mid_purge; +} + +static inline bool +hpdata_mid_hugify_get(const hpdata_t *hpdata) { + return hpdata->h_mid_hugify; +} + +static inline void +hpdata_mid_hugify_set(hpdata_t *hpdata, bool mid_hugify) { + assert(mid_hugify != hpdata->h_mid_hugify); + hpdata->h_mid_hugify = mid_hugify; +} + +static inline bool +hpdata_changing_state_get(const hpdata_t *hpdata) { + return hpdata->h_mid_purge || hpdata->h_mid_hugify; +} + + +static inline bool +hpdata_updating_get(const hpdata_t *hpdata) { + return hpdata->h_updating; +} + +static inline void +hpdata_updating_set(hpdata_t *hpdata, bool updating) { + assert(updating != hpdata->h_updating); + hpdata->h_updating = updating; +} + +static inline bool +hpdata_in_psset_get(const hpdata_t *hpdata) { + return hpdata->h_in_psset; +} + +static inline void +hpdata_in_psset_set(hpdata_t *hpdata, bool in_psset) { + assert(in_psset != hpdata->h_in_psset); + hpdata->h_in_psset = in_psset; +} + +static inline size_t +hpdata_longest_free_range_get(const hpdata_t *hpdata) { + return hpdata->h_longest_free_range; +} + +static inline void +hpdata_longest_free_range_set(hpdata_t *hpdata, size_t longest_free_range) { + assert(longest_free_range <= HUGEPAGE_PAGES); + hpdata->h_longest_free_range = longest_free_range; +} + +static inline size_t +hpdata_nactive_get(hpdata_t *hpdata) { + return hpdata->h_nactive; +} + +static inline size_t +hpdata_ntouched_get(hpdata_t *hpdata) { + return hpdata->h_ntouched; +} + +static inline size_t +hpdata_ndirty_get(hpdata_t *hpdata) { + return hpdata->h_ntouched - hpdata->h_nactive; +} + +static inline size_t +hpdata_nretained_get(hpdata_t *hpdata) { + return HUGEPAGE_PAGES - hpdata->h_ntouched; +} + +static inline void +hpdata_assert_empty(hpdata_t *hpdata) { + assert(fb_empty(hpdata->active_pages, HUGEPAGE_PAGES)); + assert(hpdata->h_nactive == 0); +} + +/* + * Only used in tests, and in hpdata_assert_consistent, below. Verifies some + * consistency properties of the hpdata (e.g. that cached counts of page stats + * match computed ones). + */ +static inline bool +hpdata_consistent(hpdata_t *hpdata) { + if(fb_urange_longest(hpdata->active_pages, HUGEPAGE_PAGES) + != hpdata_longest_free_range_get(hpdata)) { + return false; + } + if (fb_scount(hpdata->active_pages, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES) + != hpdata->h_nactive) { + return false; + } + if (fb_scount(hpdata->touched_pages, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES) + != hpdata->h_ntouched) { + return false; + } + if (hpdata->h_ntouched < hpdata->h_nactive) { + return false; + } + if (hpdata->h_huge && hpdata->h_ntouched != HUGEPAGE_PAGES) { + return false; + } + if (hpdata_changing_state_get(hpdata) + && ((hpdata->h_purge_allowed) || hpdata->h_hugify_allowed)) { + return false; + } + if (hpdata_hugify_allowed_get(hpdata) + != hpdata_in_psset_hugify_container_get(hpdata)) { + return false; + } + return true; +} + +static inline void +hpdata_assert_consistent(hpdata_t *hpdata) { + assert(hpdata_consistent(hpdata)); +} + +static inline bool +hpdata_empty(hpdata_t *hpdata) { + return hpdata->h_nactive == 0; +} + +static inline bool +hpdata_full(hpdata_t *hpdata) { + return hpdata->h_nactive == HUGEPAGE_PAGES; +} + +void hpdata_init(hpdata_t *hpdata, void *addr, uint64_t age); + +/* + * Given an hpdata which can serve an allocation request, pick and reserve an + * offset within that allocation. + */ +void *hpdata_reserve_alloc(hpdata_t *hpdata, size_t sz); +void hpdata_unreserve(hpdata_t *hpdata, void *begin, size_t sz); + +/* + * The hpdata_purge_prepare_t allows grabbing the metadata required to purge + * subranges of a hugepage while holding a lock, drop the lock during the actual + * purging of them, and reacquire it to update the metadata again. + */ +typedef struct hpdata_purge_state_s hpdata_purge_state_t; +struct hpdata_purge_state_s { + size_t npurged; + size_t ndirty_to_purge; + fb_group_t to_purge[FB_NGROUPS(HUGEPAGE_PAGES)]; + size_t next_purge_search_begin; +}; + +/* + * Initializes purge state. The access to hpdata must be externally + * synchronized with other hpdata_* calls. + * + * You can tell whether or not a thread is purging or hugifying a given hpdata + * via hpdata_changing_state_get(hpdata). Racing hugification or purging + * operations aren't allowed. + * + * Once you begin purging, you have to follow through and call hpdata_purge_next + * until you're done, and then end. Allocating out of an hpdata undergoing + * purging is not allowed. + * + * Returns the number of dirty pages that will be purged. + */ +size_t hpdata_purge_begin(hpdata_t *hpdata, hpdata_purge_state_t *purge_state); + +/* + * If there are more extents to purge, sets *r_purge_addr and *r_purge_size to + * true, and returns true. Otherwise, returns false to indicate that we're + * done. + * + * This requires exclusive access to the purge state, but *not* to the hpdata. + * In particular, unreserve calls are allowed while purging (i.e. you can dalloc + * into one part of the hpdata while purging a different part). + */ +bool hpdata_purge_next(hpdata_t *hpdata, hpdata_purge_state_t *purge_state, + void **r_purge_addr, size_t *r_purge_size); +/* + * Updates the hpdata metadata after all purging is done. Needs external + * synchronization. + */ +void hpdata_purge_end(hpdata_t *hpdata, hpdata_purge_state_t *purge_state); + +void hpdata_hugify(hpdata_t *hpdata); +void hpdata_dehugify(hpdata_t *hpdata); + +#endif /* JEMALLOC_INTERNAL_HPDATA_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/inspect.h b/BeefRT/JEMalloc/include/jemalloc/internal/inspect.h new file mode 100644 index 00000000..65fef51d --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/inspect.h @@ -0,0 +1,40 @@ +#ifndef JEMALLOC_INTERNAL_INSPECT_H +#define JEMALLOC_INTERNAL_INSPECT_H + +/* + * This module contains the heap introspection capabilities. For now they are + * exposed purely through mallctl APIs in the experimental namespace, but this + * may change over time. + */ + +/* + * The following two structs are for experimental purposes. See + * experimental_utilization_query_ctl and + * experimental_utilization_batch_query_ctl in src/ctl.c. + */ +typedef struct inspect_extent_util_stats_s inspect_extent_util_stats_t; +struct inspect_extent_util_stats_s { + size_t nfree; + size_t nregs; + size_t size; +}; + +typedef struct inspect_extent_util_stats_verbose_s + inspect_extent_util_stats_verbose_t; + +struct inspect_extent_util_stats_verbose_s { + void *slabcur_addr; + size_t nfree; + size_t nregs; + size_t size; + size_t bin_nfree; + size_t bin_nregs; +}; + +void inspect_extent_util_stats_get(tsdn_t *tsdn, const void *ptr, + size_t *nfree, size_t *nregs, size_t *size); +void inspect_extent_util_stats_verbose_get(tsdn_t *tsdn, const void *ptr, + size_t *nfree, size_t *nregs, size_t *size, + size_t *bin_nfree, size_t *bin_nregs, void **slabcur_addr); + +#endif /* JEMALLOC_INTERNAL_INSPECT_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_decls.h b/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_decls.h new file mode 100644 index 00000000..983027c8 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_decls.h @@ -0,0 +1,108 @@ +#ifndef JEMALLOC_INTERNAL_DECLS_H +#define JEMALLOC_INTERNAL_DECLS_H + +#include +#ifdef _WIN32 +# include +# include "msvc_compat/windows_extra.h" +# include "msvc_compat/strings.h" +# ifdef _WIN64 +# if LG_VADDR <= 32 +# error Generate the headers using x64 vcargs +# endif +# else +# if LG_VADDR > 32 +# undef LG_VADDR +# define LG_VADDR 32 +# endif +# endif +#else +# include +# include +# if !defined(__pnacl__) && !defined(__native_client__) +# include +# if !defined(SYS_write) && defined(__NR_write) +# define SYS_write __NR_write +# endif +# if defined(SYS_open) && defined(__aarch64__) + /* Android headers may define SYS_open to __NR_open even though + * __NR_open may not exist on AArch64 (superseded by __NR_openat). */ +# undef SYS_open +# endif +# include +# endif +# include +# if defined(__FreeBSD__) || defined(__DragonFly__) +# include +# include +# if defined(__FreeBSD__) +# define cpu_set_t cpuset_t +# endif +# endif +# include +# ifdef JEMALLOC_OS_UNFAIR_LOCK +# include +# endif +# ifdef JEMALLOC_GLIBC_MALLOC_HOOK +# include +# endif +# include +# include +# include +# ifdef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME +# include +# endif +#endif +#include + +#include +#ifndef SIZE_T_MAX +# define SIZE_T_MAX SIZE_MAX +#endif +#ifndef SSIZE_MAX +# define SSIZE_MAX ((ssize_t)(SIZE_T_MAX >> 1)) +#endif +#include +#include +#include +#include +#include +#include +#ifndef offsetof +# define offsetof(type, member) ((size_t)&(((type *)NULL)->member)) +#endif +#include +#include +#include +#ifdef _MSC_VER +# include +typedef intptr_t ssize_t; +# define PATH_MAX 1024 +# define STDERR_FILENO 2 +# define __func__ __FUNCTION__ +# ifdef JEMALLOC_HAS_RESTRICT +# define restrict __restrict +# endif +/* Disable warnings about deprecated system functions. */ +# pragma warning(disable: 4996) +#if _MSC_VER < 1800 +static int +isblank(int c) { + return (c == '\t' || c == ' '); +} +#endif +#else +# include +#endif +#include + +/* + * The Win32 midl compiler has #define small char; we don't use midl, but + * "small" is a nice identifier to have available when talking about size + * classes. + */ +#ifdef small +# undef small +#endif + +#endif /* JEMALLOC_INTERNAL_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_defs.h b/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_defs.h new file mode 100644 index 00000000..22fd3584 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_defs.h @@ -0,0 +1,428 @@ +/* include/jemalloc/internal/jemalloc_internal_defs.h. Generated from jemalloc_internal_defs.h.in by configure. */ +#ifndef JEMALLOC_INTERNAL_DEFS_H_ +#define JEMALLOC_INTERNAL_DEFS_H_ +/* + * If JEMALLOC_PREFIX is defined via --with-jemalloc-prefix, it will cause all + * public APIs to be prefixed. This makes it possible, with some care, to use + * multiple allocators simultaneously. + */ +#define JEMALLOC_PREFIX "je_" +#define JEMALLOC_CPREFIX "JE_" + +/* + * Define overrides for non-standard allocator-related functions if they are + * present on the system. + */ +/* #undef JEMALLOC_OVERRIDE___LIBC_CALLOC */ +/* #undef JEMALLOC_OVERRIDE___LIBC_FREE */ +/* #undef JEMALLOC_OVERRIDE___LIBC_MALLOC */ +/* #undef JEMALLOC_OVERRIDE___LIBC_MEMALIGN */ +/* #undef JEMALLOC_OVERRIDE___LIBC_REALLOC */ +/* #undef JEMALLOC_OVERRIDE___LIBC_VALLOC */ +/* #undef JEMALLOC_OVERRIDE___POSIX_MEMALIGN */ + +/* + * JEMALLOC_PRIVATE_NAMESPACE is used as a prefix for all library-private APIs. + * For shared libraries, symbol visibility mechanisms prevent these symbols + * from being exported, but for static libraries, naming collisions are a real + * possibility. + */ +#define JEMALLOC_PRIVATE_NAMESPACE je_ + +/* + * Hyper-threaded CPUs may need a special instruction inside spin loops in + * order to yield to another virtual CPU. + */ +#define CPU_SPINWAIT _mm_pause() +/* 1 if CPU_SPINWAIT is defined, 0 otherwise. */ +#define HAVE_CPU_SPINWAIT 1 + +/* + * Number of significant bits in virtual addresses. This may be less than the + * total number of bits in a pointer, e.g. on x64, for which the uppermost 16 + * bits are the same as bit 47. + */ +#define LG_VADDR 48 + +/* Defined if C11 atomics are available. */ +/* #undef JEMALLOC_C11_ATOMICS */ + +/* Defined if GCC __atomic atomics are available. */ +/* #undef JEMALLOC_GCC_ATOMIC_ATOMICS */ +/* and the 8-bit variant support. */ +/* #undef JEMALLOC_GCC_U8_ATOMIC_ATOMICS */ + +/* Defined if GCC __sync atomics are available. */ +/* #undef JEMALLOC_GCC_SYNC_ATOMICS */ +/* and the 8-bit variant support. */ +/* #undef JEMALLOC_GCC_U8_SYNC_ATOMICS */ + +/* + * Defined if __builtin_clz() and __builtin_clzl() are available. + */ +/* #undef JEMALLOC_HAVE_BUILTIN_CLZ */ + +/* + * Defined if os_unfair_lock_*() functions are available, as provided by Darwin. + */ +/* #undef JEMALLOC_OS_UNFAIR_LOCK */ + +/* Defined if syscall(2) is usable. */ +/* #undef JEMALLOC_USE_SYSCALL */ + +/* + * Defined if secure_getenv(3) is available. + */ +/* #undef JEMALLOC_HAVE_SECURE_GETENV */ + +/* + * Defined if issetugid(2) is available. + */ +/* #undef JEMALLOC_HAVE_ISSETUGID */ + +/* Defined if pthread_atfork(3) is available. */ +/* #undef JEMALLOC_HAVE_PTHREAD_ATFORK */ + +/* Defined if pthread_setname_np(3) is available. */ +/* #undef JEMALLOC_HAVE_PTHREAD_SETNAME_NP */ + +/* Defined if pthread_getname_np(3) is available. */ +/* #undef JEMALLOC_HAVE_PTHREAD_GETNAME_NP */ + +/* Defined if pthread_get_name_np(3) is available. */ +/* #undef JEMALLOC_HAVE_PTHREAD_GET_NAME_NP */ + +/* + * Defined if clock_gettime(CLOCK_MONOTONIC_COARSE, ...) is available. + */ +/* #undef JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE */ + +/* + * Defined if clock_gettime(CLOCK_MONOTONIC, ...) is available. + */ +/* #undef JEMALLOC_HAVE_CLOCK_MONOTONIC */ + +/* + * Defined if mach_absolute_time() is available. + */ +/* #undef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME */ + +/* + * Defined if clock_gettime(CLOCK_REALTIME, ...) is available. + */ +/* #undef JEMALLOC_HAVE_CLOCK_REALTIME */ + +/* + * Defined if _malloc_thread_cleanup() exists. At least in the case of + * FreeBSD, pthread_key_create() allocates, which if used during malloc + * bootstrapping will cause recursion into the pthreads library. Therefore, if + * _malloc_thread_cleanup() exists, use it as the basis for thread cleanup in + * malloc_tsd. + */ +/* #undef JEMALLOC_MALLOC_THREAD_CLEANUP */ + +/* + * Defined if threaded initialization is known to be safe on this platform. + * Among other things, it must be possible to initialize a mutex without + * triggering allocation in order for threaded allocation to be safe. + */ +/* #undef JEMALLOC_THREADED_INIT */ + +/* + * Defined if the pthreads implementation defines + * _pthread_mutex_init_calloc_cb(), in which case the function is used in order + * to avoid recursive allocation during mutex initialization. + */ +/* #undef JEMALLOC_MUTEX_INIT_CB */ + +/* Non-empty if the tls_model attribute is supported. */ +#define JEMALLOC_TLS_MODEL + +/* + * JEMALLOC_DEBUG enables assertions and other sanity checks, and disables + * inline functions. + */ +/* #undef JEMALLOC_DEBUG */ + +/* JEMALLOC_STATS enables statistics calculation. */ +#define JEMALLOC_STATS + +/* JEMALLOC_EXPERIMENTAL_SMALLOCX_API enables experimental smallocx API. */ +/* #undef JEMALLOC_EXPERIMENTAL_SMALLOCX_API */ + +/* JEMALLOC_PROF enables allocation profiling. */ +/* #undef JEMALLOC_PROF */ + +/* Use libunwind for profile backtracing if defined. */ +/* #undef JEMALLOC_PROF_LIBUNWIND */ + +/* Use libgcc for profile backtracing if defined. */ +/* #undef JEMALLOC_PROF_LIBGCC */ + +/* Use gcc intrinsics for profile backtracing if defined. */ +/* #undef JEMALLOC_PROF_GCC */ + +/* + * JEMALLOC_DSS enables use of sbrk(2) to allocate extents from the data storage + * segment (DSS). + */ +/* #undef JEMALLOC_DSS */ + +/* Support memory filling (junk/zero). */ +#define JEMALLOC_FILL + +/* Support utrace(2)-based tracing. */ +/* #undef JEMALLOC_UTRACE */ + +/* Support utrace(2)-based tracing (label based signature). */ +/* #undef JEMALLOC_UTRACE_LABEL */ + +/* Support optional abort() on OOM. */ +/* #undef JEMALLOC_XMALLOC */ + +/* Support lazy locking (avoid locking unless a second thread is launched). */ +/* #undef JEMALLOC_LAZY_LOCK */ + +/* + * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size + * classes). + */ +/* #undef LG_QUANTUM */ + +/* One page is 2^LG_PAGE bytes. */ +#define LG_PAGE 12 + +/* Maximum number of regions in a slab. */ +/* #undef CONFIG_LG_SLAB_MAXREGS */ + +/* + * One huge page is 2^LG_HUGEPAGE bytes. Note that this is defined even if the + * system does not explicitly support huge pages; system calls that require + * explicit huge page support are separately configured. + */ +#define LG_HUGEPAGE 21 + +/* + * If defined, adjacent virtual memory mappings with identical attributes + * automatically coalesce, and they fragment when changes are made to subranges. + * This is the normal order of things for mmap()/munmap(), but on Windows + * VirtualAlloc()/VirtualFree() operations must be precisely matched, i.e. + * mappings do *not* coalesce/fragment. + */ +/* #undef JEMALLOC_MAPS_COALESCE */ + +/* + * If defined, retain memory for later reuse by default rather than using e.g. + * munmap() to unmap freed extents. This is enabled on 64-bit Linux because + * common sequences of mmap()/munmap() calls will cause virtual memory map + * holes. + */ +/* #undef JEMALLOC_RETAIN */ + +/* TLS is used to map arenas and magazine caches to threads. */ +/* #undef JEMALLOC_TLS */ + +/* + * Used to mark unreachable code to quiet "end of non-void" compiler warnings. + * Don't use this directly; instead use unreachable() from util.h + */ +#define JEMALLOC_INTERNAL_UNREACHABLE abort + +/* + * ffs*() functions to use for bitmapping. Don't use these directly; instead, + * use ffs_*() from util.h. + */ +#define JEMALLOC_INTERNAL_FFSLL ffsll +#define JEMALLOC_INTERNAL_FFSL ffsl +#define JEMALLOC_INTERNAL_FFS ffs + +/* + * popcount*() functions to use for bitmapping. + */ +/* #undef JEMALLOC_INTERNAL_POPCOUNTL */ +/* #undef JEMALLOC_INTERNAL_POPCOUNT */ + +/* + * If defined, explicitly attempt to more uniformly distribute large allocation + * pointer alignments across all cache indices. + */ +#define JEMALLOC_CACHE_OBLIVIOUS + +/* + * If defined, enable logging facilities. We make this a configure option to + * avoid taking extra branches everywhere. + */ +/* #undef JEMALLOC_LOG */ + +/* + * If defined, use readlinkat() (instead of readlink()) to follow + * /etc/malloc_conf. + */ +/* #undef JEMALLOC_READLINKAT */ + +/* + * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings. + */ +/* #undef JEMALLOC_ZONE */ + +/* + * Methods for determining whether the OS overcommits. + * JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY: Linux's + * /proc/sys/vm.overcommit_memory file. + * JEMALLOC_SYSCTL_VM_OVERCOMMIT: FreeBSD's vm.overcommit sysctl. + */ +/* #undef JEMALLOC_SYSCTL_VM_OVERCOMMIT */ +/* #undef JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY */ + +/* Defined if madvise(2) is available. */ +/* #undef JEMALLOC_HAVE_MADVISE */ + +/* + * Defined if transparent huge pages are supported via the MADV_[NO]HUGEPAGE + * arguments to madvise(2). + */ +/* #undef JEMALLOC_HAVE_MADVISE_HUGE */ + +/* + * Methods for purging unused pages differ between operating systems. + * + * madvise(..., MADV_FREE) : This marks pages as being unused, such that they + * will be discarded rather than swapped out. + * madvise(..., MADV_DONTNEED) : If JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS is + * defined, this immediately discards pages, + * such that new pages will be demand-zeroed if + * the address region is later touched; + * otherwise this behaves similarly to + * MADV_FREE, though typically with higher + * system overhead. + */ +/* #undef JEMALLOC_PURGE_MADVISE_FREE */ +/* #undef JEMALLOC_PURGE_MADVISE_DONTNEED */ +/* #undef JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS */ + +/* Defined if madvise(2) is available but MADV_FREE is not (x86 Linux only). */ +/* #undef JEMALLOC_DEFINE_MADVISE_FREE */ + +/* + * Defined if MADV_DO[NT]DUMP is supported as an argument to madvise. + */ +/* #undef JEMALLOC_MADVISE_DONTDUMP */ + +/* + * Defined if MADV_[NO]CORE is supported as an argument to madvise. + */ +/* #undef JEMALLOC_MADVISE_NOCORE */ + +/* Defined if mprotect(2) is available. */ +/* #undef JEMALLOC_HAVE_MPROTECT */ + +/* + * Defined if transparent huge pages (THPs) are supported via the + * MADV_[NO]HUGEPAGE arguments to madvise(2), and THP support is enabled. + */ +/* #undef JEMALLOC_THP */ + +/* Defined if posix_madvise is available. */ +/* #undef JEMALLOC_HAVE_POSIX_MADVISE */ + +/* + * Method for purging unused pages using posix_madvise. + * + * posix_madvise(..., POSIX_MADV_DONTNEED) + */ +/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED */ +/* #undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS */ + +/* + * Defined if memcntl page admin call is supported + */ +/* #undef JEMALLOC_HAVE_MEMCNTL */ + +/* + * Defined if malloc_size is supported + */ +/* #undef JEMALLOC_HAVE_MALLOC_SIZE */ + +/* Define if operating system has alloca.h header. */ +/* #undef JEMALLOC_HAS_ALLOCA_H */ + +/* C99 restrict keyword supported. */ +/* #undef JEMALLOC_HAS_RESTRICT */ + +/* For use by hash code. */ +/* #undef JEMALLOC_BIG_ENDIAN */ + +/* sizeof(int) == 2^LG_SIZEOF_INT. */ +#define LG_SIZEOF_INT 2 + +/* sizeof(long) == 2^LG_SIZEOF_LONG. */ +#define LG_SIZEOF_LONG 2 + +/* sizeof(long long) == 2^LG_SIZEOF_LONG_LONG. */ +#define LG_SIZEOF_LONG_LONG 3 + +/* sizeof(intmax_t) == 2^LG_SIZEOF_INTMAX_T. */ +#define LG_SIZEOF_INTMAX_T 3 + +/* glibc malloc hooks (__malloc_hook, __realloc_hook, __free_hook). */ +/* #undef JEMALLOC_GLIBC_MALLOC_HOOK */ + +/* glibc memalign hook. */ +/* #undef JEMALLOC_GLIBC_MEMALIGN_HOOK */ + +/* pthread support */ +/* #undef JEMALLOC_HAVE_PTHREAD */ + +/* dlsym() support */ +/* #undef JEMALLOC_HAVE_DLSYM */ + +/* Adaptive mutex support in pthreads. */ +/* #undef JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP */ + +/* GNU specific sched_getcpu support */ +/* #undef JEMALLOC_HAVE_SCHED_GETCPU */ + +/* GNU specific sched_setaffinity support */ +/* #undef JEMALLOC_HAVE_SCHED_SETAFFINITY */ + +/* + * If defined, all the features necessary for background threads are present. + */ +/* #undef JEMALLOC_BACKGROUND_THREAD */ + +/* + * If defined, jemalloc symbols are not exported (doesn't work when + * JEMALLOC_PREFIX is not defined). + */ +/* #undef JEMALLOC_EXPORT */ + +/* config.malloc_conf options string. */ +#define JEMALLOC_CONFIG_MALLOC_CONF "" + +/* If defined, jemalloc takes the malloc/free/etc. symbol names. */ +/* #undef JEMALLOC_IS_MALLOC */ + +/* + * Defined if strerror_r returns char * if _GNU_SOURCE is defined. + */ +/* #undef JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE */ + +/* Performs additional safety checks when defined. */ +/* #undef JEMALLOC_OPT_SAFETY_CHECKS */ + +/* Is C++ support being built? */ +/* #undef JEMALLOC_ENABLE_CXX */ + +/* Performs additional size checks when defined. */ +/* #undef JEMALLOC_OPT_SIZE_CHECKS */ + +/* Allows sampled junk and stash for checking use-after-free when defined. */ +/* #undef JEMALLOC_UAF_DETECTION */ + +/* Darwin VM_MAKE_TAG support */ +/* #undef JEMALLOC_HAVE_VM_MAKE_TAG */ + +/* If defined, realloc(ptr, 0) defaults to "free" instead of "alloc". */ +#define JEMALLOC_ZERO_REALLOC_DEFAULT_FREE + +#endif /* JEMALLOC_INTERNAL_DEFS_H_ */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_defs.h.in b/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_defs.h.in new file mode 100644 index 00000000..3588072f --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_defs.h.in @@ -0,0 +1,427 @@ +#ifndef JEMALLOC_INTERNAL_DEFS_H_ +#define JEMALLOC_INTERNAL_DEFS_H_ +/* + * If JEMALLOC_PREFIX is defined via --with-jemalloc-prefix, it will cause all + * public APIs to be prefixed. This makes it possible, with some care, to use + * multiple allocators simultaneously. + */ +#undef JEMALLOC_PREFIX +#undef JEMALLOC_CPREFIX + +/* + * Define overrides for non-standard allocator-related functions if they are + * present on the system. + */ +#undef JEMALLOC_OVERRIDE___LIBC_CALLOC +#undef JEMALLOC_OVERRIDE___LIBC_FREE +#undef JEMALLOC_OVERRIDE___LIBC_MALLOC +#undef JEMALLOC_OVERRIDE___LIBC_MEMALIGN +#undef JEMALLOC_OVERRIDE___LIBC_REALLOC +#undef JEMALLOC_OVERRIDE___LIBC_VALLOC +#undef JEMALLOC_OVERRIDE___POSIX_MEMALIGN + +/* + * JEMALLOC_PRIVATE_NAMESPACE is used as a prefix for all library-private APIs. + * For shared libraries, symbol visibility mechanisms prevent these symbols + * from being exported, but for static libraries, naming collisions are a real + * possibility. + */ +#undef JEMALLOC_PRIVATE_NAMESPACE + +/* + * Hyper-threaded CPUs may need a special instruction inside spin loops in + * order to yield to another virtual CPU. + */ +#undef CPU_SPINWAIT +/* 1 if CPU_SPINWAIT is defined, 0 otherwise. */ +#undef HAVE_CPU_SPINWAIT + +/* + * Number of significant bits in virtual addresses. This may be less than the + * total number of bits in a pointer, e.g. on x64, for which the uppermost 16 + * bits are the same as bit 47. + */ +#undef LG_VADDR + +/* Defined if C11 atomics are available. */ +#undef JEMALLOC_C11_ATOMICS + +/* Defined if GCC __atomic atomics are available. */ +#undef JEMALLOC_GCC_ATOMIC_ATOMICS +/* and the 8-bit variant support. */ +#undef JEMALLOC_GCC_U8_ATOMIC_ATOMICS + +/* Defined if GCC __sync atomics are available. */ +#undef JEMALLOC_GCC_SYNC_ATOMICS +/* and the 8-bit variant support. */ +#undef JEMALLOC_GCC_U8_SYNC_ATOMICS + +/* + * Defined if __builtin_clz() and __builtin_clzl() are available. + */ +#undef JEMALLOC_HAVE_BUILTIN_CLZ + +/* + * Defined if os_unfair_lock_*() functions are available, as provided by Darwin. + */ +#undef JEMALLOC_OS_UNFAIR_LOCK + +/* Defined if syscall(2) is usable. */ +#undef JEMALLOC_USE_SYSCALL + +/* + * Defined if secure_getenv(3) is available. + */ +#undef JEMALLOC_HAVE_SECURE_GETENV + +/* + * Defined if issetugid(2) is available. + */ +#undef JEMALLOC_HAVE_ISSETUGID + +/* Defined if pthread_atfork(3) is available. */ +#undef JEMALLOC_HAVE_PTHREAD_ATFORK + +/* Defined if pthread_setname_np(3) is available. */ +#undef JEMALLOC_HAVE_PTHREAD_SETNAME_NP + +/* Defined if pthread_getname_np(3) is available. */ +#undef JEMALLOC_HAVE_PTHREAD_GETNAME_NP + +/* Defined if pthread_get_name_np(3) is available. */ +#undef JEMALLOC_HAVE_PTHREAD_GET_NAME_NP + +/* + * Defined if clock_gettime(CLOCK_MONOTONIC_COARSE, ...) is available. + */ +#undef JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE + +/* + * Defined if clock_gettime(CLOCK_MONOTONIC, ...) is available. + */ +#undef JEMALLOC_HAVE_CLOCK_MONOTONIC + +/* + * Defined if mach_absolute_time() is available. + */ +#undef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME + +/* + * Defined if clock_gettime(CLOCK_REALTIME, ...) is available. + */ +#undef JEMALLOC_HAVE_CLOCK_REALTIME + +/* + * Defined if _malloc_thread_cleanup() exists. At least in the case of + * FreeBSD, pthread_key_create() allocates, which if used during malloc + * bootstrapping will cause recursion into the pthreads library. Therefore, if + * _malloc_thread_cleanup() exists, use it as the basis for thread cleanup in + * malloc_tsd. + */ +#undef JEMALLOC_MALLOC_THREAD_CLEANUP + +/* + * Defined if threaded initialization is known to be safe on this platform. + * Among other things, it must be possible to initialize a mutex without + * triggering allocation in order for threaded allocation to be safe. + */ +#undef JEMALLOC_THREADED_INIT + +/* + * Defined if the pthreads implementation defines + * _pthread_mutex_init_calloc_cb(), in which case the function is used in order + * to avoid recursive allocation during mutex initialization. + */ +#undef JEMALLOC_MUTEX_INIT_CB + +/* Non-empty if the tls_model attribute is supported. */ +#undef JEMALLOC_TLS_MODEL + +/* + * JEMALLOC_DEBUG enables assertions and other sanity checks, and disables + * inline functions. + */ +#undef JEMALLOC_DEBUG + +/* JEMALLOC_STATS enables statistics calculation. */ +#undef JEMALLOC_STATS + +/* JEMALLOC_EXPERIMENTAL_SMALLOCX_API enables experimental smallocx API. */ +#undef JEMALLOC_EXPERIMENTAL_SMALLOCX_API + +/* JEMALLOC_PROF enables allocation profiling. */ +#undef JEMALLOC_PROF + +/* Use libunwind for profile backtracing if defined. */ +#undef JEMALLOC_PROF_LIBUNWIND + +/* Use libgcc for profile backtracing if defined. */ +#undef JEMALLOC_PROF_LIBGCC + +/* Use gcc intrinsics for profile backtracing if defined. */ +#undef JEMALLOC_PROF_GCC + +/* + * JEMALLOC_DSS enables use of sbrk(2) to allocate extents from the data storage + * segment (DSS). + */ +#undef JEMALLOC_DSS + +/* Support memory filling (junk/zero). */ +#undef JEMALLOC_FILL + +/* Support utrace(2)-based tracing. */ +#undef JEMALLOC_UTRACE + +/* Support utrace(2)-based tracing (label based signature). */ +#undef JEMALLOC_UTRACE_LABEL + +/* Support optional abort() on OOM. */ +#undef JEMALLOC_XMALLOC + +/* Support lazy locking (avoid locking unless a second thread is launched). */ +#undef JEMALLOC_LAZY_LOCK + +/* + * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size + * classes). + */ +#undef LG_QUANTUM + +/* One page is 2^LG_PAGE bytes. */ +#undef LG_PAGE + +/* Maximum number of regions in a slab. */ +#undef CONFIG_LG_SLAB_MAXREGS + +/* + * One huge page is 2^LG_HUGEPAGE bytes. Note that this is defined even if the + * system does not explicitly support huge pages; system calls that require + * explicit huge page support are separately configured. + */ +#undef LG_HUGEPAGE + +/* + * If defined, adjacent virtual memory mappings with identical attributes + * automatically coalesce, and they fragment when changes are made to subranges. + * This is the normal order of things for mmap()/munmap(), but on Windows + * VirtualAlloc()/VirtualFree() operations must be precisely matched, i.e. + * mappings do *not* coalesce/fragment. + */ +#undef JEMALLOC_MAPS_COALESCE + +/* + * If defined, retain memory for later reuse by default rather than using e.g. + * munmap() to unmap freed extents. This is enabled on 64-bit Linux because + * common sequences of mmap()/munmap() calls will cause virtual memory map + * holes. + */ +#undef JEMALLOC_RETAIN + +/* TLS is used to map arenas and magazine caches to threads. */ +#undef JEMALLOC_TLS + +/* + * Used to mark unreachable code to quiet "end of non-void" compiler warnings. + * Don't use this directly; instead use unreachable() from util.h + */ +#undef JEMALLOC_INTERNAL_UNREACHABLE + +/* + * ffs*() functions to use for bitmapping. Don't use these directly; instead, + * use ffs_*() from util.h. + */ +#undef JEMALLOC_INTERNAL_FFSLL +#undef JEMALLOC_INTERNAL_FFSL +#undef JEMALLOC_INTERNAL_FFS + +/* + * popcount*() functions to use for bitmapping. + */ +#undef JEMALLOC_INTERNAL_POPCOUNTL +#undef JEMALLOC_INTERNAL_POPCOUNT + +/* + * If defined, explicitly attempt to more uniformly distribute large allocation + * pointer alignments across all cache indices. + */ +#undef JEMALLOC_CACHE_OBLIVIOUS + +/* + * If defined, enable logging facilities. We make this a configure option to + * avoid taking extra branches everywhere. + */ +#undef JEMALLOC_LOG + +/* + * If defined, use readlinkat() (instead of readlink()) to follow + * /etc/malloc_conf. + */ +#undef JEMALLOC_READLINKAT + +/* + * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings. + */ +#undef JEMALLOC_ZONE + +/* + * Methods for determining whether the OS overcommits. + * JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY: Linux's + * /proc/sys/vm.overcommit_memory file. + * JEMALLOC_SYSCTL_VM_OVERCOMMIT: FreeBSD's vm.overcommit sysctl. + */ +#undef JEMALLOC_SYSCTL_VM_OVERCOMMIT +#undef JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY + +/* Defined if madvise(2) is available. */ +#undef JEMALLOC_HAVE_MADVISE + +/* + * Defined if transparent huge pages are supported via the MADV_[NO]HUGEPAGE + * arguments to madvise(2). + */ +#undef JEMALLOC_HAVE_MADVISE_HUGE + +/* + * Methods for purging unused pages differ between operating systems. + * + * madvise(..., MADV_FREE) : This marks pages as being unused, such that they + * will be discarded rather than swapped out. + * madvise(..., MADV_DONTNEED) : If JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS is + * defined, this immediately discards pages, + * such that new pages will be demand-zeroed if + * the address region is later touched; + * otherwise this behaves similarly to + * MADV_FREE, though typically with higher + * system overhead. + */ +#undef JEMALLOC_PURGE_MADVISE_FREE +#undef JEMALLOC_PURGE_MADVISE_DONTNEED +#undef JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS + +/* Defined if madvise(2) is available but MADV_FREE is not (x86 Linux only). */ +#undef JEMALLOC_DEFINE_MADVISE_FREE + +/* + * Defined if MADV_DO[NT]DUMP is supported as an argument to madvise. + */ +#undef JEMALLOC_MADVISE_DONTDUMP + +/* + * Defined if MADV_[NO]CORE is supported as an argument to madvise. + */ +#undef JEMALLOC_MADVISE_NOCORE + +/* Defined if mprotect(2) is available. */ +#undef JEMALLOC_HAVE_MPROTECT + +/* + * Defined if transparent huge pages (THPs) are supported via the + * MADV_[NO]HUGEPAGE arguments to madvise(2), and THP support is enabled. + */ +#undef JEMALLOC_THP + +/* Defined if posix_madvise is available. */ +#undef JEMALLOC_HAVE_POSIX_MADVISE + +/* + * Method for purging unused pages using posix_madvise. + * + * posix_madvise(..., POSIX_MADV_DONTNEED) + */ +#undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED +#undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS + +/* + * Defined if memcntl page admin call is supported + */ +#undef JEMALLOC_HAVE_MEMCNTL + +/* + * Defined if malloc_size is supported + */ +#undef JEMALLOC_HAVE_MALLOC_SIZE + +/* Define if operating system has alloca.h header. */ +#undef JEMALLOC_HAS_ALLOCA_H + +/* C99 restrict keyword supported. */ +#undef JEMALLOC_HAS_RESTRICT + +/* For use by hash code. */ +#undef JEMALLOC_BIG_ENDIAN + +/* sizeof(int) == 2^LG_SIZEOF_INT. */ +#undef LG_SIZEOF_INT + +/* sizeof(long) == 2^LG_SIZEOF_LONG. */ +#undef LG_SIZEOF_LONG + +/* sizeof(long long) == 2^LG_SIZEOF_LONG_LONG. */ +#undef LG_SIZEOF_LONG_LONG + +/* sizeof(intmax_t) == 2^LG_SIZEOF_INTMAX_T. */ +#undef LG_SIZEOF_INTMAX_T + +/* glibc malloc hooks (__malloc_hook, __realloc_hook, __free_hook). */ +#undef JEMALLOC_GLIBC_MALLOC_HOOK + +/* glibc memalign hook. */ +#undef JEMALLOC_GLIBC_MEMALIGN_HOOK + +/* pthread support */ +#undef JEMALLOC_HAVE_PTHREAD + +/* dlsym() support */ +#undef JEMALLOC_HAVE_DLSYM + +/* Adaptive mutex support in pthreads. */ +#undef JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP + +/* GNU specific sched_getcpu support */ +#undef JEMALLOC_HAVE_SCHED_GETCPU + +/* GNU specific sched_setaffinity support */ +#undef JEMALLOC_HAVE_SCHED_SETAFFINITY + +/* + * If defined, all the features necessary for background threads are present. + */ +#undef JEMALLOC_BACKGROUND_THREAD + +/* + * If defined, jemalloc symbols are not exported (doesn't work when + * JEMALLOC_PREFIX is not defined). + */ +#undef JEMALLOC_EXPORT + +/* config.malloc_conf options string. */ +#undef JEMALLOC_CONFIG_MALLOC_CONF + +/* If defined, jemalloc takes the malloc/free/etc. symbol names. */ +#undef JEMALLOC_IS_MALLOC + +/* + * Defined if strerror_r returns char * if _GNU_SOURCE is defined. + */ +#undef JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE + +/* Performs additional safety checks when defined. */ +#undef JEMALLOC_OPT_SAFETY_CHECKS + +/* Is C++ support being built? */ +#undef JEMALLOC_ENABLE_CXX + +/* Performs additional size checks when defined. */ +#undef JEMALLOC_OPT_SIZE_CHECKS + +/* Allows sampled junk and stash for checking use-after-free when defined. */ +#undef JEMALLOC_UAF_DETECTION + +/* Darwin VM_MAKE_TAG support */ +#undef JEMALLOC_HAVE_VM_MAKE_TAG + +/* If defined, realloc(ptr, 0) defaults to "free" instead of "alloc". */ +#undef JEMALLOC_ZERO_REALLOC_DEFAULT_FREE + +#endif /* JEMALLOC_INTERNAL_DEFS_H_ */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_externs.h b/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_externs.h new file mode 100644 index 00000000..fc834c67 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_externs.h @@ -0,0 +1,75 @@ +#ifndef JEMALLOC_INTERNAL_EXTERNS_H +#define JEMALLOC_INTERNAL_EXTERNS_H + +#include "jemalloc/internal/atomic.h" +#include "jemalloc/internal/hpa_opts.h" +#include "jemalloc/internal/sec_opts.h" +#include "jemalloc/internal/tsd_types.h" +#include "jemalloc/internal/nstime.h" + +/* TSD checks this to set thread local slow state accordingly. */ +extern bool malloc_slow; + +/* Run-time options. */ +extern bool opt_abort; +extern bool opt_abort_conf; +extern bool opt_trust_madvise; +extern bool opt_confirm_conf; +extern bool opt_hpa; +extern hpa_shard_opts_t opt_hpa_opts; +extern sec_opts_t opt_hpa_sec_opts; + +extern const char *opt_junk; +extern bool opt_junk_alloc; +extern bool opt_junk_free; +extern void (*junk_free_callback)(void *ptr, size_t size); +extern void (*junk_alloc_callback)(void *ptr, size_t size); +extern bool opt_utrace; +extern bool opt_xmalloc; +extern bool opt_experimental_infallible_new; +extern bool opt_zero; +extern unsigned opt_narenas; +extern zero_realloc_action_t opt_zero_realloc_action; +extern malloc_init_t malloc_init_state; +extern const char *zero_realloc_mode_names[]; +extern atomic_zu_t zero_realloc_count; +extern bool opt_cache_oblivious; + +/* Escape free-fastpath when ptr & mask == 0 (for sanitization purpose). */ +extern uintptr_t san_cache_bin_nonfast_mask; + +/* Number of CPUs. */ +extern unsigned ncpus; + +/* Number of arenas used for automatic multiplexing of threads and arenas. */ +extern unsigned narenas_auto; + +/* Base index for manual arenas. */ +extern unsigned manual_arena_base; + +/* + * Arenas that are used to service external requests. Not all elements of the + * arenas array are necessarily used; arenas are created lazily as needed. + */ +extern atomic_p_t arenas[]; + +void *a0malloc(size_t size); +void a0dalloc(void *ptr); +void *bootstrap_malloc(size_t size); +void *bootstrap_calloc(size_t num, size_t size); +void bootstrap_free(void *ptr); +void arena_set(unsigned ind, arena_t *arena); +unsigned narenas_total_get(void); +arena_t *arena_init(tsdn_t *tsdn, unsigned ind, const arena_config_t *config); +arena_t *arena_choose_hard(tsd_t *tsd, bool internal); +void arena_migrate(tsd_t *tsd, arena_t *oldarena, arena_t *newarena); +void iarena_cleanup(tsd_t *tsd); +void arena_cleanup(tsd_t *tsd); +size_t batch_alloc(void **ptrs, size_t num, size_t size, int flags); +void jemalloc_prefork(void); +void jemalloc_postfork_parent(void); +void jemalloc_postfork_child(void); +void je_sdallocx_noflags(void *ptr, size_t size); +void *malloc_default(size_t size); + +#endif /* JEMALLOC_INTERNAL_EXTERNS_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_includes.h b/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_includes.h new file mode 100644 index 00000000..751c112f --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_includes.h @@ -0,0 +1,84 @@ +#ifndef JEMALLOC_INTERNAL_INCLUDES_H +#define JEMALLOC_INTERNAL_INCLUDES_H + +/* + * jemalloc can conceptually be broken into components (arena, tcache, etc.), + * but there are circular dependencies that cannot be broken without + * substantial performance degradation. + * + * Historically, we dealt with this by each header into four sections (types, + * structs, externs, and inlines), and included each header file multiple times + * in this file, picking out the portion we want on each pass using the + * following #defines: + * JEMALLOC_H_TYPES : Preprocessor-defined constants and pseudo-opaque data + * types. + * JEMALLOC_H_STRUCTS : Data structures. + * JEMALLOC_H_EXTERNS : Extern data declarations and function prototypes. + * JEMALLOC_H_INLINES : Inline functions. + * + * We're moving toward a world in which the dependencies are explicit; each file + * will #include the headers it depends on (rather than relying on them being + * implicitly available via this file including every header file in the + * project). + * + * We're now in an intermediate state: we've broken up the header files to avoid + * having to include each one multiple times, but have not yet moved the + * dependency information into the header files (i.e. we still rely on the + * ordering in this file to ensure all a header's dependencies are available in + * its translation unit). Each component is now broken up into multiple header + * files, corresponding to the sections above (e.g. instead of "foo.h", we now + * have "foo_types.h", "foo_structs.h", "foo_externs.h", "foo_inlines.h"). + * + * Those files which have been converted to explicitly include their + * inter-component dependencies are now in the initial HERMETIC HEADERS + * section. All headers may still rely on jemalloc_preamble.h (which, by fiat, + * must be included first in every translation unit) for system headers and + * global jemalloc definitions, however. + */ + +/******************************************************************************/ +/* TYPES */ +/******************************************************************************/ + +#include "jemalloc/internal/arena_types.h" +#include "jemalloc/internal/tcache_types.h" +#include "jemalloc/internal/prof_types.h" + +/******************************************************************************/ +/* STRUCTS */ +/******************************************************************************/ + +#include "jemalloc/internal/prof_structs.h" +#include "jemalloc/internal/arena_structs.h" +#include "jemalloc/internal/tcache_structs.h" +#include "jemalloc/internal/background_thread_structs.h" + +/******************************************************************************/ +/* EXTERNS */ +/******************************************************************************/ + +#include "jemalloc/internal/jemalloc_internal_externs.h" +#include "jemalloc/internal/arena_externs.h" +#include "jemalloc/internal/large_externs.h" +#include "jemalloc/internal/tcache_externs.h" +#include "jemalloc/internal/prof_externs.h" +#include "jemalloc/internal/background_thread_externs.h" + +/******************************************************************************/ +/* INLINES */ +/******************************************************************************/ + +#include "jemalloc/internal/jemalloc_internal_inlines_a.h" +/* + * Include portions of arena code interleaved with tcache code in order to + * resolve circular dependencies. + */ +#include "jemalloc/internal/arena_inlines_a.h" +#include "jemalloc/internal/jemalloc_internal_inlines_b.h" +#include "jemalloc/internal/tcache_inlines.h" +#include "jemalloc/internal/arena_inlines_b.h" +#include "jemalloc/internal/jemalloc_internal_inlines_c.h" +#include "jemalloc/internal/prof_inlines.h" +#include "jemalloc/internal/background_thread_inlines.h" + +#endif /* JEMALLOC_INTERNAL_INCLUDES_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_inlines_a.h b/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_inlines_a.h new file mode 100644 index 00000000..9e27cc30 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_inlines_a.h @@ -0,0 +1,122 @@ +#ifndef JEMALLOC_INTERNAL_INLINES_A_H +#define JEMALLOC_INTERNAL_INLINES_A_H + +#include "jemalloc/internal/atomic.h" +#include "jemalloc/internal/bit_util.h" +#include "jemalloc/internal/jemalloc_internal_types.h" +#include "jemalloc/internal/sc.h" +#include "jemalloc/internal/ticker.h" + +JEMALLOC_ALWAYS_INLINE malloc_cpuid_t +malloc_getcpu(void) { + assert(have_percpu_arena); +#if defined(_WIN32) + return GetCurrentProcessorNumber(); +#elif defined(JEMALLOC_HAVE_SCHED_GETCPU) + return (malloc_cpuid_t)sched_getcpu(); +#else + not_reached(); + return -1; +#endif +} + +/* Return the chosen arena index based on current cpu. */ +JEMALLOC_ALWAYS_INLINE unsigned +percpu_arena_choose(void) { + assert(have_percpu_arena && PERCPU_ARENA_ENABLED(opt_percpu_arena)); + + malloc_cpuid_t cpuid = malloc_getcpu(); + assert(cpuid >= 0); + + unsigned arena_ind; + if ((opt_percpu_arena == percpu_arena) || ((unsigned)cpuid < ncpus / + 2)) { + arena_ind = cpuid; + } else { + assert(opt_percpu_arena == per_phycpu_arena); + /* Hyper threads on the same physical CPU share arena. */ + arena_ind = cpuid - ncpus / 2; + } + + return arena_ind; +} + +/* Return the limit of percpu auto arena range, i.e. arenas[0...ind_limit). */ +JEMALLOC_ALWAYS_INLINE unsigned +percpu_arena_ind_limit(percpu_arena_mode_t mode) { + assert(have_percpu_arena && PERCPU_ARENA_ENABLED(mode)); + if (mode == per_phycpu_arena && ncpus > 1) { + if (ncpus % 2) { + /* This likely means a misconfig. */ + return ncpus / 2 + 1; + } + return ncpus / 2; + } else { + return ncpus; + } +} + +static inline arena_t * +arena_get(tsdn_t *tsdn, unsigned ind, bool init_if_missing) { + arena_t *ret; + + assert(ind < MALLOCX_ARENA_LIMIT); + + ret = (arena_t *)atomic_load_p(&arenas[ind], ATOMIC_ACQUIRE); + if (unlikely(ret == NULL)) { + if (init_if_missing) { + ret = arena_init(tsdn, ind, &arena_config_default); + } + } + return ret; +} + +JEMALLOC_ALWAYS_INLINE bool +tcache_available(tsd_t *tsd) { + /* + * Thread specific auto tcache might be unavailable if: 1) during tcache + * initialization, or 2) disabled through thread.tcache.enabled mallctl + * or config options. This check covers all cases. + */ + if (likely(tsd_tcache_enabled_get(tsd))) { + /* Associated arena == NULL implies tcache init in progress. */ + if (config_debug && tsd_tcache_slowp_get(tsd)->arena != NULL) { + tcache_assert_initialized(tsd_tcachep_get(tsd)); + } + return true; + } + + return false; +} + +JEMALLOC_ALWAYS_INLINE tcache_t * +tcache_get(tsd_t *tsd) { + if (!tcache_available(tsd)) { + return NULL; + } + + return tsd_tcachep_get(tsd); +} + +JEMALLOC_ALWAYS_INLINE tcache_slow_t * +tcache_slow_get(tsd_t *tsd) { + if (!tcache_available(tsd)) { + return NULL; + } + + return tsd_tcache_slowp_get(tsd); +} + +static inline void +pre_reentrancy(tsd_t *tsd, arena_t *arena) { + /* arena is the current context. Reentry from a0 is not allowed. */ + assert(arena != arena_get(tsd_tsdn(tsd), 0, false)); + tsd_pre_reentrancy_raw(tsd); +} + +static inline void +post_reentrancy(tsd_t *tsd) { + tsd_post_reentrancy_raw(tsd); +} + +#endif /* JEMALLOC_INTERNAL_INLINES_A_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_inlines_b.h b/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_inlines_b.h new file mode 100644 index 00000000..152f8a03 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_inlines_b.h @@ -0,0 +1,103 @@ +#ifndef JEMALLOC_INTERNAL_INLINES_B_H +#define JEMALLOC_INTERNAL_INLINES_B_H + +#include "jemalloc/internal/extent.h" + +static inline void +percpu_arena_update(tsd_t *tsd, unsigned cpu) { + assert(have_percpu_arena); + arena_t *oldarena = tsd_arena_get(tsd); + assert(oldarena != NULL); + unsigned oldind = arena_ind_get(oldarena); + + if (oldind != cpu) { + unsigned newind = cpu; + arena_t *newarena = arena_get(tsd_tsdn(tsd), newind, true); + assert(newarena != NULL); + + /* Set new arena/tcache associations. */ + arena_migrate(tsd, oldarena, newarena); + tcache_t *tcache = tcache_get(tsd); + if (tcache != NULL) { + tcache_slow_t *tcache_slow = tsd_tcache_slowp_get(tsd); + tcache_arena_reassociate(tsd_tsdn(tsd), tcache_slow, + tcache, newarena); + } + } +} + + +/* Choose an arena based on a per-thread value. */ +static inline arena_t * +arena_choose_impl(tsd_t *tsd, arena_t *arena, bool internal) { + arena_t *ret; + + if (arena != NULL) { + return arena; + } + + /* During reentrancy, arena 0 is the safest bet. */ + if (unlikely(tsd_reentrancy_level_get(tsd) > 0)) { + return arena_get(tsd_tsdn(tsd), 0, true); + } + + ret = internal ? tsd_iarena_get(tsd) : tsd_arena_get(tsd); + if (unlikely(ret == NULL)) { + ret = arena_choose_hard(tsd, internal); + assert(ret); + if (tcache_available(tsd)) { + tcache_slow_t *tcache_slow = tsd_tcache_slowp_get(tsd); + tcache_t *tcache = tsd_tcachep_get(tsd); + if (tcache_slow->arena != NULL) { + /* See comments in tsd_tcache_data_init().*/ + assert(tcache_slow->arena == + arena_get(tsd_tsdn(tsd), 0, false)); + if (tcache_slow->arena != ret) { + tcache_arena_reassociate(tsd_tsdn(tsd), + tcache_slow, tcache, ret); + } + } else { + tcache_arena_associate(tsd_tsdn(tsd), + tcache_slow, tcache, ret); + } + } + } + + /* + * Note that for percpu arena, if the current arena is outside of the + * auto percpu arena range, (i.e. thread is assigned to a manually + * managed arena), then percpu arena is skipped. + */ + if (have_percpu_arena && PERCPU_ARENA_ENABLED(opt_percpu_arena) && + !internal && (arena_ind_get(ret) < + percpu_arena_ind_limit(opt_percpu_arena)) && (ret->last_thd != + tsd_tsdn(tsd))) { + unsigned ind = percpu_arena_choose(); + if (arena_ind_get(ret) != ind) { + percpu_arena_update(tsd, ind); + ret = tsd_arena_get(tsd); + } + ret->last_thd = tsd_tsdn(tsd); + } + + return ret; +} + +static inline arena_t * +arena_choose(tsd_t *tsd, arena_t *arena) { + return arena_choose_impl(tsd, arena, false); +} + +static inline arena_t * +arena_ichoose(tsd_t *tsd, arena_t *arena) { + return arena_choose_impl(tsd, arena, true); +} + +static inline bool +arena_is_auto(arena_t *arena) { + assert(narenas_auto > 0); + + return (arena_ind_get(arena) < manual_arena_base); +} + +#endif /* JEMALLOC_INTERNAL_INLINES_B_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h b/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h new file mode 100644 index 00000000..b0868b7d --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h @@ -0,0 +1,340 @@ +#ifndef JEMALLOC_INTERNAL_INLINES_C_H +#define JEMALLOC_INTERNAL_INLINES_C_H + +#include "jemalloc/internal/hook.h" +#include "jemalloc/internal/jemalloc_internal_types.h" +#include "jemalloc/internal/log.h" +#include "jemalloc/internal/sz.h" +#include "jemalloc/internal/thread_event.h" +#include "jemalloc/internal/witness.h" + +/* + * Translating the names of the 'i' functions: + * Abbreviations used in the first part of the function name (before + * alloc/dalloc) describe what that function accomplishes: + * a: arena (query) + * s: size (query, or sized deallocation) + * e: extent (query) + * p: aligned (allocates) + * vs: size (query, without knowing that the pointer is into the heap) + * r: rallocx implementation + * x: xallocx implementation + * Abbreviations used in the second part of the function name (after + * alloc/dalloc) describe the arguments it takes + * z: whether to return zeroed memory + * t: accepts a tcache_t * parameter + * m: accepts an arena_t * parameter + */ + +JEMALLOC_ALWAYS_INLINE arena_t * +iaalloc(tsdn_t *tsdn, const void *ptr) { + assert(ptr != NULL); + + return arena_aalloc(tsdn, ptr); +} + +JEMALLOC_ALWAYS_INLINE size_t +isalloc(tsdn_t *tsdn, const void *ptr) { + assert(ptr != NULL); + + return arena_salloc(tsdn, ptr); +} + +JEMALLOC_ALWAYS_INLINE void * +iallocztm(tsdn_t *tsdn, size_t size, szind_t ind, bool zero, tcache_t *tcache, + bool is_internal, arena_t *arena, bool slow_path) { + void *ret; + + assert(!is_internal || tcache == NULL); + assert(!is_internal || arena == NULL || arena_is_auto(arena)); + if (!tsdn_null(tsdn) && tsd_reentrancy_level_get(tsdn_tsd(tsdn)) == 0) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + } + + ret = arena_malloc(tsdn, arena, size, ind, zero, tcache, slow_path); + if (config_stats && is_internal && likely(ret != NULL)) { + arena_internal_add(iaalloc(tsdn, ret), isalloc(tsdn, ret)); + } + return ret; +} + +JEMALLOC_ALWAYS_INLINE void * +ialloc(tsd_t *tsd, size_t size, szind_t ind, bool zero, bool slow_path) { + return iallocztm(tsd_tsdn(tsd), size, ind, zero, tcache_get(tsd), false, + NULL, slow_path); +} + +JEMALLOC_ALWAYS_INLINE void * +ipallocztm(tsdn_t *tsdn, size_t usize, size_t alignment, bool zero, + tcache_t *tcache, bool is_internal, arena_t *arena) { + void *ret; + + assert(usize != 0); + assert(usize == sz_sa2u(usize, alignment)); + assert(!is_internal || tcache == NULL); + assert(!is_internal || arena == NULL || arena_is_auto(arena)); + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + + ret = arena_palloc(tsdn, arena, usize, alignment, zero, tcache); + assert(ALIGNMENT_ADDR2BASE(ret, alignment) == ret); + if (config_stats && is_internal && likely(ret != NULL)) { + arena_internal_add(iaalloc(tsdn, ret), isalloc(tsdn, ret)); + } + return ret; +} + +JEMALLOC_ALWAYS_INLINE void * +ipalloct(tsdn_t *tsdn, size_t usize, size_t alignment, bool zero, + tcache_t *tcache, arena_t *arena) { + return ipallocztm(tsdn, usize, alignment, zero, tcache, false, arena); +} + +JEMALLOC_ALWAYS_INLINE void * +ipalloc(tsd_t *tsd, size_t usize, size_t alignment, bool zero) { + return ipallocztm(tsd_tsdn(tsd), usize, alignment, zero, + tcache_get(tsd), false, NULL); +} + +JEMALLOC_ALWAYS_INLINE size_t +ivsalloc(tsdn_t *tsdn, const void *ptr) { + return arena_vsalloc(tsdn, ptr); +} + +JEMALLOC_ALWAYS_INLINE void +idalloctm(tsdn_t *tsdn, void *ptr, tcache_t *tcache, + emap_alloc_ctx_t *alloc_ctx, bool is_internal, bool slow_path) { + assert(ptr != NULL); + assert(!is_internal || tcache == NULL); + assert(!is_internal || arena_is_auto(iaalloc(tsdn, ptr))); + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + if (config_stats && is_internal) { + arena_internal_sub(iaalloc(tsdn, ptr), isalloc(tsdn, ptr)); + } + if (!is_internal && !tsdn_null(tsdn) && + tsd_reentrancy_level_get(tsdn_tsd(tsdn)) != 0) { + assert(tcache == NULL); + } + arena_dalloc(tsdn, ptr, tcache, alloc_ctx, slow_path); +} + +JEMALLOC_ALWAYS_INLINE void +idalloc(tsd_t *tsd, void *ptr) { + idalloctm(tsd_tsdn(tsd), ptr, tcache_get(tsd), NULL, false, true); +} + +JEMALLOC_ALWAYS_INLINE void +isdalloct(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache, + emap_alloc_ctx_t *alloc_ctx, bool slow_path) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + arena_sdalloc(tsdn, ptr, size, tcache, alloc_ctx, slow_path); +} + +JEMALLOC_ALWAYS_INLINE void * +iralloct_realign(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, + size_t alignment, bool zero, tcache_t *tcache, arena_t *arena, + hook_ralloc_args_t *hook_args) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + void *p; + size_t usize, copysize; + + usize = sz_sa2u(size, alignment); + if (unlikely(usize == 0 || usize > SC_LARGE_MAXCLASS)) { + return NULL; + } + p = ipalloct(tsdn, usize, alignment, zero, tcache, arena); + if (p == NULL) { + return NULL; + } + /* + * Copy at most size bytes (not size+extra), since the caller has no + * expectation that the extra bytes will be reliably preserved. + */ + copysize = (size < oldsize) ? size : oldsize; + memcpy(p, ptr, copysize); + hook_invoke_alloc(hook_args->is_realloc + ? hook_alloc_realloc : hook_alloc_rallocx, p, (uintptr_t)p, + hook_args->args); + hook_invoke_dalloc(hook_args->is_realloc + ? hook_dalloc_realloc : hook_dalloc_rallocx, ptr, hook_args->args); + isdalloct(tsdn, ptr, oldsize, tcache, NULL, true); + return p; +} + +/* + * is_realloc threads through the knowledge of whether or not this call comes + * from je_realloc (as opposed to je_rallocx); this ensures that we pass the + * correct entry point into any hooks. + * Note that these functions are all force-inlined, so no actual bool gets + * passed-around anywhere. + */ +JEMALLOC_ALWAYS_INLINE void * +iralloct(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t alignment, + bool zero, tcache_t *tcache, arena_t *arena, hook_ralloc_args_t *hook_args) +{ + assert(ptr != NULL); + assert(size != 0); + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + + if (alignment != 0 && ((uintptr_t)ptr & ((uintptr_t)alignment-1)) + != 0) { + /* + * Existing object alignment is inadequate; allocate new space + * and copy. + */ + return iralloct_realign(tsdn, ptr, oldsize, size, alignment, + zero, tcache, arena, hook_args); + } + + return arena_ralloc(tsdn, arena, ptr, oldsize, size, alignment, zero, + tcache, hook_args); +} + +JEMALLOC_ALWAYS_INLINE void * +iralloc(tsd_t *tsd, void *ptr, size_t oldsize, size_t size, size_t alignment, + bool zero, hook_ralloc_args_t *hook_args) { + return iralloct(tsd_tsdn(tsd), ptr, oldsize, size, alignment, zero, + tcache_get(tsd), NULL, hook_args); +} + +JEMALLOC_ALWAYS_INLINE bool +ixalloc(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t extra, + size_t alignment, bool zero, size_t *newsize) { + assert(ptr != NULL); + assert(size != 0); + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + + if (alignment != 0 && ((uintptr_t)ptr & ((uintptr_t)alignment-1)) + != 0) { + /* Existing object alignment is inadequate. */ + *newsize = oldsize; + return true; + } + + return arena_ralloc_no_move(tsdn, ptr, oldsize, size, extra, zero, + newsize); +} + +JEMALLOC_ALWAYS_INLINE void +fastpath_success_finish(tsd_t *tsd, uint64_t allocated_after, + cache_bin_t *bin, void *ret) { + thread_allocated_set(tsd, allocated_after); + if (config_stats) { + bin->tstats.nrequests++; + } + + LOG("core.malloc.exit", "result: %p", ret); +} + +JEMALLOC_ALWAYS_INLINE bool +malloc_initialized(void) { + return (malloc_init_state == malloc_init_initialized); +} + +/* + * malloc() fastpath. Included here so that we can inline it into operator new; + * function call overhead there is non-negligible as a fraction of total CPU in + * allocation-heavy C++ programs. We take the fallback alloc to allow malloc + * (which can return NULL) to differ in its behavior from operator new (which + * can't). It matches the signature of malloc / operator new so that we can + * tail-call the fallback allocator, allowing us to avoid setting up the call + * frame in the common case. + * + * Fastpath assumes size <= SC_LOOKUP_MAXCLASS, and that we hit + * tcache. If either of these is false, we tail-call to the slowpath, + * malloc_default(). Tail-calling is used to avoid any caller-saved + * registers. + * + * fastpath supports ticker and profiling, both of which will also + * tail-call to the slowpath if they fire. + */ +JEMALLOC_ALWAYS_INLINE void * +imalloc_fastpath(size_t size, void *(fallback_alloc)(size_t)) { + LOG("core.malloc.entry", "size: %zu", size); + if (tsd_get_allocates() && unlikely(!malloc_initialized())) { + return fallback_alloc(size); + } + + tsd_t *tsd = tsd_get(false); + if (unlikely((size > SC_LOOKUP_MAXCLASS) || tsd == NULL)) { + return fallback_alloc(size); + } + /* + * The code below till the branch checking the next_event threshold may + * execute before malloc_init(), in which case the threshold is 0 to + * trigger slow path and initialization. + * + * Note that when uninitialized, only the fast-path variants of the sz / + * tsd facilities may be called. + */ + szind_t ind; + /* + * The thread_allocated counter in tsd serves as a general purpose + * accumulator for bytes of allocation to trigger different types of + * events. usize is always needed to advance thread_allocated, though + * it's not always needed in the core allocation logic. + */ + size_t usize; + sz_size2index_usize_fastpath(size, &ind, &usize); + /* Fast path relies on size being a bin. */ + assert(ind < SC_NBINS); + assert((SC_LOOKUP_MAXCLASS < SC_SMALL_MAXCLASS) && + (size <= SC_SMALL_MAXCLASS)); + + uint64_t allocated, threshold; + te_malloc_fastpath_ctx(tsd, &allocated, &threshold); + uint64_t allocated_after = allocated + usize; + /* + * The ind and usize might be uninitialized (or partially) before + * malloc_init(). The assertions check for: 1) full correctness (usize + * & ind) when initialized; and 2) guaranteed slow-path (threshold == 0) + * when !initialized. + */ + if (!malloc_initialized()) { + assert(threshold == 0); + } else { + assert(ind == sz_size2index(size)); + assert(usize > 0 && usize == sz_index2size(ind)); + } + /* + * Check for events and tsd non-nominal (fast_threshold will be set to + * 0) in a single branch. + */ + if (unlikely(allocated_after >= threshold)) { + return fallback_alloc(size); + } + assert(tsd_fast(tsd)); + + tcache_t *tcache = tsd_tcachep_get(tsd); + assert(tcache == tcache_get(tsd)); + cache_bin_t *bin = &tcache->bins[ind]; + bool tcache_success; + void *ret; + + /* + * We split up the code this way so that redundant low-water + * computation doesn't happen on the (more common) case in which we + * don't touch the low water mark. The compiler won't do this + * duplication on its own. + */ + ret = cache_bin_alloc_easy(bin, &tcache_success); + if (tcache_success) { + fastpath_success_finish(tsd, allocated_after, bin, ret); + return ret; + } + ret = cache_bin_alloc(bin, &tcache_success); + if (tcache_success) { + fastpath_success_finish(tsd, allocated_after, bin, ret); + return ret; + } + + return fallback_alloc(size); +} + +#endif /* JEMALLOC_INTERNAL_INLINES_C_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_macros.h b/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_macros.h new file mode 100644 index 00000000..e97b5f90 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_macros.h @@ -0,0 +1,111 @@ +#ifndef JEMALLOC_INTERNAL_MACROS_H +#define JEMALLOC_INTERNAL_MACROS_H + +#ifdef JEMALLOC_DEBUG +# define JEMALLOC_ALWAYS_INLINE static inline +#else +# ifdef _MSC_VER +# define JEMALLOC_ALWAYS_INLINE static __forceinline +# else +# define JEMALLOC_ALWAYS_INLINE JEMALLOC_ATTR(always_inline) static inline +# endif +#endif +#ifdef _MSC_VER +# define inline _inline +#endif + +#define UNUSED JEMALLOC_ATTR(unused) + +#define ZU(z) ((size_t)z) +#define ZD(z) ((ssize_t)z) +#define QU(q) ((uint64_t)q) +#define QD(q) ((int64_t)q) + +#define KZU(z) ZU(z##ULL) +#define KZD(z) ZD(z##LL) +#define KQU(q) QU(q##ULL) +#define KQD(q) QI(q##LL) + +#ifndef __DECONST +# define __DECONST(type, var) ((type)(uintptr_t)(const void *)(var)) +#endif + +#if !defined(JEMALLOC_HAS_RESTRICT) || defined(__cplusplus) +# define restrict +#endif + +/* Various function pointers are static and immutable except during testing. */ +#ifdef JEMALLOC_JET +# define JET_MUTABLE +#else +# define JET_MUTABLE const +#endif + +#define JEMALLOC_VA_ARGS_HEAD(head, ...) head +#define JEMALLOC_VA_ARGS_TAIL(head, ...) __VA_ARGS__ + +/* Diagnostic suppression macros */ +#if defined(_MSC_VER) && !defined(__clang__) +# define JEMALLOC_DIAGNOSTIC_PUSH __pragma(warning(push)) +# define JEMALLOC_DIAGNOSTIC_POP __pragma(warning(pop)) +# define JEMALLOC_DIAGNOSTIC_IGNORE(W) __pragma(warning(disable:W)) +# define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS +# define JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS +# define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN +# define JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS +/* #pragma GCC diagnostic first appeared in gcc 4.6. */ +#elif (defined(__GNUC__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && \ + (__GNUC_MINOR__ > 5)))) || defined(__clang__) +/* + * The JEMALLOC_PRAGMA__ macro is an implementation detail of the GCC and Clang + * diagnostic suppression macros and should not be used anywhere else. + */ +# define JEMALLOC_PRAGMA__(X) _Pragma(#X) +# define JEMALLOC_DIAGNOSTIC_PUSH JEMALLOC_PRAGMA__(GCC diagnostic push) +# define JEMALLOC_DIAGNOSTIC_POP JEMALLOC_PRAGMA__(GCC diagnostic pop) +# define JEMALLOC_DIAGNOSTIC_IGNORE(W) \ + JEMALLOC_PRAGMA__(GCC diagnostic ignored W) + +/* + * The -Wmissing-field-initializers warning is buggy in GCC versions < 5.1 and + * all clang versions up to version 7 (currently trunk, unreleased). This macro + * suppresses the warning for the affected compiler versions only. + */ +# if ((defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ < 5)) || \ + defined(__clang__) +# define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS \ + JEMALLOC_DIAGNOSTIC_IGNORE("-Wmissing-field-initializers") +# else +# define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS +# endif + +# define JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS \ + JEMALLOC_DIAGNOSTIC_IGNORE("-Wtype-limits") +# define JEMALLOC_DIAGNOSTIC_IGNORE_UNUSED_PARAMETER \ + JEMALLOC_DIAGNOSTIC_IGNORE("-Wunused-parameter") +# if defined(__GNUC__) && !defined(__clang__) && (__GNUC__ >= 7) +# define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN \ + JEMALLOC_DIAGNOSTIC_IGNORE("-Walloc-size-larger-than=") +# else +# define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN +# endif +# define JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS \ + JEMALLOC_DIAGNOSTIC_PUSH \ + JEMALLOC_DIAGNOSTIC_IGNORE_UNUSED_PARAMETER +#else +# define JEMALLOC_DIAGNOSTIC_PUSH +# define JEMALLOC_DIAGNOSTIC_POP +# define JEMALLOC_DIAGNOSTIC_IGNORE(W) +# define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS +# define JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS +# define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN +# define JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS +#endif + +/* + * Disables spurious diagnostics for all headers. Since these headers are not + * included by users directly, it does not affect their diagnostic settings. + */ +JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS + +#endif /* JEMALLOC_INTERNAL_MACROS_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_types.h b/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_types.h new file mode 100644 index 00000000..62c2b59c --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_internal_types.h @@ -0,0 +1,130 @@ +#ifndef JEMALLOC_INTERNAL_TYPES_H +#define JEMALLOC_INTERNAL_TYPES_H + +#include "jemalloc/internal/quantum.h" + +/* Processor / core id type. */ +typedef int malloc_cpuid_t; + +/* When realloc(non-null-ptr, 0) is called, what happens? */ +enum zero_realloc_action_e { + /* Realloc(ptr, 0) is free(ptr); return malloc(0); */ + zero_realloc_action_alloc = 0, + /* Realloc(ptr, 0) is free(ptr); */ + zero_realloc_action_free = 1, + /* Realloc(ptr, 0) aborts. */ + zero_realloc_action_abort = 2 +}; +typedef enum zero_realloc_action_e zero_realloc_action_t; + +/* Signature of write callback. */ +typedef void (write_cb_t)(void *, const char *); + +enum malloc_init_e { + malloc_init_uninitialized = 3, + malloc_init_a0_initialized = 2, + malloc_init_recursible = 1, + malloc_init_initialized = 0 /* Common case --> jnz. */ +}; +typedef enum malloc_init_e malloc_init_t; + +/* + * Flags bits: + * + * a: arena + * t: tcache + * 0: unused + * z: zero + * n: alignment + * + * aaaaaaaa aaaatttt tttttttt 0znnnnnn + */ +#define MALLOCX_ARENA_BITS 12 +#define MALLOCX_TCACHE_BITS 12 +#define MALLOCX_LG_ALIGN_BITS 6 +#define MALLOCX_ARENA_SHIFT 20 +#define MALLOCX_TCACHE_SHIFT 8 +#define MALLOCX_ARENA_MASK \ + (((1 << MALLOCX_ARENA_BITS) - 1) << MALLOCX_ARENA_SHIFT) +/* NB: Arena index bias decreases the maximum number of arenas by 1. */ +#define MALLOCX_ARENA_LIMIT ((1 << MALLOCX_ARENA_BITS) - 1) +#define MALLOCX_TCACHE_MASK \ + (((1 << MALLOCX_TCACHE_BITS) - 1) << MALLOCX_TCACHE_SHIFT) +#define MALLOCX_TCACHE_MAX ((1 << MALLOCX_TCACHE_BITS) - 3) +#define MALLOCX_LG_ALIGN_MASK ((1 << MALLOCX_LG_ALIGN_BITS) - 1) +/* Use MALLOCX_ALIGN_GET() if alignment may not be specified in flags. */ +#define MALLOCX_ALIGN_GET_SPECIFIED(flags) \ + (ZU(1) << (flags & MALLOCX_LG_ALIGN_MASK)) +#define MALLOCX_ALIGN_GET(flags) \ + (MALLOCX_ALIGN_GET_SPECIFIED(flags) & (SIZE_T_MAX-1)) +#define MALLOCX_ZERO_GET(flags) \ + ((bool)(flags & MALLOCX_ZERO)) + +#define MALLOCX_TCACHE_GET(flags) \ + (((unsigned)((flags & MALLOCX_TCACHE_MASK) >> MALLOCX_TCACHE_SHIFT)) - 2) +#define MALLOCX_ARENA_GET(flags) \ + (((unsigned)(((unsigned)flags) >> MALLOCX_ARENA_SHIFT)) - 1) + +/* Smallest size class to support. */ +#define TINY_MIN (1U << LG_TINY_MIN) + +#define LONG ((size_t)(1U << LG_SIZEOF_LONG)) +#define LONG_MASK (LONG - 1) + +/* Return the smallest long multiple that is >= a. */ +#define LONG_CEILING(a) \ + (((a) + LONG_MASK) & ~LONG_MASK) + +#define SIZEOF_PTR (1U << LG_SIZEOF_PTR) +#define PTR_MASK (SIZEOF_PTR - 1) + +/* Return the smallest (void *) multiple that is >= a. */ +#define PTR_CEILING(a) \ + (((a) + PTR_MASK) & ~PTR_MASK) + +/* + * Maximum size of L1 cache line. This is used to avoid cache line aliasing. + * In addition, this controls the spacing of cacheline-spaced size classes. + * + * CACHELINE cannot be based on LG_CACHELINE because __declspec(align()) can + * only handle raw constants. + */ +#define LG_CACHELINE 6 +#define CACHELINE 64 +#define CACHELINE_MASK (CACHELINE - 1) + +/* Return the smallest cacheline multiple that is >= s. */ +#define CACHELINE_CEILING(s) \ + (((s) + CACHELINE_MASK) & ~CACHELINE_MASK) + +/* Return the nearest aligned address at or below a. */ +#define ALIGNMENT_ADDR2BASE(a, alignment) \ + ((void *)((uintptr_t)(a) & ((~(alignment)) + 1))) + +/* Return the offset between a and the nearest aligned address at or below a. */ +#define ALIGNMENT_ADDR2OFFSET(a, alignment) \ + ((size_t)((uintptr_t)(a) & (alignment - 1))) + +/* Return the smallest alignment multiple that is >= s. */ +#define ALIGNMENT_CEILING(s, alignment) \ + (((s) + (alignment - 1)) & ((~(alignment)) + 1)) + +/* Declare a variable-length array. */ +#if __STDC_VERSION__ < 199901L +# ifdef _MSC_VER +# include +# define alloca _alloca +# else +# ifdef JEMALLOC_HAS_ALLOCA_H +# include +# else +# include +# endif +# endif +# define VARIABLE_ARRAY(type, name, count) \ + type *name = alloca(sizeof(type) * (count)) +#else +# define VARIABLE_ARRAY(type, name, count) type name[(count)] +#endif + +#endif /* JEMALLOC_INTERNAL_TYPES_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_preamble.h b/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_preamble.h new file mode 100644 index 00000000..3be27ed5 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_preamble.h @@ -0,0 +1,263 @@ +#ifndef JEMALLOC_PREAMBLE_H +#define JEMALLOC_PREAMBLE_H + +#include "jemalloc_internal_defs.h" +#include "jemalloc/internal/jemalloc_internal_decls.h" + +#if defined(JEMALLOC_UTRACE) || defined(JEMALLOC_UTRACE_LABEL) +#include +# if defined(JEMALLOC_UTRACE) +# define UTRACE_CALL(p, l) utrace(p, l) +# else +# define UTRACE_CALL(p, l) utrace("jemalloc_process", p, l) +# define JEMALLOC_UTRACE +# endif +#endif + +#define JEMALLOC_NO_DEMANGLE +#ifdef JEMALLOC_JET +# undef JEMALLOC_IS_MALLOC +# define JEMALLOC_N(n) jet_##n +# include "jemalloc/internal/public_namespace.h" +# define JEMALLOC_NO_RENAME +# include "../jemalloc.h" +# undef JEMALLOC_NO_RENAME +#else +# define JEMALLOC_N(n) je_##n +# include "../jemalloc.h" +#endif + +#if defined(JEMALLOC_OSATOMIC) +#include +#endif + +#ifdef JEMALLOC_ZONE +#include +#include +#include +#endif + +#include "jemalloc/internal/jemalloc_internal_macros.h" + +/* + * Note that the ordering matters here; the hook itself is name-mangled. We + * want the inclusion of hooks to happen early, so that we hook as much as + * possible. + */ +#ifndef JEMALLOC_NO_PRIVATE_NAMESPACE +# ifndef JEMALLOC_JET +# include "jemalloc/internal/private_namespace.h" +# else +# include "jemalloc/internal/private_namespace_jet.h" +# endif +#endif +#include "jemalloc/internal/test_hooks.h" + +#ifdef JEMALLOC_DEFINE_MADVISE_FREE +# define JEMALLOC_MADV_FREE 8 +#endif + +static const bool config_debug = +#ifdef JEMALLOC_DEBUG + true +#else + false +#endif + ; +static const bool have_dss = +#ifdef JEMALLOC_DSS + true +#else + false +#endif + ; +static const bool have_madvise_huge = +#ifdef JEMALLOC_HAVE_MADVISE_HUGE + true +#else + false +#endif + ; +static const bool config_fill = +#ifdef JEMALLOC_FILL + true +#else + false +#endif + ; +static const bool config_lazy_lock = +#ifdef JEMALLOC_LAZY_LOCK + true +#else + false +#endif + ; +static const char * const config_malloc_conf = JEMALLOC_CONFIG_MALLOC_CONF; +static const bool config_prof = +#ifdef JEMALLOC_PROF + true +#else + false +#endif + ; +static const bool config_prof_libgcc = +#ifdef JEMALLOC_PROF_LIBGCC + true +#else + false +#endif + ; +static const bool config_prof_libunwind = +#ifdef JEMALLOC_PROF_LIBUNWIND + true +#else + false +#endif + ; +static const bool maps_coalesce = +#ifdef JEMALLOC_MAPS_COALESCE + true +#else + false +#endif + ; +static const bool config_stats = +#ifdef JEMALLOC_STATS + true +#else + false +#endif + ; +static const bool config_tls = +#ifdef JEMALLOC_TLS + true +#else + false +#endif + ; +static const bool config_utrace = +#ifdef JEMALLOC_UTRACE + true +#else + false +#endif + ; +static const bool config_xmalloc = +#ifdef JEMALLOC_XMALLOC + true +#else + false +#endif + ; +static const bool config_cache_oblivious = +#ifdef JEMALLOC_CACHE_OBLIVIOUS + true +#else + false +#endif + ; +/* + * Undocumented, for jemalloc development use only at the moment. See the note + * in jemalloc/internal/log.h. + */ +static const bool config_log = +#ifdef JEMALLOC_LOG + true +#else + false +#endif + ; +/* + * Are extra safety checks enabled; things like checking the size of sized + * deallocations, double-frees, etc. + */ +static const bool config_opt_safety_checks = +#ifdef JEMALLOC_OPT_SAFETY_CHECKS + true +#elif defined(JEMALLOC_DEBUG) + /* + * This lets us only guard safety checks by one flag instead of two; fast + * checks can guard solely by config_opt_safety_checks and run in debug mode + * too. + */ + true +#else + false +#endif + ; + +/* + * Extra debugging of sized deallocations too onerous to be included in the + * general safety checks. + */ +static const bool config_opt_size_checks = +#if defined(JEMALLOC_OPT_SIZE_CHECKS) || defined(JEMALLOC_DEBUG) + true +#else + false +#endif + ; + +static const bool config_uaf_detection = +#if defined(JEMALLOC_UAF_DETECTION) || defined(JEMALLOC_DEBUG) + true +#else + false +#endif + ; + +/* Whether or not the C++ extensions are enabled. */ +static const bool config_enable_cxx = +#ifdef JEMALLOC_ENABLE_CXX + true +#else + false +#endif +; + +#if defined(_WIN32) || defined(JEMALLOC_HAVE_SCHED_GETCPU) +/* Currently percpu_arena depends on sched_getcpu. */ +#define JEMALLOC_PERCPU_ARENA +#endif +static const bool have_percpu_arena = +#ifdef JEMALLOC_PERCPU_ARENA + true +#else + false +#endif + ; +/* + * Undocumented, and not recommended; the application should take full + * responsibility for tracking provenance. + */ +static const bool force_ivsalloc = +#ifdef JEMALLOC_FORCE_IVSALLOC + true +#else + false +#endif + ; +static const bool have_background_thread = +#ifdef JEMALLOC_BACKGROUND_THREAD + true +#else + false +#endif + ; +static const bool config_high_res_timer = +#ifdef JEMALLOC_HAVE_CLOCK_REALTIME + true +#else + false +#endif + ; + +static const bool have_memcntl = +#ifdef JEMALLOC_HAVE_MEMCNTL + true +#else + false +#endif + ; + +#endif /* JEMALLOC_PREAMBLE_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_preamble.h.in b/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_preamble.h.in new file mode 100644 index 00000000..5ce77d96 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/jemalloc_preamble.h.in @@ -0,0 +1,263 @@ +#ifndef JEMALLOC_PREAMBLE_H +#define JEMALLOC_PREAMBLE_H + +#include "jemalloc_internal_defs.h" +#include "jemalloc/internal/jemalloc_internal_decls.h" + +#if defined(JEMALLOC_UTRACE) || defined(JEMALLOC_UTRACE_LABEL) +#include +# if defined(JEMALLOC_UTRACE) +# define UTRACE_CALL(p, l) utrace(p, l) +# else +# define UTRACE_CALL(p, l) utrace("jemalloc_process", p, l) +# define JEMALLOC_UTRACE +# endif +#endif + +#define JEMALLOC_NO_DEMANGLE +#ifdef JEMALLOC_JET +# undef JEMALLOC_IS_MALLOC +# define JEMALLOC_N(n) jet_##n +# include "jemalloc/internal/public_namespace.h" +# define JEMALLOC_NO_RENAME +# include "../jemalloc@install_suffix@.h" +# undef JEMALLOC_NO_RENAME +#else +# define JEMALLOC_N(n) @private_namespace@##n +# include "../jemalloc@install_suffix@.h" +#endif + +#if defined(JEMALLOC_OSATOMIC) +#include +#endif + +#ifdef JEMALLOC_ZONE +#include +#include +#include +#endif + +#include "jemalloc/internal/jemalloc_internal_macros.h" + +/* + * Note that the ordering matters here; the hook itself is name-mangled. We + * want the inclusion of hooks to happen early, so that we hook as much as + * possible. + */ +#ifndef JEMALLOC_NO_PRIVATE_NAMESPACE +# ifndef JEMALLOC_JET +# include "jemalloc/internal/private_namespace.h" +# else +# include "jemalloc/internal/private_namespace_jet.h" +# endif +#endif +#include "jemalloc/internal/test_hooks.h" + +#ifdef JEMALLOC_DEFINE_MADVISE_FREE +# define JEMALLOC_MADV_FREE 8 +#endif + +static const bool config_debug = +#ifdef JEMALLOC_DEBUG + true +#else + false +#endif + ; +static const bool have_dss = +#ifdef JEMALLOC_DSS + true +#else + false +#endif + ; +static const bool have_madvise_huge = +#ifdef JEMALLOC_HAVE_MADVISE_HUGE + true +#else + false +#endif + ; +static const bool config_fill = +#ifdef JEMALLOC_FILL + true +#else + false +#endif + ; +static const bool config_lazy_lock = +#ifdef JEMALLOC_LAZY_LOCK + true +#else + false +#endif + ; +static const char * const config_malloc_conf = JEMALLOC_CONFIG_MALLOC_CONF; +static const bool config_prof = +#ifdef JEMALLOC_PROF + true +#else + false +#endif + ; +static const bool config_prof_libgcc = +#ifdef JEMALLOC_PROF_LIBGCC + true +#else + false +#endif + ; +static const bool config_prof_libunwind = +#ifdef JEMALLOC_PROF_LIBUNWIND + true +#else + false +#endif + ; +static const bool maps_coalesce = +#ifdef JEMALLOC_MAPS_COALESCE + true +#else + false +#endif + ; +static const bool config_stats = +#ifdef JEMALLOC_STATS + true +#else + false +#endif + ; +static const bool config_tls = +#ifdef JEMALLOC_TLS + true +#else + false +#endif + ; +static const bool config_utrace = +#ifdef JEMALLOC_UTRACE + true +#else + false +#endif + ; +static const bool config_xmalloc = +#ifdef JEMALLOC_XMALLOC + true +#else + false +#endif + ; +static const bool config_cache_oblivious = +#ifdef JEMALLOC_CACHE_OBLIVIOUS + true +#else + false +#endif + ; +/* + * Undocumented, for jemalloc development use only at the moment. See the note + * in jemalloc/internal/log.h. + */ +static const bool config_log = +#ifdef JEMALLOC_LOG + true +#else + false +#endif + ; +/* + * Are extra safety checks enabled; things like checking the size of sized + * deallocations, double-frees, etc. + */ +static const bool config_opt_safety_checks = +#ifdef JEMALLOC_OPT_SAFETY_CHECKS + true +#elif defined(JEMALLOC_DEBUG) + /* + * This lets us only guard safety checks by one flag instead of two; fast + * checks can guard solely by config_opt_safety_checks and run in debug mode + * too. + */ + true +#else + false +#endif + ; + +/* + * Extra debugging of sized deallocations too onerous to be included in the + * general safety checks. + */ +static const bool config_opt_size_checks = +#if defined(JEMALLOC_OPT_SIZE_CHECKS) || defined(JEMALLOC_DEBUG) + true +#else + false +#endif + ; + +static const bool config_uaf_detection = +#if defined(JEMALLOC_UAF_DETECTION) || defined(JEMALLOC_DEBUG) + true +#else + false +#endif + ; + +/* Whether or not the C++ extensions are enabled. */ +static const bool config_enable_cxx = +#ifdef JEMALLOC_ENABLE_CXX + true +#else + false +#endif +; + +#if defined(_WIN32) || defined(JEMALLOC_HAVE_SCHED_GETCPU) +/* Currently percpu_arena depends on sched_getcpu. */ +#define JEMALLOC_PERCPU_ARENA +#endif +static const bool have_percpu_arena = +#ifdef JEMALLOC_PERCPU_ARENA + true +#else + false +#endif + ; +/* + * Undocumented, and not recommended; the application should take full + * responsibility for tracking provenance. + */ +static const bool force_ivsalloc = +#ifdef JEMALLOC_FORCE_IVSALLOC + true +#else + false +#endif + ; +static const bool have_background_thread = +#ifdef JEMALLOC_BACKGROUND_THREAD + true +#else + false +#endif + ; +static const bool config_high_res_timer = +#ifdef JEMALLOC_HAVE_CLOCK_REALTIME + true +#else + false +#endif + ; + +static const bool have_memcntl = +#ifdef JEMALLOC_HAVE_MEMCNTL + true +#else + false +#endif + ; + +#endif /* JEMALLOC_PREAMBLE_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/large_externs.h b/BeefRT/JEMalloc/include/jemalloc/internal/large_externs.h new file mode 100644 index 00000000..8e09122d --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/large_externs.h @@ -0,0 +1,24 @@ +#ifndef JEMALLOC_INTERNAL_LARGE_EXTERNS_H +#define JEMALLOC_INTERNAL_LARGE_EXTERNS_H + +#include "jemalloc/internal/hook.h" + +void *large_malloc(tsdn_t *tsdn, arena_t *arena, size_t usize, bool zero); +void *large_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment, + bool zero); +bool large_ralloc_no_move(tsdn_t *tsdn, edata_t *edata, size_t usize_min, + size_t usize_max, bool zero); +void *large_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t usize, + size_t alignment, bool zero, tcache_t *tcache, + hook_ralloc_args_t *hook_args); + +void large_dalloc_prep_locked(tsdn_t *tsdn, edata_t *edata); +void large_dalloc_finish(tsdn_t *tsdn, edata_t *edata); +void large_dalloc(tsdn_t *tsdn, edata_t *edata); +size_t large_salloc(tsdn_t *tsdn, const edata_t *edata); +void large_prof_info_get(tsd_t *tsd, edata_t *edata, prof_info_t *prof_info, + bool reset_recent); +void large_prof_tctx_reset(edata_t *edata); +void large_prof_info_set(edata_t *edata, prof_tctx_t *tctx, size_t size); + +#endif /* JEMALLOC_INTERNAL_LARGE_EXTERNS_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/lockedint.h b/BeefRT/JEMalloc/include/jemalloc/internal/lockedint.h new file mode 100644 index 00000000..d020ebec --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/lockedint.h @@ -0,0 +1,204 @@ +#ifndef JEMALLOC_INTERNAL_LOCKEDINT_H +#define JEMALLOC_INTERNAL_LOCKEDINT_H + +/* + * In those architectures that support 64-bit atomics, we use atomic updates for + * our 64-bit values. Otherwise, we use a plain uint64_t and synchronize + * externally. + */ + +typedef struct locked_u64_s locked_u64_t; +#ifdef JEMALLOC_ATOMIC_U64 +struct locked_u64_s { + atomic_u64_t val; +}; +#else +/* Must hold the associated mutex. */ +struct locked_u64_s { + uint64_t val; +}; +#endif + +typedef struct locked_zu_s locked_zu_t; +struct locked_zu_s { + atomic_zu_t val; +}; + +#ifndef JEMALLOC_ATOMIC_U64 +# define LOCKEDINT_MTX_DECLARE(name) malloc_mutex_t name; +# define LOCKEDINT_MTX_INIT(mu, name, rank, rank_mode) \ + malloc_mutex_init(&(mu), name, rank, rank_mode) +# define LOCKEDINT_MTX(mtx) (&(mtx)) +# define LOCKEDINT_MTX_LOCK(tsdn, mu) malloc_mutex_lock(tsdn, &(mu)) +# define LOCKEDINT_MTX_UNLOCK(tsdn, mu) malloc_mutex_unlock(tsdn, &(mu)) +# define LOCKEDINT_MTX_PREFORK(tsdn, mu) malloc_mutex_prefork(tsdn, &(mu)) +# define LOCKEDINT_MTX_POSTFORK_PARENT(tsdn, mu) \ + malloc_mutex_postfork_parent(tsdn, &(mu)) +# define LOCKEDINT_MTX_POSTFORK_CHILD(tsdn, mu) \ + malloc_mutex_postfork_child(tsdn, &(mu)) +#else +# define LOCKEDINT_MTX_DECLARE(name) +# define LOCKEDINT_MTX(mtx) NULL +# define LOCKEDINT_MTX_INIT(mu, name, rank, rank_mode) false +# define LOCKEDINT_MTX_LOCK(tsdn, mu) +# define LOCKEDINT_MTX_UNLOCK(tsdn, mu) +# define LOCKEDINT_MTX_PREFORK(tsdn, mu) +# define LOCKEDINT_MTX_POSTFORK_PARENT(tsdn, mu) +# define LOCKEDINT_MTX_POSTFORK_CHILD(tsdn, mu) +#endif + +#ifdef JEMALLOC_ATOMIC_U64 +# define LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx) assert((mtx) == NULL) +#else +# define LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx) \ + malloc_mutex_assert_owner(tsdn, (mtx)) +#endif + +static inline uint64_t +locked_read_u64(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_u64_t *p) { + LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx); +#ifdef JEMALLOC_ATOMIC_U64 + return atomic_load_u64(&p->val, ATOMIC_RELAXED); +#else + return p->val; +#endif +} + +static inline void +locked_inc_u64(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_u64_t *p, + uint64_t x) { + LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx); +#ifdef JEMALLOC_ATOMIC_U64 + atomic_fetch_add_u64(&p->val, x, ATOMIC_RELAXED); +#else + p->val += x; +#endif +} + +static inline void +locked_dec_u64(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_u64_t *p, + uint64_t x) { + LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx); +#ifdef JEMALLOC_ATOMIC_U64 + uint64_t r = atomic_fetch_sub_u64(&p->val, x, ATOMIC_RELAXED); + assert(r - x <= r); +#else + p->val -= x; + assert(p->val + x >= p->val); +#endif +} + +/* Increment and take modulus. Returns whether the modulo made any change. */ +static inline bool +locked_inc_mod_u64(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_u64_t *p, + const uint64_t x, const uint64_t modulus) { + LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx); + uint64_t before, after; + bool overflow; +#ifdef JEMALLOC_ATOMIC_U64 + before = atomic_load_u64(&p->val, ATOMIC_RELAXED); + do { + after = before + x; + assert(after >= before); + overflow = (after >= modulus); + if (overflow) { + after %= modulus; + } + } while (!atomic_compare_exchange_weak_u64(&p->val, &before, after, + ATOMIC_RELAXED, ATOMIC_RELAXED)); +#else + before = p->val; + after = before + x; + overflow = (after >= modulus); + if (overflow) { + after %= modulus; + } + p->val = after; +#endif + return overflow; +} + +/* + * Non-atomically sets *dst += src. *dst needs external synchronization. + * This lets us avoid the cost of a fetch_add when its unnecessary (note that + * the types here are atomic). + */ +static inline void +locked_inc_u64_unsynchronized(locked_u64_t *dst, uint64_t src) { +#ifdef JEMALLOC_ATOMIC_U64 + uint64_t cur_dst = atomic_load_u64(&dst->val, ATOMIC_RELAXED); + atomic_store_u64(&dst->val, src + cur_dst, ATOMIC_RELAXED); +#else + dst->val += src; +#endif +} + +static inline uint64_t +locked_read_u64_unsynchronized(locked_u64_t *p) { +#ifdef JEMALLOC_ATOMIC_U64 + return atomic_load_u64(&p->val, ATOMIC_RELAXED); +#else + return p->val; +#endif +} + +static inline void +locked_init_u64_unsynchronized(locked_u64_t *p, uint64_t x) { +#ifdef JEMALLOC_ATOMIC_U64 + atomic_store_u64(&p->val, x, ATOMIC_RELAXED); +#else + p->val = x; +#endif +} + +static inline size_t +locked_read_zu(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_zu_t *p) { + LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx); +#ifdef JEMALLOC_ATOMIC_U64 + return atomic_load_zu(&p->val, ATOMIC_RELAXED); +#else + return atomic_load_zu(&p->val, ATOMIC_RELAXED); +#endif +} + +static inline void +locked_inc_zu(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_zu_t *p, + size_t x) { + LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx); +#ifdef JEMALLOC_ATOMIC_U64 + atomic_fetch_add_zu(&p->val, x, ATOMIC_RELAXED); +#else + size_t cur = atomic_load_zu(&p->val, ATOMIC_RELAXED); + atomic_store_zu(&p->val, cur + x, ATOMIC_RELAXED); +#endif +} + +static inline void +locked_dec_zu(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_zu_t *p, + size_t x) { + LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx); +#ifdef JEMALLOC_ATOMIC_U64 + size_t r = atomic_fetch_sub_zu(&p->val, x, ATOMIC_RELAXED); + assert(r - x <= r); +#else + size_t cur = atomic_load_zu(&p->val, ATOMIC_RELAXED); + atomic_store_zu(&p->val, cur - x, ATOMIC_RELAXED); +#endif +} + +/* Like the _u64 variant, needs an externally synchronized *dst. */ +static inline void +locked_inc_zu_unsynchronized(locked_zu_t *dst, size_t src) { + size_t cur_dst = atomic_load_zu(&dst->val, ATOMIC_RELAXED); + atomic_store_zu(&dst->val, src + cur_dst, ATOMIC_RELAXED); +} + +/* + * Unlike the _u64 variant, this is safe to call unconditionally. + */ +static inline size_t +locked_read_atomic_zu(locked_zu_t *p) { + return atomic_load_zu(&p->val, ATOMIC_RELAXED); +} + +#endif /* JEMALLOC_INTERNAL_LOCKEDINT_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/log.h b/BeefRT/JEMalloc/include/jemalloc/internal/log.h new file mode 100644 index 00000000..64208586 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/log.h @@ -0,0 +1,115 @@ +#ifndef JEMALLOC_INTERNAL_LOG_H +#define JEMALLOC_INTERNAL_LOG_H + +#include "jemalloc/internal/atomic.h" +#include "jemalloc/internal/malloc_io.h" +#include "jemalloc/internal/mutex.h" + +#ifdef JEMALLOC_LOG +# define JEMALLOC_LOG_VAR_BUFSIZE 1000 +#else +# define JEMALLOC_LOG_VAR_BUFSIZE 1 +#endif + +#define JEMALLOC_LOG_BUFSIZE 4096 + +/* + * The log malloc_conf option is a '|'-delimited list of log_var name segments + * which should be logged. The names are themselves hierarchical, with '.' as + * the delimiter (a "segment" is just a prefix in the log namespace). So, if + * you have: + * + * log("arena", "log msg for arena"); // 1 + * log("arena.a", "log msg for arena.a"); // 2 + * log("arena.b", "log msg for arena.b"); // 3 + * log("arena.a.a", "log msg for arena.a.a"); // 4 + * log("extent.a", "log msg for extent.a"); // 5 + * log("extent.b", "log msg for extent.b"); // 6 + * + * And your malloc_conf option is "log=arena.a|extent", then lines 2, 4, 5, and + * 6 will print at runtime. You can enable logging from all log vars by + * writing "log=.". + * + * None of this should be regarded as a stable API for right now. It's intended + * as a debugging interface, to let us keep around some of our printf-debugging + * statements. + */ + +extern char log_var_names[JEMALLOC_LOG_VAR_BUFSIZE]; +extern atomic_b_t log_init_done; + +typedef struct log_var_s log_var_t; +struct log_var_s { + /* + * Lowest bit is "inited", second lowest is "enabled". Putting them in + * a single word lets us avoid any fences on weak architectures. + */ + atomic_u_t state; + const char *name; +}; + +#define LOG_NOT_INITIALIZED 0U +#define LOG_INITIALIZED_NOT_ENABLED 1U +#define LOG_ENABLED 2U + +#define LOG_VAR_INIT(name_str) {ATOMIC_INIT(LOG_NOT_INITIALIZED), name_str} + +/* + * Returns the value we should assume for state (which is not necessarily + * accurate; if logging is done before logging has finished initializing, then + * we default to doing the safe thing by logging everything). + */ +unsigned log_var_update_state(log_var_t *log_var); + +/* We factor out the metadata management to allow us to test more easily. */ +#define log_do_begin(log_var) \ +if (config_log) { \ + unsigned log_state = atomic_load_u(&(log_var).state, \ + ATOMIC_RELAXED); \ + if (unlikely(log_state == LOG_NOT_INITIALIZED)) { \ + log_state = log_var_update_state(&(log_var)); \ + assert(log_state != LOG_NOT_INITIALIZED); \ + } \ + if (log_state == LOG_ENABLED) { \ + { + /* User code executes here. */ +#define log_do_end(log_var) \ + } \ + } \ +} + +/* + * MSVC has some preprocessor bugs in its expansion of __VA_ARGS__ during + * preprocessing. To work around this, we take all potential extra arguments in + * a var-args functions. Since a varargs macro needs at least one argument in + * the "...", we accept the format string there, and require that the first + * argument in this "..." is a const char *. + */ +static inline void +log_impl_varargs(const char *name, ...) { + char buf[JEMALLOC_LOG_BUFSIZE]; + va_list ap; + + va_start(ap, name); + const char *format = va_arg(ap, const char *); + size_t dst_offset = 0; + dst_offset += malloc_snprintf(buf, JEMALLOC_LOG_BUFSIZE, "%s: ", name); + dst_offset += malloc_vsnprintf(buf + dst_offset, + JEMALLOC_LOG_BUFSIZE - dst_offset, format, ap); + dst_offset += malloc_snprintf(buf + dst_offset, + JEMALLOC_LOG_BUFSIZE - dst_offset, "\n"); + va_end(ap); + + malloc_write(buf); +} + +/* Call as log("log.var.str", "format_string %d", arg_for_format_string); */ +#define LOG(log_var_str, ...) \ +do { \ + static log_var_t log_var = LOG_VAR_INIT(log_var_str); \ + log_do_begin(log_var) \ + log_impl_varargs((log_var).name, __VA_ARGS__); \ + log_do_end(log_var) \ +} while (0) + +#endif /* JEMALLOC_INTERNAL_LOG_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/malloc_io.h b/BeefRT/JEMalloc/include/jemalloc/internal/malloc_io.h new file mode 100644 index 00000000..a375bdae --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/malloc_io.h @@ -0,0 +1,105 @@ +#ifndef JEMALLOC_INTERNAL_MALLOC_IO_H +#define JEMALLOC_INTERNAL_MALLOC_IO_H + +#include "jemalloc/internal/jemalloc_internal_types.h" + +#ifdef _WIN32 +# ifdef _WIN64 +# define FMT64_PREFIX "ll" +# define FMTPTR_PREFIX "ll" +# else +# define FMT64_PREFIX "ll" +# define FMTPTR_PREFIX "" +# endif +# define FMTd32 "d" +# define FMTu32 "u" +# define FMTx32 "x" +# define FMTd64 FMT64_PREFIX "d" +# define FMTu64 FMT64_PREFIX "u" +# define FMTx64 FMT64_PREFIX "x" +# define FMTdPTR FMTPTR_PREFIX "d" +# define FMTuPTR FMTPTR_PREFIX "u" +# define FMTxPTR FMTPTR_PREFIX "x" +#else +# include +# define FMTd32 PRId32 +# define FMTu32 PRIu32 +# define FMTx32 PRIx32 +# define FMTd64 PRId64 +# define FMTu64 PRIu64 +# define FMTx64 PRIx64 +# define FMTdPTR PRIdPTR +# define FMTuPTR PRIuPTR +# define FMTxPTR PRIxPTR +#endif + +/* Size of stack-allocated buffer passed to buferror(). */ +#define BUFERROR_BUF 64 + +/* + * Size of stack-allocated buffer used by malloc_{,v,vc}printf(). This must be + * large enough for all possible uses within jemalloc. + */ +#define MALLOC_PRINTF_BUFSIZE 4096 + +write_cb_t wrtmessage; +int buferror(int err, char *buf, size_t buflen); +uintmax_t malloc_strtoumax(const char *restrict nptr, char **restrict endptr, + int base); +void malloc_write(const char *s); + +/* + * malloc_vsnprintf() supports a subset of snprintf(3) that avoids floating + * point math. + */ +size_t malloc_vsnprintf(char *str, size_t size, const char *format, + va_list ap); +size_t malloc_snprintf(char *str, size_t size, const char *format, ...) + JEMALLOC_FORMAT_PRINTF(3, 4); +/* + * The caller can set write_cb to null to choose to print with the + * je_malloc_message hook. + */ +void malloc_vcprintf(write_cb_t *write_cb, void *cbopaque, const char *format, + va_list ap); +void malloc_cprintf(write_cb_t *write_cb, void *cbopaque, const char *format, + ...) JEMALLOC_FORMAT_PRINTF(3, 4); +void malloc_printf(const char *format, ...) JEMALLOC_FORMAT_PRINTF(1, 2); + +static inline ssize_t +malloc_write_fd(int fd, const void *buf, size_t count) { +#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_write) + /* + * Use syscall(2) rather than write(2) when possible in order to avoid + * the possibility of memory allocation within libc. This is necessary + * on FreeBSD; most operating systems do not have this problem though. + * + * syscall() returns long or int, depending on platform, so capture the + * result in the widest plausible type to avoid compiler warnings. + */ + long result = syscall(SYS_write, fd, buf, count); +#else + ssize_t result = (ssize_t)write(fd, buf, +#ifdef _WIN32 + (unsigned int) +#endif + count); +#endif + return (ssize_t)result; +} + +static inline ssize_t +malloc_read_fd(int fd, void *buf, size_t count) { +#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_read) + long result = syscall(SYS_read, fd, buf, count); +#else + ssize_t result = read(fd, buf, +#ifdef _WIN32 + (unsigned int) +#endif + count); +#endif + return (ssize_t)result; +} + +#endif /* JEMALLOC_INTERNAL_MALLOC_IO_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/mpsc_queue.h b/BeefRT/JEMalloc/include/jemalloc/internal/mpsc_queue.h new file mode 100644 index 00000000..316ea9b1 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/mpsc_queue.h @@ -0,0 +1,134 @@ +#ifndef JEMALLOC_INTERNAL_MPSC_QUEUE_H +#define JEMALLOC_INTERNAL_MPSC_QUEUE_H + +#include "jemalloc/internal/atomic.h" + +/* + * A concurrent implementation of a multi-producer, single-consumer queue. It + * supports three concurrent operations: + * - Push + * - Push batch + * - Pop batch + * + * These operations are all lock-free. + * + * The implementation is the simple two-stack queue built on a Treiber stack. + * It's not terribly efficient, but this isn't expected to go into anywhere with + * hot code. In fact, we don't really even need queue semantics in any + * anticipated use cases; we could get away with just the stack. But this way + * lets us frame the API in terms of the existing list types, which is a nice + * convenience. We can save on cache misses by introducing our own (parallel) + * single-linked list type here, and dropping FIFO semantics, if we need this to + * get faster. Since we're currently providing queue semantics though, we use + * the prev field in the link rather than the next field for Treiber-stack + * linkage, so that we can preserve order for bash-pushed lists (recall that the + * two-stack tricks reverses orders in the lock-free first stack). + */ + +#define mpsc_queue(a_type) \ +struct { \ + atomic_p_t tail; \ +} + +#define mpsc_queue_proto(a_attr, a_prefix, a_queue_type, a_type, \ + a_list_type) \ +/* Initialize a queue. */ \ +a_attr void \ +a_prefix##new(a_queue_type *queue); \ +/* Insert all items in src into the queue, clearing src. */ \ +a_attr void \ +a_prefix##push_batch(a_queue_type *queue, a_list_type *src); \ +/* Insert node into the queue. */ \ +a_attr void \ +a_prefix##push(a_queue_type *queue, a_type *node); \ +/* \ + * Pop all items in the queue into the list at dst. dst should already \ + * be initialized (and may contain existing items, which then remain \ + * in dst). \ + */ \ +a_attr void \ +a_prefix##pop_batch(a_queue_type *queue, a_list_type *dst); + +#define mpsc_queue_gen(a_attr, a_prefix, a_queue_type, a_type, \ + a_list_type, a_link) \ +a_attr void \ +a_prefix##new(a_queue_type *queue) { \ + atomic_store_p(&queue->tail, NULL, ATOMIC_RELAXED); \ +} \ +a_attr void \ +a_prefix##push_batch(a_queue_type *queue, a_list_type *src) { \ + /* \ + * Reuse the ql list next field as the Treiber stack next \ + * field. \ + */ \ + a_type *first = ql_first(src); \ + a_type *last = ql_last(src, a_link); \ + void* cur_tail = atomic_load_p(&queue->tail, ATOMIC_RELAXED); \ + do { \ + /* \ + * Note that this breaks the queue ring structure; \ + * it's not a ring any more! \ + */ \ + first->a_link.qre_prev = cur_tail; \ + /* \ + * Note: the upcoming CAS doesn't need an atomic; every \ + * push only needs to synchronize with the next pop, \ + * which we get from the release sequence rules. \ + */ \ + } while (!atomic_compare_exchange_weak_p(&queue->tail, \ + &cur_tail, last, ATOMIC_RELEASE, ATOMIC_RELAXED)); \ + ql_new(src); \ +} \ +a_attr void \ +a_prefix##push(a_queue_type *queue, a_type *node) { \ + ql_elm_new(node, a_link); \ + a_list_type list; \ + ql_new(&list); \ + ql_head_insert(&list, node, a_link); \ + a_prefix##push_batch(queue, &list); \ +} \ +a_attr void \ +a_prefix##pop_batch(a_queue_type *queue, a_list_type *dst) { \ + a_type *tail = atomic_load_p(&queue->tail, ATOMIC_RELAXED); \ + if (tail == NULL) { \ + /* \ + * In the common special case where there are no \ + * pending elements, bail early without a costly RMW. \ + */ \ + return; \ + } \ + tail = atomic_exchange_p(&queue->tail, NULL, ATOMIC_ACQUIRE); \ + /* \ + * It's a single-consumer queue, so if cur started non-NULL, \ + * it'd better stay non-NULL. \ + */ \ + assert(tail != NULL); \ + /* \ + * We iterate through the stack and both fix up the link \ + * structure (stack insertion broke the list requirement that \ + * the list be circularly linked). It's just as efficient at \ + * this point to make the queue a "real" queue, so do that as \ + * well. \ + * If this ever gets to be a hot spot, we can omit this fixup \ + * and make the queue a bag (i.e. not necessarily ordered), but \ + * that would mean jettisoning the existing list API as the \ + * batch pushing/popping interface. \ + */ \ + a_list_type reversed; \ + ql_new(&reversed); \ + while (tail != NULL) { \ + /* \ + * Pop an item off the stack, prepend it onto the list \ + * (reversing the order). Recall that we use the \ + * list prev field as the Treiber stack next field to \ + * preserve order of batch-pushed items when reversed. \ + */ \ + a_type *next = tail->a_link.qre_prev; \ + ql_elm_new(tail, a_link); \ + ql_head_insert(&reversed, tail, a_link); \ + tail = next; \ + } \ + ql_concat(dst, &reversed, a_link); \ +} + +#endif /* JEMALLOC_INTERNAL_MPSC_QUEUE_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/mutex.h b/BeefRT/JEMalloc/include/jemalloc/internal/mutex.h new file mode 100644 index 00000000..63a0b1b3 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/mutex.h @@ -0,0 +1,319 @@ +#ifndef JEMALLOC_INTERNAL_MUTEX_H +#define JEMALLOC_INTERNAL_MUTEX_H + +#include "jemalloc/internal/atomic.h" +#include "jemalloc/internal/mutex_prof.h" +#include "jemalloc/internal/tsd.h" +#include "jemalloc/internal/witness.h" + +extern int64_t opt_mutex_max_spin; + +typedef enum { + /* Can only acquire one mutex of a given witness rank at a time. */ + malloc_mutex_rank_exclusive, + /* + * Can acquire multiple mutexes of the same witness rank, but in + * address-ascending order only. + */ + malloc_mutex_address_ordered +} malloc_mutex_lock_order_t; + +typedef struct malloc_mutex_s malloc_mutex_t; +struct malloc_mutex_s { + union { + struct { + /* + * prof_data is defined first to reduce cacheline + * bouncing: the data is not touched by the mutex holder + * during unlocking, while might be modified by + * contenders. Having it before the mutex itself could + * avoid prefetching a modified cacheline (for the + * unlocking thread). + */ + mutex_prof_data_t prof_data; +#ifdef _WIN32 +# if _WIN32_WINNT >= 0x0600 + SRWLOCK lock; +# else + CRITICAL_SECTION lock; +# endif +#elif (defined(JEMALLOC_OS_UNFAIR_LOCK)) + os_unfair_lock lock; +#elif (defined(JEMALLOC_MUTEX_INIT_CB)) + pthread_mutex_t lock; + malloc_mutex_t *postponed_next; +#else + pthread_mutex_t lock; +#endif + /* + * Hint flag to avoid exclusive cache line contention + * during spin waiting + */ + atomic_b_t locked; + }; + /* + * We only touch witness when configured w/ debug. However we + * keep the field in a union when !debug so that we don't have + * to pollute the code base with #ifdefs, while avoid paying the + * memory cost. + */ +#if !defined(JEMALLOC_DEBUG) + witness_t witness; + malloc_mutex_lock_order_t lock_order; +#endif + }; + +#if defined(JEMALLOC_DEBUG) + witness_t witness; + malloc_mutex_lock_order_t lock_order; +#endif +}; + +#ifdef _WIN32 +# if _WIN32_WINNT >= 0x0600 +# define MALLOC_MUTEX_LOCK(m) AcquireSRWLockExclusive(&(m)->lock) +# define MALLOC_MUTEX_UNLOCK(m) ReleaseSRWLockExclusive(&(m)->lock) +# define MALLOC_MUTEX_TRYLOCK(m) (!TryAcquireSRWLockExclusive(&(m)->lock)) +# else +# define MALLOC_MUTEX_LOCK(m) EnterCriticalSection(&(m)->lock) +# define MALLOC_MUTEX_UNLOCK(m) LeaveCriticalSection(&(m)->lock) +# define MALLOC_MUTEX_TRYLOCK(m) (!TryEnterCriticalSection(&(m)->lock)) +# endif +#elif (defined(JEMALLOC_OS_UNFAIR_LOCK)) +# define MALLOC_MUTEX_LOCK(m) os_unfair_lock_lock(&(m)->lock) +# define MALLOC_MUTEX_UNLOCK(m) os_unfair_lock_unlock(&(m)->lock) +# define MALLOC_MUTEX_TRYLOCK(m) (!os_unfair_lock_trylock(&(m)->lock)) +#else +# define MALLOC_MUTEX_LOCK(m) pthread_mutex_lock(&(m)->lock) +# define MALLOC_MUTEX_UNLOCK(m) pthread_mutex_unlock(&(m)->lock) +# define MALLOC_MUTEX_TRYLOCK(m) (pthread_mutex_trylock(&(m)->lock) != 0) +#endif + +#define LOCK_PROF_DATA_INITIALIZER \ + {NSTIME_ZERO_INITIALIZER, NSTIME_ZERO_INITIALIZER, 0, 0, 0, \ + ATOMIC_INIT(0), 0, NULL, 0} + +#ifdef _WIN32 +# define MALLOC_MUTEX_INITIALIZER +#elif (defined(JEMALLOC_OS_UNFAIR_LOCK)) +# if defined(JEMALLOC_DEBUG) +# define MALLOC_MUTEX_INITIALIZER \ + {{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT, ATOMIC_INIT(false)}}, \ + WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0} +# else +# define MALLOC_MUTEX_INITIALIZER \ + {{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT, ATOMIC_INIT(false)}}, \ + WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)} +# endif +#elif (defined(JEMALLOC_MUTEX_INIT_CB)) +# if (defined(JEMALLOC_DEBUG)) +# define MALLOC_MUTEX_INITIALIZER \ + {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL, ATOMIC_INIT(false)}}, \ + WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0} +# else +# define MALLOC_MUTEX_INITIALIZER \ + {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL, ATOMIC_INIT(false)}}, \ + WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)} +# endif + +#else +# define MALLOC_MUTEX_TYPE PTHREAD_MUTEX_DEFAULT +# if defined(JEMALLOC_DEBUG) +# define MALLOC_MUTEX_INITIALIZER \ + {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, ATOMIC_INIT(false)}}, \ + WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0} +# else +# define MALLOC_MUTEX_INITIALIZER \ + {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, ATOMIC_INIT(false)}}, \ + WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)} +# endif +#endif + +#ifdef JEMALLOC_LAZY_LOCK +extern bool isthreaded; +#else +# undef isthreaded /* Undo private_namespace.h definition. */ +# define isthreaded true +#endif + +bool malloc_mutex_init(malloc_mutex_t *mutex, const char *name, + witness_rank_t rank, malloc_mutex_lock_order_t lock_order); +void malloc_mutex_prefork(tsdn_t *tsdn, malloc_mutex_t *mutex); +void malloc_mutex_postfork_parent(tsdn_t *tsdn, malloc_mutex_t *mutex); +void malloc_mutex_postfork_child(tsdn_t *tsdn, malloc_mutex_t *mutex); +bool malloc_mutex_boot(void); +void malloc_mutex_prof_data_reset(tsdn_t *tsdn, malloc_mutex_t *mutex); + +void malloc_mutex_lock_slow(malloc_mutex_t *mutex); + +static inline void +malloc_mutex_lock_final(malloc_mutex_t *mutex) { + MALLOC_MUTEX_LOCK(mutex); + atomic_store_b(&mutex->locked, true, ATOMIC_RELAXED); +} + +static inline bool +malloc_mutex_trylock_final(malloc_mutex_t *mutex) { + return MALLOC_MUTEX_TRYLOCK(mutex); +} + +static inline void +mutex_owner_stats_update(tsdn_t *tsdn, malloc_mutex_t *mutex) { + if (config_stats) { + mutex_prof_data_t *data = &mutex->prof_data; + data->n_lock_ops++; + if (data->prev_owner != tsdn) { + data->prev_owner = tsdn; + data->n_owner_switches++; + } + } +} + +/* Trylock: return false if the lock is successfully acquired. */ +static inline bool +malloc_mutex_trylock(tsdn_t *tsdn, malloc_mutex_t *mutex) { + witness_assert_not_owner(tsdn_witness_tsdp_get(tsdn), &mutex->witness); + if (isthreaded) { + if (malloc_mutex_trylock_final(mutex)) { + atomic_store_b(&mutex->locked, true, ATOMIC_RELAXED); + return true; + } + mutex_owner_stats_update(tsdn, mutex); + } + witness_lock(tsdn_witness_tsdp_get(tsdn), &mutex->witness); + + return false; +} + +/* Aggregate lock prof data. */ +static inline void +malloc_mutex_prof_merge(mutex_prof_data_t *sum, mutex_prof_data_t *data) { + nstime_add(&sum->tot_wait_time, &data->tot_wait_time); + if (nstime_compare(&sum->max_wait_time, &data->max_wait_time) < 0) { + nstime_copy(&sum->max_wait_time, &data->max_wait_time); + } + + sum->n_wait_times += data->n_wait_times; + sum->n_spin_acquired += data->n_spin_acquired; + + if (sum->max_n_thds < data->max_n_thds) { + sum->max_n_thds = data->max_n_thds; + } + uint32_t cur_n_waiting_thds = atomic_load_u32(&sum->n_waiting_thds, + ATOMIC_RELAXED); + uint32_t new_n_waiting_thds = cur_n_waiting_thds + atomic_load_u32( + &data->n_waiting_thds, ATOMIC_RELAXED); + atomic_store_u32(&sum->n_waiting_thds, new_n_waiting_thds, + ATOMIC_RELAXED); + sum->n_owner_switches += data->n_owner_switches; + sum->n_lock_ops += data->n_lock_ops; +} + +static inline void +malloc_mutex_lock(tsdn_t *tsdn, malloc_mutex_t *mutex) { + witness_assert_not_owner(tsdn_witness_tsdp_get(tsdn), &mutex->witness); + if (isthreaded) { + if (malloc_mutex_trylock_final(mutex)) { + malloc_mutex_lock_slow(mutex); + atomic_store_b(&mutex->locked, true, ATOMIC_RELAXED); + } + mutex_owner_stats_update(tsdn, mutex); + } + witness_lock(tsdn_witness_tsdp_get(tsdn), &mutex->witness); +} + +static inline void +malloc_mutex_unlock(tsdn_t *tsdn, malloc_mutex_t *mutex) { + atomic_store_b(&mutex->locked, false, ATOMIC_RELAXED); + witness_unlock(tsdn_witness_tsdp_get(tsdn), &mutex->witness); + if (isthreaded) { + MALLOC_MUTEX_UNLOCK(mutex); + } +} + +static inline void +malloc_mutex_assert_owner(tsdn_t *tsdn, malloc_mutex_t *mutex) { + witness_assert_owner(tsdn_witness_tsdp_get(tsdn), &mutex->witness); +} + +static inline void +malloc_mutex_assert_not_owner(tsdn_t *tsdn, malloc_mutex_t *mutex) { + witness_assert_not_owner(tsdn_witness_tsdp_get(tsdn), &mutex->witness); +} + +static inline void +malloc_mutex_prof_copy(mutex_prof_data_t *dst, mutex_prof_data_t *source) { + /* + * Not *really* allowed (we shouldn't be doing non-atomic loads of + * atomic data), but the mutex protection makes this safe, and writing + * a member-for-member copy is tedious for this situation. + */ + *dst = *source; + /* n_wait_thds is not reported (modified w/o locking). */ + atomic_store_u32(&dst->n_waiting_thds, 0, ATOMIC_RELAXED); +} + +/* Copy the prof data from mutex for processing. */ +static inline void +malloc_mutex_prof_read(tsdn_t *tsdn, mutex_prof_data_t *data, + malloc_mutex_t *mutex) { + /* Can only read holding the mutex. */ + malloc_mutex_assert_owner(tsdn, mutex); + malloc_mutex_prof_copy(data, &mutex->prof_data); +} + +static inline void +malloc_mutex_prof_accum(tsdn_t *tsdn, mutex_prof_data_t *data, + malloc_mutex_t *mutex) { + mutex_prof_data_t *source = &mutex->prof_data; + /* Can only read holding the mutex. */ + malloc_mutex_assert_owner(tsdn, mutex); + + nstime_add(&data->tot_wait_time, &source->tot_wait_time); + if (nstime_compare(&source->max_wait_time, &data->max_wait_time) > 0) { + nstime_copy(&data->max_wait_time, &source->max_wait_time); + } + data->n_wait_times += source->n_wait_times; + data->n_spin_acquired += source->n_spin_acquired; + if (data->max_n_thds < source->max_n_thds) { + data->max_n_thds = source->max_n_thds; + } + /* n_wait_thds is not reported. */ + atomic_store_u32(&data->n_waiting_thds, 0, ATOMIC_RELAXED); + data->n_owner_switches += source->n_owner_switches; + data->n_lock_ops += source->n_lock_ops; +} + +/* Compare the prof data and update to the maximum. */ +static inline void +malloc_mutex_prof_max_update(tsdn_t *tsdn, mutex_prof_data_t *data, + malloc_mutex_t *mutex) { + mutex_prof_data_t *source = &mutex->prof_data; + /* Can only read holding the mutex. */ + malloc_mutex_assert_owner(tsdn, mutex); + + if (nstime_compare(&source->tot_wait_time, &data->tot_wait_time) > 0) { + nstime_copy(&data->tot_wait_time, &source->tot_wait_time); + } + if (nstime_compare(&source->max_wait_time, &data->max_wait_time) > 0) { + nstime_copy(&data->max_wait_time, &source->max_wait_time); + } + if (source->n_wait_times > data->n_wait_times) { + data->n_wait_times = source->n_wait_times; + } + if (source->n_spin_acquired > data->n_spin_acquired) { + data->n_spin_acquired = source->n_spin_acquired; + } + if (source->max_n_thds > data->max_n_thds) { + data->max_n_thds = source->max_n_thds; + } + if (source->n_owner_switches > data->n_owner_switches) { + data->n_owner_switches = source->n_owner_switches; + } + if (source->n_lock_ops > data->n_lock_ops) { + data->n_lock_ops = source->n_lock_ops; + } + /* n_wait_thds is not reported. */ +} + +#endif /* JEMALLOC_INTERNAL_MUTEX_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/mutex_prof.h b/BeefRT/JEMalloc/include/jemalloc/internal/mutex_prof.h new file mode 100644 index 00000000..4a526a5a --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/mutex_prof.h @@ -0,0 +1,117 @@ +#ifndef JEMALLOC_INTERNAL_MUTEX_PROF_H +#define JEMALLOC_INTERNAL_MUTEX_PROF_H + +#include "jemalloc/internal/atomic.h" +#include "jemalloc/internal/nstime.h" +#include "jemalloc/internal/tsd_types.h" + +#define MUTEX_PROF_GLOBAL_MUTEXES \ + OP(background_thread) \ + OP(max_per_bg_thd) \ + OP(ctl) \ + OP(prof) \ + OP(prof_thds_data) \ + OP(prof_dump) \ + OP(prof_recent_alloc) \ + OP(prof_recent_dump) \ + OP(prof_stats) + +typedef enum { +#define OP(mtx) global_prof_mutex_##mtx, + MUTEX_PROF_GLOBAL_MUTEXES +#undef OP + mutex_prof_num_global_mutexes +} mutex_prof_global_ind_t; + +#define MUTEX_PROF_ARENA_MUTEXES \ + OP(large) \ + OP(extent_avail) \ + OP(extents_dirty) \ + OP(extents_muzzy) \ + OP(extents_retained) \ + OP(decay_dirty) \ + OP(decay_muzzy) \ + OP(base) \ + OP(tcache_list) \ + OP(hpa_shard) \ + OP(hpa_shard_grow) \ + OP(hpa_sec) + +typedef enum { +#define OP(mtx) arena_prof_mutex_##mtx, + MUTEX_PROF_ARENA_MUTEXES +#undef OP + mutex_prof_num_arena_mutexes +} mutex_prof_arena_ind_t; + +/* + * The forth parameter is a boolean value that is true for derived rate counters + * and false for real ones. + */ +#define MUTEX_PROF_UINT64_COUNTERS \ + OP(num_ops, uint64_t, "n_lock_ops", false, num_ops) \ + OP(num_ops_ps, uint64_t, "(#/sec)", true, num_ops) \ + OP(num_wait, uint64_t, "n_waiting", false, num_wait) \ + OP(num_wait_ps, uint64_t, "(#/sec)", true, num_wait) \ + OP(num_spin_acq, uint64_t, "n_spin_acq", false, num_spin_acq) \ + OP(num_spin_acq_ps, uint64_t, "(#/sec)", true, num_spin_acq) \ + OP(num_owner_switch, uint64_t, "n_owner_switch", false, num_owner_switch) \ + OP(num_owner_switch_ps, uint64_t, "(#/sec)", true, num_owner_switch) \ + OP(total_wait_time, uint64_t, "total_wait_ns", false, total_wait_time) \ + OP(total_wait_time_ps, uint64_t, "(#/sec)", true, total_wait_time) \ + OP(max_wait_time, uint64_t, "max_wait_ns", false, max_wait_time) + +#define MUTEX_PROF_UINT32_COUNTERS \ + OP(max_num_thds, uint32_t, "max_n_thds", false, max_num_thds) + +#define MUTEX_PROF_COUNTERS \ + MUTEX_PROF_UINT64_COUNTERS \ + MUTEX_PROF_UINT32_COUNTERS + +#define OP(counter, type, human, derived, base_counter) mutex_counter_##counter, + +#define COUNTER_ENUM(counter_list, t) \ + typedef enum { \ + counter_list \ + mutex_prof_num_##t##_counters \ + } mutex_prof_##t##_counter_ind_t; + +COUNTER_ENUM(MUTEX_PROF_UINT64_COUNTERS, uint64_t) +COUNTER_ENUM(MUTEX_PROF_UINT32_COUNTERS, uint32_t) + +#undef COUNTER_ENUM +#undef OP + +typedef struct { + /* + * Counters touched on the slow path, i.e. when there is lock + * contention. We update them once we have the lock. + */ + /* Total time (in nano seconds) spent waiting on this mutex. */ + nstime_t tot_wait_time; + /* Max time (in nano seconds) spent on a single lock operation. */ + nstime_t max_wait_time; + /* # of times have to wait for this mutex (after spinning). */ + uint64_t n_wait_times; + /* # of times acquired the mutex through local spinning. */ + uint64_t n_spin_acquired; + /* Max # of threads waiting for the mutex at the same time. */ + uint32_t max_n_thds; + /* Current # of threads waiting on the lock. Atomic synced. */ + atomic_u32_t n_waiting_thds; + + /* + * Data touched on the fast path. These are modified right after we + * grab the lock, so it's placed closest to the end (i.e. right before + * the lock) so that we have a higher chance of them being on the same + * cacheline. + */ + /* # of times the mutex holder is different than the previous one. */ + uint64_t n_owner_switches; + /* Previous mutex holder, to facilitate n_owner_switches. */ + tsdn_t *prev_owner; + /* # of lock() operations in total. */ + uint64_t n_lock_ops; +} mutex_prof_data_t; + +#endif /* JEMALLOC_INTERNAL_MUTEX_PROF_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/nstime.h b/BeefRT/JEMalloc/include/jemalloc/internal/nstime.h new file mode 100644 index 00000000..486e5cca --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/nstime.h @@ -0,0 +1,73 @@ +#ifndef JEMALLOC_INTERNAL_NSTIME_H +#define JEMALLOC_INTERNAL_NSTIME_H + +/* Maximum supported number of seconds (~584 years). */ +#define NSTIME_SEC_MAX KQU(18446744072) + +#define NSTIME_MAGIC ((uint32_t)0xb8a9ce37) +#ifdef JEMALLOC_DEBUG +# define NSTIME_ZERO_INITIALIZER {0, NSTIME_MAGIC} +#else +# define NSTIME_ZERO_INITIALIZER {0} +#endif + +typedef struct { + uint64_t ns; +#ifdef JEMALLOC_DEBUG + uint32_t magic; /* Tracks if initialized. */ +#endif +} nstime_t; + +static const nstime_t nstime_zero = NSTIME_ZERO_INITIALIZER; + +void nstime_init(nstime_t *time, uint64_t ns); +void nstime_init2(nstime_t *time, uint64_t sec, uint64_t nsec); +uint64_t nstime_ns(const nstime_t *time); +uint64_t nstime_sec(const nstime_t *time); +uint64_t nstime_msec(const nstime_t *time); +uint64_t nstime_nsec(const nstime_t *time); +void nstime_copy(nstime_t *time, const nstime_t *source); +int nstime_compare(const nstime_t *a, const nstime_t *b); +void nstime_add(nstime_t *time, const nstime_t *addend); +void nstime_iadd(nstime_t *time, uint64_t addend); +void nstime_subtract(nstime_t *time, const nstime_t *subtrahend); +void nstime_isubtract(nstime_t *time, uint64_t subtrahend); +void nstime_imultiply(nstime_t *time, uint64_t multiplier); +void nstime_idivide(nstime_t *time, uint64_t divisor); +uint64_t nstime_divide(const nstime_t *time, const nstime_t *divisor); +uint64_t nstime_ns_since(const nstime_t *past); + +typedef bool (nstime_monotonic_t)(void); +extern nstime_monotonic_t *JET_MUTABLE nstime_monotonic; + +typedef void (nstime_update_t)(nstime_t *); +extern nstime_update_t *JET_MUTABLE nstime_update; + +typedef void (nstime_prof_update_t)(nstime_t *); +extern nstime_prof_update_t *JET_MUTABLE nstime_prof_update; + +void nstime_init_update(nstime_t *time); +void nstime_prof_init_update(nstime_t *time); + +enum prof_time_res_e { + prof_time_res_default = 0, + prof_time_res_high = 1 +}; +typedef enum prof_time_res_e prof_time_res_t; + +extern prof_time_res_t opt_prof_time_res; +extern const char *prof_time_res_mode_names[]; + +JEMALLOC_ALWAYS_INLINE void +nstime_init_zero(nstime_t *time) { + nstime_copy(time, &nstime_zero); +} + +JEMALLOC_ALWAYS_INLINE bool +nstime_equals_zero(nstime_t *time) { + int diff = nstime_compare(time, &nstime_zero); + assert(diff >= 0); + return diff == 0; +} + +#endif /* JEMALLOC_INTERNAL_NSTIME_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/pa.h b/BeefRT/JEMalloc/include/jemalloc/internal/pa.h new file mode 100644 index 00000000..4748a05b --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/pa.h @@ -0,0 +1,243 @@ +#ifndef JEMALLOC_INTERNAL_PA_H +#define JEMALLOC_INTERNAL_PA_H + +#include "jemalloc/internal/base.h" +#include "jemalloc/internal/decay.h" +#include "jemalloc/internal/ecache.h" +#include "jemalloc/internal/edata_cache.h" +#include "jemalloc/internal/emap.h" +#include "jemalloc/internal/hpa.h" +#include "jemalloc/internal/lockedint.h" +#include "jemalloc/internal/pac.h" +#include "jemalloc/internal/pai.h" +#include "jemalloc/internal/sec.h" + +/* + * The page allocator; responsible for acquiring pages of memory for + * allocations. It picks the implementation of the page allocator interface + * (i.e. a pai_t) to handle a given page-level allocation request. For now, the + * only such implementation is the PAC code ("page allocator classic"), but + * others will be coming soon. + */ + +typedef struct pa_central_s pa_central_t; +struct pa_central_s { + hpa_central_t hpa; +}; + +/* + * The stats for a particular pa_shard. Because of the way the ctl module + * handles stats epoch data collection (it has its own arena_stats, and merges + * the stats from each arena into it), this needs to live in the arena_stats_t; + * hence we define it here and let the pa_shard have a pointer (rather than the + * more natural approach of just embedding it in the pa_shard itself). + * + * We follow the arena_stats_t approach of marking the derived fields. These + * are the ones that are not maintained on their own; instead, their values are + * derived during those stats merges. + */ +typedef struct pa_shard_stats_s pa_shard_stats_t; +struct pa_shard_stats_s { + /* Number of edata_t structs allocated by base, but not being used. */ + size_t edata_avail; /* Derived. */ + /* + * Stats specific to the PAC. For now, these are the only stats that + * exist, but there will eventually be other page allocators. Things + * like edata_avail make sense in a cross-PA sense, but things like + * npurges don't. + */ + pac_stats_t pac_stats; +}; + +/* + * The local allocator handle. Keeps the state necessary to satisfy page-sized + * allocations. + * + * The contents are mostly internal to the PA module. The key exception is that + * arena decay code is allowed to grab pointers to the dirty and muzzy ecaches + * decay_ts, for a couple of queries, passing them back to a PA function, or + * acquiring decay.mtx and looking at decay.purging. The reasoning is that, + * while PA decides what and how to purge, the arena code decides when and where + * (e.g. on what thread). It's allowed to use the presence of another purger to + * decide. + * (The background thread code also touches some other decay internals, but + * that's not fundamental; its' just an artifact of a partial refactoring, and + * its accesses could be straightforwardly moved inside the decay module). + */ +typedef struct pa_shard_s pa_shard_t; +struct pa_shard_s { + /* The central PA this shard is associated with. */ + pa_central_t *central; + + /* + * Number of pages in active extents. + * + * Synchronization: atomic. + */ + atomic_zu_t nactive; + + /* + * Whether or not we should prefer the hugepage allocator. Atomic since + * it may be concurrently modified by a thread setting extent hooks. + * Note that we still may do HPA operations in this arena; if use_hpa is + * changed from true to false, we'll free back to the hugepage allocator + * for those allocations. + */ + atomic_b_t use_hpa; + + /* + * If we never used the HPA to begin with, it wasn't initialized, and so + * we shouldn't try to e.g. acquire its mutexes during fork. This + * tracks that knowledge. + */ + bool ever_used_hpa; + + /* Allocates from a PAC. */ + pac_t pac; + + /* + * We place a small extent cache in front of the HPA, since we intend + * these configurations to use many fewer arenas, and therefore have a + * higher risk of hot locks. + */ + sec_t hpa_sec; + hpa_shard_t hpa_shard; + + /* The source of edata_t objects. */ + edata_cache_t edata_cache; + + unsigned ind; + + malloc_mutex_t *stats_mtx; + pa_shard_stats_t *stats; + + /* The emap this shard is tied to. */ + emap_t *emap; + + /* The base from which we get the ehooks and allocate metadat. */ + base_t *base; +}; + +static inline bool +pa_shard_dont_decay_muzzy(pa_shard_t *shard) { + return ecache_npages_get(&shard->pac.ecache_muzzy) == 0 && + pac_decay_ms_get(&shard->pac, extent_state_muzzy) <= 0; +} + +static inline ehooks_t * +pa_shard_ehooks_get(pa_shard_t *shard) { + return base_ehooks_get(shard->base); +} + +/* Returns true on error. */ +bool pa_central_init(pa_central_t *central, base_t *base, bool hpa, + hpa_hooks_t *hpa_hooks); + +/* Returns true on error. */ +bool pa_shard_init(tsdn_t *tsdn, pa_shard_t *shard, pa_central_t *central, + emap_t *emap, base_t *base, unsigned ind, pa_shard_stats_t *stats, + malloc_mutex_t *stats_mtx, nstime_t *cur_time, size_t oversize_threshold, + ssize_t dirty_decay_ms, ssize_t muzzy_decay_ms); + +/* + * This isn't exposed to users; we allow late enablement of the HPA shard so + * that we can boot without worrying about the HPA, then turn it on in a0. + */ +bool pa_shard_enable_hpa(tsdn_t *tsdn, pa_shard_t *shard, + const hpa_shard_opts_t *hpa_opts, const sec_opts_t *hpa_sec_opts); + +/* + * We stop using the HPA when custom extent hooks are installed, but still + * redirect deallocations to it. + */ +void pa_shard_disable_hpa(tsdn_t *tsdn, pa_shard_t *shard); + +/* + * This does the PA-specific parts of arena reset (i.e. freeing all active + * allocations). + */ +void pa_shard_reset(tsdn_t *tsdn, pa_shard_t *shard); + +/* + * Destroy all the remaining retained extents. Should only be called after + * decaying all active, dirty, and muzzy extents to the retained state, as the + * last step in destroying the shard. + */ +void pa_shard_destroy(tsdn_t *tsdn, pa_shard_t *shard); + +/* Gets an edata for the given allocation. */ +edata_t *pa_alloc(tsdn_t *tsdn, pa_shard_t *shard, size_t size, + size_t alignment, bool slab, szind_t szind, bool zero, bool guarded, + bool *deferred_work_generated); +/* Returns true on error, in which case nothing changed. */ +bool pa_expand(tsdn_t *tsdn, pa_shard_t *shard, edata_t *edata, size_t old_size, + size_t new_size, szind_t szind, bool zero, bool *deferred_work_generated); +/* + * The same. Sets *generated_dirty to true if we produced new dirty pages, and + * false otherwise. + */ +bool pa_shrink(tsdn_t *tsdn, pa_shard_t *shard, edata_t *edata, size_t old_size, + size_t new_size, szind_t szind, bool *deferred_work_generated); +/* + * Frees the given edata back to the pa. Sets *generated_dirty if we produced + * new dirty pages (well, we always set it for now; but this need not be the + * case). + * (We could make generated_dirty the return value of course, but this is more + * consistent with the shrink pathway and our error codes here). + */ +void pa_dalloc(tsdn_t *tsdn, pa_shard_t *shard, edata_t *edata, + bool *deferred_work_generated); +bool pa_decay_ms_set(tsdn_t *tsdn, pa_shard_t *shard, extent_state_t state, + ssize_t decay_ms, pac_purge_eagerness_t eagerness); +ssize_t pa_decay_ms_get(pa_shard_t *shard, extent_state_t state); + +/* + * Do deferred work on this PA shard. + * + * Morally, this should do both PAC decay and the HPA deferred work. For now, + * though, the arena, background thread, and PAC modules are tightly interwoven + * in a way that's tricky to extricate, so we only do the HPA-specific parts. + */ +void pa_shard_set_deferral_allowed(tsdn_t *tsdn, pa_shard_t *shard, + bool deferral_allowed); +void pa_shard_do_deferred_work(tsdn_t *tsdn, pa_shard_t *shard); +void pa_shard_try_deferred_work(tsdn_t *tsdn, pa_shard_t *shard); +uint64_t pa_shard_time_until_deferred_work(tsdn_t *tsdn, pa_shard_t *shard); + +/******************************************************************************/ +/* + * Various bits of "boring" functionality that are still part of this module, + * but that we relegate to pa_extra.c, to keep the core logic in pa.c as + * readable as possible. + */ + +/* + * These fork phases are synchronized with the arena fork phase numbering to + * make it easy to keep straight. That's why there's no prefork1. + */ +void pa_shard_prefork0(tsdn_t *tsdn, pa_shard_t *shard); +void pa_shard_prefork2(tsdn_t *tsdn, pa_shard_t *shard); +void pa_shard_prefork3(tsdn_t *tsdn, pa_shard_t *shard); +void pa_shard_prefork4(tsdn_t *tsdn, pa_shard_t *shard); +void pa_shard_prefork5(tsdn_t *tsdn, pa_shard_t *shard); +void pa_shard_postfork_parent(tsdn_t *tsdn, pa_shard_t *shard); +void pa_shard_postfork_child(tsdn_t *tsdn, pa_shard_t *shard); + +void pa_shard_basic_stats_merge(pa_shard_t *shard, size_t *nactive, + size_t *ndirty, size_t *nmuzzy); + +void pa_shard_stats_merge(tsdn_t *tsdn, pa_shard_t *shard, + pa_shard_stats_t *pa_shard_stats_out, pac_estats_t *estats_out, + hpa_shard_stats_t *hpa_stats_out, sec_stats_t *sec_stats_out, + size_t *resident); + +/* + * Reads the PA-owned mutex stats into the output stats array, at the + * appropriate positions. Morally, these stats should really live in + * pa_shard_stats_t, but the indices are sort of baked into the various mutex + * prof macros. This would be a good thing to do at some point. + */ +void pa_shard_mtx_stats_read(tsdn_t *tsdn, pa_shard_t *shard, + mutex_prof_data_t mutex_prof_data[mutex_prof_num_arena_mutexes]); + +#endif /* JEMALLOC_INTERNAL_PA_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/pac.h b/BeefRT/JEMalloc/include/jemalloc/internal/pac.h new file mode 100644 index 00000000..01c4e6af --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/pac.h @@ -0,0 +1,179 @@ +#ifndef JEMALLOC_INTERNAL_PAC_H +#define JEMALLOC_INTERNAL_PAC_H + +#include "jemalloc/internal/exp_grow.h" +#include "jemalloc/internal/pai.h" +#include "san_bump.h" + + +/* + * Page allocator classic; an implementation of the PAI interface that: + * - Can be used for arenas with custom extent hooks. + * - Can always satisfy any allocation request (including highly-fragmentary + * ones). + * - Can use efficient OS-level zeroing primitives for demand-filled pages. + */ + +/* How "eager" decay/purging should be. */ +enum pac_purge_eagerness_e { + PAC_PURGE_ALWAYS, + PAC_PURGE_NEVER, + PAC_PURGE_ON_EPOCH_ADVANCE +}; +typedef enum pac_purge_eagerness_e pac_purge_eagerness_t; + +typedef struct pac_decay_stats_s pac_decay_stats_t; +struct pac_decay_stats_s { + /* Total number of purge sweeps. */ + locked_u64_t npurge; + /* Total number of madvise calls made. */ + locked_u64_t nmadvise; + /* Total number of pages purged. */ + locked_u64_t purged; +}; + +typedef struct pac_estats_s pac_estats_t; +struct pac_estats_s { + /* + * Stats for a given index in the range [0, SC_NPSIZES] in the various + * ecache_ts. + * We track both bytes and # of extents: two extents in the same bucket + * may have different sizes if adjacent size classes differ by more than + * a page, so bytes cannot always be derived from # of extents. + */ + size_t ndirty; + size_t dirty_bytes; + size_t nmuzzy; + size_t muzzy_bytes; + size_t nretained; + size_t retained_bytes; +}; + +typedef struct pac_stats_s pac_stats_t; +struct pac_stats_s { + pac_decay_stats_t decay_dirty; + pac_decay_stats_t decay_muzzy; + + /* + * Number of unused virtual memory bytes currently retained. Retained + * bytes are technically mapped (though always decommitted or purged), + * but they are excluded from the mapped statistic (above). + */ + size_t retained; /* Derived. */ + + /* + * Number of bytes currently mapped, excluding retained memory (and any + * base-allocated memory, which is tracked by the arena stats). + * + * We name this "pac_mapped" to avoid confusion with the arena_stats + * "mapped". + */ + atomic_zu_t pac_mapped; + + /* VM space had to be leaked (undocumented). Normally 0. */ + atomic_zu_t abandoned_vm; +}; + +typedef struct pac_s pac_t; +struct pac_s { + /* + * Must be the first member (we convert it to a PAC given only a + * pointer). The handle to the allocation interface. + */ + pai_t pai; + /* + * Collections of extents that were previously allocated. These are + * used when allocating extents, in an attempt to re-use address space. + * + * Synchronization: internal. + */ + ecache_t ecache_dirty; + ecache_t ecache_muzzy; + ecache_t ecache_retained; + + base_t *base; + emap_t *emap; + edata_cache_t *edata_cache; + + /* The grow info for the retained ecache. */ + exp_grow_t exp_grow; + malloc_mutex_t grow_mtx; + + /* Special allocator for guarded frequently reused extents. */ + san_bump_alloc_t sba; + + /* How large extents should be before getting auto-purged. */ + atomic_zu_t oversize_threshold; + + /* + * Decay-based purging state, responsible for scheduling extent state + * transitions. + * + * Synchronization: via the internal mutex. + */ + decay_t decay_dirty; /* dirty --> muzzy */ + decay_t decay_muzzy; /* muzzy --> retained */ + + malloc_mutex_t *stats_mtx; + pac_stats_t *stats; + + /* Extent serial number generator state. */ + atomic_zu_t extent_sn_next; +}; + +bool pac_init(tsdn_t *tsdn, pac_t *pac, base_t *base, emap_t *emap, + edata_cache_t *edata_cache, nstime_t *cur_time, size_t oversize_threshold, + ssize_t dirty_decay_ms, ssize_t muzzy_decay_ms, pac_stats_t *pac_stats, + malloc_mutex_t *stats_mtx); + +static inline size_t +pac_mapped(pac_t *pac) { + return atomic_load_zu(&pac->stats->pac_mapped, ATOMIC_RELAXED); +} + +static inline ehooks_t * +pac_ehooks_get(pac_t *pac) { + return base_ehooks_get(pac->base); +} + +/* + * All purging functions require holding decay->mtx. This is one of the few + * places external modules are allowed to peek inside pa_shard_t internals. + */ + +/* + * Decays the number of pages currently in the ecache. This might not leave the + * ecache empty if other threads are inserting dirty objects into it + * concurrently with the call. + */ +void pac_decay_all(tsdn_t *tsdn, pac_t *pac, decay_t *decay, + pac_decay_stats_t *decay_stats, ecache_t *ecache, bool fully_decay); +/* + * Updates decay settings for the current time, and conditionally purges in + * response (depending on decay_purge_setting). Returns whether or not the + * epoch advanced. + */ +bool pac_maybe_decay_purge(tsdn_t *tsdn, pac_t *pac, decay_t *decay, + pac_decay_stats_t *decay_stats, ecache_t *ecache, + pac_purge_eagerness_t eagerness); + +/* + * Gets / sets the maximum amount that we'll grow an arena down the + * grow-retained pathways (unless forced to by an allocaction request). + * + * Set new_limit to NULL if it's just a query, or old_limit to NULL if you don't + * care about the previous value. + * + * Returns true on error (if the new limit is not valid). + */ +bool pac_retain_grow_limit_get_set(tsdn_t *tsdn, pac_t *pac, size_t *old_limit, + size_t *new_limit); + +bool pac_decay_ms_set(tsdn_t *tsdn, pac_t *pac, extent_state_t state, + ssize_t decay_ms, pac_purge_eagerness_t eagerness); +ssize_t pac_decay_ms_get(pac_t *pac, extent_state_t state); + +void pac_reset(tsdn_t *tsdn, pac_t *pac); +void pac_destroy(tsdn_t *tsdn, pac_t *pac); + +#endif /* JEMALLOC_INTERNAL_PAC_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/pages.h b/BeefRT/JEMalloc/include/jemalloc/internal/pages.h new file mode 100644 index 00000000..ad1f606a --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/pages.h @@ -0,0 +1,119 @@ +#ifndef JEMALLOC_INTERNAL_PAGES_EXTERNS_H +#define JEMALLOC_INTERNAL_PAGES_EXTERNS_H + +/* Page size. LG_PAGE is determined by the configure script. */ +#ifdef PAGE_MASK +# undef PAGE_MASK +#endif +#define PAGE ((size_t)(1U << LG_PAGE)) +#define PAGE_MASK ((size_t)(PAGE - 1)) +/* Return the page base address for the page containing address a. */ +#define PAGE_ADDR2BASE(a) \ + ((void *)((uintptr_t)(a) & ~PAGE_MASK)) +/* Return the smallest pagesize multiple that is >= s. */ +#define PAGE_CEILING(s) \ + (((s) + PAGE_MASK) & ~PAGE_MASK) +/* Return the largest pagesize multiple that is <=s. */ +#define PAGE_FLOOR(s) \ + ((s) & ~PAGE_MASK) + +/* Huge page size. LG_HUGEPAGE is determined by the configure script. */ +#define HUGEPAGE ((size_t)(1U << LG_HUGEPAGE)) +#define HUGEPAGE_MASK ((size_t)(HUGEPAGE - 1)) + +#if LG_HUGEPAGE != 0 +# define HUGEPAGE_PAGES (HUGEPAGE / PAGE) +#else +/* + * It's convenient to define arrays (or bitmaps) of HUGEPAGE_PAGES lengths. If + * we can't autodetect the hugepage size, it gets treated as 0, in which case + * we'll trigger a compiler error in those arrays. Avoid this case by ensuring + * that this value is at least 1. (We won't ever run in this degraded state; + * hpa_supported() returns false in this case. + */ +# define HUGEPAGE_PAGES 1 +#endif + +/* Return the huge page base address for the huge page containing address a. */ +#define HUGEPAGE_ADDR2BASE(a) \ + ((void *)((uintptr_t)(a) & ~HUGEPAGE_MASK)) +/* Return the smallest pagesize multiple that is >= s. */ +#define HUGEPAGE_CEILING(s) \ + (((s) + HUGEPAGE_MASK) & ~HUGEPAGE_MASK) + +/* PAGES_CAN_PURGE_LAZY is defined if lazy purging is supported. */ +#if defined(_WIN32) || defined(JEMALLOC_PURGE_MADVISE_FREE) +# define PAGES_CAN_PURGE_LAZY +#endif +/* + * PAGES_CAN_PURGE_FORCED is defined if forced purging is supported. + * + * The only supported way to hard-purge on Windows is to decommit and then + * re-commit, but doing so is racy, and if re-commit fails it's a pain to + * propagate the "poisoned" memory state. Since we typically decommit as the + * next step after purging on Windows anyway, there's no point in adding such + * complexity. + */ +#if !defined(_WIN32) && ((defined(JEMALLOC_PURGE_MADVISE_DONTNEED) && \ + defined(JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS)) || \ + defined(JEMALLOC_MAPS_COALESCE)) +# define PAGES_CAN_PURGE_FORCED +#endif + +static const bool pages_can_purge_lazy = +#ifdef PAGES_CAN_PURGE_LAZY + true +#else + false +#endif + ; +static const bool pages_can_purge_forced = +#ifdef PAGES_CAN_PURGE_FORCED + true +#else + false +#endif + ; + +#if defined(JEMALLOC_HAVE_MADVISE_HUGE) || defined(JEMALLOC_HAVE_MEMCNTL) +# define PAGES_CAN_HUGIFY +#endif + +static const bool pages_can_hugify = +#ifdef PAGES_CAN_HUGIFY + true +#else + false +#endif + ; + +typedef enum { + thp_mode_default = 0, /* Do not change hugepage settings. */ + thp_mode_always = 1, /* Always set MADV_HUGEPAGE. */ + thp_mode_never = 2, /* Always set MADV_NOHUGEPAGE. */ + + thp_mode_names_limit = 3, /* Used for option processing. */ + thp_mode_not_supported = 3 /* No THP support detected. */ +} thp_mode_t; + +#define THP_MODE_DEFAULT thp_mode_default +extern thp_mode_t opt_thp; +extern thp_mode_t init_system_thp_mode; /* Initial system wide state. */ +extern const char *thp_mode_names[]; + +void *pages_map(void *addr, size_t size, size_t alignment, bool *commit); +void pages_unmap(void *addr, size_t size); +bool pages_commit(void *addr, size_t size); +bool pages_decommit(void *addr, size_t size); +bool pages_purge_lazy(void *addr, size_t size); +bool pages_purge_forced(void *addr, size_t size); +bool pages_huge(void *addr, size_t size); +bool pages_nohuge(void *addr, size_t size); +bool pages_dontdump(void *addr, size_t size); +bool pages_dodump(void *addr, size_t size); +bool pages_boot(void); +void pages_set_thp_state (void *ptr, size_t size); +void pages_mark_guards(void *head, void *tail); +void pages_unmark_guards(void *head, void *tail); + +#endif /* JEMALLOC_INTERNAL_PAGES_EXTERNS_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/pai.h b/BeefRT/JEMalloc/include/jemalloc/internal/pai.h new file mode 100644 index 00000000..d978cd7d --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/pai.h @@ -0,0 +1,95 @@ +#ifndef JEMALLOC_INTERNAL_PAI_H +#define JEMALLOC_INTERNAL_PAI_H + +/* An interface for page allocation. */ + +typedef struct pai_s pai_t; +struct pai_s { + /* Returns NULL on failure. */ + edata_t *(*alloc)(tsdn_t *tsdn, pai_t *self, size_t size, + size_t alignment, bool zero, bool guarded, bool frequent_reuse, + bool *deferred_work_generated); + /* + * Returns the number of extents added to the list (which may be fewer + * than requested, in case of OOM). The list should already be + * initialized. The only alignment guarantee is page-alignment, and + * the results are not necessarily zeroed. + */ + size_t (*alloc_batch)(tsdn_t *tsdn, pai_t *self, size_t size, + size_t nallocs, edata_list_active_t *results, + bool *deferred_work_generated); + bool (*expand)(tsdn_t *tsdn, pai_t *self, edata_t *edata, + size_t old_size, size_t new_size, bool zero, + bool *deferred_work_generated); + bool (*shrink)(tsdn_t *tsdn, pai_t *self, edata_t *edata, + size_t old_size, size_t new_size, bool *deferred_work_generated); + void (*dalloc)(tsdn_t *tsdn, pai_t *self, edata_t *edata, + bool *deferred_work_generated); + /* This function empties out list as a side-effect of being called. */ + void (*dalloc_batch)(tsdn_t *tsdn, pai_t *self, + edata_list_active_t *list, bool *deferred_work_generated); + uint64_t (*time_until_deferred_work)(tsdn_t *tsdn, pai_t *self); +}; + +/* + * These are just simple convenience functions to avoid having to reference the + * same pai_t twice on every invocation. + */ + +static inline edata_t * +pai_alloc(tsdn_t *tsdn, pai_t *self, size_t size, size_t alignment, + bool zero, bool guarded, bool frequent_reuse, + bool *deferred_work_generated) { + return self->alloc(tsdn, self, size, alignment, zero, guarded, + frequent_reuse, deferred_work_generated); +} + +static inline size_t +pai_alloc_batch(tsdn_t *tsdn, pai_t *self, size_t size, size_t nallocs, + edata_list_active_t *results, bool *deferred_work_generated) { + return self->alloc_batch(tsdn, self, size, nallocs, results, + deferred_work_generated); +} + +static inline bool +pai_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size, + size_t new_size, bool zero, bool *deferred_work_generated) { + return self->expand(tsdn, self, edata, old_size, new_size, zero, + deferred_work_generated); +} + +static inline bool +pai_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size, + size_t new_size, bool *deferred_work_generated) { + return self->shrink(tsdn, self, edata, old_size, new_size, + deferred_work_generated); +} + +static inline void +pai_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata, + bool *deferred_work_generated) { + self->dalloc(tsdn, self, edata, deferred_work_generated); +} + +static inline void +pai_dalloc_batch(tsdn_t *tsdn, pai_t *self, edata_list_active_t *list, + bool *deferred_work_generated) { + self->dalloc_batch(tsdn, self, list, deferred_work_generated); +} + +static inline uint64_t +pai_time_until_deferred_work(tsdn_t *tsdn, pai_t *self) { + return self->time_until_deferred_work(tsdn, self); +} + +/* + * An implementation of batch allocation that simply calls alloc once for + * each item in the list. + */ +size_t pai_alloc_batch_default(tsdn_t *tsdn, pai_t *self, size_t size, + size_t nallocs, edata_list_active_t *results, bool *deferred_work_generated); +/* Ditto, for dalloc. */ +void pai_dalloc_batch_default(tsdn_t *tsdn, pai_t *self, + edata_list_active_t *list, bool *deferred_work_generated); + +#endif /* JEMALLOC_INTERNAL_PAI_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/peak.h b/BeefRT/JEMalloc/include/jemalloc/internal/peak.h new file mode 100644 index 00000000..59da3e41 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/peak.h @@ -0,0 +1,37 @@ +#ifndef JEMALLOC_INTERNAL_PEAK_H +#define JEMALLOC_INTERNAL_PEAK_H + +typedef struct peak_s peak_t; +struct peak_s { + /* The highest recorded peak value, after adjustment (see below). */ + uint64_t cur_max; + /* + * The difference between alloc and dalloc at the last set_zero call; + * this lets us cancel out the appropriate amount of excess. + */ + uint64_t adjustment; +}; + +#define PEAK_INITIALIZER {0, 0} + +static inline uint64_t +peak_max(peak_t *peak) { + return peak->cur_max; +} + +static inline void +peak_update(peak_t *peak, uint64_t alloc, uint64_t dalloc) { + int64_t candidate_max = (int64_t)(alloc - dalloc - peak->adjustment); + if (candidate_max > (int64_t)peak->cur_max) { + peak->cur_max = candidate_max; + } +} + +/* Resets the counter to zero; all peaks are now relative to this point. */ +static inline void +peak_set_zero(peak_t *peak, uint64_t alloc, uint64_t dalloc) { + peak->cur_max = 0; + peak->adjustment = alloc - dalloc; +} + +#endif /* JEMALLOC_INTERNAL_PEAK_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/peak_event.h b/BeefRT/JEMalloc/include/jemalloc/internal/peak_event.h new file mode 100644 index 00000000..b808ce04 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/peak_event.h @@ -0,0 +1,24 @@ +#ifndef JEMALLOC_INTERNAL_PEAK_EVENT_H +#define JEMALLOC_INTERNAL_PEAK_EVENT_H + +/* + * While peak.h contains the simple helper struct that tracks state, this + * contains the allocator tie-ins (and knows about tsd, the event module, etc.). + */ + +/* Update the peak with current tsd state. */ +void peak_event_update(tsd_t *tsd); +/* Set current state to zero. */ +void peak_event_zero(tsd_t *tsd); +uint64_t peak_event_max(tsd_t *tsd); + +/* Manual hooks. */ +/* The activity-triggered hooks. */ +uint64_t peak_alloc_new_event_wait(tsd_t *tsd); +uint64_t peak_alloc_postponed_event_wait(tsd_t *tsd); +void peak_alloc_event_handler(tsd_t *tsd, uint64_t elapsed); +uint64_t peak_dalloc_new_event_wait(tsd_t *tsd); +uint64_t peak_dalloc_postponed_event_wait(tsd_t *tsd); +void peak_dalloc_event_handler(tsd_t *tsd, uint64_t elapsed); + +#endif /* JEMALLOC_INTERNAL_PEAK_EVENT_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/ph.h b/BeefRT/JEMalloc/include/jemalloc/internal/ph.h new file mode 100644 index 00000000..5f091c5f --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/ph.h @@ -0,0 +1,520 @@ +#ifndef JEMALLOC_INTERNAL_PH_H +#define JEMALLOC_INTERNAL_PH_H + +/* + * A Pairing Heap implementation. + * + * "The Pairing Heap: A New Form of Self-Adjusting Heap" + * https://www.cs.cmu.edu/~sleator/papers/pairing-heaps.pdf + * + * With auxiliary twopass list, described in a follow on paper. + * + * "Pairing Heaps: Experiments and Analysis" + * http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.106.2988&rep=rep1&type=pdf + * + ******************************************************************************* + * + * We include a non-obvious optimization: + * - First, we introduce a new pop-and-link operation; pop the two most + * recently-inserted items off the aux-list, link them, and push the resulting + * heap. + * - We maintain a count of the number of insertions since the last time we + * merged the aux-list (i.e. via first() or remove_first()). After N inserts, + * we do ffs(N) pop-and-link operations. + * + * One way to think of this is that we're progressively building up a tree in + * the aux-list, rather than a linked-list (think of the series of merges that + * will be performed as the aux-count grows). + * + * There's a couple reasons we benefit from this: + * - Ordinarily, after N insertions, the aux-list is of size N. With our + * strategy, it's of size O(log(N)). So we decrease the worst-case time of + * first() calls, and reduce the average cost of remove_min calls. Since + * these almost always occur while holding a lock, we practically reduce the + * frequency of unusually long hold times. + * - This moves the bulk of the work of merging the aux-list onto the threads + * that are inserting into the heap. In some common scenarios, insertions + * happen in bulk, from a single thread (think tcache flushing; we potentially + * move many slabs from slabs_full to slabs_nonfull). All the nodes in this + * case are in the inserting threads cache, and linking them is very cheap + * (cache misses dominate linking cost). Without this optimization, linking + * happens on the next call to remove_first. Since that remove_first call + * likely happens on a different thread (or at least, after the cache has + * gotten cold if done on the same thread), deferring linking trades cheap + * link operations now for expensive ones later. + * + * The ffs trick keeps amortized insert cost at constant time. Similar + * strategies based on periodically sorting the list after a batch of operations + * perform worse than this in practice, even with various fancy tricks; they + * all took amortized complexity of an insert from O(1) to O(log(n)). + */ + +typedef int (*ph_cmp_t)(void *, void *); + +/* Node structure. */ +typedef struct phn_link_s phn_link_t; +struct phn_link_s { + void *prev; + void *next; + void *lchild; +}; + +typedef struct ph_s ph_t; +struct ph_s { + void *root; + /* + * Inserts done since the last aux-list merge. This is not necessarily + * the size of the aux-list, since it's possible that removals have + * happened since, and we don't track whether or not those removals are + * from the aux list. + */ + size_t auxcount; +}; + +JEMALLOC_ALWAYS_INLINE phn_link_t * +phn_link_get(void *phn, size_t offset) { + return (phn_link_t *)(((uintptr_t)phn) + offset); +} + +JEMALLOC_ALWAYS_INLINE void +phn_link_init(void *phn, size_t offset) { + phn_link_get(phn, offset)->prev = NULL; + phn_link_get(phn, offset)->next = NULL; + phn_link_get(phn, offset)->lchild = NULL; +} + +/* Internal utility helpers. */ +JEMALLOC_ALWAYS_INLINE void * +phn_lchild_get(void *phn, size_t offset) { + return phn_link_get(phn, offset)->lchild; +} + +JEMALLOC_ALWAYS_INLINE void +phn_lchild_set(void *phn, void *lchild, size_t offset) { + phn_link_get(phn, offset)->lchild = lchild; +} + +JEMALLOC_ALWAYS_INLINE void * +phn_next_get(void *phn, size_t offset) { + return phn_link_get(phn, offset)->next; +} + +JEMALLOC_ALWAYS_INLINE void +phn_next_set(void *phn, void *next, size_t offset) { + phn_link_get(phn, offset)->next = next; +} + +JEMALLOC_ALWAYS_INLINE void * +phn_prev_get(void *phn, size_t offset) { + return phn_link_get(phn, offset)->prev; +} + +JEMALLOC_ALWAYS_INLINE void +phn_prev_set(void *phn, void *prev, size_t offset) { + phn_link_get(phn, offset)->prev = prev; +} + +JEMALLOC_ALWAYS_INLINE void +phn_merge_ordered(void *phn0, void *phn1, size_t offset, + ph_cmp_t cmp) { + void *phn0child; + + assert(phn0 != NULL); + assert(phn1 != NULL); + assert(cmp(phn0, phn1) <= 0); + + phn_prev_set(phn1, phn0, offset); + phn0child = phn_lchild_get(phn0, offset); + phn_next_set(phn1, phn0child, offset); + if (phn0child != NULL) { + phn_prev_set(phn0child, phn1, offset); + } + phn_lchild_set(phn0, phn1, offset); +} + +JEMALLOC_ALWAYS_INLINE void * +phn_merge(void *phn0, void *phn1, size_t offset, ph_cmp_t cmp) { + void *result; + if (phn0 == NULL) { + result = phn1; + } else if (phn1 == NULL) { + result = phn0; + } else if (cmp(phn0, phn1) < 0) { + phn_merge_ordered(phn0, phn1, offset, cmp); + result = phn0; + } else { + phn_merge_ordered(phn1, phn0, offset, cmp); + result = phn1; + } + return result; +} + +JEMALLOC_ALWAYS_INLINE void * +phn_merge_siblings(void *phn, size_t offset, ph_cmp_t cmp) { + void *head = NULL; + void *tail = NULL; + void *phn0 = phn; + void *phn1 = phn_next_get(phn0, offset); + + /* + * Multipass merge, wherein the first two elements of a FIFO + * are repeatedly merged, and each result is appended to the + * singly linked FIFO, until the FIFO contains only a single + * element. We start with a sibling list but no reference to + * its tail, so we do a single pass over the sibling list to + * populate the FIFO. + */ + if (phn1 != NULL) { + void *phnrest = phn_next_get(phn1, offset); + if (phnrest != NULL) { + phn_prev_set(phnrest, NULL, offset); + } + phn_prev_set(phn0, NULL, offset); + phn_next_set(phn0, NULL, offset); + phn_prev_set(phn1, NULL, offset); + phn_next_set(phn1, NULL, offset); + phn0 = phn_merge(phn0, phn1, offset, cmp); + head = tail = phn0; + phn0 = phnrest; + while (phn0 != NULL) { + phn1 = phn_next_get(phn0, offset); + if (phn1 != NULL) { + phnrest = phn_next_get(phn1, offset); + if (phnrest != NULL) { + phn_prev_set(phnrest, NULL, offset); + } + phn_prev_set(phn0, NULL, offset); + phn_next_set(phn0, NULL, offset); + phn_prev_set(phn1, NULL, offset); + phn_next_set(phn1, NULL, offset); + phn0 = phn_merge(phn0, phn1, offset, cmp); + phn_next_set(tail, phn0, offset); + tail = phn0; + phn0 = phnrest; + } else { + phn_next_set(tail, phn0, offset); + tail = phn0; + phn0 = NULL; + } + } + phn0 = head; + phn1 = phn_next_get(phn0, offset); + if (phn1 != NULL) { + while (true) { + head = phn_next_get(phn1, offset); + assert(phn_prev_get(phn0, offset) == NULL); + phn_next_set(phn0, NULL, offset); + assert(phn_prev_get(phn1, offset) == NULL); + phn_next_set(phn1, NULL, offset); + phn0 = phn_merge(phn0, phn1, offset, cmp); + if (head == NULL) { + break; + } + phn_next_set(tail, phn0, offset); + tail = phn0; + phn0 = head; + phn1 = phn_next_get(phn0, offset); + } + } + } + return phn0; +} + +JEMALLOC_ALWAYS_INLINE void +ph_merge_aux(ph_t *ph, size_t offset, ph_cmp_t cmp) { + ph->auxcount = 0; + void *phn = phn_next_get(ph->root, offset); + if (phn != NULL) { + phn_prev_set(ph->root, NULL, offset); + phn_next_set(ph->root, NULL, offset); + phn_prev_set(phn, NULL, offset); + phn = phn_merge_siblings(phn, offset, cmp); + assert(phn_next_get(phn, offset) == NULL); + ph->root = phn_merge(ph->root, phn, offset, cmp); + } +} + +JEMALLOC_ALWAYS_INLINE void * +ph_merge_children(void *phn, size_t offset, ph_cmp_t cmp) { + void *result; + void *lchild = phn_lchild_get(phn, offset); + if (lchild == NULL) { + result = NULL; + } else { + result = phn_merge_siblings(lchild, offset, cmp); + } + return result; +} + +JEMALLOC_ALWAYS_INLINE void +ph_new(ph_t *ph) { + ph->root = NULL; + ph->auxcount = 0; +} + +JEMALLOC_ALWAYS_INLINE bool +ph_empty(ph_t *ph) { + return ph->root == NULL; +} + +JEMALLOC_ALWAYS_INLINE void * +ph_first(ph_t *ph, size_t offset, ph_cmp_t cmp) { + if (ph->root == NULL) { + return NULL; + } + ph_merge_aux(ph, offset, cmp); + return ph->root; +} + +JEMALLOC_ALWAYS_INLINE void * +ph_any(ph_t *ph, size_t offset) { + if (ph->root == NULL) { + return NULL; + } + void *aux = phn_next_get(ph->root, offset); + if (aux != NULL) { + return aux; + } + return ph->root; +} + +/* Returns true if we should stop trying to merge. */ +JEMALLOC_ALWAYS_INLINE bool +ph_try_aux_merge_pair(ph_t *ph, size_t offset, ph_cmp_t cmp) { + assert(ph->root != NULL); + void *phn0 = phn_next_get(ph->root, offset); + if (phn0 == NULL) { + return true; + } + void *phn1 = phn_next_get(phn0, offset); + if (phn1 == NULL) { + return true; + } + void *next_phn1 = phn_next_get(phn1, offset); + phn_next_set(phn0, NULL, offset); + phn_prev_set(phn0, NULL, offset); + phn_next_set(phn1, NULL, offset); + phn_prev_set(phn1, NULL, offset); + phn0 = phn_merge(phn0, phn1, offset, cmp); + phn_next_set(phn0, next_phn1, offset); + if (next_phn1 != NULL) { + phn_prev_set(next_phn1, phn0, offset); + } + phn_next_set(ph->root, phn0, offset); + phn_prev_set(phn0, ph->root, offset); + return next_phn1 == NULL; +} + +JEMALLOC_ALWAYS_INLINE void +ph_insert(ph_t *ph, void *phn, size_t offset, ph_cmp_t cmp) { + phn_link_init(phn, offset); + + /* + * Treat the root as an aux list during insertion, and lazily merge + * during a_prefix##remove_first(). For elements that are inserted, + * then removed via a_prefix##remove() before the aux list is ever + * processed, this makes insert/remove constant-time, whereas eager + * merging would make insert O(log n). + */ + if (ph->root == NULL) { + ph->root = phn; + } else { + /* + * As a special case, check to see if we can replace the root. + * This is practically common in some important cases, and lets + * us defer some insertions (hopefully, until the point where + * some of the items in the aux list have been removed, savings + * us from linking them at all). + */ + if (cmp(phn, ph->root) < 0) { + phn_lchild_set(phn, ph->root, offset); + phn_prev_set(ph->root, phn, offset); + ph->root = phn; + ph->auxcount = 0; + return; + } + ph->auxcount++; + phn_next_set(phn, phn_next_get(ph->root, offset), offset); + if (phn_next_get(ph->root, offset) != NULL) { + phn_prev_set(phn_next_get(ph->root, offset), phn, + offset); + } + phn_prev_set(phn, ph->root, offset); + phn_next_set(ph->root, phn, offset); + } + if (ph->auxcount > 1) { + unsigned nmerges = ffs_zu(ph->auxcount - 1); + bool done = false; + for (unsigned i = 0; i < nmerges && !done; i++) { + done = ph_try_aux_merge_pair(ph, offset, cmp); + } + } +} + +JEMALLOC_ALWAYS_INLINE void * +ph_remove_first(ph_t *ph, size_t offset, ph_cmp_t cmp) { + void *ret; + + if (ph->root == NULL) { + return NULL; + } + ph_merge_aux(ph, offset, cmp); + ret = ph->root; + ph->root = ph_merge_children(ph->root, offset, cmp); + + return ret; + +} + +JEMALLOC_ALWAYS_INLINE void +ph_remove(ph_t *ph, void *phn, size_t offset, ph_cmp_t cmp) { + void *replace; + void *parent; + + if (ph->root == phn) { + /* + * We can delete from aux list without merging it, but we need + * to merge if we are dealing with the root node and it has + * children. + */ + if (phn_lchild_get(phn, offset) == NULL) { + ph->root = phn_next_get(phn, offset); + if (ph->root != NULL) { + phn_prev_set(ph->root, NULL, offset); + } + return; + } + ph_merge_aux(ph, offset, cmp); + if (ph->root == phn) { + ph->root = ph_merge_children(ph->root, offset, cmp); + return; + } + } + + /* Get parent (if phn is leftmost child) before mutating. */ + if ((parent = phn_prev_get(phn, offset)) != NULL) { + if (phn_lchild_get(parent, offset) != phn) { + parent = NULL; + } + } + /* Find a possible replacement node, and link to parent. */ + replace = ph_merge_children(phn, offset, cmp); + /* Set next/prev for sibling linked list. */ + if (replace != NULL) { + if (parent != NULL) { + phn_prev_set(replace, parent, offset); + phn_lchild_set(parent, replace, offset); + } else { + phn_prev_set(replace, phn_prev_get(phn, offset), + offset); + if (phn_prev_get(phn, offset) != NULL) { + phn_next_set(phn_prev_get(phn, offset), replace, + offset); + } + } + phn_next_set(replace, phn_next_get(phn, offset), offset); + if (phn_next_get(phn, offset) != NULL) { + phn_prev_set(phn_next_get(phn, offset), replace, + offset); + } + } else { + if (parent != NULL) { + void *next = phn_next_get(phn, offset); + phn_lchild_set(parent, next, offset); + if (next != NULL) { + phn_prev_set(next, parent, offset); + } + } else { + assert(phn_prev_get(phn, offset) != NULL); + phn_next_set( + phn_prev_get(phn, offset), + phn_next_get(phn, offset), offset); + } + if (phn_next_get(phn, offset) != NULL) { + phn_prev_set( + phn_next_get(phn, offset), + phn_prev_get(phn, offset), offset); + } + } +} + +#define ph_structs(a_prefix, a_type) \ +typedef struct { \ + phn_link_t link; \ +} a_prefix##_link_t; \ + \ +typedef struct { \ + ph_t ph; \ +} a_prefix##_t; + +/* + * The ph_proto() macro generates function prototypes that correspond to the + * functions generated by an equivalently parameterized call to ph_gen(). + */ +#define ph_proto(a_attr, a_prefix, a_type) \ + \ +a_attr void a_prefix##_new(a_prefix##_t *ph); \ +a_attr bool a_prefix##_empty(a_prefix##_t *ph); \ +a_attr a_type *a_prefix##_first(a_prefix##_t *ph); \ +a_attr a_type *a_prefix##_any(a_prefix##_t *ph); \ +a_attr void a_prefix##_insert(a_prefix##_t *ph, a_type *phn); \ +a_attr a_type *a_prefix##_remove_first(a_prefix##_t *ph); \ +a_attr void a_prefix##_remove(a_prefix##_t *ph, a_type *phn); \ +a_attr a_type *a_prefix##_remove_any(a_prefix##_t *ph); + +/* The ph_gen() macro generates a type-specific pairing heap implementation. */ +#define ph_gen(a_attr, a_prefix, a_type, a_field, a_cmp) \ +JEMALLOC_ALWAYS_INLINE int \ +a_prefix##_ph_cmp(void *a, void *b) { \ + return a_cmp((a_type *)a, (a_type *)b); \ +} \ + \ +a_attr void \ +a_prefix##_new(a_prefix##_t *ph) { \ + ph_new(&ph->ph); \ +} \ + \ +a_attr bool \ +a_prefix##_empty(a_prefix##_t *ph) { \ + return ph_empty(&ph->ph); \ +} \ + \ +a_attr a_type * \ +a_prefix##_first(a_prefix##_t *ph) { \ + return ph_first(&ph->ph, offsetof(a_type, a_field), \ + &a_prefix##_ph_cmp); \ +} \ + \ +a_attr a_type * \ +a_prefix##_any(a_prefix##_t *ph) { \ + return ph_any(&ph->ph, offsetof(a_type, a_field)); \ +} \ + \ +a_attr void \ +a_prefix##_insert(a_prefix##_t *ph, a_type *phn) { \ + ph_insert(&ph->ph, phn, offsetof(a_type, a_field), \ + a_prefix##_ph_cmp); \ +} \ + \ +a_attr a_type * \ +a_prefix##_remove_first(a_prefix##_t *ph) { \ + return ph_remove_first(&ph->ph, offsetof(a_type, a_field), \ + a_prefix##_ph_cmp); \ +} \ + \ +a_attr void \ +a_prefix##_remove(a_prefix##_t *ph, a_type *phn) { \ + ph_remove(&ph->ph, phn, offsetof(a_type, a_field), \ + a_prefix##_ph_cmp); \ +} \ + \ +a_attr a_type * \ +a_prefix##_remove_any(a_prefix##_t *ph) { \ + a_type *ret = a_prefix##_any(ph); \ + if (ret != NULL) { \ + a_prefix##_remove(ph, ret); \ + } \ + return ret; \ +} + +#endif /* JEMALLOC_INTERNAL_PH_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/private_namespace.sh b/BeefRT/JEMalloc/include/jemalloc/internal/private_namespace.sh new file mode 100644 index 00000000..6ef1346a --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/private_namespace.sh @@ -0,0 +1,5 @@ +#!/bin/sh + +for symbol in `cat "$@"` ; do + echo "#define ${symbol} JEMALLOC_N(${symbol})" +done diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/private_symbols.awk b/BeefRT/JEMalloc/include/jemalloc/internal/private_symbols.awk new file mode 100644 index 00000000..68d68856 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/private_symbols.awk @@ -0,0 +1,52 @@ +#!/usr/bin/env awk -f + +BEGIN { + sym_prefix = "" + split("\ + je_aligned_alloc \ + je_calloc \ + je_dallocx \ + je_free \ + je_mallctl \ + je_mallctlbymib \ + je_mallctlnametomib \ + je_malloc \ + je_malloc_conf \ + je_malloc_conf_2_conf_harder \ + je_malloc_message \ + je_malloc_stats_print \ + je_malloc_usable_size \ + je_mallocx \ + je_smallocx_54eaed1d8b56b1aa528be3bdd1877e59c56fa90c \ + je_nallocx \ + je_posix_memalign \ + je_rallocx \ + je_realloc \ + je_sallocx \ + je_sdallocx \ + je_xallocx \ + tls_callback \ + ", exported_symbol_names) + # Store exported symbol names as keys in exported_symbols. + for (i in exported_symbol_names) { + exported_symbols[exported_symbol_names[i]] = 1 + } +} + +# Process 'nm -a ' output. +# +# Handle lines like: +# 0000000000000008 D opt_junk +# 0000000000007574 T malloc_initialized +(NF == 3 && $2 ~ /^[ABCDGRSTVW]$/ && !($3 in exported_symbols) && $3 ~ /^[A-Za-z0-9_]+$/) { + print substr($3, 1+length(sym_prefix), length($3)-length(sym_prefix)) +} + +# Process 'dumpbin /SYMBOLS ' output. +# +# Handle lines like: +# 353 00008098 SECT4 notype External | opt_junk +# 3F1 00000000 SECT7 notype () External | malloc_initialized +($3 ~ /^SECT[0-9]+/ && $(NF-2) == "External" && !($NF in exported_symbols)) { + print $NF +} diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/private_symbols.sh b/BeefRT/JEMalloc/include/jemalloc/internal/private_symbols.sh new file mode 100644 index 00000000..442a259f --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/private_symbols.sh @@ -0,0 +1,51 @@ +#!/bin/sh +# +# Generate private_symbols[_jet].awk. +# +# Usage: private_symbols.sh * +# +# is typically "" or "_". + +sym_prefix=$1 +shift + +cat <' output. +# +# Handle lines like: +# 0000000000000008 D opt_junk +# 0000000000007574 T malloc_initialized +(NF == 3 && $2 ~ /^[ABCDGRSTVW]$/ && !($3 in exported_symbols) && $3 ~ /^[A-Za-z0-9_]+$/) { + print substr($3, 1+length(sym_prefix), length($3)-length(sym_prefix)) +} + +# Process 'dumpbin /SYMBOLS ' output. +# +# Handle lines like: +# 353 00008098 SECT4 notype External | opt_junk +# 3F1 00000000 SECT7 notype () External | malloc_initialized +($3 ~ /^SECT[0-9]+/ && $(NF-2) == "External" && !($NF in exported_symbols)) { + print $NF +} +EOF diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/private_symbols_jet.awk b/BeefRT/JEMalloc/include/jemalloc/internal/private_symbols_jet.awk new file mode 100644 index 00000000..7ab31d7c --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/private_symbols_jet.awk @@ -0,0 +1,52 @@ +#!/usr/bin/env awk -f + +BEGIN { + sym_prefix = "" + split("\ + jet_aligned_alloc \ + jet_calloc \ + jet_dallocx \ + jet_free \ + jet_mallctl \ + jet_mallctlbymib \ + jet_mallctlnametomib \ + jet_malloc \ + jet_malloc_conf \ + jet_malloc_conf_2_conf_harder \ + jet_malloc_message \ + jet_malloc_stats_print \ + jet_malloc_usable_size \ + jet_mallocx \ + jet_smallocx_54eaed1d8b56b1aa528be3bdd1877e59c56fa90c \ + jet_nallocx \ + jet_posix_memalign \ + jet_rallocx \ + jet_realloc \ + jet_sallocx \ + jet_sdallocx \ + jet_xallocx \ + tls_callback \ + ", exported_symbol_names) + # Store exported symbol names as keys in exported_symbols. + for (i in exported_symbol_names) { + exported_symbols[exported_symbol_names[i]] = 1 + } +} + +# Process 'nm -a ' output. +# +# Handle lines like: +# 0000000000000008 D opt_junk +# 0000000000007574 T malloc_initialized +(NF == 3 && $2 ~ /^[ABCDGRSTVW]$/ && !($3 in exported_symbols) && $3 ~ /^[A-Za-z0-9_]+$/) { + print substr($3, 1+length(sym_prefix), length($3)-length(sym_prefix)) +} + +# Process 'dumpbin /SYMBOLS ' output. +# +# Handle lines like: +# 353 00008098 SECT4 notype External | opt_junk +# 3F1 00000000 SECT7 notype () External | malloc_initialized +($3 ~ /^SECT[0-9]+/ && $(NF-2) == "External" && !($NF in exported_symbols)) { + print $NF +} diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/prng.h b/BeefRT/JEMalloc/include/jemalloc/internal/prng.h new file mode 100644 index 00000000..14542aa1 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/prng.h @@ -0,0 +1,168 @@ +#ifndef JEMALLOC_INTERNAL_PRNG_H +#define JEMALLOC_INTERNAL_PRNG_H + +#include "jemalloc/internal/bit_util.h" + +/* + * Simple linear congruential pseudo-random number generator: + * + * prng(y) = (a*x + c) % m + * + * where the following constants ensure maximal period: + * + * a == Odd number (relatively prime to 2^n), and (a-1) is a multiple of 4. + * c == Odd number (relatively prime to 2^n). + * m == 2^32 + * + * See Knuth's TAOCP 3rd Ed., Vol. 2, pg. 17 for details on these constraints. + * + * This choice of m has the disadvantage that the quality of the bits is + * proportional to bit position. For example, the lowest bit has a cycle of 2, + * the next has a cycle of 4, etc. For this reason, we prefer to use the upper + * bits. + */ + +/******************************************************************************/ +/* INTERNAL DEFINITIONS -- IGNORE */ +/******************************************************************************/ +#define PRNG_A_32 UINT32_C(1103515241) +#define PRNG_C_32 UINT32_C(12347) + +#define PRNG_A_64 UINT64_C(6364136223846793005) +#define PRNG_C_64 UINT64_C(1442695040888963407) + +JEMALLOC_ALWAYS_INLINE uint32_t +prng_state_next_u32(uint32_t state) { + return (state * PRNG_A_32) + PRNG_C_32; +} + +JEMALLOC_ALWAYS_INLINE uint64_t +prng_state_next_u64(uint64_t state) { + return (state * PRNG_A_64) + PRNG_C_64; +} + +JEMALLOC_ALWAYS_INLINE size_t +prng_state_next_zu(size_t state) { +#if LG_SIZEOF_PTR == 2 + return (state * PRNG_A_32) + PRNG_C_32; +#elif LG_SIZEOF_PTR == 3 + return (state * PRNG_A_64) + PRNG_C_64; +#else +#error Unsupported pointer size +#endif +} + +/******************************************************************************/ +/* BEGIN PUBLIC API */ +/******************************************************************************/ + +/* + * The prng_lg_range functions give a uniform int in the half-open range [0, + * 2**lg_range). + */ + +JEMALLOC_ALWAYS_INLINE uint32_t +prng_lg_range_u32(uint32_t *state, unsigned lg_range) { + assert(lg_range > 0); + assert(lg_range <= 32); + + *state = prng_state_next_u32(*state); + uint32_t ret = *state >> (32 - lg_range); + + return ret; +} + +JEMALLOC_ALWAYS_INLINE uint64_t +prng_lg_range_u64(uint64_t *state, unsigned lg_range) { + assert(lg_range > 0); + assert(lg_range <= 64); + + *state = prng_state_next_u64(*state); + uint64_t ret = *state >> (64 - lg_range); + + return ret; +} + +JEMALLOC_ALWAYS_INLINE size_t +prng_lg_range_zu(size_t *state, unsigned lg_range) { + assert(lg_range > 0); + assert(lg_range <= ZU(1) << (3 + LG_SIZEOF_PTR)); + + *state = prng_state_next_zu(*state); + size_t ret = *state >> ((ZU(1) << (3 + LG_SIZEOF_PTR)) - lg_range); + + return ret; +} + +/* + * The prng_range functions behave like the prng_lg_range, but return a result + * in [0, range) instead of [0, 2**lg_range). + */ + +JEMALLOC_ALWAYS_INLINE uint32_t +prng_range_u32(uint32_t *state, uint32_t range) { + assert(range != 0); + /* + * If range were 1, lg_range would be 0, so the shift in + * prng_lg_range_u32 would be a shift of a 32-bit variable by 32 bits, + * which is UB. Just handle this case as a one-off. + */ + if (range == 1) { + return 0; + } + + /* Compute the ceiling of lg(range). */ + unsigned lg_range = ffs_u32(pow2_ceil_u32(range)); + + /* Generate a result in [0..range) via repeated trial. */ + uint32_t ret; + do { + ret = prng_lg_range_u32(state, lg_range); + } while (ret >= range); + + return ret; +} + +JEMALLOC_ALWAYS_INLINE uint64_t +prng_range_u64(uint64_t *state, uint64_t range) { + assert(range != 0); + + /* See the note in prng_range_u32. */ + if (range == 1) { + return 0; + } + + /* Compute the ceiling of lg(range). */ + unsigned lg_range = ffs_u64(pow2_ceil_u64(range)); + + /* Generate a result in [0..range) via repeated trial. */ + uint64_t ret; + do { + ret = prng_lg_range_u64(state, lg_range); + } while (ret >= range); + + return ret; +} + +JEMALLOC_ALWAYS_INLINE size_t +prng_range_zu(size_t *state, size_t range) { + assert(range != 0); + + /* See the note in prng_range_u32. */ + if (range == 1) { + return 0; + } + + /* Compute the ceiling of lg(range). */ + unsigned lg_range = ffs_u64(pow2_ceil_u64(range)); + + /* Generate a result in [0..range) via repeated trial. */ + size_t ret; + do { + ret = prng_lg_range_zu(state, lg_range); + } while (ret >= range); + + return ret; +} + +#endif /* JEMALLOC_INTERNAL_PRNG_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/prof_data.h b/BeefRT/JEMalloc/include/jemalloc/internal/prof_data.h new file mode 100644 index 00000000..4c8e22c7 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/prof_data.h @@ -0,0 +1,37 @@ +#ifndef JEMALLOC_INTERNAL_PROF_DATA_H +#define JEMALLOC_INTERNAL_PROF_DATA_H + +#include "jemalloc/internal/mutex.h" + +extern malloc_mutex_t bt2gctx_mtx; +extern malloc_mutex_t tdatas_mtx; +extern malloc_mutex_t prof_dump_mtx; + +extern malloc_mutex_t *gctx_locks; +extern malloc_mutex_t *tdata_locks; + +extern size_t prof_unbiased_sz[PROF_SC_NSIZES]; +extern size_t prof_shifted_unbiased_cnt[PROF_SC_NSIZES]; + +void prof_bt_hash(const void *key, size_t r_hash[2]); +bool prof_bt_keycomp(const void *k1, const void *k2); + +bool prof_data_init(tsd_t *tsd); +prof_tctx_t *prof_lookup(tsd_t *tsd, prof_bt_t *bt); +char *prof_thread_name_alloc(tsd_t *tsd, const char *thread_name); +int prof_thread_name_set_impl(tsd_t *tsd, const char *thread_name); +void prof_unbias_map_init(); +void prof_dump_impl(tsd_t *tsd, write_cb_t *prof_dump_write, void *cbopaque, + prof_tdata_t *tdata, bool leakcheck); +prof_tdata_t * prof_tdata_init_impl(tsd_t *tsd, uint64_t thr_uid, + uint64_t thr_discrim, char *thread_name, bool active); +void prof_tdata_detach(tsd_t *tsd, prof_tdata_t *tdata); +void prof_reset(tsd_t *tsd, size_t lg_sample); +void prof_tctx_try_destroy(tsd_t *tsd, prof_tctx_t *tctx); + +/* Used in unit tests. */ +size_t prof_tdata_count(void); +size_t prof_bt_count(void); +void prof_cnt_all(prof_cnt_t *cnt_all); + +#endif /* JEMALLOC_INTERNAL_PROF_DATA_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/prof_externs.h b/BeefRT/JEMalloc/include/jemalloc/internal/prof_externs.h new file mode 100644 index 00000000..bdff1349 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/prof_externs.h @@ -0,0 +1,95 @@ +#ifndef JEMALLOC_INTERNAL_PROF_EXTERNS_H +#define JEMALLOC_INTERNAL_PROF_EXTERNS_H + +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/prof_hook.h" + +extern bool opt_prof; +extern bool opt_prof_active; +extern bool opt_prof_thread_active_init; +extern size_t opt_lg_prof_sample; /* Mean bytes between samples. */ +extern ssize_t opt_lg_prof_interval; /* lg(prof_interval). */ +extern bool opt_prof_gdump; /* High-water memory dumping. */ +extern bool opt_prof_final; /* Final profile dumping. */ +extern bool opt_prof_leak; /* Dump leak summary at exit. */ +extern bool opt_prof_leak_error; /* Exit with error code if memory leaked */ +extern bool opt_prof_accum; /* Report cumulative bytes. */ +extern bool opt_prof_log; /* Turn logging on at boot. */ +extern char opt_prof_prefix[ + /* Minimize memory bloat for non-prof builds. */ +#ifdef JEMALLOC_PROF + PATH_MAX + +#endif + 1]; +extern bool opt_prof_unbias; + +/* For recording recent allocations */ +extern ssize_t opt_prof_recent_alloc_max; + +/* Whether to use thread name provided by the system or by mallctl. */ +extern bool opt_prof_sys_thread_name; + +/* Whether to record per size class counts and request size totals. */ +extern bool opt_prof_stats; + +/* Accessed via prof_active_[gs]et{_unlocked,}(). */ +extern bool prof_active_state; + +/* Accessed via prof_gdump_[gs]et{_unlocked,}(). */ +extern bool prof_gdump_val; + +/* Profile dump interval, measured in bytes allocated. */ +extern uint64_t prof_interval; + +/* + * Initialized as opt_lg_prof_sample, and potentially modified during profiling + * resets. + */ +extern size_t lg_prof_sample; + +extern bool prof_booted; + +void prof_backtrace_hook_set(prof_backtrace_hook_t hook); +prof_backtrace_hook_t prof_backtrace_hook_get(); + +void prof_dump_hook_set(prof_dump_hook_t hook); +prof_dump_hook_t prof_dump_hook_get(); + +/* Functions only accessed in prof_inlines.h */ +prof_tdata_t *prof_tdata_init(tsd_t *tsd); +prof_tdata_t *prof_tdata_reinit(tsd_t *tsd, prof_tdata_t *tdata); + +void prof_alloc_rollback(tsd_t *tsd, prof_tctx_t *tctx); +void prof_malloc_sample_object(tsd_t *tsd, const void *ptr, size_t size, + size_t usize, prof_tctx_t *tctx); +void prof_free_sampled_object(tsd_t *tsd, size_t usize, prof_info_t *prof_info); +prof_tctx_t *prof_tctx_create(tsd_t *tsd); +void prof_idump(tsdn_t *tsdn); +bool prof_mdump(tsd_t *tsd, const char *filename); +void prof_gdump(tsdn_t *tsdn); + +void prof_tdata_cleanup(tsd_t *tsd); +bool prof_active_get(tsdn_t *tsdn); +bool prof_active_set(tsdn_t *tsdn, bool active); +const char *prof_thread_name_get(tsd_t *tsd); +int prof_thread_name_set(tsd_t *tsd, const char *thread_name); +bool prof_thread_active_get(tsd_t *tsd); +bool prof_thread_active_set(tsd_t *tsd, bool active); +bool prof_thread_active_init_get(tsdn_t *tsdn); +bool prof_thread_active_init_set(tsdn_t *tsdn, bool active_init); +bool prof_gdump_get(tsdn_t *tsdn); +bool prof_gdump_set(tsdn_t *tsdn, bool active); +void prof_boot0(void); +void prof_boot1(void); +bool prof_boot2(tsd_t *tsd, base_t *base); +void prof_prefork0(tsdn_t *tsdn); +void prof_prefork1(tsdn_t *tsdn); +void prof_postfork_parent(tsdn_t *tsdn); +void prof_postfork_child(tsdn_t *tsdn); + +/* Only accessed by thread event. */ +uint64_t prof_sample_new_event_wait(tsd_t *tsd); +uint64_t prof_sample_postponed_event_wait(tsd_t *tsd); +void prof_sample_event_handler(tsd_t *tsd, uint64_t elapsed); + +#endif /* JEMALLOC_INTERNAL_PROF_EXTERNS_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/prof_hook.h b/BeefRT/JEMalloc/include/jemalloc/internal/prof_hook.h new file mode 100644 index 00000000..150d19d3 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/prof_hook.h @@ -0,0 +1,21 @@ +#ifndef JEMALLOC_INTERNAL_PROF_HOOK_H +#define JEMALLOC_INTERNAL_PROF_HOOK_H + +/* + * The hooks types of which are declared in this file are experimental and + * undocumented, thus the typedefs are located in an 'internal' header. + */ + +/* + * A hook to mock out backtrace functionality. This can be handy, since it's + * otherwise difficult to guarantee that two allocations are reported as coming + * from the exact same stack trace in the presence of an optimizing compiler. + */ +typedef void (*prof_backtrace_hook_t)(void **, unsigned *, unsigned); + +/* + * A callback hook that notifies about recently dumped heap profile. + */ +typedef void (*prof_dump_hook_t)(const char *filename); + +#endif /* JEMALLOC_INTERNAL_PROF_HOOK_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/prof_inlines.h b/BeefRT/JEMalloc/include/jemalloc/internal/prof_inlines.h new file mode 100644 index 00000000..a8e7e7fb --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/prof_inlines.h @@ -0,0 +1,261 @@ +#ifndef JEMALLOC_INTERNAL_PROF_INLINES_H +#define JEMALLOC_INTERNAL_PROF_INLINES_H + +#include "jemalloc/internal/safety_check.h" +#include "jemalloc/internal/sz.h" +#include "jemalloc/internal/thread_event.h" + +JEMALLOC_ALWAYS_INLINE void +prof_active_assert() { + cassert(config_prof); + /* + * If opt_prof is off, then prof_active must always be off, regardless + * of whether prof_active_mtx is in effect or not. + */ + assert(opt_prof || !prof_active_state); +} + +JEMALLOC_ALWAYS_INLINE bool +prof_active_get_unlocked(void) { + prof_active_assert(); + /* + * Even if opt_prof is true, sampling can be temporarily disabled by + * setting prof_active to false. No locking is used when reading + * prof_active in the fast path, so there are no guarantees regarding + * how long it will take for all threads to notice state changes. + */ + return prof_active_state; +} + +JEMALLOC_ALWAYS_INLINE bool +prof_gdump_get_unlocked(void) { + /* + * No locking is used when reading prof_gdump_val in the fast path, so + * there are no guarantees regarding how long it will take for all + * threads to notice state changes. + */ + return prof_gdump_val; +} + +JEMALLOC_ALWAYS_INLINE prof_tdata_t * +prof_tdata_get(tsd_t *tsd, bool create) { + prof_tdata_t *tdata; + + cassert(config_prof); + + tdata = tsd_prof_tdata_get(tsd); + if (create) { + assert(tsd_reentrancy_level_get(tsd) == 0); + if (unlikely(tdata == NULL)) { + if (tsd_nominal(tsd)) { + tdata = prof_tdata_init(tsd); + tsd_prof_tdata_set(tsd, tdata); + } + } else if (unlikely(tdata->expired)) { + tdata = prof_tdata_reinit(tsd, tdata); + tsd_prof_tdata_set(tsd, tdata); + } + assert(tdata == NULL || tdata->attached); + } + + return tdata; +} + +JEMALLOC_ALWAYS_INLINE void +prof_info_get(tsd_t *tsd, const void *ptr, emap_alloc_ctx_t *alloc_ctx, + prof_info_t *prof_info) { + cassert(config_prof); + assert(ptr != NULL); + assert(prof_info != NULL); + + arena_prof_info_get(tsd, ptr, alloc_ctx, prof_info, false); +} + +JEMALLOC_ALWAYS_INLINE void +prof_info_get_and_reset_recent(tsd_t *tsd, const void *ptr, + emap_alloc_ctx_t *alloc_ctx, prof_info_t *prof_info) { + cassert(config_prof); + assert(ptr != NULL); + assert(prof_info != NULL); + + arena_prof_info_get(tsd, ptr, alloc_ctx, prof_info, true); +} + +JEMALLOC_ALWAYS_INLINE void +prof_tctx_reset(tsd_t *tsd, const void *ptr, emap_alloc_ctx_t *alloc_ctx) { + cassert(config_prof); + assert(ptr != NULL); + + arena_prof_tctx_reset(tsd, ptr, alloc_ctx); +} + +JEMALLOC_ALWAYS_INLINE void +prof_tctx_reset_sampled(tsd_t *tsd, const void *ptr) { + cassert(config_prof); + assert(ptr != NULL); + + arena_prof_tctx_reset_sampled(tsd, ptr); +} + +JEMALLOC_ALWAYS_INLINE void +prof_info_set(tsd_t *tsd, edata_t *edata, prof_tctx_t *tctx, size_t size) { + cassert(config_prof); + assert(edata != NULL); + assert((uintptr_t)tctx > (uintptr_t)1U); + + arena_prof_info_set(tsd, edata, tctx, size); +} + +JEMALLOC_ALWAYS_INLINE bool +prof_sample_should_skip(tsd_t *tsd, bool sample_event) { + cassert(config_prof); + + /* Fastpath: no need to load tdata */ + if (likely(!sample_event)) { + return true; + } + + /* + * sample_event is always obtained from the thread event module, and + * whenever it's true, it means that the thread event module has + * already checked the reentrancy level. + */ + assert(tsd_reentrancy_level_get(tsd) == 0); + + prof_tdata_t *tdata = prof_tdata_get(tsd, true); + if (unlikely(tdata == NULL)) { + return true; + } + + return !tdata->active; +} + +JEMALLOC_ALWAYS_INLINE prof_tctx_t * +prof_alloc_prep(tsd_t *tsd, bool prof_active, bool sample_event) { + prof_tctx_t *ret; + + if (!prof_active || + likely(prof_sample_should_skip(tsd, sample_event))) { + ret = (prof_tctx_t *)(uintptr_t)1U; + } else { + ret = prof_tctx_create(tsd); + } + + return ret; +} + +JEMALLOC_ALWAYS_INLINE void +prof_malloc(tsd_t *tsd, const void *ptr, size_t size, size_t usize, + emap_alloc_ctx_t *alloc_ctx, prof_tctx_t *tctx) { + cassert(config_prof); + assert(ptr != NULL); + assert(usize == isalloc(tsd_tsdn(tsd), ptr)); + + if (unlikely((uintptr_t)tctx > (uintptr_t)1U)) { + prof_malloc_sample_object(tsd, ptr, size, usize, tctx); + } else { + prof_tctx_reset(tsd, ptr, alloc_ctx); + } +} + +JEMALLOC_ALWAYS_INLINE void +prof_realloc(tsd_t *tsd, const void *ptr, size_t size, size_t usize, + prof_tctx_t *tctx, bool prof_active, const void *old_ptr, size_t old_usize, + prof_info_t *old_prof_info, bool sample_event) { + bool sampled, old_sampled, moved; + + cassert(config_prof); + assert(ptr != NULL || (uintptr_t)tctx <= (uintptr_t)1U); + + if (prof_active && ptr != NULL) { + assert(usize == isalloc(tsd_tsdn(tsd), ptr)); + if (prof_sample_should_skip(tsd, sample_event)) { + /* + * Don't sample. The usize passed to prof_alloc_prep() + * was larger than what actually got allocated, so a + * backtrace was captured for this allocation, even + * though its actual usize was insufficient to cross the + * sample threshold. + */ + prof_alloc_rollback(tsd, tctx); + tctx = (prof_tctx_t *)(uintptr_t)1U; + } + } + + sampled = ((uintptr_t)tctx > (uintptr_t)1U); + old_sampled = ((uintptr_t)old_prof_info->alloc_tctx > (uintptr_t)1U); + moved = (ptr != old_ptr); + + if (unlikely(sampled)) { + prof_malloc_sample_object(tsd, ptr, size, usize, tctx); + } else if (moved) { + prof_tctx_reset(tsd, ptr, NULL); + } else if (unlikely(old_sampled)) { + /* + * prof_tctx_reset() would work for the !moved case as well, + * but prof_tctx_reset_sampled() is slightly cheaper, and the + * proper thing to do here in the presence of explicit + * knowledge re: moved state. + */ + prof_tctx_reset_sampled(tsd, ptr); + } else { + prof_info_t prof_info; + prof_info_get(tsd, ptr, NULL, &prof_info); + assert((uintptr_t)prof_info.alloc_tctx == (uintptr_t)1U); + } + + /* + * The prof_free_sampled_object() call must come after the + * prof_malloc_sample_object() call, because tctx and old_tctx may be + * the same, in which case reversing the call order could cause the tctx + * to be prematurely destroyed as a side effect of momentarily zeroed + * counters. + */ + if (unlikely(old_sampled)) { + prof_free_sampled_object(tsd, old_usize, old_prof_info); + } +} + +JEMALLOC_ALWAYS_INLINE size_t +prof_sample_align(size_t orig_align) { + /* + * Enforce page alignment, so that sampled allocations can be identified + * w/o metadata lookup. + */ + assert(opt_prof); + return (opt_cache_oblivious && orig_align < PAGE) ? PAGE : + orig_align; +} + +JEMALLOC_ALWAYS_INLINE bool +prof_sample_aligned(const void *ptr) { + return ((uintptr_t)ptr & PAGE_MASK) == 0; +} + +JEMALLOC_ALWAYS_INLINE bool +prof_sampled(tsd_t *tsd, const void *ptr) { + prof_info_t prof_info; + prof_info_get(tsd, ptr, NULL, &prof_info); + bool sampled = (uintptr_t)prof_info.alloc_tctx > (uintptr_t)1U; + if (sampled) { + assert(prof_sample_aligned(ptr)); + } + return sampled; +} + +JEMALLOC_ALWAYS_INLINE void +prof_free(tsd_t *tsd, const void *ptr, size_t usize, + emap_alloc_ctx_t *alloc_ctx) { + prof_info_t prof_info; + prof_info_get_and_reset_recent(tsd, ptr, alloc_ctx, &prof_info); + + cassert(config_prof); + assert(usize == isalloc(tsd_tsdn(tsd), ptr)); + + if (unlikely((uintptr_t)prof_info.alloc_tctx > (uintptr_t)1U)) { + assert(prof_sample_aligned(ptr)); + prof_free_sampled_object(tsd, usize, &prof_info); + } +} + +#endif /* JEMALLOC_INTERNAL_PROF_INLINES_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/prof_log.h b/BeefRT/JEMalloc/include/jemalloc/internal/prof_log.h new file mode 100644 index 00000000..ccb557dd --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/prof_log.h @@ -0,0 +1,22 @@ +#ifndef JEMALLOC_INTERNAL_PROF_LOG_H +#define JEMALLOC_INTERNAL_PROF_LOG_H + +#include "jemalloc/internal/mutex.h" + +extern malloc_mutex_t log_mtx; + +void prof_try_log(tsd_t *tsd, size_t usize, prof_info_t *prof_info); +bool prof_log_init(tsd_t *tsdn); + +/* Used in unit tests. */ +size_t prof_log_bt_count(void); +size_t prof_log_alloc_count(void); +size_t prof_log_thr_count(void); +bool prof_log_is_logging(void); +bool prof_log_rep_check(void); +void prof_log_dummy_set(bool new_value); + +bool prof_log_start(tsdn_t *tsdn, const char *filename); +bool prof_log_stop(tsdn_t *tsdn); + +#endif /* JEMALLOC_INTERNAL_PROF_LOG_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/prof_recent.h b/BeefRT/JEMalloc/include/jemalloc/internal/prof_recent.h new file mode 100644 index 00000000..df410236 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/prof_recent.h @@ -0,0 +1,23 @@ +#ifndef JEMALLOC_INTERNAL_PROF_RECENT_H +#define JEMALLOC_INTERNAL_PROF_RECENT_H + +extern malloc_mutex_t prof_recent_alloc_mtx; +extern malloc_mutex_t prof_recent_dump_mtx; + +bool prof_recent_alloc_prepare(tsd_t *tsd, prof_tctx_t *tctx); +void prof_recent_alloc(tsd_t *tsd, edata_t *edata, size_t size, size_t usize); +void prof_recent_alloc_reset(tsd_t *tsd, edata_t *edata); +bool prof_recent_init(); +void edata_prof_recent_alloc_init(edata_t *edata); + +/* Used in unit tests. */ +typedef ql_head(prof_recent_t) prof_recent_list_t; +extern prof_recent_list_t prof_recent_alloc_list; +edata_t *prof_recent_alloc_edata_get_no_lock_test(const prof_recent_t *node); +prof_recent_t *edata_prof_recent_alloc_get_no_lock_test(const edata_t *edata); + +ssize_t prof_recent_alloc_max_ctl_read(); +ssize_t prof_recent_alloc_max_ctl_write(tsd_t *tsd, ssize_t max); +void prof_recent_alloc_dump(tsd_t *tsd, write_cb_t *write_cb, void *cbopaque); + +#endif /* JEMALLOC_INTERNAL_PROF_RECENT_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/prof_stats.h b/BeefRT/JEMalloc/include/jemalloc/internal/prof_stats.h new file mode 100644 index 00000000..7954e82d --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/prof_stats.h @@ -0,0 +1,17 @@ +#ifndef JEMALLOC_INTERNAL_PROF_STATS_H +#define JEMALLOC_INTERNAL_PROF_STATS_H + +typedef struct prof_stats_s prof_stats_t; +struct prof_stats_s { + uint64_t req_sum; + uint64_t count; +}; + +extern malloc_mutex_t prof_stats_mtx; + +void prof_stats_inc(tsd_t *tsd, szind_t ind, size_t size); +void prof_stats_dec(tsd_t *tsd, szind_t ind, size_t size); +void prof_stats_get_live(tsd_t *tsd, szind_t ind, prof_stats_t *stats); +void prof_stats_get_accum(tsd_t *tsd, szind_t ind, prof_stats_t *stats); + +#endif /* JEMALLOC_INTERNAL_PROF_STATS_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/prof_structs.h b/BeefRT/JEMalloc/include/jemalloc/internal/prof_structs.h new file mode 100644 index 00000000..dd22115f --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/prof_structs.h @@ -0,0 +1,221 @@ +#ifndef JEMALLOC_INTERNAL_PROF_STRUCTS_H +#define JEMALLOC_INTERNAL_PROF_STRUCTS_H + +#include "jemalloc/internal/ckh.h" +#include "jemalloc/internal/edata.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/prng.h" +#include "jemalloc/internal/rb.h" + +struct prof_bt_s { + /* Backtrace, stored as len program counters. */ + void **vec; + unsigned len; +}; + +#ifdef JEMALLOC_PROF_LIBGCC +/* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */ +typedef struct { + void **vec; + unsigned *len; + unsigned max; +} prof_unwind_data_t; +#endif + +struct prof_cnt_s { + /* Profiling counters. */ + uint64_t curobjs; + uint64_t curobjs_shifted_unbiased; + uint64_t curbytes; + uint64_t curbytes_unbiased; + uint64_t accumobjs; + uint64_t accumobjs_shifted_unbiased; + uint64_t accumbytes; + uint64_t accumbytes_unbiased; +}; + +typedef enum { + prof_tctx_state_initializing, + prof_tctx_state_nominal, + prof_tctx_state_dumping, + prof_tctx_state_purgatory /* Dumper must finish destroying. */ +} prof_tctx_state_t; + +struct prof_tctx_s { + /* Thread data for thread that performed the allocation. */ + prof_tdata_t *tdata; + + /* + * Copy of tdata->thr_{uid,discrim}, necessary because tdata may be + * defunct during teardown. + */ + uint64_t thr_uid; + uint64_t thr_discrim; + + /* + * Reference count of how many times this tctx object is referenced in + * recent allocation / deallocation records, protected by tdata->lock. + */ + uint64_t recent_count; + + /* Profiling counters, protected by tdata->lock. */ + prof_cnt_t cnts; + + /* Associated global context. */ + prof_gctx_t *gctx; + + /* + * UID that distinguishes multiple tctx's created by the same thread, + * but coexisting in gctx->tctxs. There are two ways that such + * coexistence can occur: + * - A dumper thread can cause a tctx to be retained in the purgatory + * state. + * - Although a single "producer" thread must create all tctx's which + * share the same thr_uid, multiple "consumers" can each concurrently + * execute portions of prof_tctx_destroy(). prof_tctx_destroy() only + * gets called once each time cnts.cur{objs,bytes} drop to 0, but this + * threshold can be hit again before the first consumer finishes + * executing prof_tctx_destroy(). + */ + uint64_t tctx_uid; + + /* Linkage into gctx's tctxs. */ + rb_node(prof_tctx_t) tctx_link; + + /* + * True during prof_alloc_prep()..prof_malloc_sample_object(), prevents + * sample vs destroy race. + */ + bool prepared; + + /* Current dump-related state, protected by gctx->lock. */ + prof_tctx_state_t state; + + /* + * Copy of cnts snapshotted during early dump phase, protected by + * dump_mtx. + */ + prof_cnt_t dump_cnts; +}; +typedef rb_tree(prof_tctx_t) prof_tctx_tree_t; + +struct prof_info_s { + /* Time when the allocation was made. */ + nstime_t alloc_time; + /* Points to the prof_tctx_t corresponding to the allocation. */ + prof_tctx_t *alloc_tctx; + /* Allocation request size. */ + size_t alloc_size; +}; + +struct prof_gctx_s { + /* Protects nlimbo, cnt_summed, and tctxs. */ + malloc_mutex_t *lock; + + /* + * Number of threads that currently cause this gctx to be in a state of + * limbo due to one of: + * - Initializing this gctx. + * - Initializing per thread counters associated with this gctx. + * - Preparing to destroy this gctx. + * - Dumping a heap profile that includes this gctx. + * nlimbo must be 1 (single destroyer) in order to safely destroy the + * gctx. + */ + unsigned nlimbo; + + /* + * Tree of profile counters, one for each thread that has allocated in + * this context. + */ + prof_tctx_tree_t tctxs; + + /* Linkage for tree of contexts to be dumped. */ + rb_node(prof_gctx_t) dump_link; + + /* Temporary storage for summation during dump. */ + prof_cnt_t cnt_summed; + + /* Associated backtrace. */ + prof_bt_t bt; + + /* Backtrace vector, variable size, referred to by bt. */ + void *vec[1]; +}; +typedef rb_tree(prof_gctx_t) prof_gctx_tree_t; + +struct prof_tdata_s { + malloc_mutex_t *lock; + + /* Monotonically increasing unique thread identifier. */ + uint64_t thr_uid; + + /* + * Monotonically increasing discriminator among tdata structures + * associated with the same thr_uid. + */ + uint64_t thr_discrim; + + /* Included in heap profile dumps if non-NULL. */ + char *thread_name; + + bool attached; + bool expired; + + rb_node(prof_tdata_t) tdata_link; + + /* + * Counter used to initialize prof_tctx_t's tctx_uid. No locking is + * necessary when incrementing this field, because only one thread ever + * does so. + */ + uint64_t tctx_uid_next; + + /* + * Hash of (prof_bt_t *)-->(prof_tctx_t *). Each thread tracks + * backtraces for which it has non-zero allocation/deallocation counters + * associated with thread-specific prof_tctx_t objects. Other threads + * may write to prof_tctx_t contents when freeing associated objects. + */ + ckh_t bt2tctx; + + /* State used to avoid dumping while operating on prof internals. */ + bool enq; + bool enq_idump; + bool enq_gdump; + + /* + * Set to true during an early dump phase for tdata's which are + * currently being dumped. New threads' tdata's have this initialized + * to false so that they aren't accidentally included in later dump + * phases. + */ + bool dumping; + + /* + * True if profiling is active for this tdata's thread + * (thread.prof.active mallctl). + */ + bool active; + + /* Temporary storage for summation during dump. */ + prof_cnt_t cnt_summed; + + /* Backtrace vector, used for calls to prof_backtrace(). */ + void *vec[PROF_BT_MAX]; +}; +typedef rb_tree(prof_tdata_t) prof_tdata_tree_t; + +struct prof_recent_s { + nstime_t alloc_time; + nstime_t dalloc_time; + + ql_elm(prof_recent_t) link; + size_t size; + size_t usize; + atomic_p_t alloc_edata; /* NULL means allocation has been freed. */ + prof_tctx_t *alloc_tctx; + prof_tctx_t *dalloc_tctx; +}; + +#endif /* JEMALLOC_INTERNAL_PROF_STRUCTS_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/prof_sys.h b/BeefRT/JEMalloc/include/jemalloc/internal/prof_sys.h new file mode 100644 index 00000000..3d25a429 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/prof_sys.h @@ -0,0 +1,30 @@ +#ifndef JEMALLOC_INTERNAL_PROF_SYS_H +#define JEMALLOC_INTERNAL_PROF_SYS_H + +extern malloc_mutex_t prof_dump_filename_mtx; +extern base_t *prof_base; + +void bt_init(prof_bt_t *bt, void **vec); +void prof_backtrace(tsd_t *tsd, prof_bt_t *bt); +void prof_hooks_init(); +void prof_unwind_init(); +void prof_sys_thread_name_fetch(tsd_t *tsd); +int prof_getpid(void); +void prof_get_default_filename(tsdn_t *tsdn, char *filename, uint64_t ind); +bool prof_prefix_set(tsdn_t *tsdn, const char *prefix); +void prof_fdump_impl(tsd_t *tsd); +void prof_idump_impl(tsd_t *tsd); +bool prof_mdump_impl(tsd_t *tsd, const char *filename); +void prof_gdump_impl(tsd_t *tsd); + +/* Used in unit tests. */ +typedef int (prof_sys_thread_name_read_t)(char *buf, size_t limit); +extern prof_sys_thread_name_read_t *JET_MUTABLE prof_sys_thread_name_read; +typedef int (prof_dump_open_file_t)(const char *, int); +extern prof_dump_open_file_t *JET_MUTABLE prof_dump_open_file; +typedef ssize_t (prof_dump_write_file_t)(int, const void *, size_t); +extern prof_dump_write_file_t *JET_MUTABLE prof_dump_write_file; +typedef int (prof_dump_open_maps_t)(); +extern prof_dump_open_maps_t *JET_MUTABLE prof_dump_open_maps; + +#endif /* JEMALLOC_INTERNAL_PROF_SYS_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/prof_types.h b/BeefRT/JEMalloc/include/jemalloc/internal/prof_types.h new file mode 100644 index 00000000..ba628654 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/prof_types.h @@ -0,0 +1,75 @@ +#ifndef JEMALLOC_INTERNAL_PROF_TYPES_H +#define JEMALLOC_INTERNAL_PROF_TYPES_H + +typedef struct prof_bt_s prof_bt_t; +typedef struct prof_cnt_s prof_cnt_t; +typedef struct prof_tctx_s prof_tctx_t; +typedef struct prof_info_s prof_info_t; +typedef struct prof_gctx_s prof_gctx_t; +typedef struct prof_tdata_s prof_tdata_t; +typedef struct prof_recent_s prof_recent_t; + +/* Option defaults. */ +#ifdef JEMALLOC_PROF +# define PROF_PREFIX_DEFAULT "jeprof" +#else +# define PROF_PREFIX_DEFAULT "" +#endif +#define LG_PROF_SAMPLE_DEFAULT 19 +#define LG_PROF_INTERVAL_DEFAULT -1 + +/* + * Hard limit on stack backtrace depth. The version of prof_backtrace() that + * is based on __builtin_return_address() necessarily has a hard-coded number + * of backtrace frame handlers, and should be kept in sync with this setting. + */ +#define PROF_BT_MAX 128 + +/* Initial hash table size. */ +#define PROF_CKH_MINITEMS 64 + +/* Size of memory buffer to use when writing dump files. */ +#ifndef JEMALLOC_PROF +/* Minimize memory bloat for non-prof builds. */ +# define PROF_DUMP_BUFSIZE 1 +#elif defined(JEMALLOC_DEBUG) +/* Use a small buffer size in debug build, mainly to facilitate testing. */ +# define PROF_DUMP_BUFSIZE 16 +#else +# define PROF_DUMP_BUFSIZE 65536 +#endif + +/* Size of size class related tables */ +#ifdef JEMALLOC_PROF +# define PROF_SC_NSIZES SC_NSIZES +#else +/* Minimize memory bloat for non-prof builds. */ +# define PROF_SC_NSIZES 1 +#endif + +/* Size of stack-allocated buffer used by prof_printf(). */ +#define PROF_PRINTF_BUFSIZE 128 + +/* + * Number of mutexes shared among all gctx's. No space is allocated for these + * unless profiling is enabled, so it's okay to over-provision. + */ +#define PROF_NCTX_LOCKS 1024 + +/* + * Number of mutexes shared among all tdata's. No space is allocated for these + * unless profiling is enabled, so it's okay to over-provision. + */ +#define PROF_NTDATA_LOCKS 256 + +/* Minimize memory bloat for non-prof builds. */ +#ifdef JEMALLOC_PROF +#define PROF_DUMP_FILENAME_LEN (PATH_MAX + 1) +#else +#define PROF_DUMP_FILENAME_LEN 1 +#endif + +/* Default number of recent allocations to record. */ +#define PROF_RECENT_ALLOC_MAX_DEFAULT 0 + +#endif /* JEMALLOC_INTERNAL_PROF_TYPES_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/psset.h b/BeefRT/JEMalloc/include/jemalloc/internal/psset.h new file mode 100644 index 00000000..e1d64970 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/psset.h @@ -0,0 +1,131 @@ +#ifndef JEMALLOC_INTERNAL_PSSET_H +#define JEMALLOC_INTERNAL_PSSET_H + +#include "jemalloc/internal/hpdata.h" + +/* + * A page-slab set. What the eset is to PAC, the psset is to HPA. It maintains + * a collection of page-slabs (the intent being that they are backed by + * hugepages, or at least could be), and handles allocation and deallocation + * requests. + */ + +/* + * One more than the maximum pszind_t we will serve out of the HPA. + * Practically, we expect only the first few to be actually used. This + * corresponds to a maximum size of of 512MB on systems with 4k pages and + * SC_NGROUP == 4, which is already an unreasonably large maximum. Morally, you + * can think of this as being SC_NPSIZES, but there's no sense in wasting that + * much space in the arena, making bitmaps that much larger, etc. + */ +#define PSSET_NPSIZES 64 + +/* + * We keep two purge lists per page size class; one for hugified hpdatas (at + * index 2*pszind), and one for the non-hugified hpdatas (at index 2*pszind + + * 1). This lets us implement a preference for purging non-hugified hpdatas + * among similarly-dirty ones. + * We reserve the last two indices for empty slabs, in that case purging + * hugified ones (which are definitionally all waste) before non-hugified ones + * (i.e. reversing the order). + */ +#define PSSET_NPURGE_LISTS (2 * PSSET_NPSIZES) + +typedef struct psset_bin_stats_s psset_bin_stats_t; +struct psset_bin_stats_s { + /* How many pageslabs are in this bin? */ + size_t npageslabs; + /* Of them, how many pages are active? */ + size_t nactive; + /* And how many are dirty? */ + size_t ndirty; +}; + +typedef struct psset_stats_s psset_stats_t; +struct psset_stats_s { + /* + * The second index is huge stats; nonfull_slabs[pszind][0] contains + * stats for the non-huge slabs in bucket pszind, while + * nonfull_slabs[pszind][1] contains stats for the huge slabs. + */ + psset_bin_stats_t nonfull_slabs[PSSET_NPSIZES][2]; + + /* + * Full slabs don't live in any edata heap, but we still track their + * stats. + */ + psset_bin_stats_t full_slabs[2]; + + /* Empty slabs are similar. */ + psset_bin_stats_t empty_slabs[2]; +}; + +typedef struct psset_s psset_t; +struct psset_s { + /* + * The pageslabs, quantized by the size class of the largest contiguous + * free run of pages in a pageslab. + */ + hpdata_age_heap_t pageslabs[PSSET_NPSIZES]; + /* Bitmap for which set bits correspond to non-empty heaps. */ + fb_group_t pageslab_bitmap[FB_NGROUPS(PSSET_NPSIZES)]; + /* + * The sum of all bin stats in stats. This lets us quickly answer + * queries for the number of dirty, active, and retained pages in the + * entire set. + */ + psset_bin_stats_t merged_stats; + psset_stats_t stats; + /* + * Slabs with no active allocations, but which are allowed to serve new + * allocations. + */ + hpdata_empty_list_t empty; + /* + * Slabs which are available to be purged, ordered by how much we want + * to purge them (with later indices indicating slabs we want to purge + * more). + */ + hpdata_purge_list_t to_purge[PSSET_NPURGE_LISTS]; + /* Bitmap for which set bits correspond to non-empty purge lists. */ + fb_group_t purge_bitmap[FB_NGROUPS(PSSET_NPURGE_LISTS)]; + /* Slabs which are available to be hugified. */ + hpdata_hugify_list_t to_hugify; +}; + +void psset_init(psset_t *psset); +void psset_stats_accum(psset_stats_t *dst, psset_stats_t *src); + +/* + * Begin or end updating the given pageslab's metadata. While the pageslab is + * being updated, it won't be returned from psset_fit calls. + */ +void psset_update_begin(psset_t *psset, hpdata_t *ps); +void psset_update_end(psset_t *psset, hpdata_t *ps); + +/* Analogous to the eset_fit; pick a hpdata to serve the request. */ +hpdata_t *psset_pick_alloc(psset_t *psset, size_t size); +/* Pick one to purge. */ +hpdata_t *psset_pick_purge(psset_t *psset); +/* Pick one to hugify. */ +hpdata_t *psset_pick_hugify(psset_t *psset); + +void psset_insert(psset_t *psset, hpdata_t *ps); +void psset_remove(psset_t *psset, hpdata_t *ps); + +static inline size_t +psset_npageslabs(psset_t *psset) { + return psset->merged_stats.npageslabs; +} + +static inline size_t +psset_nactive(psset_t *psset) { + return psset->merged_stats.nactive; +} + +static inline size_t +psset_ndirty(psset_t *psset) { + return psset->merged_stats.ndirty; +} + +#endif /* JEMALLOC_INTERNAL_PSSET_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/public_namespace.h b/BeefRT/JEMalloc/include/jemalloc/internal/public_namespace.h new file mode 100644 index 00000000..cd4f4ba5 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/public_namespace.h @@ -0,0 +1,22 @@ +#define je_aligned_alloc JEMALLOC_N(aligned_alloc) +#define je_calloc JEMALLOC_N(calloc) +#define je_dallocx JEMALLOC_N(dallocx) +#define je_free JEMALLOC_N(free) +#define je_mallctl JEMALLOC_N(mallctl) +#define je_mallctlbymib JEMALLOC_N(mallctlbymib) +#define je_mallctlnametomib JEMALLOC_N(mallctlnametomib) +#define je_malloc JEMALLOC_N(malloc) +#define je_malloc_conf JEMALLOC_N(malloc_conf) +#define je_malloc_conf_2_conf_harder JEMALLOC_N(malloc_conf_2_conf_harder) +#define je_malloc_message JEMALLOC_N(malloc_message) +#define je_malloc_stats_print JEMALLOC_N(malloc_stats_print) +#define je_malloc_usable_size JEMALLOC_N(malloc_usable_size) +#define je_mallocx JEMALLOC_N(mallocx) +#define je_smallocx_54eaed1d8b56b1aa528be3bdd1877e59c56fa90c JEMALLOC_N(smallocx_54eaed1d8b56b1aa528be3bdd1877e59c56fa90c) +#define je_nallocx JEMALLOC_N(nallocx) +#define je_posix_memalign JEMALLOC_N(posix_memalign) +#define je_rallocx JEMALLOC_N(rallocx) +#define je_realloc JEMALLOC_N(realloc) +#define je_sallocx JEMALLOC_N(sallocx) +#define je_sdallocx JEMALLOC_N(sdallocx) +#define je_xallocx JEMALLOC_N(xallocx) diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/public_namespace.sh b/BeefRT/JEMalloc/include/jemalloc/internal/public_namespace.sh new file mode 100644 index 00000000..4d415ba0 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/public_namespace.sh @@ -0,0 +1,6 @@ +#!/bin/sh + +for nm in `cat $1` ; do + n=`echo ${nm} |tr ':' ' ' |awk '{print $1}'` + echo "#define je_${n} JEMALLOC_N(${n})" +done diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/public_symbols.txt b/BeefRT/JEMalloc/include/jemalloc/internal/public_symbols.txt new file mode 100644 index 00000000..624f14cd --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/public_symbols.txt @@ -0,0 +1,22 @@ +aligned_alloc:je_aligned_alloc +calloc:je_calloc +dallocx:je_dallocx +free:je_free +mallctl:je_mallctl +mallctlbymib:je_mallctlbymib +mallctlnametomib:je_mallctlnametomib +malloc:je_malloc +malloc_conf:je_malloc_conf +malloc_conf_2_conf_harder:je_malloc_conf_2_conf_harder +malloc_message:je_malloc_message +malloc_stats_print:je_malloc_stats_print +malloc_usable_size:je_malloc_usable_size +mallocx:je_mallocx +smallocx_54eaed1d8b56b1aa528be3bdd1877e59c56fa90c:je_smallocx_54eaed1d8b56b1aa528be3bdd1877e59c56fa90c +nallocx:je_nallocx +posix_memalign:je_posix_memalign +rallocx:je_rallocx +realloc:je_realloc +sallocx:je_sallocx +sdallocx:je_sdallocx +xallocx:je_xallocx diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/public_unnamespace.h b/BeefRT/JEMalloc/include/jemalloc/internal/public_unnamespace.h new file mode 100644 index 00000000..842137e7 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/public_unnamespace.h @@ -0,0 +1,22 @@ +#undef je_aligned_alloc +#undef je_calloc +#undef je_dallocx +#undef je_free +#undef je_mallctl +#undef je_mallctlbymib +#undef je_mallctlnametomib +#undef je_malloc +#undef je_malloc_conf +#undef je_malloc_conf_2_conf_harder +#undef je_malloc_message +#undef je_malloc_stats_print +#undef je_malloc_usable_size +#undef je_mallocx +#undef je_smallocx_54eaed1d8b56b1aa528be3bdd1877e59c56fa90c +#undef je_nallocx +#undef je_posix_memalign +#undef je_rallocx +#undef je_realloc +#undef je_sallocx +#undef je_sdallocx +#undef je_xallocx diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/public_unnamespace.sh b/BeefRT/JEMalloc/include/jemalloc/internal/public_unnamespace.sh new file mode 100644 index 00000000..4239d177 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/public_unnamespace.sh @@ -0,0 +1,6 @@ +#!/bin/sh + +for nm in `cat $1` ; do + n=`echo ${nm} |tr ':' ' ' |awk '{print $1}'` + echo "#undef je_${n}" +done diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/ql.h b/BeefRT/JEMalloc/include/jemalloc/internal/ql.h new file mode 100644 index 00000000..c7f52f86 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/ql.h @@ -0,0 +1,197 @@ +#ifndef JEMALLOC_INTERNAL_QL_H +#define JEMALLOC_INTERNAL_QL_H + +#include "jemalloc/internal/qr.h" + +/* + * A linked-list implementation. + * + * This is built on top of the ring implementation, but that can be viewed as an + * implementation detail (i.e. trying to advance past the tail of the list + * doesn't wrap around). + * + * You define a struct like so: + * typedef strucy my_s my_t; + * struct my_s { + * int data; + * ql_elm(my_t) my_link; + * }; + * + * // We wobble between "list" and "head" for this type; we're now mostly + * // heading towards "list". + * typedef ql_head(my_t) my_list_t; + * + * You then pass a my_list_t * for a_head arguments, a my_t * for a_elm + * arguments, the token "my_link" for a_field arguments, and the token "my_t" + * for a_type arguments. + */ + +/* List definitions. */ +#define ql_head(a_type) \ +struct { \ + a_type *qlh_first; \ +} + +/* Static initializer for an empty list. */ +#define ql_head_initializer(a_head) {NULL} + +/* The field definition. */ +#define ql_elm(a_type) qr(a_type) + +/* A pointer to the first element in the list, or NULL if the list is empty. */ +#define ql_first(a_head) ((a_head)->qlh_first) + +/* Dynamically initializes a list. */ +#define ql_new(a_head) do { \ + ql_first(a_head) = NULL; \ +} while (0) + +/* + * Sets dest to be the contents of src (overwriting any elements there), leaving + * src empty. + */ +#define ql_move(a_head_dest, a_head_src) do { \ + ql_first(a_head_dest) = ql_first(a_head_src); \ + ql_new(a_head_src); \ +} while (0) + +/* True if the list is empty, otherwise false. */ +#define ql_empty(a_head) (ql_first(a_head) == NULL) + +/* + * Initializes a ql_elm. Must be called even if the field is about to be + * overwritten. + */ +#define ql_elm_new(a_elm, a_field) qr_new((a_elm), a_field) + +/* + * Obtains the last item in the list. + */ +#define ql_last(a_head, a_field) \ + (ql_empty(a_head) ? NULL : qr_prev(ql_first(a_head), a_field)) + +/* + * Gets a pointer to the next/prev element in the list. Trying to advance past + * the end or retreat before the beginning of the list returns NULL. + */ +#define ql_next(a_head, a_elm, a_field) \ + ((ql_last(a_head, a_field) != (a_elm)) \ + ? qr_next((a_elm), a_field) : NULL) +#define ql_prev(a_head, a_elm, a_field) \ + ((ql_first(a_head) != (a_elm)) ? qr_prev((a_elm), a_field) \ + : NULL) + +/* Inserts a_elm before a_qlelm in the list. */ +#define ql_before_insert(a_head, a_qlelm, a_elm, a_field) do { \ + qr_before_insert((a_qlelm), (a_elm), a_field); \ + if (ql_first(a_head) == (a_qlelm)) { \ + ql_first(a_head) = (a_elm); \ + } \ +} while (0) + +/* Inserts a_elm after a_qlelm in the list. */ +#define ql_after_insert(a_qlelm, a_elm, a_field) \ + qr_after_insert((a_qlelm), (a_elm), a_field) + +/* Inserts a_elm as the first item in the list. */ +#define ql_head_insert(a_head, a_elm, a_field) do { \ + if (!ql_empty(a_head)) { \ + qr_before_insert(ql_first(a_head), (a_elm), a_field); \ + } \ + ql_first(a_head) = (a_elm); \ +} while (0) + +/* Inserts a_elm as the last item in the list. */ +#define ql_tail_insert(a_head, a_elm, a_field) do { \ + if (!ql_empty(a_head)) { \ + qr_before_insert(ql_first(a_head), (a_elm), a_field); \ + } \ + ql_first(a_head) = qr_next((a_elm), a_field); \ +} while (0) + +/* + * Given lists a = [a_1, ..., a_n] and [b_1, ..., b_n], results in: + * a = [a1, ..., a_n, b_1, ..., b_n] and b = []. + */ +#define ql_concat(a_head_a, a_head_b, a_field) do { \ + if (ql_empty(a_head_a)) { \ + ql_move(a_head_a, a_head_b); \ + } else if (!ql_empty(a_head_b)) { \ + qr_meld(ql_first(a_head_a), ql_first(a_head_b), \ + a_field); \ + ql_new(a_head_b); \ + } \ +} while (0) + +/* Removes a_elm from the list. */ +#define ql_remove(a_head, a_elm, a_field) do { \ + if (ql_first(a_head) == (a_elm)) { \ + ql_first(a_head) = qr_next(ql_first(a_head), a_field); \ + } \ + if (ql_first(a_head) != (a_elm)) { \ + qr_remove((a_elm), a_field); \ + } else { \ + ql_new(a_head); \ + } \ +} while (0) + +/* Removes the first item in the list. */ +#define ql_head_remove(a_head, a_type, a_field) do { \ + a_type *t = ql_first(a_head); \ + ql_remove((a_head), t, a_field); \ +} while (0) + +/* Removes the last item in the list. */ +#define ql_tail_remove(a_head, a_type, a_field) do { \ + a_type *t = ql_last(a_head, a_field); \ + ql_remove((a_head), t, a_field); \ +} while (0) + +/* + * Given a = [a_1, a_2, ..., a_n-1, a_n, a_n+1, ...], + * ql_split(a, a_n, b, some_field) results in + * a = [a_1, a_2, ..., a_n-1] + * and replaces b's contents with: + * b = [a_n, a_n+1, ...] + */ +#define ql_split(a_head_a, a_elm, a_head_b, a_field) do { \ + if (ql_first(a_head_a) == (a_elm)) { \ + ql_move(a_head_b, a_head_a); \ + } else { \ + qr_split(ql_first(a_head_a), (a_elm), a_field); \ + ql_first(a_head_b) = (a_elm); \ + } \ +} while (0) + +/* + * An optimized version of: + * a_type *t = ql_first(a_head); + * ql_remove((a_head), t, a_field); + * ql_tail_insert((a_head), t, a_field); + */ +#define ql_rotate(a_head, a_field) do { \ + ql_first(a_head) = qr_next(ql_first(a_head), a_field); \ +} while (0) + +/* + * Helper macro to iterate over each element in a list in order, starting from + * the head (or in reverse order, starting from the tail). The usage is + * (assuming my_t and my_list_t defined as above). + * + * int sum(my_list_t *list) { + * int sum = 0; + * my_t *iter; + * ql_foreach(iter, list, link) { + * sum += iter->data; + * } + * return sum; + * } + */ + +#define ql_foreach(a_var, a_head, a_field) \ + qr_foreach((a_var), ql_first(a_head), a_field) + +#define ql_reverse_foreach(a_var, a_head, a_field) \ + qr_reverse_foreach((a_var), ql_first(a_head), a_field) + +#endif /* JEMALLOC_INTERNAL_QL_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/qr.h b/BeefRT/JEMalloc/include/jemalloc/internal/qr.h new file mode 100644 index 00000000..ece4f556 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/qr.h @@ -0,0 +1,140 @@ +#ifndef JEMALLOC_INTERNAL_QR_H +#define JEMALLOC_INTERNAL_QR_H + +/* + * A ring implementation based on an embedded circular doubly-linked list. + * + * You define your struct like so: + * + * typedef struct my_s my_t; + * struct my_s { + * int data; + * qr(my_t) my_link; + * }; + * + * And then pass a my_t * into macros for a_qr arguments, and the token + * "my_link" into a_field fields. + */ + +/* Ring definitions. */ +#define qr(a_type) \ +struct { \ + a_type *qre_next; \ + a_type *qre_prev; \ +} + +/* + * Initialize a qr link. Every link must be initialized before being used, even + * if that initialization is going to be immediately overwritten (say, by being + * passed into an insertion macro). + */ +#define qr_new(a_qr, a_field) do { \ + (a_qr)->a_field.qre_next = (a_qr); \ + (a_qr)->a_field.qre_prev = (a_qr); \ +} while (0) + +/* + * Go forwards or backwards in the ring. Note that (the ring being circular), this + * always succeeds -- you just keep looping around and around the ring if you + * chase pointers without end. + */ +#define qr_next(a_qr, a_field) ((a_qr)->a_field.qre_next) +#define qr_prev(a_qr, a_field) ((a_qr)->a_field.qre_prev) + +/* + * Given two rings: + * a -> a_1 -> ... -> a_n -- + * ^ | + * |------------------------ + * + * b -> b_1 -> ... -> b_n -- + * ^ | + * |------------------------ + * + * Results in the ring: + * a -> a_1 -> ... -> a_n -> b -> b_1 -> ... -> b_n -- + * ^ | + * |-------------------------------------------------| + * + * a_qr_a can directly be a qr_next() macro, but a_qr_b cannot. + */ +#define qr_meld(a_qr_a, a_qr_b, a_field) do { \ + (a_qr_b)->a_field.qre_prev->a_field.qre_next = \ + (a_qr_a)->a_field.qre_prev; \ + (a_qr_a)->a_field.qre_prev = (a_qr_b)->a_field.qre_prev; \ + (a_qr_b)->a_field.qre_prev = \ + (a_qr_b)->a_field.qre_prev->a_field.qre_next; \ + (a_qr_a)->a_field.qre_prev->a_field.qre_next = (a_qr_a); \ + (a_qr_b)->a_field.qre_prev->a_field.qre_next = (a_qr_b); \ +} while (0) + +/* + * Logically, this is just a meld. The intent, though, is that a_qrelm is a + * single-element ring, so that "before" has a more obvious interpretation than + * meld. + */ +#define qr_before_insert(a_qrelm, a_qr, a_field) \ + qr_meld((a_qrelm), (a_qr), a_field) + +/* Ditto, but inserting after rather than before. */ +#define qr_after_insert(a_qrelm, a_qr, a_field) \ + qr_before_insert(qr_next(a_qrelm, a_field), (a_qr), a_field) + +/* + * Inverts meld; given the ring: + * a -> a_1 -> ... -> a_n -> b -> b_1 -> ... -> b_n -- + * ^ | + * |-------------------------------------------------| + * + * Results in two rings: + * a -> a_1 -> ... -> a_n -- + * ^ | + * |------------------------ + * + * b -> b_1 -> ... -> b_n -- + * ^ | + * |------------------------ + * + * qr_meld() and qr_split() are functionally equivalent, so there's no need to + * have two copies of the code. + */ +#define qr_split(a_qr_a, a_qr_b, a_field) \ + qr_meld((a_qr_a), (a_qr_b), a_field) + +/* + * Splits off a_qr from the rest of its ring, so that it becomes a + * single-element ring. + */ +#define qr_remove(a_qr, a_field) \ + qr_split(qr_next(a_qr, a_field), (a_qr), a_field) + +/* + * Helper macro to iterate over each element in a ring exactly once, starting + * with a_qr. The usage is (assuming my_t defined as above): + * + * int sum(my_t *item) { + * int sum = 0; + * my_t *iter; + * qr_foreach(iter, item, link) { + * sum += iter->data; + * } + * return sum; + * } + */ +#define qr_foreach(var, a_qr, a_field) \ + for ((var) = (a_qr); \ + (var) != NULL; \ + (var) = (((var)->a_field.qre_next != (a_qr)) \ + ? (var)->a_field.qre_next : NULL)) + +/* + * The same (and with the same usage) as qr_foreach, but in the opposite order, + * ending with a_qr. + */ +#define qr_reverse_foreach(var, a_qr, a_field) \ + for ((var) = ((a_qr) != NULL) ? qr_prev(a_qr, a_field) : NULL; \ + (var) != NULL; \ + (var) = (((var) != (a_qr)) \ + ? (var)->a_field.qre_prev : NULL)) + +#endif /* JEMALLOC_INTERNAL_QR_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/quantum.h b/BeefRT/JEMalloc/include/jemalloc/internal/quantum.h new file mode 100644 index 00000000..c22d753a --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/quantum.h @@ -0,0 +1,87 @@ +#ifndef JEMALLOC_INTERNAL_QUANTUM_H +#define JEMALLOC_INTERNAL_QUANTUM_H + +/* + * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size + * classes). + */ +#ifndef LG_QUANTUM +# if (defined(__i386__) || defined(_M_IX86)) +# define LG_QUANTUM 4 +# endif +# ifdef __ia64__ +# define LG_QUANTUM 4 +# endif +# ifdef __alpha__ +# define LG_QUANTUM 4 +# endif +# if (defined(__sparc64__) || defined(__sparcv9) || defined(__sparc_v9__)) +# define LG_QUANTUM 4 +# endif +# if (defined(__amd64__) || defined(__x86_64__) || defined(_M_X64)) +# define LG_QUANTUM 4 +# endif +# ifdef __arm__ +# define LG_QUANTUM 3 +# endif +# ifdef __aarch64__ +# define LG_QUANTUM 4 +# endif +# ifdef __hppa__ +# define LG_QUANTUM 4 +# endif +# ifdef __loongarch__ +# define LG_QUANTUM 4 +# endif +# ifdef __m68k__ +# define LG_QUANTUM 3 +# endif +# ifdef __mips__ +# if defined(__mips_n32) || defined(__mips_n64) +# define LG_QUANTUM 4 +# else +# define LG_QUANTUM 3 +# endif +# endif +# ifdef __nios2__ +# define LG_QUANTUM 3 +# endif +# ifdef __or1k__ +# define LG_QUANTUM 3 +# endif +# ifdef __powerpc__ +# define LG_QUANTUM 4 +# endif +# if defined(__riscv) || defined(__riscv__) +# define LG_QUANTUM 4 +# endif +# ifdef __s390__ +# define LG_QUANTUM 4 +# endif +# if (defined (__SH3E__) || defined(__SH4_SINGLE__) || defined(__SH4__) || \ + defined(__SH4_SINGLE_ONLY__)) +# define LG_QUANTUM 4 +# endif +# ifdef __tile__ +# define LG_QUANTUM 4 +# endif +# ifdef __le32__ +# define LG_QUANTUM 4 +# endif +# ifdef __arc__ +# define LG_QUANTUM 3 +# endif +# ifndef LG_QUANTUM +# error "Unknown minimum alignment for architecture; specify via " + "--with-lg-quantum" +# endif +#endif + +#define QUANTUM ((size_t)(1U << LG_QUANTUM)) +#define QUANTUM_MASK (QUANTUM - 1) + +/* Return the smallest quantum multiple that is >= a. */ +#define QUANTUM_CEILING(a) \ + (((a) + QUANTUM_MASK) & ~QUANTUM_MASK) + +#endif /* JEMALLOC_INTERNAL_QUANTUM_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/rb.h b/BeefRT/JEMalloc/include/jemalloc/internal/rb.h new file mode 100644 index 00000000..a9a51cb6 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/rb.h @@ -0,0 +1,1856 @@ +#ifndef JEMALLOC_INTERNAL_RB_H +#define JEMALLOC_INTERNAL_RB_H + +/*- + ******************************************************************************* + * + * cpp macro implementation of left-leaning 2-3 red-black trees. Parent + * pointers are not used, and color bits are stored in the least significant + * bit of right-child pointers (if RB_COMPACT is defined), thus making node + * linkage as compact as is possible for red-black trees. + * + * Usage: + * + * #include + * #include + * #define NDEBUG // (Optional, see assert(3).) + * #include + * #define RB_COMPACT // (Optional, embed color bits in right-child pointers.) + * #include + * ... + * + ******************************************************************************* + */ + +#ifndef __PGI +#define RB_COMPACT +#endif + +/* + * Each node in the RB tree consumes at least 1 byte of space (for the linkage + * if nothing else, so there are a maximum of sizeof(void *) << 3 rb tree nodes + * in any process (and thus, at most sizeof(void *) << 3 nodes in any rb tree). + * The choice of algorithm bounds the depth of a tree to twice the binary log of + * the number of elements in the tree; the following bound follows. + */ +#define RB_MAX_DEPTH (sizeof(void *) << 4) + +#ifdef RB_COMPACT +/* Node structure. */ +#define rb_node(a_type) \ +struct { \ + a_type *rbn_left; \ + a_type *rbn_right_red; \ +} +#else +#define rb_node(a_type) \ +struct { \ + a_type *rbn_left; \ + a_type *rbn_right; \ + bool rbn_red; \ +} +#endif + +/* Root structure. */ +#define rb_tree(a_type) \ +struct { \ + a_type *rbt_root; \ +} + +/* Left accessors. */ +#define rbtn_left_get(a_type, a_field, a_node) \ + ((a_node)->a_field.rbn_left) +#define rbtn_left_set(a_type, a_field, a_node, a_left) do { \ + (a_node)->a_field.rbn_left = a_left; \ +} while (0) + +#ifdef RB_COMPACT +/* Right accessors. */ +#define rbtn_right_get(a_type, a_field, a_node) \ + ((a_type *) (((intptr_t) (a_node)->a_field.rbn_right_red) \ + & ((ssize_t)-2))) +#define rbtn_right_set(a_type, a_field, a_node, a_right) do { \ + (a_node)->a_field.rbn_right_red = (a_type *) (((uintptr_t) a_right) \ + | (((uintptr_t) (a_node)->a_field.rbn_right_red) & ((size_t)1))); \ +} while (0) + +/* Color accessors. */ +#define rbtn_red_get(a_type, a_field, a_node) \ + ((bool) (((uintptr_t) (a_node)->a_field.rbn_right_red) \ + & ((size_t)1))) +#define rbtn_color_set(a_type, a_field, a_node, a_red) do { \ + (a_node)->a_field.rbn_right_red = (a_type *) ((((intptr_t) \ + (a_node)->a_field.rbn_right_red) & ((ssize_t)-2)) \ + | ((ssize_t)a_red)); \ +} while (0) +#define rbtn_red_set(a_type, a_field, a_node) do { \ + (a_node)->a_field.rbn_right_red = (a_type *) (((uintptr_t) \ + (a_node)->a_field.rbn_right_red) | ((size_t)1)); \ +} while (0) +#define rbtn_black_set(a_type, a_field, a_node) do { \ + (a_node)->a_field.rbn_right_red = (a_type *) (((intptr_t) \ + (a_node)->a_field.rbn_right_red) & ((ssize_t)-2)); \ +} while (0) + +/* Node initializer. */ +#define rbt_node_new(a_type, a_field, a_rbt, a_node) do { \ + /* Bookkeeping bit cannot be used by node pointer. */ \ + assert(((uintptr_t)(a_node) & 0x1) == 0); \ + rbtn_left_set(a_type, a_field, (a_node), NULL); \ + rbtn_right_set(a_type, a_field, (a_node), NULL); \ + rbtn_red_set(a_type, a_field, (a_node)); \ +} while (0) +#else +/* Right accessors. */ +#define rbtn_right_get(a_type, a_field, a_node) \ + ((a_node)->a_field.rbn_right) +#define rbtn_right_set(a_type, a_field, a_node, a_right) do { \ + (a_node)->a_field.rbn_right = a_right; \ +} while (0) + +/* Color accessors. */ +#define rbtn_red_get(a_type, a_field, a_node) \ + ((a_node)->a_field.rbn_red) +#define rbtn_color_set(a_type, a_field, a_node, a_red) do { \ + (a_node)->a_field.rbn_red = (a_red); \ +} while (0) +#define rbtn_red_set(a_type, a_field, a_node) do { \ + (a_node)->a_field.rbn_red = true; \ +} while (0) +#define rbtn_black_set(a_type, a_field, a_node) do { \ + (a_node)->a_field.rbn_red = false; \ +} while (0) + +/* Node initializer. */ +#define rbt_node_new(a_type, a_field, a_rbt, a_node) do { \ + rbtn_left_set(a_type, a_field, (a_node), NULL); \ + rbtn_right_set(a_type, a_field, (a_node), NULL); \ + rbtn_red_set(a_type, a_field, (a_node)); \ +} while (0) +#endif + +/* Tree initializer. */ +#define rb_new(a_type, a_field, a_rbt) do { \ + (a_rbt)->rbt_root = NULL; \ +} while (0) + +/* Internal utility macros. */ +#define rbtn_first(a_type, a_field, a_rbt, a_root, r_node) do { \ + (r_node) = (a_root); \ + if ((r_node) != NULL) { \ + for (; \ + rbtn_left_get(a_type, a_field, (r_node)) != NULL; \ + (r_node) = rbtn_left_get(a_type, a_field, (r_node))) { \ + } \ + } \ +} while (0) + +#define rbtn_last(a_type, a_field, a_rbt, a_root, r_node) do { \ + (r_node) = (a_root); \ + if ((r_node) != NULL) { \ + for (; rbtn_right_get(a_type, a_field, (r_node)) != NULL; \ + (r_node) = rbtn_right_get(a_type, a_field, (r_node))) { \ + } \ + } \ +} while (0) + +#define rbtn_rotate_left(a_type, a_field, a_node, r_node) do { \ + (r_node) = rbtn_right_get(a_type, a_field, (a_node)); \ + rbtn_right_set(a_type, a_field, (a_node), \ + rbtn_left_get(a_type, a_field, (r_node))); \ + rbtn_left_set(a_type, a_field, (r_node), (a_node)); \ +} while (0) + +#define rbtn_rotate_right(a_type, a_field, a_node, r_node) do { \ + (r_node) = rbtn_left_get(a_type, a_field, (a_node)); \ + rbtn_left_set(a_type, a_field, (a_node), \ + rbtn_right_get(a_type, a_field, (r_node))); \ + rbtn_right_set(a_type, a_field, (r_node), (a_node)); \ +} while (0) + +#define rb_summarized_only_false(...) +#define rb_summarized_only_true(...) __VA_ARGS__ +#define rb_empty_summarize(a_node, a_lchild, a_rchild) false + +/* + * The rb_proto() and rb_summarized_proto() macros generate function prototypes + * that correspond to the functions generated by an equivalently parameterized + * call to rb_gen() or rb_summarized_gen(), respectively. + */ + +#define rb_proto(a_attr, a_prefix, a_rbt_type, a_type) \ + rb_proto_impl(a_attr, a_prefix, a_rbt_type, a_type, false) +#define rb_summarized_proto(a_attr, a_prefix, a_rbt_type, a_type) \ + rb_proto_impl(a_attr, a_prefix, a_rbt_type, a_type, true) +#define rb_proto_impl(a_attr, a_prefix, a_rbt_type, a_type, \ + a_is_summarized) \ +a_attr void \ +a_prefix##new(a_rbt_type *rbtree); \ +a_attr bool \ +a_prefix##empty(a_rbt_type *rbtree); \ +a_attr a_type * \ +a_prefix##first(a_rbt_type *rbtree); \ +a_attr a_type * \ +a_prefix##last(a_rbt_type *rbtree); \ +a_attr a_type * \ +a_prefix##next(a_rbt_type *rbtree, a_type *node); \ +a_attr a_type * \ +a_prefix##prev(a_rbt_type *rbtree, a_type *node); \ +a_attr a_type * \ +a_prefix##search(a_rbt_type *rbtree, const a_type *key); \ +a_attr a_type * \ +a_prefix##nsearch(a_rbt_type *rbtree, const a_type *key); \ +a_attr a_type * \ +a_prefix##psearch(a_rbt_type *rbtree, const a_type *key); \ +a_attr void \ +a_prefix##insert(a_rbt_type *rbtree, a_type *node); \ +a_attr void \ +a_prefix##remove(a_rbt_type *rbtree, a_type *node); \ +a_attr a_type * \ +a_prefix##iter(a_rbt_type *rbtree, a_type *start, a_type *(*cb)( \ + a_rbt_type *, a_type *, void *), void *arg); \ +a_attr a_type * \ +a_prefix##reverse_iter(a_rbt_type *rbtree, a_type *start, \ + a_type *(*cb)(a_rbt_type *, a_type *, void *), void *arg); \ +a_attr void \ +a_prefix##destroy(a_rbt_type *rbtree, void (*cb)(a_type *, void *), \ + void *arg); \ +/* Extended API */ \ +rb_summarized_only_##a_is_summarized( \ +a_attr void \ +a_prefix##update_summaries(a_rbt_type *rbtree, a_type *node); \ +a_attr bool \ +a_prefix##empty_filtered(a_rbt_type *rbtree, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx); \ +a_attr a_type * \ +a_prefix##first_filtered(a_rbt_type *rbtree, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx); \ +a_attr a_type * \ +a_prefix##last_filtered(a_rbt_type *rbtree, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx); \ +a_attr a_type * \ +a_prefix##next_filtered(a_rbt_type *rbtree, a_type *node, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx); \ +a_attr a_type * \ +a_prefix##prev_filtered(a_rbt_type *rbtree, a_type *node, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx); \ +a_attr a_type * \ +a_prefix##search_filtered(a_rbt_type *rbtree, const a_type *key, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx); \ +a_attr a_type * \ +a_prefix##nsearch_filtered(a_rbt_type *rbtree, const a_type *key, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx); \ +a_attr a_type * \ +a_prefix##psearch_filtered(a_rbt_type *rbtree, const a_type *key, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx); \ +a_attr a_type * \ +a_prefix##iter_filtered(a_rbt_type *rbtree, a_type *start, \ + a_type *(*cb)(a_rbt_type *, a_type *, void *), void *arg, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx); \ +a_attr a_type * \ +a_prefix##reverse_iter_filtered(a_rbt_type *rbtree, a_type *start, \ + a_type *(*cb)(a_rbt_type *, a_type *, void *), void *arg, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx); \ +) + +/* + * The rb_gen() macro generates a type-specific red-black tree implementation, + * based on the above cpp macros. + * Arguments: + * + * a_attr: + * Function attribute for generated functions (ex: static). + * a_prefix: + * Prefix for generated functions (ex: ex_). + * a_rb_type: + * Type for red-black tree data structure (ex: ex_t). + * a_type: + * Type for red-black tree node data structure (ex: ex_node_t). + * a_field: + * Name of red-black tree node linkage (ex: ex_link). + * a_cmp: + * Node comparison function name, with the following prototype: + * + * int a_cmp(a_type *a_node, a_type *a_other); + * ^^^^^^ + * or a_key + * Interpretation of comparison function return values: + * -1 : a_node < a_other + * 0 : a_node == a_other + * 1 : a_node > a_other + * In all cases, the a_node or a_key macro argument is the first argument to + * the comparison function, which makes it possible to write comparison + * functions that treat the first argument specially. a_cmp must be a total + * order on values inserted into the tree -- duplicates are not allowed. + * + * Assuming the following setup: + * + * typedef struct ex_node_s ex_node_t; + * struct ex_node_s { + * rb_node(ex_node_t) ex_link; + * }; + * typedef rb_tree(ex_node_t) ex_t; + * rb_gen(static, ex_, ex_t, ex_node_t, ex_link, ex_cmp) + * + * The following API is generated: + * + * static void + * ex_new(ex_t *tree); + * Description: Initialize a red-black tree structure. + * Args: + * tree: Pointer to an uninitialized red-black tree object. + * + * static bool + * ex_empty(ex_t *tree); + * Description: Determine whether tree is empty. + * Args: + * tree: Pointer to an initialized red-black tree object. + * Ret: True if tree is empty, false otherwise. + * + * static ex_node_t * + * ex_first(ex_t *tree); + * static ex_node_t * + * ex_last(ex_t *tree); + * Description: Get the first/last node in tree. + * Args: + * tree: Pointer to an initialized red-black tree object. + * Ret: First/last node in tree, or NULL if tree is empty. + * + * static ex_node_t * + * ex_next(ex_t *tree, ex_node_t *node); + * static ex_node_t * + * ex_prev(ex_t *tree, ex_node_t *node); + * Description: Get node's successor/predecessor. + * Args: + * tree: Pointer to an initialized red-black tree object. + * node: A node in tree. + * Ret: node's successor/predecessor in tree, or NULL if node is + * last/first. + * + * static ex_node_t * + * ex_search(ex_t *tree, const ex_node_t *key); + * Description: Search for node that matches key. + * Args: + * tree: Pointer to an initialized red-black tree object. + * key : Search key. + * Ret: Node in tree that matches key, or NULL if no match. + * + * static ex_node_t * + * ex_nsearch(ex_t *tree, const ex_node_t *key); + * static ex_node_t * + * ex_psearch(ex_t *tree, const ex_node_t *key); + * Description: Search for node that matches key. If no match is found, + * return what would be key's successor/predecessor, were + * key in tree. + * Args: + * tree: Pointer to an initialized red-black tree object. + * key : Search key. + * Ret: Node in tree that matches key, or if no match, hypothetical node's + * successor/predecessor (NULL if no successor/predecessor). + * + * static void + * ex_insert(ex_t *tree, ex_node_t *node); + * Description: Insert node into tree. + * Args: + * tree: Pointer to an initialized red-black tree object. + * node: Node to be inserted into tree. + * + * static void + * ex_remove(ex_t *tree, ex_node_t *node); + * Description: Remove node from tree. + * Args: + * tree: Pointer to an initialized red-black tree object. + * node: Node in tree to be removed. + * + * static ex_node_t * + * ex_iter(ex_t *tree, ex_node_t *start, ex_node_t *(*cb)(ex_t *, + * ex_node_t *, void *), void *arg); + * static ex_node_t * + * ex_reverse_iter(ex_t *tree, ex_node_t *start, ex_node *(*cb)(ex_t *, + * ex_node_t *, void *), void *arg); + * Description: Iterate forward/backward over tree, starting at node. If + * tree is modified, iteration must be immediately + * terminated by the callback function that causes the + * modification. + * Args: + * tree : Pointer to an initialized red-black tree object. + * start: Node at which to start iteration, or NULL to start at + * first/last node. + * cb : Callback function, which is called for each node during + * iteration. Under normal circumstances the callback function + * should return NULL, which causes iteration to continue. If a + * callback function returns non-NULL, iteration is immediately + * terminated and the non-NULL return value is returned by the + * iterator. This is useful for re-starting iteration after + * modifying tree. + * arg : Opaque pointer passed to cb(). + * Ret: NULL if iteration completed, or the non-NULL callback return value + * that caused termination of the iteration. + * + * static void + * ex_destroy(ex_t *tree, void (*cb)(ex_node_t *, void *), void *arg); + * Description: Iterate over the tree with post-order traversal, remove + * each node, and run the callback if non-null. This is + * used for destroying a tree without paying the cost to + * rebalance it. The tree must not be otherwise altered + * during traversal. + * Args: + * tree: Pointer to an initialized red-black tree object. + * cb : Callback function, which, if non-null, is called for each node + * during iteration. There is no way to stop iteration once it + * has begun. + * arg : Opaque pointer passed to cb(). + * + * The rb_summarized_gen() macro generates all the functions above, but has an + * expanded interface. In introduces the notion of summarizing subtrees, and of + * filtering searches in the tree according to the information contained in + * those summaries. + * The extra macro argument is: + * a_summarize: + * Tree summarization function name, with the following prototype: + * + * bool a_summarize(a_type *a_node, const a_type *a_left_child, + * const a_type *a_right_child); + * + * This function should update a_node with the summary of the subtree rooted + * there, using the data contained in it and the summaries in a_left_child + * and a_right_child. One or both of them may be NULL. When the tree + * changes due to an insertion or removal, it updates the summaries of all + * nodes whose subtrees have changed (always updating the summaries of + * children before their parents). If the user alters a node in the tree in + * a way that may change its summary, they can call the generated + * update_summaries function to bubble up the summary changes to the root. + * It should return true if the summary changed (or may have changed), and + * false if it didn't (which will allow the implementation to terminate + * "bubbling up" the summaries early). + * As the parameter names indicate, the children are ordered as they are in + * the tree, a_left_child, if it is not NULL, compares less than a_node, + * which in turn compares less than a_right_child (if a_right_child is not + * NULL). + * + * Using the same setup as above but replacing the macro with + * rb_summarized_gen(static, ex_, ex_t, ex_node_t, ex_link, ex_cmp, + * ex_summarize) + * + * Generates all the previous functions, but adds some more: + * + * static void + * ex_update_summaries(ex_t *tree, ex_node_t *node); + * Description: Recompute all summaries of ancestors of node. + * Args: + * tree: Pointer to an initialized red-black tree object. + * node: The element of the tree whose summary may have changed. + * + * For each of ex_empty, ex_first, ex_last, ex_next, ex_prev, ex_search, + * ex_nsearch, ex_psearch, ex_iter, and ex_reverse_iter, an additional function + * is generated as well, with the suffix _filtered (e.g. ex_empty_filtered, + * ex_first_filtered, etc.). These use the concept of a "filter"; a binary + * property some node either satisfies or does not satisfy. Clever use of the + * a_summary argument to rb_summarized_gen can allow efficient computation of + * these predicates across whole subtrees of the tree. + * The extended API functions accept three additional arguments after the + * arguments to the corresponding non-extended equivalent. + * + * ex_fn(..., bool (*filter_node)(void *, ex_node_t *), + * bool (*filter_subtree)(void *, ex_node_t *), void *filter_ctx); + * filter_node : Returns true if the node passes the filter. + * filter_subtree : Returns true if some node in the subtree rooted at + * node passes the filter. + * filter_ctx : A context argument passed to the filters. + * + * For a more concrete example of summarizing and filtering, suppose we're using + * the red-black tree to track a set of integers: + * + * struct ex_node_s { + * rb_node(ex_node_t) ex_link; + * unsigned data; + * }; + * + * Suppose, for some application-specific reason, we want to be able to quickly + * find numbers in the set which are divisible by large powers of 2 (say, for + * aligned allocation purposes). We augment the node with a summary field: + * + * struct ex_node_s { + * rb_node(ex_node_t) ex_link; + * unsigned data; + * unsigned max_subtree_ffs; + * } + * + * and define our summarization function as follows: + * + * bool + * ex_summarize(ex_node_t *node, const ex_node_t *lchild, + * const ex_node_t *rchild) { + * unsigned new_max_subtree_ffs = ffs(node->data); + * if (lchild != NULL && lchild->max_subtree_ffs > new_max_subtree_ffs) { + * new_max_subtree_ffs = lchild->max_subtree_ffs; + * } + * if (rchild != NULL && rchild->max_subtree_ffs > new_max_subtree_ffs) { + * new_max_subtree_ffs = rchild->max_subtree_ffs; + * } + * bool changed = (node->max_subtree_ffs != new_max_subtree_ffs) + * node->max_subtree_ffs = new_max_subtree_ffs; + * // This could be "return true" without any correctness or big-O + * // performance changes; but practically, precisely reporting summary + * // changes reduces the amount of work that has to be done when "bubbling + * // up" summary changes. + * return changed; + * } + * + * We can now implement our filter functions as follows: + * bool + * ex_filter_node(void *filter_ctx, ex_node_t *node) { + * unsigned required_ffs = *(unsigned *)filter_ctx; + * return ffs(node->data) >= required_ffs; + * } + * bool + * ex_filter_subtree(void *filter_ctx, ex_node_t *node) { + * unsigned required_ffs = *(unsigned *)filter_ctx; + * return node->max_subtree_ffs >= required_ffs; + * } + * + * We can now easily search for, e.g., the smallest integer in the set that's + * divisible by 128: + * ex_node_t * + * find_div_128(ex_tree_t *tree) { + * unsigned min_ffs = 7; + * return ex_first_filtered(tree, &ex_filter_node, &ex_filter_subtree, + * &min_ffs); + * } + * + * We could with similar ease: + * - Fnd the next multiple of 128 in the set that's larger than 12345 (with + * ex_nsearch_filtered) + * - Iterate over just those multiples of 64 that are in the set (with + * ex_iter_filtered) + * - Determine if the set contains any multiples of 1024 (with + * ex_empty_filtered). + * + * Some possibly subtle API notes: + * - The node argument to ex_next_filtered and ex_prev_filtered need not pass + * the filter; it will find the next/prev node that passes the filter. + * - ex_search_filtered will fail even for a node in the tree, if that node does + * not pass the filter. ex_psearch_filtered and ex_nsearch_filtered behave + * similarly; they may return a node larger/smaller than the key, even if a + * node equivalent to the key is in the tree (but does not pass the filter). + * - Similarly, if the start argument to a filtered iteration function does not + * pass the filter, the callback won't be invoked on it. + * + * These should make sense after a moment's reflection; each post-condition is + * the same as with the unfiltered version, with the added constraint that the + * returned node must pass the filter. + */ +#define rb_gen(a_attr, a_prefix, a_rbt_type, a_type, a_field, a_cmp) \ + rb_gen_impl(a_attr, a_prefix, a_rbt_type, a_type, a_field, a_cmp, \ + rb_empty_summarize, false) +#define rb_summarized_gen(a_attr, a_prefix, a_rbt_type, a_type, \ + a_field, a_cmp, a_summarize) \ + rb_gen_impl(a_attr, a_prefix, a_rbt_type, a_type, a_field, a_cmp, \ + a_summarize, true) + +#define rb_gen_impl(a_attr, a_prefix, a_rbt_type, a_type, \ + a_field, a_cmp, a_summarize, a_is_summarized) \ +typedef struct { \ + a_type *node; \ + int cmp; \ +} a_prefix##path_entry_t; \ +static inline void \ +a_prefix##summarize_range(a_prefix##path_entry_t *rfirst, \ + a_prefix##path_entry_t *rlast) { \ + while ((uintptr_t)rlast >= (uintptr_t)rfirst) { \ + a_type *node = rlast->node; \ + /* Avoid a warning when a_summarize is rb_empty_summarize. */ \ + (void)node; \ + bool changed = a_summarize(node, rbtn_left_get(a_type, a_field, \ + node), rbtn_right_get(a_type, a_field, node)); \ + if (!changed) { \ + break; \ + } \ + rlast--; \ + } \ +} \ +/* On the remove pathways, we sometimes swap the node being removed */\ +/* and its first successor; in such cases we need to do two range */\ +/* updates; one from the node to its (former) swapped successor, the */\ +/* next from that successor to the root (with either allowed to */\ +/* bail out early if appropriate. */\ +static inline void \ +a_prefix##summarize_swapped_range(a_prefix##path_entry_t *rfirst, \ + a_prefix##path_entry_t *rlast, a_prefix##path_entry_t *swap_loc) { \ + if (swap_loc == NULL || rlast <= swap_loc) { \ + a_prefix##summarize_range(rfirst, rlast); \ + } else { \ + a_prefix##summarize_range(swap_loc + 1, rlast); \ + (void)a_summarize(swap_loc->node, \ + rbtn_left_get(a_type, a_field, swap_loc->node), \ + rbtn_right_get(a_type, a_field, swap_loc->node)); \ + a_prefix##summarize_range(rfirst, swap_loc - 1); \ + } \ +} \ +a_attr void \ +a_prefix##new(a_rbt_type *rbtree) { \ + rb_new(a_type, a_field, rbtree); \ +} \ +a_attr bool \ +a_prefix##empty(a_rbt_type *rbtree) { \ + return (rbtree->rbt_root == NULL); \ +} \ +a_attr a_type * \ +a_prefix##first(a_rbt_type *rbtree) { \ + a_type *ret; \ + rbtn_first(a_type, a_field, rbtree, rbtree->rbt_root, ret); \ + return ret; \ +} \ +a_attr a_type * \ +a_prefix##last(a_rbt_type *rbtree) { \ + a_type *ret; \ + rbtn_last(a_type, a_field, rbtree, rbtree->rbt_root, ret); \ + return ret; \ +} \ +a_attr a_type * \ +a_prefix##next(a_rbt_type *rbtree, a_type *node) { \ + a_type *ret; \ + if (rbtn_right_get(a_type, a_field, node) != NULL) { \ + rbtn_first(a_type, a_field, rbtree, rbtn_right_get(a_type, \ + a_field, node), ret); \ + } else { \ + a_type *tnode = rbtree->rbt_root; \ + assert(tnode != NULL); \ + ret = NULL; \ + while (true) { \ + int cmp = (a_cmp)(node, tnode); \ + if (cmp < 0) { \ + ret = tnode; \ + tnode = rbtn_left_get(a_type, a_field, tnode); \ + } else if (cmp > 0) { \ + tnode = rbtn_right_get(a_type, a_field, tnode); \ + } else { \ + break; \ + } \ + assert(tnode != NULL); \ + } \ + } \ + return ret; \ +} \ +a_attr a_type * \ +a_prefix##prev(a_rbt_type *rbtree, a_type *node) { \ + a_type *ret; \ + if (rbtn_left_get(a_type, a_field, node) != NULL) { \ + rbtn_last(a_type, a_field, rbtree, rbtn_left_get(a_type, \ + a_field, node), ret); \ + } else { \ + a_type *tnode = rbtree->rbt_root; \ + assert(tnode != NULL); \ + ret = NULL; \ + while (true) { \ + int cmp = (a_cmp)(node, tnode); \ + if (cmp < 0) { \ + tnode = rbtn_left_get(a_type, a_field, tnode); \ + } else if (cmp > 0) { \ + ret = tnode; \ + tnode = rbtn_right_get(a_type, a_field, tnode); \ + } else { \ + break; \ + } \ + assert(tnode != NULL); \ + } \ + } \ + return ret; \ +} \ +a_attr a_type * \ +a_prefix##search(a_rbt_type *rbtree, const a_type *key) { \ + a_type *ret; \ + int cmp; \ + ret = rbtree->rbt_root; \ + while (ret != NULL \ + && (cmp = (a_cmp)(key, ret)) != 0) { \ + if (cmp < 0) { \ + ret = rbtn_left_get(a_type, a_field, ret); \ + } else { \ + ret = rbtn_right_get(a_type, a_field, ret); \ + } \ + } \ + return ret; \ +} \ +a_attr a_type * \ +a_prefix##nsearch(a_rbt_type *rbtree, const a_type *key) { \ + a_type *ret; \ + a_type *tnode = rbtree->rbt_root; \ + ret = NULL; \ + while (tnode != NULL) { \ + int cmp = (a_cmp)(key, tnode); \ + if (cmp < 0) { \ + ret = tnode; \ + tnode = rbtn_left_get(a_type, a_field, tnode); \ + } else if (cmp > 0) { \ + tnode = rbtn_right_get(a_type, a_field, tnode); \ + } else { \ + ret = tnode; \ + break; \ + } \ + } \ + return ret; \ +} \ +a_attr a_type * \ +a_prefix##psearch(a_rbt_type *rbtree, const a_type *key) { \ + a_type *ret; \ + a_type *tnode = rbtree->rbt_root; \ + ret = NULL; \ + while (tnode != NULL) { \ + int cmp = (a_cmp)(key, tnode); \ + if (cmp < 0) { \ + tnode = rbtn_left_get(a_type, a_field, tnode); \ + } else if (cmp > 0) { \ + ret = tnode; \ + tnode = rbtn_right_get(a_type, a_field, tnode); \ + } else { \ + ret = tnode; \ + break; \ + } \ + } \ + return ret; \ +} \ +a_attr void \ +a_prefix##insert(a_rbt_type *rbtree, a_type *node) { \ + a_prefix##path_entry_t path[RB_MAX_DEPTH]; \ + a_prefix##path_entry_t *pathp; \ + rbt_node_new(a_type, a_field, rbtree, node); \ + /* Wind. */ \ + path->node = rbtree->rbt_root; \ + for (pathp = path; pathp->node != NULL; pathp++) { \ + int cmp = pathp->cmp = a_cmp(node, pathp->node); \ + assert(cmp != 0); \ + if (cmp < 0) { \ + pathp[1].node = rbtn_left_get(a_type, a_field, \ + pathp->node); \ + } else { \ + pathp[1].node = rbtn_right_get(a_type, a_field, \ + pathp->node); \ + } \ + } \ + pathp->node = node; \ + /* A loop invariant we maintain is that all nodes with */\ + /* out-of-date summaries live in path[0], path[1], ..., *pathp. */\ + /* To maintain this, we have to summarize node, since we */\ + /* decrement pathp before the first iteration. */\ + assert(rbtn_left_get(a_type, a_field, node) == NULL); \ + assert(rbtn_right_get(a_type, a_field, node) == NULL); \ + (void)a_summarize(node, NULL, NULL); \ + /* Unwind. */ \ + for (pathp--; (uintptr_t)pathp >= (uintptr_t)path; pathp--) { \ + a_type *cnode = pathp->node; \ + if (pathp->cmp < 0) { \ + a_type *left = pathp[1].node; \ + rbtn_left_set(a_type, a_field, cnode, left); \ + if (rbtn_red_get(a_type, a_field, left)) { \ + a_type *leftleft = rbtn_left_get(a_type, a_field, left);\ + if (leftleft != NULL && rbtn_red_get(a_type, a_field, \ + leftleft)) { \ + /* Fix up 4-node. */ \ + a_type *tnode; \ + rbtn_black_set(a_type, a_field, leftleft); \ + rbtn_rotate_right(a_type, a_field, cnode, tnode); \ + (void)a_summarize(cnode, \ + rbtn_left_get(a_type, a_field, cnode), \ + rbtn_right_get(a_type, a_field, cnode)); \ + cnode = tnode; \ + } \ + } else { \ + a_prefix##summarize_range(path, pathp); \ + return; \ + } \ + } else { \ + a_type *right = pathp[1].node; \ + rbtn_right_set(a_type, a_field, cnode, right); \ + if (rbtn_red_get(a_type, a_field, right)) { \ + a_type *left = rbtn_left_get(a_type, a_field, cnode); \ + if (left != NULL && rbtn_red_get(a_type, a_field, \ + left)) { \ + /* Split 4-node. */ \ + rbtn_black_set(a_type, a_field, left); \ + rbtn_black_set(a_type, a_field, right); \ + rbtn_red_set(a_type, a_field, cnode); \ + } else { \ + /* Lean left. */ \ + a_type *tnode; \ + bool tred = rbtn_red_get(a_type, a_field, cnode); \ + rbtn_rotate_left(a_type, a_field, cnode, tnode); \ + rbtn_color_set(a_type, a_field, tnode, tred); \ + rbtn_red_set(a_type, a_field, cnode); \ + (void)a_summarize(cnode, \ + rbtn_left_get(a_type, a_field, cnode), \ + rbtn_right_get(a_type, a_field, cnode)); \ + cnode = tnode; \ + } \ + } else { \ + a_prefix##summarize_range(path, pathp); \ + return; \ + } \ + } \ + pathp->node = cnode; \ + (void)a_summarize(cnode, \ + rbtn_left_get(a_type, a_field, cnode), \ + rbtn_right_get(a_type, a_field, cnode)); \ + } \ + /* Set root, and make it black. */ \ + rbtree->rbt_root = path->node; \ + rbtn_black_set(a_type, a_field, rbtree->rbt_root); \ +} \ +a_attr void \ +a_prefix##remove(a_rbt_type *rbtree, a_type *node) { \ + a_prefix##path_entry_t path[RB_MAX_DEPTH]; \ + a_prefix##path_entry_t *pathp; \ + a_prefix##path_entry_t *nodep; \ + a_prefix##path_entry_t *swap_loc; \ + /* This is a "real" sentinel -- NULL means we didn't swap the */\ + /* node to be pruned with one of its successors, and so */\ + /* summarization can terminate early whenever some summary */\ + /* doesn't change. */\ + swap_loc = NULL; \ + /* This is just to silence a compiler warning. */ \ + nodep = NULL; \ + /* Wind. */ \ + path->node = rbtree->rbt_root; \ + for (pathp = path; pathp->node != NULL; pathp++) { \ + int cmp = pathp->cmp = a_cmp(node, pathp->node); \ + if (cmp < 0) { \ + pathp[1].node = rbtn_left_get(a_type, a_field, \ + pathp->node); \ + } else { \ + pathp[1].node = rbtn_right_get(a_type, a_field, \ + pathp->node); \ + if (cmp == 0) { \ + /* Find node's successor, in preparation for swap. */ \ + pathp->cmp = 1; \ + nodep = pathp; \ + for (pathp++; pathp->node != NULL; pathp++) { \ + pathp->cmp = -1; \ + pathp[1].node = rbtn_left_get(a_type, a_field, \ + pathp->node); \ + } \ + break; \ + } \ + } \ + } \ + assert(nodep->node == node); \ + pathp--; \ + if (pathp->node != node) { \ + /* Swap node with its successor. */ \ + swap_loc = nodep; \ + bool tred = rbtn_red_get(a_type, a_field, pathp->node); \ + rbtn_color_set(a_type, a_field, pathp->node, \ + rbtn_red_get(a_type, a_field, node)); \ + rbtn_left_set(a_type, a_field, pathp->node, \ + rbtn_left_get(a_type, a_field, node)); \ + /* If node's successor is its right child, the following code */\ + /* will do the wrong thing for the right child pointer. */\ + /* However, it doesn't matter, because the pointer will be */\ + /* properly set when the successor is pruned. */\ + rbtn_right_set(a_type, a_field, pathp->node, \ + rbtn_right_get(a_type, a_field, node)); \ + rbtn_color_set(a_type, a_field, node, tred); \ + /* The pruned leaf node's child pointers are never accessed */\ + /* again, so don't bother setting them to nil. */\ + nodep->node = pathp->node; \ + pathp->node = node; \ + if (nodep == path) { \ + rbtree->rbt_root = nodep->node; \ + } else { \ + if (nodep[-1].cmp < 0) { \ + rbtn_left_set(a_type, a_field, nodep[-1].node, \ + nodep->node); \ + } else { \ + rbtn_right_set(a_type, a_field, nodep[-1].node, \ + nodep->node); \ + } \ + } \ + } else { \ + a_type *left = rbtn_left_get(a_type, a_field, node); \ + if (left != NULL) { \ + /* node has no successor, but it has a left child. */\ + /* Splice node out, without losing the left child. */\ + assert(!rbtn_red_get(a_type, a_field, node)); \ + assert(rbtn_red_get(a_type, a_field, left)); \ + rbtn_black_set(a_type, a_field, left); \ + if (pathp == path) { \ + rbtree->rbt_root = left; \ + /* Nothing to summarize -- the subtree rooted at the */\ + /* node's left child hasn't changed, and it's now the */\ + /* root. */\ + } else { \ + if (pathp[-1].cmp < 0) { \ + rbtn_left_set(a_type, a_field, pathp[-1].node, \ + left); \ + } else { \ + rbtn_right_set(a_type, a_field, pathp[-1].node, \ + left); \ + } \ + a_prefix##summarize_swapped_range(path, &pathp[-1], \ + swap_loc); \ + } \ + return; \ + } else if (pathp == path) { \ + /* The tree only contained one node. */ \ + rbtree->rbt_root = NULL; \ + return; \ + } \ + } \ + /* We've now established the invariant that the node has no right */\ + /* child (well, morally; we didn't bother nulling it out if we */\ + /* swapped it with its successor), and that the only nodes with */\ + /* out-of-date summaries live in path[0], path[1], ..., pathp[-1].*/\ + if (rbtn_red_get(a_type, a_field, pathp->node)) { \ + /* Prune red node, which requires no fixup. */ \ + assert(pathp[-1].cmp < 0); \ + rbtn_left_set(a_type, a_field, pathp[-1].node, NULL); \ + a_prefix##summarize_swapped_range(path, &pathp[-1], swap_loc); \ + return; \ + } \ + /* The node to be pruned is black, so unwind until balance is */\ + /* restored. */\ + pathp->node = NULL; \ + for (pathp--; (uintptr_t)pathp >= (uintptr_t)path; pathp--) { \ + assert(pathp->cmp != 0); \ + if (pathp->cmp < 0) { \ + rbtn_left_set(a_type, a_field, pathp->node, \ + pathp[1].node); \ + if (rbtn_red_get(a_type, a_field, pathp->node)) { \ + a_type *right = rbtn_right_get(a_type, a_field, \ + pathp->node); \ + a_type *rightleft = rbtn_left_get(a_type, a_field, \ + right); \ + a_type *tnode; \ + if (rightleft != NULL && rbtn_red_get(a_type, a_field, \ + rightleft)) { \ + /* In the following diagrams, ||, //, and \\ */\ + /* indicate the path to the removed node. */\ + /* */\ + /* || */\ + /* pathp(r) */\ + /* // \ */\ + /* (b) (b) */\ + /* / */\ + /* (r) */\ + /* */\ + rbtn_black_set(a_type, a_field, pathp->node); \ + rbtn_rotate_right(a_type, a_field, right, tnode); \ + rbtn_right_set(a_type, a_field, pathp->node, tnode);\ + rbtn_rotate_left(a_type, a_field, pathp->node, \ + tnode); \ + (void)a_summarize(pathp->node, \ + rbtn_left_get(a_type, a_field, pathp->node), \ + rbtn_right_get(a_type, a_field, pathp->node)); \ + (void)a_summarize(right, \ + rbtn_left_get(a_type, a_field, right), \ + rbtn_right_get(a_type, a_field, right)); \ + } else { \ + /* || */\ + /* pathp(r) */\ + /* // \ */\ + /* (b) (b) */\ + /* / */\ + /* (b) */\ + /* */\ + rbtn_rotate_left(a_type, a_field, pathp->node, \ + tnode); \ + (void)a_summarize(pathp->node, \ + rbtn_left_get(a_type, a_field, pathp->node), \ + rbtn_right_get(a_type, a_field, pathp->node)); \ + } \ + (void)a_summarize(tnode, rbtn_left_get(a_type, a_field, \ + tnode), rbtn_right_get(a_type, a_field, tnode)); \ + /* Balance restored, but rotation modified subtree */\ + /* root. */\ + assert((uintptr_t)pathp > (uintptr_t)path); \ + if (pathp[-1].cmp < 0) { \ + rbtn_left_set(a_type, a_field, pathp[-1].node, \ + tnode); \ + } else { \ + rbtn_right_set(a_type, a_field, pathp[-1].node, \ + tnode); \ + } \ + a_prefix##summarize_swapped_range(path, &pathp[-1], \ + swap_loc); \ + return; \ + } else { \ + a_type *right = rbtn_right_get(a_type, a_field, \ + pathp->node); \ + a_type *rightleft = rbtn_left_get(a_type, a_field, \ + right); \ + if (rightleft != NULL && rbtn_red_get(a_type, a_field, \ + rightleft)) { \ + /* || */\ + /* pathp(b) */\ + /* // \ */\ + /* (b) (b) */\ + /* / */\ + /* (r) */\ + a_type *tnode; \ + rbtn_black_set(a_type, a_field, rightleft); \ + rbtn_rotate_right(a_type, a_field, right, tnode); \ + rbtn_right_set(a_type, a_field, pathp->node, tnode);\ + rbtn_rotate_left(a_type, a_field, pathp->node, \ + tnode); \ + (void)a_summarize(pathp->node, \ + rbtn_left_get(a_type, a_field, pathp->node), \ + rbtn_right_get(a_type, a_field, pathp->node)); \ + (void)a_summarize(right, \ + rbtn_left_get(a_type, a_field, right), \ + rbtn_right_get(a_type, a_field, right)); \ + (void)a_summarize(tnode, \ + rbtn_left_get(a_type, a_field, tnode), \ + rbtn_right_get(a_type, a_field, tnode)); \ + /* Balance restored, but rotation modified */\ + /* subtree root, which may actually be the tree */\ + /* root. */\ + if (pathp == path) { \ + /* Set root. */ \ + rbtree->rbt_root = tnode; \ + } else { \ + if (pathp[-1].cmp < 0) { \ + rbtn_left_set(a_type, a_field, \ + pathp[-1].node, tnode); \ + } else { \ + rbtn_right_set(a_type, a_field, \ + pathp[-1].node, tnode); \ + } \ + a_prefix##summarize_swapped_range(path, \ + &pathp[-1], swap_loc); \ + } \ + return; \ + } else { \ + /* || */\ + /* pathp(b) */\ + /* // \ */\ + /* (b) (b) */\ + /* / */\ + /* (b) */\ + a_type *tnode; \ + rbtn_red_set(a_type, a_field, pathp->node); \ + rbtn_rotate_left(a_type, a_field, pathp->node, \ + tnode); \ + (void)a_summarize(pathp->node, \ + rbtn_left_get(a_type, a_field, pathp->node), \ + rbtn_right_get(a_type, a_field, pathp->node)); \ + (void)a_summarize(tnode, \ + rbtn_left_get(a_type, a_field, tnode), \ + rbtn_right_get(a_type, a_field, tnode)); \ + pathp->node = tnode; \ + } \ + } \ + } else { \ + a_type *left; \ + rbtn_right_set(a_type, a_field, pathp->node, \ + pathp[1].node); \ + left = rbtn_left_get(a_type, a_field, pathp->node); \ + if (rbtn_red_get(a_type, a_field, left)) { \ + a_type *tnode; \ + a_type *leftright = rbtn_right_get(a_type, a_field, \ + left); \ + a_type *leftrightleft = rbtn_left_get(a_type, a_field, \ + leftright); \ + if (leftrightleft != NULL && rbtn_red_get(a_type, \ + a_field, leftrightleft)) { \ + /* || */\ + /* pathp(b) */\ + /* / \\ */\ + /* (r) (b) */\ + /* \ */\ + /* (b) */\ + /* / */\ + /* (r) */\ + a_type *unode; \ + rbtn_black_set(a_type, a_field, leftrightleft); \ + rbtn_rotate_right(a_type, a_field, pathp->node, \ + unode); \ + rbtn_rotate_right(a_type, a_field, pathp->node, \ + tnode); \ + rbtn_right_set(a_type, a_field, unode, tnode); \ + rbtn_rotate_left(a_type, a_field, unode, tnode); \ + (void)a_summarize(pathp->node, \ + rbtn_left_get(a_type, a_field, pathp->node), \ + rbtn_right_get(a_type, a_field, pathp->node)); \ + (void)a_summarize(unode, \ + rbtn_left_get(a_type, a_field, unode), \ + rbtn_right_get(a_type, a_field, unode)); \ + } else { \ + /* || */\ + /* pathp(b) */\ + /* / \\ */\ + /* (r) (b) */\ + /* \ */\ + /* (b) */\ + /* / */\ + /* (b) */\ + assert(leftright != NULL); \ + rbtn_red_set(a_type, a_field, leftright); \ + rbtn_rotate_right(a_type, a_field, pathp->node, \ + tnode); \ + rbtn_black_set(a_type, a_field, tnode); \ + (void)a_summarize(pathp->node, \ + rbtn_left_get(a_type, a_field, pathp->node), \ + rbtn_right_get(a_type, a_field, pathp->node)); \ + } \ + (void)a_summarize(tnode, \ + rbtn_left_get(a_type, a_field, tnode), \ + rbtn_right_get(a_type, a_field, tnode)); \ + /* Balance restored, but rotation modified subtree */\ + /* root, which may actually be the tree root. */\ + if (pathp == path) { \ + /* Set root. */ \ + rbtree->rbt_root = tnode; \ + } else { \ + if (pathp[-1].cmp < 0) { \ + rbtn_left_set(a_type, a_field, pathp[-1].node, \ + tnode); \ + } else { \ + rbtn_right_set(a_type, a_field, pathp[-1].node, \ + tnode); \ + } \ + a_prefix##summarize_swapped_range(path, &pathp[-1], \ + swap_loc); \ + } \ + return; \ + } else if (rbtn_red_get(a_type, a_field, pathp->node)) { \ + a_type *leftleft = rbtn_left_get(a_type, a_field, left);\ + if (leftleft != NULL && rbtn_red_get(a_type, a_field, \ + leftleft)) { \ + /* || */\ + /* pathp(r) */\ + /* / \\ */\ + /* (b) (b) */\ + /* / */\ + /* (r) */\ + a_type *tnode; \ + rbtn_black_set(a_type, a_field, pathp->node); \ + rbtn_red_set(a_type, a_field, left); \ + rbtn_black_set(a_type, a_field, leftleft); \ + rbtn_rotate_right(a_type, a_field, pathp->node, \ + tnode); \ + (void)a_summarize(pathp->node, \ + rbtn_left_get(a_type, a_field, pathp->node), \ + rbtn_right_get(a_type, a_field, pathp->node)); \ + (void)a_summarize(tnode, \ + rbtn_left_get(a_type, a_field, tnode), \ + rbtn_right_get(a_type, a_field, tnode)); \ + /* Balance restored, but rotation modified */\ + /* subtree root. */\ + assert((uintptr_t)pathp > (uintptr_t)path); \ + if (pathp[-1].cmp < 0) { \ + rbtn_left_set(a_type, a_field, pathp[-1].node, \ + tnode); \ + } else { \ + rbtn_right_set(a_type, a_field, pathp[-1].node, \ + tnode); \ + } \ + a_prefix##summarize_swapped_range(path, &pathp[-1], \ + swap_loc); \ + return; \ + } else { \ + /* || */\ + /* pathp(r) */\ + /* / \\ */\ + /* (b) (b) */\ + /* / */\ + /* (b) */\ + rbtn_red_set(a_type, a_field, left); \ + rbtn_black_set(a_type, a_field, pathp->node); \ + /* Balance restored. */ \ + a_prefix##summarize_swapped_range(path, pathp, \ + swap_loc); \ + return; \ + } \ + } else { \ + a_type *leftleft = rbtn_left_get(a_type, a_field, left);\ + if (leftleft != NULL && rbtn_red_get(a_type, a_field, \ + leftleft)) { \ + /* || */\ + /* pathp(b) */\ + /* / \\ */\ + /* (b) (b) */\ + /* / */\ + /* (r) */\ + a_type *tnode; \ + rbtn_black_set(a_type, a_field, leftleft); \ + rbtn_rotate_right(a_type, a_field, pathp->node, \ + tnode); \ + (void)a_summarize(pathp->node, \ + rbtn_left_get(a_type, a_field, pathp->node), \ + rbtn_right_get(a_type, a_field, pathp->node)); \ + (void)a_summarize(tnode, \ + rbtn_left_get(a_type, a_field, tnode), \ + rbtn_right_get(a_type, a_field, tnode)); \ + /* Balance restored, but rotation modified */\ + /* subtree root, which may actually be the tree */\ + /* root. */\ + if (pathp == path) { \ + /* Set root. */ \ + rbtree->rbt_root = tnode; \ + } else { \ + if (pathp[-1].cmp < 0) { \ + rbtn_left_set(a_type, a_field, \ + pathp[-1].node, tnode); \ + } else { \ + rbtn_right_set(a_type, a_field, \ + pathp[-1].node, tnode); \ + } \ + a_prefix##summarize_swapped_range(path, \ + &pathp[-1], swap_loc); \ + } \ + return; \ + } else { \ + /* || */\ + /* pathp(b) */\ + /* / \\ */\ + /* (b) (b) */\ + /* / */\ + /* (b) */\ + rbtn_red_set(a_type, a_field, left); \ + (void)a_summarize(pathp->node, \ + rbtn_left_get(a_type, a_field, pathp->node), \ + rbtn_right_get(a_type, a_field, pathp->node)); \ + } \ + } \ + } \ + } \ + /* Set root. */ \ + rbtree->rbt_root = path->node; \ + assert(!rbtn_red_get(a_type, a_field, rbtree->rbt_root)); \ +} \ +a_attr a_type * \ +a_prefix##iter_recurse(a_rbt_type *rbtree, a_type *node, \ + a_type *(*cb)(a_rbt_type *, a_type *, void *), void *arg) { \ + if (node == NULL) { \ + return NULL; \ + } else { \ + a_type *ret; \ + if ((ret = a_prefix##iter_recurse(rbtree, rbtn_left_get(a_type, \ + a_field, node), cb, arg)) != NULL || (ret = cb(rbtree, node, \ + arg)) != NULL) { \ + return ret; \ + } \ + return a_prefix##iter_recurse(rbtree, rbtn_right_get(a_type, \ + a_field, node), cb, arg); \ + } \ +} \ +a_attr a_type * \ +a_prefix##iter_start(a_rbt_type *rbtree, a_type *start, a_type *node, \ + a_type *(*cb)(a_rbt_type *, a_type *, void *), void *arg) { \ + int cmp = a_cmp(start, node); \ + if (cmp < 0) { \ + a_type *ret; \ + if ((ret = a_prefix##iter_start(rbtree, start, \ + rbtn_left_get(a_type, a_field, node), cb, arg)) != NULL || \ + (ret = cb(rbtree, node, arg)) != NULL) { \ + return ret; \ + } \ + return a_prefix##iter_recurse(rbtree, rbtn_right_get(a_type, \ + a_field, node), cb, arg); \ + } else if (cmp > 0) { \ + return a_prefix##iter_start(rbtree, start, \ + rbtn_right_get(a_type, a_field, node), cb, arg); \ + } else { \ + a_type *ret; \ + if ((ret = cb(rbtree, node, arg)) != NULL) { \ + return ret; \ + } \ + return a_prefix##iter_recurse(rbtree, rbtn_right_get(a_type, \ + a_field, node), cb, arg); \ + } \ +} \ +a_attr a_type * \ +a_prefix##iter(a_rbt_type *rbtree, a_type *start, a_type *(*cb)( \ + a_rbt_type *, a_type *, void *), void *arg) { \ + a_type *ret; \ + if (start != NULL) { \ + ret = a_prefix##iter_start(rbtree, start, rbtree->rbt_root, \ + cb, arg); \ + } else { \ + ret = a_prefix##iter_recurse(rbtree, rbtree->rbt_root, cb, arg);\ + } \ + return ret; \ +} \ +a_attr a_type * \ +a_prefix##reverse_iter_recurse(a_rbt_type *rbtree, a_type *node, \ + a_type *(*cb)(a_rbt_type *, a_type *, void *), void *arg) { \ + if (node == NULL) { \ + return NULL; \ + } else { \ + a_type *ret; \ + if ((ret = a_prefix##reverse_iter_recurse(rbtree, \ + rbtn_right_get(a_type, a_field, node), cb, arg)) != NULL || \ + (ret = cb(rbtree, node, arg)) != NULL) { \ + return ret; \ + } \ + return a_prefix##reverse_iter_recurse(rbtree, \ + rbtn_left_get(a_type, a_field, node), cb, arg); \ + } \ +} \ +a_attr a_type * \ +a_prefix##reverse_iter_start(a_rbt_type *rbtree, a_type *start, \ + a_type *node, a_type *(*cb)(a_rbt_type *, a_type *, void *), \ + void *arg) { \ + int cmp = a_cmp(start, node); \ + if (cmp > 0) { \ + a_type *ret; \ + if ((ret = a_prefix##reverse_iter_start(rbtree, start, \ + rbtn_right_get(a_type, a_field, node), cb, arg)) != NULL || \ + (ret = cb(rbtree, node, arg)) != NULL) { \ + return ret; \ + } \ + return a_prefix##reverse_iter_recurse(rbtree, \ + rbtn_left_get(a_type, a_field, node), cb, arg); \ + } else if (cmp < 0) { \ + return a_prefix##reverse_iter_start(rbtree, start, \ + rbtn_left_get(a_type, a_field, node), cb, arg); \ + } else { \ + a_type *ret; \ + if ((ret = cb(rbtree, node, arg)) != NULL) { \ + return ret; \ + } \ + return a_prefix##reverse_iter_recurse(rbtree, \ + rbtn_left_get(a_type, a_field, node), cb, arg); \ + } \ +} \ +a_attr a_type * \ +a_prefix##reverse_iter(a_rbt_type *rbtree, a_type *start, \ + a_type *(*cb)(a_rbt_type *, a_type *, void *), void *arg) { \ + a_type *ret; \ + if (start != NULL) { \ + ret = a_prefix##reverse_iter_start(rbtree, start, \ + rbtree->rbt_root, cb, arg); \ + } else { \ + ret = a_prefix##reverse_iter_recurse(rbtree, rbtree->rbt_root, \ + cb, arg); \ + } \ + return ret; \ +} \ +a_attr void \ +a_prefix##destroy_recurse(a_rbt_type *rbtree, a_type *node, void (*cb)( \ + a_type *, void *), void *arg) { \ + if (node == NULL) { \ + return; \ + } \ + a_prefix##destroy_recurse(rbtree, rbtn_left_get(a_type, a_field, \ + node), cb, arg); \ + rbtn_left_set(a_type, a_field, (node), NULL); \ + a_prefix##destroy_recurse(rbtree, rbtn_right_get(a_type, a_field, \ + node), cb, arg); \ + rbtn_right_set(a_type, a_field, (node), NULL); \ + if (cb) { \ + cb(node, arg); \ + } \ +} \ +a_attr void \ +a_prefix##destroy(a_rbt_type *rbtree, void (*cb)(a_type *, void *), \ + void *arg) { \ + a_prefix##destroy_recurse(rbtree, rbtree->rbt_root, cb, arg); \ + rbtree->rbt_root = NULL; \ +} \ +/* BEGIN SUMMARIZED-ONLY IMPLEMENTATION */ \ +rb_summarized_only_##a_is_summarized( \ +static inline a_prefix##path_entry_t * \ +a_prefix##wind(a_rbt_type *rbtree, \ + a_prefix##path_entry_t path[RB_MAX_DEPTH], a_type *node) { \ + a_prefix##path_entry_t *pathp; \ + path->node = rbtree->rbt_root; \ + for (pathp = path; ; pathp++) { \ + assert((size_t)(pathp - path) < RB_MAX_DEPTH); \ + pathp->cmp = a_cmp(node, pathp->node); \ + if (pathp->cmp < 0) { \ + pathp[1].node = rbtn_left_get(a_type, a_field, \ + pathp->node); \ + } else if (pathp->cmp == 0) { \ + return pathp; \ + } else { \ + pathp[1].node = rbtn_right_get(a_type, a_field, \ + pathp->node); \ + } \ + } \ + unreachable(); \ +} \ +a_attr void \ +a_prefix##update_summaries(a_rbt_type *rbtree, a_type *node) { \ + a_prefix##path_entry_t path[RB_MAX_DEPTH]; \ + a_prefix##path_entry_t *pathp = a_prefix##wind(rbtree, path, node); \ + a_prefix##summarize_range(path, pathp); \ +} \ +a_attr bool \ +a_prefix##empty_filtered(a_rbt_type *rbtree, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx) { \ + a_type *node = rbtree->rbt_root; \ + return node == NULL || !filter_subtree(filter_ctx, node); \ +} \ +static inline a_type * \ +a_prefix##first_filtered_from_node(a_type *node, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx) { \ + assert(node != NULL && filter_subtree(filter_ctx, node)); \ + while (true) { \ + a_type *left = rbtn_left_get(a_type, a_field, node); \ + a_type *right = rbtn_right_get(a_type, a_field, node); \ + if (left != NULL && filter_subtree(filter_ctx, left)) { \ + node = left; \ + } else if (filter_node(filter_ctx, node)) { \ + return node; \ + } else { \ + assert(right != NULL \ + && filter_subtree(filter_ctx, right)); \ + node = right; \ + } \ + } \ + unreachable(); \ +} \ +a_attr a_type * \ +a_prefix##first_filtered(a_rbt_type *rbtree, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx) { \ + a_type *node = rbtree->rbt_root; \ + if (node == NULL || !filter_subtree(filter_ctx, node)) { \ + return NULL; \ + } \ + return a_prefix##first_filtered_from_node(node, filter_node, \ + filter_subtree, filter_ctx); \ +} \ +static inline a_type * \ +a_prefix##last_filtered_from_node(a_type *node, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx) { \ + assert(node != NULL && filter_subtree(filter_ctx, node)); \ + while (true) { \ + a_type *left = rbtn_left_get(a_type, a_field, node); \ + a_type *right = rbtn_right_get(a_type, a_field, node); \ + if (right != NULL && filter_subtree(filter_ctx, right)) { \ + node = right; \ + } else if (filter_node(filter_ctx, node)) { \ + return node; \ + } else { \ + assert(left != NULL \ + && filter_subtree(filter_ctx, left)); \ + node = left; \ + } \ + } \ + unreachable(); \ +} \ +a_attr a_type * \ +a_prefix##last_filtered(a_rbt_type *rbtree, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx) { \ + a_type *node = rbtree->rbt_root; \ + if (node == NULL || !filter_subtree(filter_ctx, node)) { \ + return NULL; \ + } \ + return a_prefix##last_filtered_from_node(node, filter_node, \ + filter_subtree, filter_ctx); \ +} \ +/* Internal implementation function. Search for a node comparing */\ +/* equal to key matching the filter. If such a node is in the tree, */\ +/* return it. Additionally, the caller has the option to ask for */\ +/* bounds on the next / prev node in the tree passing the filter. */\ +/* If nextbound is true, then this function will do one of the */\ +/* following: */\ +/* - Fill in *nextbound_node with the smallest node in the tree */\ +/* greater than key passing the filter, and NULL-out */\ +/* *nextbound_subtree. */\ +/* - Fill in *nextbound_subtree with a parent of that node which is */\ +/* not a parent of the searched-for node, and NULL-out */\ +/* *nextbound_node. */\ +/* - NULL-out both *nextbound_node and *nextbound_subtree, in which */\ +/* case no node greater than key but passing the filter is in the */\ +/* tree. */\ +/* The prevbound case is similar. If the caller knows that key is in */\ +/* the tree and that the subtree rooted at key does not contain a */\ +/* node satisfying the bound being searched for, then they can pass */\ +/* false for include_subtree, in which case we won't bother searching */\ +/* there (risking a cache miss). */\ +/* */\ +/* This API is unfortunately complex; but the logic for filtered */\ +/* searches is very subtle, and otherwise we would have to repeat it */\ +/* multiple times for filtered search, nsearch, psearch, next, and */\ +/* prev. */\ +static inline a_type * \ +a_prefix##search_with_filter_bounds(a_rbt_type *rbtree, \ + const a_type *key, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx, \ + bool include_subtree, \ + bool nextbound, a_type **nextbound_node, a_type **nextbound_subtree, \ + bool prevbound, a_type **prevbound_node, a_type **prevbound_subtree) {\ + if (nextbound) { \ + *nextbound_node = NULL; \ + *nextbound_subtree = NULL; \ + } \ + if (prevbound) { \ + *prevbound_node = NULL; \ + *prevbound_subtree = NULL; \ + } \ + a_type *tnode = rbtree->rbt_root; \ + while (tnode != NULL && filter_subtree(filter_ctx, tnode)) { \ + int cmp = a_cmp(key, tnode); \ + a_type *tleft = rbtn_left_get(a_type, a_field, tnode); \ + a_type *tright = rbtn_right_get(a_type, a_field, tnode); \ + if (cmp < 0) { \ + if (nextbound) { \ + if (filter_node(filter_ctx, tnode)) { \ + *nextbound_node = tnode; \ + *nextbound_subtree = NULL; \ + } else if (tright != NULL && filter_subtree( \ + filter_ctx, tright)) { \ + *nextbound_node = NULL; \ + *nextbound_subtree = tright; \ + } \ + } \ + tnode = tleft; \ + } else if (cmp > 0) { \ + if (prevbound) { \ + if (filter_node(filter_ctx, tnode)) { \ + *prevbound_node = tnode; \ + *prevbound_subtree = NULL; \ + } else if (tleft != NULL && filter_subtree( \ + filter_ctx, tleft)) { \ + *prevbound_node = NULL; \ + *prevbound_subtree = tleft; \ + } \ + } \ + tnode = tright; \ + } else { \ + if (filter_node(filter_ctx, tnode)) { \ + return tnode; \ + } \ + if (include_subtree) { \ + if (prevbound && tleft != NULL && filter_subtree( \ + filter_ctx, tleft)) { \ + *prevbound_node = NULL; \ + *prevbound_subtree = tleft; \ + } \ + if (nextbound && tright != NULL && filter_subtree( \ + filter_ctx, tright)) { \ + *nextbound_node = NULL; \ + *nextbound_subtree = tright; \ + } \ + } \ + return NULL; \ + } \ + } \ + return NULL; \ +} \ +a_attr a_type * \ +a_prefix##next_filtered(a_rbt_type *rbtree, a_type *node, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx) { \ + a_type *nright = rbtn_right_get(a_type, a_field, node); \ + if (nright != NULL && filter_subtree(filter_ctx, nright)) { \ + return a_prefix##first_filtered_from_node(nright, filter_node, \ + filter_subtree, filter_ctx); \ + } \ + a_type *node_candidate; \ + a_type *subtree_candidate; \ + a_type *search_result = a_prefix##search_with_filter_bounds( \ + rbtree, node, filter_node, filter_subtree, filter_ctx, \ + /* include_subtree */ false, \ + /* nextbound */ true, &node_candidate, &subtree_candidate, \ + /* prevbound */ false, NULL, NULL); \ + assert(node == search_result \ + || !filter_node(filter_ctx, node)); \ + if (node_candidate != NULL) { \ + return node_candidate; \ + } \ + if (subtree_candidate != NULL) { \ + return a_prefix##first_filtered_from_node( \ + subtree_candidate, filter_node, filter_subtree, \ + filter_ctx); \ + } \ + return NULL; \ +} \ +a_attr a_type * \ +a_prefix##prev_filtered(a_rbt_type *rbtree, a_type *node, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx) { \ + a_type *nleft = rbtn_left_get(a_type, a_field, node); \ + if (nleft != NULL && filter_subtree(filter_ctx, nleft)) { \ + return a_prefix##last_filtered_from_node(nleft, filter_node, \ + filter_subtree, filter_ctx); \ + } \ + a_type *node_candidate; \ + a_type *subtree_candidate; \ + a_type *search_result = a_prefix##search_with_filter_bounds( \ + rbtree, node, filter_node, filter_subtree, filter_ctx, \ + /* include_subtree */ false, \ + /* nextbound */ false, NULL, NULL, \ + /* prevbound */ true, &node_candidate, &subtree_candidate); \ + assert(node == search_result \ + || !filter_node(filter_ctx, node)); \ + if (node_candidate != NULL) { \ + return node_candidate; \ + } \ + if (subtree_candidate != NULL) { \ + return a_prefix##last_filtered_from_node( \ + subtree_candidate, filter_node, filter_subtree, \ + filter_ctx); \ + } \ + return NULL; \ +} \ +a_attr a_type * \ +a_prefix##search_filtered(a_rbt_type *rbtree, const a_type *key, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx) { \ + a_type *result = a_prefix##search_with_filter_bounds(rbtree, key, \ + filter_node, filter_subtree, filter_ctx, \ + /* include_subtree */ false, \ + /* nextbound */ false, NULL, NULL, \ + /* prevbound */ false, NULL, NULL); \ + return result; \ +} \ +a_attr a_type * \ +a_prefix##nsearch_filtered(a_rbt_type *rbtree, const a_type *key, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx) { \ + a_type *node_candidate; \ + a_type *subtree_candidate; \ + a_type *result = a_prefix##search_with_filter_bounds(rbtree, key, \ + filter_node, filter_subtree, filter_ctx, \ + /* include_subtree */ true, \ + /* nextbound */ true, &node_candidate, &subtree_candidate, \ + /* prevbound */ false, NULL, NULL); \ + if (result != NULL) { \ + return result; \ + } \ + if (node_candidate != NULL) { \ + return node_candidate; \ + } \ + if (subtree_candidate != NULL) { \ + return a_prefix##first_filtered_from_node( \ + subtree_candidate, filter_node, filter_subtree, \ + filter_ctx); \ + } \ + return NULL; \ +} \ +a_attr a_type * \ +a_prefix##psearch_filtered(a_rbt_type *rbtree, const a_type *key, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx) { \ + a_type *node_candidate; \ + a_type *subtree_candidate; \ + a_type *result = a_prefix##search_with_filter_bounds(rbtree, key, \ + filter_node, filter_subtree, filter_ctx, \ + /* include_subtree */ true, \ + /* nextbound */ false, NULL, NULL, \ + /* prevbound */ true, &node_candidate, &subtree_candidate); \ + if (result != NULL) { \ + return result; \ + } \ + if (node_candidate != NULL) { \ + return node_candidate; \ + } \ + if (subtree_candidate != NULL) { \ + return a_prefix##last_filtered_from_node( \ + subtree_candidate, filter_node, filter_subtree, \ + filter_ctx); \ + } \ + return NULL; \ +} \ +a_attr a_type * \ +a_prefix##iter_recurse_filtered(a_rbt_type *rbtree, a_type *node, \ + a_type *(*cb)(a_rbt_type *, a_type *, void *), void *arg, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx) { \ + if (node == NULL || !filter_subtree(filter_ctx, node)) { \ + return NULL; \ + } \ + a_type *ret; \ + a_type *left = rbtn_left_get(a_type, a_field, node); \ + a_type *right = rbtn_right_get(a_type, a_field, node); \ + ret = a_prefix##iter_recurse_filtered(rbtree, left, cb, arg, \ + filter_node, filter_subtree, filter_ctx); \ + if (ret != NULL) { \ + return ret; \ + } \ + if (filter_node(filter_ctx, node)) { \ + ret = cb(rbtree, node, arg); \ + } \ + if (ret != NULL) { \ + return ret; \ + } \ + return a_prefix##iter_recurse_filtered(rbtree, right, cb, arg, \ + filter_node, filter_subtree, filter_ctx); \ +} \ +a_attr a_type * \ +a_prefix##iter_start_filtered(a_rbt_type *rbtree, a_type *start, \ + a_type *node, a_type *(*cb)(a_rbt_type *, a_type *, void *), \ + void *arg, bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx) { \ + if (!filter_subtree(filter_ctx, node)) { \ + return NULL; \ + } \ + int cmp = a_cmp(start, node); \ + a_type *ret; \ + a_type *left = rbtn_left_get(a_type, a_field, node); \ + a_type *right = rbtn_right_get(a_type, a_field, node); \ + if (cmp < 0) { \ + ret = a_prefix##iter_start_filtered(rbtree, start, left, cb, \ + arg, filter_node, filter_subtree, filter_ctx); \ + if (ret != NULL) { \ + return ret; \ + } \ + if (filter_node(filter_ctx, node)) { \ + ret = cb(rbtree, node, arg); \ + if (ret != NULL) { \ + return ret; \ + } \ + } \ + return a_prefix##iter_recurse_filtered(rbtree, right, cb, arg, \ + filter_node, filter_subtree, filter_ctx); \ + } else if (cmp > 0) { \ + return a_prefix##iter_start_filtered(rbtree, start, right, \ + cb, arg, filter_node, filter_subtree, filter_ctx); \ + } else { \ + if (filter_node(filter_ctx, node)) { \ + ret = cb(rbtree, node, arg); \ + if (ret != NULL) { \ + return ret; \ + } \ + } \ + return a_prefix##iter_recurse_filtered(rbtree, right, cb, arg, \ + filter_node, filter_subtree, filter_ctx); \ + } \ +} \ +a_attr a_type * \ +a_prefix##iter_filtered(a_rbt_type *rbtree, a_type *start, \ + a_type *(*cb)(a_rbt_type *, a_type *, void *), void *arg, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx) { \ + a_type *ret; \ + if (start != NULL) { \ + ret = a_prefix##iter_start_filtered(rbtree, start, \ + rbtree->rbt_root, cb, arg, filter_node, filter_subtree, \ + filter_ctx); \ + } else { \ + ret = a_prefix##iter_recurse_filtered(rbtree, rbtree->rbt_root, \ + cb, arg, filter_node, filter_subtree, filter_ctx); \ + } \ + return ret; \ +} \ +a_attr a_type * \ +a_prefix##reverse_iter_recurse_filtered(a_rbt_type *rbtree, \ + a_type *node, a_type *(*cb)(a_rbt_type *, a_type *, void *), \ + void *arg, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx) { \ + if (node == NULL || !filter_subtree(filter_ctx, node)) { \ + return NULL; \ + } \ + a_type *ret; \ + a_type *left = rbtn_left_get(a_type, a_field, node); \ + a_type *right = rbtn_right_get(a_type, a_field, node); \ + ret = a_prefix##reverse_iter_recurse_filtered(rbtree, right, cb, \ + arg, filter_node, filter_subtree, filter_ctx); \ + if (ret != NULL) { \ + return ret; \ + } \ + if (filter_node(filter_ctx, node)) { \ + ret = cb(rbtree, node, arg); \ + } \ + if (ret != NULL) { \ + return ret; \ + } \ + return a_prefix##reverse_iter_recurse_filtered(rbtree, left, cb, \ + arg, filter_node, filter_subtree, filter_ctx); \ +} \ +a_attr a_type * \ +a_prefix##reverse_iter_start_filtered(a_rbt_type *rbtree, a_type *start,\ + a_type *node, a_type *(*cb)(a_rbt_type *, a_type *, void *), \ + void *arg, bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx) { \ + if (!filter_subtree(filter_ctx, node)) { \ + return NULL; \ + } \ + int cmp = a_cmp(start, node); \ + a_type *ret; \ + a_type *left = rbtn_left_get(a_type, a_field, node); \ + a_type *right = rbtn_right_get(a_type, a_field, node); \ + if (cmp > 0) { \ + ret = a_prefix##reverse_iter_start_filtered(rbtree, start, \ + right, cb, arg, filter_node, filter_subtree, filter_ctx); \ + if (ret != NULL) { \ + return ret; \ + } \ + if (filter_node(filter_ctx, node)) { \ + ret = cb(rbtree, node, arg); \ + if (ret != NULL) { \ + return ret; \ + } \ + } \ + return a_prefix##reverse_iter_recurse_filtered(rbtree, left, cb,\ + arg, filter_node, filter_subtree, filter_ctx); \ + } else if (cmp < 0) { \ + return a_prefix##reverse_iter_start_filtered(rbtree, start, \ + left, cb, arg, filter_node, filter_subtree, filter_ctx); \ + } else { \ + if (filter_node(filter_ctx, node)) { \ + ret = cb(rbtree, node, arg); \ + if (ret != NULL) { \ + return ret; \ + } \ + } \ + return a_prefix##reverse_iter_recurse_filtered(rbtree, left, cb,\ + arg, filter_node, filter_subtree, filter_ctx); \ + } \ +} \ +a_attr a_type * \ +a_prefix##reverse_iter_filtered(a_rbt_type *rbtree, a_type *start, \ + a_type *(*cb)(a_rbt_type *, a_type *, void *), void *arg, \ + bool (*filter_node)(void *, a_type *), \ + bool (*filter_subtree)(void *, a_type *), \ + void *filter_ctx) { \ + a_type *ret; \ + if (start != NULL) { \ + ret = a_prefix##reverse_iter_start_filtered(rbtree, start, \ + rbtree->rbt_root, cb, arg, filter_node, filter_subtree, \ + filter_ctx); \ + } else { \ + ret = a_prefix##reverse_iter_recurse_filtered(rbtree, \ + rbtree->rbt_root, cb, arg, filter_node, filter_subtree, \ + filter_ctx); \ + } \ + return ret; \ +} \ +) /* end rb_summarized_only */ + +#endif /* JEMALLOC_INTERNAL_RB_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/rtree.h b/BeefRT/JEMalloc/include/jemalloc/internal/rtree.h new file mode 100644 index 00000000..a00adb29 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/rtree.h @@ -0,0 +1,554 @@ +#ifndef JEMALLOC_INTERNAL_RTREE_H +#define JEMALLOC_INTERNAL_RTREE_H + +#include "jemalloc/internal/atomic.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/rtree_tsd.h" +#include "jemalloc/internal/sc.h" +#include "jemalloc/internal/tsd.h" + +/* + * This radix tree implementation is tailored to the singular purpose of + * associating metadata with extents that are currently owned by jemalloc. + * + ******************************************************************************* + */ + +/* Number of high insignificant bits. */ +#define RTREE_NHIB ((1U << (LG_SIZEOF_PTR+3)) - LG_VADDR) +/* Number of low insigificant bits. */ +#define RTREE_NLIB LG_PAGE +/* Number of significant bits. */ +#define RTREE_NSB (LG_VADDR - RTREE_NLIB) +/* Number of levels in radix tree. */ +#if RTREE_NSB <= 10 +# define RTREE_HEIGHT 1 +#elif RTREE_NSB <= 36 +# define RTREE_HEIGHT 2 +#elif RTREE_NSB <= 52 +# define RTREE_HEIGHT 3 +#else +# error Unsupported number of significant virtual address bits +#endif +/* Use compact leaf representation if virtual address encoding allows. */ +#if RTREE_NHIB >= LG_CEIL(SC_NSIZES) +# define RTREE_LEAF_COMPACT +#endif + +typedef struct rtree_node_elm_s rtree_node_elm_t; +struct rtree_node_elm_s { + atomic_p_t child; /* (rtree_{node,leaf}_elm_t *) */ +}; + +typedef struct rtree_metadata_s rtree_metadata_t; +struct rtree_metadata_s { + szind_t szind; + extent_state_t state; /* Mirrors edata->state. */ + bool is_head; /* Mirrors edata->is_head. */ + bool slab; +}; + +typedef struct rtree_contents_s rtree_contents_t; +struct rtree_contents_s { + edata_t *edata; + rtree_metadata_t metadata; +}; + +#define RTREE_LEAF_STATE_WIDTH EDATA_BITS_STATE_WIDTH +#define RTREE_LEAF_STATE_SHIFT 2 +#define RTREE_LEAF_STATE_MASK MASK(RTREE_LEAF_STATE_WIDTH, RTREE_LEAF_STATE_SHIFT) + +struct rtree_leaf_elm_s { +#ifdef RTREE_LEAF_COMPACT + /* + * Single pointer-width field containing all three leaf element fields. + * For example, on a 64-bit x64 system with 48 significant virtual + * memory address bits, the index, edata, and slab fields are packed as + * such: + * + * x: index + * e: edata + * s: state + * h: is_head + * b: slab + * + * 00000000 xxxxxxxx eeeeeeee [...] eeeeeeee e00ssshb + */ + atomic_p_t le_bits; +#else + atomic_p_t le_edata; /* (edata_t *) */ + /* + * From high to low bits: szind (8 bits), state (4 bits), is_head, slab + */ + atomic_u_t le_metadata; +#endif +}; + +typedef struct rtree_level_s rtree_level_t; +struct rtree_level_s { + /* Number of key bits distinguished by this level. */ + unsigned bits; + /* + * Cumulative number of key bits distinguished by traversing to + * corresponding tree level. + */ + unsigned cumbits; +}; + +typedef struct rtree_s rtree_t; +struct rtree_s { + base_t *base; + malloc_mutex_t init_lock; + /* Number of elements based on rtree_levels[0].bits. */ +#if RTREE_HEIGHT > 1 + rtree_node_elm_t root[1U << (RTREE_NSB/RTREE_HEIGHT)]; +#else + rtree_leaf_elm_t root[1U << (RTREE_NSB/RTREE_HEIGHT)]; +#endif +}; + +/* + * Split the bits into one to three partitions depending on number of + * significant bits. It the number of bits does not divide evenly into the + * number of levels, place one remainder bit per level starting at the leaf + * level. + */ +static const rtree_level_t rtree_levels[] = { +#if RTREE_HEIGHT == 1 + {RTREE_NSB, RTREE_NHIB + RTREE_NSB} +#elif RTREE_HEIGHT == 2 + {RTREE_NSB/2, RTREE_NHIB + RTREE_NSB/2}, + {RTREE_NSB/2 + RTREE_NSB%2, RTREE_NHIB + RTREE_NSB} +#elif RTREE_HEIGHT == 3 + {RTREE_NSB/3, RTREE_NHIB + RTREE_NSB/3}, + {RTREE_NSB/3 + RTREE_NSB%3/2, + RTREE_NHIB + RTREE_NSB/3*2 + RTREE_NSB%3/2}, + {RTREE_NSB/3 + RTREE_NSB%3 - RTREE_NSB%3/2, RTREE_NHIB + RTREE_NSB} +#else +# error Unsupported rtree height +#endif +}; + +bool rtree_new(rtree_t *rtree, base_t *base, bool zeroed); + +rtree_leaf_elm_t *rtree_leaf_elm_lookup_hard(tsdn_t *tsdn, rtree_t *rtree, + rtree_ctx_t *rtree_ctx, uintptr_t key, bool dependent, bool init_missing); + +JEMALLOC_ALWAYS_INLINE unsigned +rtree_leaf_maskbits(void) { + unsigned ptrbits = ZU(1) << (LG_SIZEOF_PTR+3); + unsigned cumbits = (rtree_levels[RTREE_HEIGHT-1].cumbits - + rtree_levels[RTREE_HEIGHT-1].bits); + return ptrbits - cumbits; +} + +JEMALLOC_ALWAYS_INLINE uintptr_t +rtree_leafkey(uintptr_t key) { + uintptr_t mask = ~((ZU(1) << rtree_leaf_maskbits()) - 1); + return (key & mask); +} + +JEMALLOC_ALWAYS_INLINE size_t +rtree_cache_direct_map(uintptr_t key) { + return (size_t)((key >> rtree_leaf_maskbits()) & + (RTREE_CTX_NCACHE - 1)); +} + +JEMALLOC_ALWAYS_INLINE uintptr_t +rtree_subkey(uintptr_t key, unsigned level) { + unsigned ptrbits = ZU(1) << (LG_SIZEOF_PTR+3); + unsigned cumbits = rtree_levels[level].cumbits; + unsigned shiftbits = ptrbits - cumbits; + unsigned maskbits = rtree_levels[level].bits; + uintptr_t mask = (ZU(1) << maskbits) - 1; + return ((key >> shiftbits) & mask); +} + +/* + * Atomic getters. + * + * dependent: Reading a value on behalf of a pointer to a valid allocation + * is guaranteed to be a clean read even without synchronization, + * because the rtree update became visible in memory before the + * pointer came into existence. + * !dependent: An arbitrary read, e.g. on behalf of ivsalloc(), may not be + * dependent on a previous rtree write, which means a stale read + * could result if synchronization were omitted here. + */ +# ifdef RTREE_LEAF_COMPACT +JEMALLOC_ALWAYS_INLINE uintptr_t +rtree_leaf_elm_bits_read(tsdn_t *tsdn, rtree_t *rtree, + rtree_leaf_elm_t *elm, bool dependent) { + return (uintptr_t)atomic_load_p(&elm->le_bits, dependent + ? ATOMIC_RELAXED : ATOMIC_ACQUIRE); +} + +JEMALLOC_ALWAYS_INLINE uintptr_t +rtree_leaf_elm_bits_encode(rtree_contents_t contents) { + assert((uintptr_t)contents.edata % (uintptr_t)EDATA_ALIGNMENT == 0); + uintptr_t edata_bits = (uintptr_t)contents.edata + & (((uintptr_t)1 << LG_VADDR) - 1); + + uintptr_t szind_bits = (uintptr_t)contents.metadata.szind << LG_VADDR; + uintptr_t slab_bits = (uintptr_t)contents.metadata.slab; + uintptr_t is_head_bits = (uintptr_t)contents.metadata.is_head << 1; + uintptr_t state_bits = (uintptr_t)contents.metadata.state << + RTREE_LEAF_STATE_SHIFT; + uintptr_t metadata_bits = szind_bits | state_bits | is_head_bits | + slab_bits; + assert((edata_bits & metadata_bits) == 0); + + return edata_bits | metadata_bits; +} + +JEMALLOC_ALWAYS_INLINE rtree_contents_t +rtree_leaf_elm_bits_decode(uintptr_t bits) { + rtree_contents_t contents; + /* Do the easy things first. */ + contents.metadata.szind = bits >> LG_VADDR; + contents.metadata.slab = (bool)(bits & 1); + contents.metadata.is_head = (bool)(bits & (1 << 1)); + + uintptr_t state_bits = (bits & RTREE_LEAF_STATE_MASK) >> + RTREE_LEAF_STATE_SHIFT; + assert(state_bits <= extent_state_max); + contents.metadata.state = (extent_state_t)state_bits; + + uintptr_t low_bit_mask = ~((uintptr_t)EDATA_ALIGNMENT - 1); +# ifdef __aarch64__ + /* + * aarch64 doesn't sign extend the highest virtual address bit to set + * the higher ones. Instead, the high bits get zeroed. + */ + uintptr_t high_bit_mask = ((uintptr_t)1 << LG_VADDR) - 1; + /* Mask off metadata. */ + uintptr_t mask = high_bit_mask & low_bit_mask; + contents.edata = (edata_t *)(bits & mask); +# else + /* Restore sign-extended high bits, mask metadata bits. */ + contents.edata = (edata_t *)((uintptr_t)((intptr_t)(bits << RTREE_NHIB) + >> RTREE_NHIB) & low_bit_mask); +# endif + assert((uintptr_t)contents.edata % (uintptr_t)EDATA_ALIGNMENT == 0); + return contents; +} + +# endif /* RTREE_LEAF_COMPACT */ + +JEMALLOC_ALWAYS_INLINE rtree_contents_t +rtree_leaf_elm_read(tsdn_t *tsdn, rtree_t *rtree, rtree_leaf_elm_t *elm, + bool dependent) { +#ifdef RTREE_LEAF_COMPACT + uintptr_t bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm, dependent); + rtree_contents_t contents = rtree_leaf_elm_bits_decode(bits); + return contents; +#else + rtree_contents_t contents; + unsigned metadata_bits = atomic_load_u(&elm->le_metadata, dependent + ? ATOMIC_RELAXED : ATOMIC_ACQUIRE); + contents.metadata.slab = (bool)(metadata_bits & 1); + contents.metadata.is_head = (bool)(metadata_bits & (1 << 1)); + + uintptr_t state_bits = (metadata_bits & RTREE_LEAF_STATE_MASK) >> + RTREE_LEAF_STATE_SHIFT; + assert(state_bits <= extent_state_max); + contents.metadata.state = (extent_state_t)state_bits; + contents.metadata.szind = metadata_bits >> (RTREE_LEAF_STATE_SHIFT + + RTREE_LEAF_STATE_WIDTH); + + contents.edata = (edata_t *)atomic_load_p(&elm->le_edata, dependent + ? ATOMIC_RELAXED : ATOMIC_ACQUIRE); + + return contents; +#endif +} + +JEMALLOC_ALWAYS_INLINE void +rtree_contents_encode(rtree_contents_t contents, void **bits, + unsigned *additional) { +#ifdef RTREE_LEAF_COMPACT + *bits = (void *)rtree_leaf_elm_bits_encode(contents); +#else + *additional = (unsigned)contents.metadata.slab + | ((unsigned)contents.metadata.is_head << 1) + | ((unsigned)contents.metadata.state << RTREE_LEAF_STATE_SHIFT) + | ((unsigned)contents.metadata.szind << (RTREE_LEAF_STATE_SHIFT + + RTREE_LEAF_STATE_WIDTH)); + *bits = contents.edata; +#endif +} + +JEMALLOC_ALWAYS_INLINE void +rtree_leaf_elm_write_commit(tsdn_t *tsdn, rtree_t *rtree, + rtree_leaf_elm_t *elm, void *bits, unsigned additional) { +#ifdef RTREE_LEAF_COMPACT + atomic_store_p(&elm->le_bits, bits, ATOMIC_RELEASE); +#else + atomic_store_u(&elm->le_metadata, additional, ATOMIC_RELEASE); + /* + * Write edata last, since the element is atomically considered valid + * as soon as the edata field is non-NULL. + */ + atomic_store_p(&elm->le_edata, bits, ATOMIC_RELEASE); +#endif +} + +JEMALLOC_ALWAYS_INLINE void +rtree_leaf_elm_write(tsdn_t *tsdn, rtree_t *rtree, + rtree_leaf_elm_t *elm, rtree_contents_t contents) { + assert((uintptr_t)contents.edata % EDATA_ALIGNMENT == 0); + void *bits; + unsigned additional; + + rtree_contents_encode(contents, &bits, &additional); + rtree_leaf_elm_write_commit(tsdn, rtree, elm, bits, additional); +} + +/* The state field can be updated independently (and more frequently). */ +JEMALLOC_ALWAYS_INLINE void +rtree_leaf_elm_state_update(tsdn_t *tsdn, rtree_t *rtree, + rtree_leaf_elm_t *elm1, rtree_leaf_elm_t *elm2, extent_state_t state) { + assert(elm1 != NULL); +#ifdef RTREE_LEAF_COMPACT + uintptr_t bits = rtree_leaf_elm_bits_read(tsdn, rtree, elm1, + /* dependent */ true); + bits &= ~RTREE_LEAF_STATE_MASK; + bits |= state << RTREE_LEAF_STATE_SHIFT; + atomic_store_p(&elm1->le_bits, (void *)bits, ATOMIC_RELEASE); + if (elm2 != NULL) { + atomic_store_p(&elm2->le_bits, (void *)bits, ATOMIC_RELEASE); + } +#else + unsigned bits = atomic_load_u(&elm1->le_metadata, ATOMIC_RELAXED); + bits &= ~RTREE_LEAF_STATE_MASK; + bits |= state << RTREE_LEAF_STATE_SHIFT; + atomic_store_u(&elm1->le_metadata, bits, ATOMIC_RELEASE); + if (elm2 != NULL) { + atomic_store_u(&elm2->le_metadata, bits, ATOMIC_RELEASE); + } +#endif +} + +/* + * Tries to look up the key in the L1 cache, returning false if there's a hit, or + * true if there's a miss. + * Key is allowed to be NULL; returns true in this case. + */ +JEMALLOC_ALWAYS_INLINE bool +rtree_leaf_elm_lookup_fast(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, + uintptr_t key, rtree_leaf_elm_t **elm) { + size_t slot = rtree_cache_direct_map(key); + uintptr_t leafkey = rtree_leafkey(key); + assert(leafkey != RTREE_LEAFKEY_INVALID); + + if (unlikely(rtree_ctx->cache[slot].leafkey != leafkey)) { + return true; + } + + rtree_leaf_elm_t *leaf = rtree_ctx->cache[slot].leaf; + assert(leaf != NULL); + uintptr_t subkey = rtree_subkey(key, RTREE_HEIGHT-1); + *elm = &leaf[subkey]; + + return false; +} + +JEMALLOC_ALWAYS_INLINE rtree_leaf_elm_t * +rtree_leaf_elm_lookup(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, + uintptr_t key, bool dependent, bool init_missing) { + assert(key != 0); + assert(!dependent || !init_missing); + + size_t slot = rtree_cache_direct_map(key); + uintptr_t leafkey = rtree_leafkey(key); + assert(leafkey != RTREE_LEAFKEY_INVALID); + + /* Fast path: L1 direct mapped cache. */ + if (likely(rtree_ctx->cache[slot].leafkey == leafkey)) { + rtree_leaf_elm_t *leaf = rtree_ctx->cache[slot].leaf; + assert(leaf != NULL); + uintptr_t subkey = rtree_subkey(key, RTREE_HEIGHT-1); + return &leaf[subkey]; + } + /* + * Search the L2 LRU cache. On hit, swap the matching element into the + * slot in L1 cache, and move the position in L2 up by 1. + */ +#define RTREE_CACHE_CHECK_L2(i) do { \ + if (likely(rtree_ctx->l2_cache[i].leafkey == leafkey)) { \ + rtree_leaf_elm_t *leaf = rtree_ctx->l2_cache[i].leaf; \ + assert(leaf != NULL); \ + if (i > 0) { \ + /* Bubble up by one. */ \ + rtree_ctx->l2_cache[i].leafkey = \ + rtree_ctx->l2_cache[i - 1].leafkey; \ + rtree_ctx->l2_cache[i].leaf = \ + rtree_ctx->l2_cache[i - 1].leaf; \ + rtree_ctx->l2_cache[i - 1].leafkey = \ + rtree_ctx->cache[slot].leafkey; \ + rtree_ctx->l2_cache[i - 1].leaf = \ + rtree_ctx->cache[slot].leaf; \ + } else { \ + rtree_ctx->l2_cache[0].leafkey = \ + rtree_ctx->cache[slot].leafkey; \ + rtree_ctx->l2_cache[0].leaf = \ + rtree_ctx->cache[slot].leaf; \ + } \ + rtree_ctx->cache[slot].leafkey = leafkey; \ + rtree_ctx->cache[slot].leaf = leaf; \ + uintptr_t subkey = rtree_subkey(key, RTREE_HEIGHT-1); \ + return &leaf[subkey]; \ + } \ +} while (0) + /* Check the first cache entry. */ + RTREE_CACHE_CHECK_L2(0); + /* Search the remaining cache elements. */ + for (unsigned i = 1; i < RTREE_CTX_NCACHE_L2; i++) { + RTREE_CACHE_CHECK_L2(i); + } +#undef RTREE_CACHE_CHECK_L2 + + return rtree_leaf_elm_lookup_hard(tsdn, rtree, rtree_ctx, key, + dependent, init_missing); +} + +/* + * Returns true on lookup failure. + */ +static inline bool +rtree_read_independent(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, + uintptr_t key, rtree_contents_t *r_contents) { + rtree_leaf_elm_t *elm = rtree_leaf_elm_lookup(tsdn, rtree, rtree_ctx, + key, /* dependent */ false, /* init_missing */ false); + if (elm == NULL) { + return true; + } + *r_contents = rtree_leaf_elm_read(tsdn, rtree, elm, + /* dependent */ false); + return false; +} + +static inline rtree_contents_t +rtree_read(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, + uintptr_t key) { + rtree_leaf_elm_t *elm = rtree_leaf_elm_lookup(tsdn, rtree, rtree_ctx, + key, /* dependent */ true, /* init_missing */ false); + assert(elm != NULL); + return rtree_leaf_elm_read(tsdn, rtree, elm, /* dependent */ true); +} + +static inline rtree_metadata_t +rtree_metadata_read(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, + uintptr_t key) { + rtree_leaf_elm_t *elm = rtree_leaf_elm_lookup(tsdn, rtree, rtree_ctx, + key, /* dependent */ true, /* init_missing */ false); + assert(elm != NULL); + return rtree_leaf_elm_read(tsdn, rtree, elm, + /* dependent */ true).metadata; +} + +/* + * Returns true when the request cannot be fulfilled by fastpath. + */ +static inline bool +rtree_metadata_try_read_fast(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, + uintptr_t key, rtree_metadata_t *r_rtree_metadata) { + rtree_leaf_elm_t *elm; + /* + * Should check the bool return value (lookup success or not) instead of + * elm == NULL (which will result in an extra branch). This is because + * when the cache lookup succeeds, there will never be a NULL pointer + * returned (which is unknown to the compiler). + */ + if (rtree_leaf_elm_lookup_fast(tsdn, rtree, rtree_ctx, key, &elm)) { + return true; + } + assert(elm != NULL); + *r_rtree_metadata = rtree_leaf_elm_read(tsdn, rtree, elm, + /* dependent */ true).metadata; + return false; +} + +JEMALLOC_ALWAYS_INLINE void +rtree_write_range_impl(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, + uintptr_t base, uintptr_t end, rtree_contents_t contents, bool clearing) { + assert((base & PAGE_MASK) == 0 && (end & PAGE_MASK) == 0); + /* + * Only used for emap_(de)register_interior, which implies the + * boundaries have been registered already. Therefore all the lookups + * are dependent w/o init_missing, assuming the range spans across at + * most 2 rtree leaf nodes (each covers 1 GiB of vaddr). + */ + void *bits; + unsigned additional; + rtree_contents_encode(contents, &bits, &additional); + + rtree_leaf_elm_t *elm = NULL; /* Dead store. */ + for (uintptr_t addr = base; addr <= end; addr += PAGE) { + if (addr == base || + (addr & ((ZU(1) << rtree_leaf_maskbits()) - 1)) == 0) { + elm = rtree_leaf_elm_lookup(tsdn, rtree, rtree_ctx, addr, + /* dependent */ true, /* init_missing */ false); + assert(elm != NULL); + } + assert(elm == rtree_leaf_elm_lookup(tsdn, rtree, rtree_ctx, addr, + /* dependent */ true, /* init_missing */ false)); + assert(!clearing || rtree_leaf_elm_read(tsdn, rtree, elm, + /* dependent */ true).edata != NULL); + rtree_leaf_elm_write_commit(tsdn, rtree, elm, bits, additional); + elm++; + } +} + +JEMALLOC_ALWAYS_INLINE void +rtree_write_range(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, + uintptr_t base, uintptr_t end, rtree_contents_t contents) { + rtree_write_range_impl(tsdn, rtree, rtree_ctx, base, end, contents, + /* clearing */ false); +} + +JEMALLOC_ALWAYS_INLINE bool +rtree_write(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, uintptr_t key, + rtree_contents_t contents) { + rtree_leaf_elm_t *elm = rtree_leaf_elm_lookup(tsdn, rtree, rtree_ctx, + key, /* dependent */ false, /* init_missing */ true); + if (elm == NULL) { + return true; + } + + rtree_leaf_elm_write(tsdn, rtree, elm, contents); + + return false; +} + +static inline void +rtree_clear(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, + uintptr_t key) { + rtree_leaf_elm_t *elm = rtree_leaf_elm_lookup(tsdn, rtree, rtree_ctx, + key, /* dependent */ true, /* init_missing */ false); + assert(elm != NULL); + assert(rtree_leaf_elm_read(tsdn, rtree, elm, + /* dependent */ true).edata != NULL); + rtree_contents_t contents; + contents.edata = NULL; + contents.metadata.szind = SC_NSIZES; + contents.metadata.slab = false; + contents.metadata.is_head = false; + contents.metadata.state = (extent_state_t)0; + rtree_leaf_elm_write(tsdn, rtree, elm, contents); +} + +static inline void +rtree_clear_range(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, + uintptr_t base, uintptr_t end) { + rtree_contents_t contents; + contents.edata = NULL; + contents.metadata.szind = SC_NSIZES; + contents.metadata.slab = false; + contents.metadata.is_head = false; + contents.metadata.state = (extent_state_t)0; + rtree_write_range_impl(tsdn, rtree, rtree_ctx, base, end, contents, + /* clearing */ true); +} + +#endif /* JEMALLOC_INTERNAL_RTREE_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/rtree_tsd.h b/BeefRT/JEMalloc/include/jemalloc/internal/rtree_tsd.h new file mode 100644 index 00000000..e45525c5 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/rtree_tsd.h @@ -0,0 +1,62 @@ +#ifndef JEMALLOC_INTERNAL_RTREE_CTX_H +#define JEMALLOC_INTERNAL_RTREE_CTX_H + +/* + * Number of leafkey/leaf pairs to cache in L1 and L2 level respectively. Each + * entry supports an entire leaf, so the cache hit rate is typically high even + * with a small number of entries. In rare cases extent activity will straddle + * the boundary between two leaf nodes. Furthermore, an arena may use a + * combination of dss and mmap. Note that as memory usage grows past the amount + * that this cache can directly cover, the cache will become less effective if + * locality of reference is low, but the consequence is merely cache misses + * while traversing the tree nodes. + * + * The L1 direct mapped cache offers consistent and low cost on cache hit. + * However collision could affect hit rate negatively. This is resolved by + * combining with a L2 LRU cache, which requires linear search and re-ordering + * on access but suffers no collision. Note that, the cache will itself suffer + * cache misses if made overly large, plus the cost of linear search in the LRU + * cache. + */ +#define RTREE_CTX_NCACHE 16 +#define RTREE_CTX_NCACHE_L2 8 + +/* Needed for initialization only. */ +#define RTREE_LEAFKEY_INVALID ((uintptr_t)1) +#define RTREE_CTX_CACHE_ELM_INVALID {RTREE_LEAFKEY_INVALID, NULL} + +#define RTREE_CTX_INIT_ELM_1 RTREE_CTX_CACHE_ELM_INVALID +#define RTREE_CTX_INIT_ELM_2 RTREE_CTX_INIT_ELM_1, RTREE_CTX_INIT_ELM_1 +#define RTREE_CTX_INIT_ELM_4 RTREE_CTX_INIT_ELM_2, RTREE_CTX_INIT_ELM_2 +#define RTREE_CTX_INIT_ELM_8 RTREE_CTX_INIT_ELM_4, RTREE_CTX_INIT_ELM_4 +#define RTREE_CTX_INIT_ELM_16 RTREE_CTX_INIT_ELM_8, RTREE_CTX_INIT_ELM_8 + +#define _RTREE_CTX_INIT_ELM_DATA(n) RTREE_CTX_INIT_ELM_##n +#define RTREE_CTX_INIT_ELM_DATA(n) _RTREE_CTX_INIT_ELM_DATA(n) + +/* + * Static initializer (to invalidate the cache entries) is required because the + * free fastpath may access the rtree cache before a full tsd initialization. + */ +#define RTREE_CTX_INITIALIZER {{RTREE_CTX_INIT_ELM_DATA(RTREE_CTX_NCACHE)}, \ + {RTREE_CTX_INIT_ELM_DATA(RTREE_CTX_NCACHE_L2)}} + +typedef struct rtree_leaf_elm_s rtree_leaf_elm_t; + +typedef struct rtree_ctx_cache_elm_s rtree_ctx_cache_elm_t; +struct rtree_ctx_cache_elm_s { + uintptr_t leafkey; + rtree_leaf_elm_t *leaf; +}; + +typedef struct rtree_ctx_s rtree_ctx_t; +struct rtree_ctx_s { + /* Direct mapped cache. */ + rtree_ctx_cache_elm_t cache[RTREE_CTX_NCACHE]; + /* L2 LRU cache. */ + rtree_ctx_cache_elm_t l2_cache[RTREE_CTX_NCACHE_L2]; +}; + +void rtree_ctx_data_init(rtree_ctx_t *ctx); + +#endif /* JEMALLOC_INTERNAL_RTREE_CTX_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/safety_check.h b/BeefRT/JEMalloc/include/jemalloc/internal/safety_check.h new file mode 100644 index 00000000..f1a74f17 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/safety_check.h @@ -0,0 +1,31 @@ +#ifndef JEMALLOC_INTERNAL_SAFETY_CHECK_H +#define JEMALLOC_INTERNAL_SAFETY_CHECK_H + +void safety_check_fail_sized_dealloc(bool current_dealloc, const void *ptr, + size_t true_size, size_t input_size); +void safety_check_fail(const char *format, ...); + +typedef void (*safety_check_abort_hook_t)(const char *message); + +/* Can set to NULL for a default. */ +void safety_check_set_abort(safety_check_abort_hook_t abort_fn); + +JEMALLOC_ALWAYS_INLINE void +safety_check_set_redzone(void *ptr, size_t usize, size_t bumped_usize) { + assert(usize < bumped_usize); + for (size_t i = usize; i < bumped_usize && i < usize + 32; ++i) { + *((unsigned char *)ptr + i) = 0xBC; + } +} + +JEMALLOC_ALWAYS_INLINE void +safety_check_verify_redzone(const void *ptr, size_t usize, size_t bumped_usize) +{ + for (size_t i = usize; i < bumped_usize && i < usize + 32; ++i) { + if (unlikely(*((unsigned char *)ptr + i) != 0xBC)) { + safety_check_fail("Use after free error\n"); + } + } +} + +#endif /*JEMALLOC_INTERNAL_SAFETY_CHECK_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/san.h b/BeefRT/JEMalloc/include/jemalloc/internal/san.h new file mode 100644 index 00000000..8813d6bb --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/san.h @@ -0,0 +1,191 @@ +#ifndef JEMALLOC_INTERNAL_GUARD_H +#define JEMALLOC_INTERNAL_GUARD_H + +#include "jemalloc/internal/ehooks.h" +#include "jemalloc/internal/emap.h" + +#define SAN_PAGE_GUARD PAGE +#define SAN_PAGE_GUARDS_SIZE (SAN_PAGE_GUARD * 2) + +#define SAN_GUARD_LARGE_EVERY_N_EXTENTS_DEFAULT 0 +#define SAN_GUARD_SMALL_EVERY_N_EXTENTS_DEFAULT 0 + +#define SAN_LG_UAF_ALIGN_DEFAULT (-1) +#define SAN_CACHE_BIN_NONFAST_MASK_DEFAULT (uintptr_t)(-1) + +static const uintptr_t uaf_detect_junk = (uintptr_t)0x5b5b5b5b5b5b5b5bULL; + +/* 0 means disabled, i.e. never guarded. */ +extern size_t opt_san_guard_large; +extern size_t opt_san_guard_small; +/* -1 means disabled, i.e. never check for use-after-free. */ +extern ssize_t opt_lg_san_uaf_align; + +void san_guard_pages(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + emap_t *emap, bool left, bool right, bool remap); +void san_unguard_pages(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + emap_t *emap, bool left, bool right); +/* + * Unguard the extent, but don't modify emap boundaries. Must be called on an + * extent that has been erased from emap and shouldn't be placed back. + */ +void san_unguard_pages_pre_destroy(tsdn_t *tsdn, ehooks_t *ehooks, + edata_t *edata, emap_t *emap); +void san_check_stashed_ptrs(void **ptrs, size_t nstashed, size_t usize); + +void tsd_san_init(tsd_t *tsd); +void san_init(ssize_t lg_san_uaf_align); + +static inline void +san_guard_pages_two_sided(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + emap_t *emap, bool remap) { + san_guard_pages(tsdn, ehooks, edata, emap, true, true, remap); +} + +static inline void +san_unguard_pages_two_sided(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + emap_t *emap) { + san_unguard_pages(tsdn, ehooks, edata, emap, true, true); +} + +static inline size_t +san_two_side_unguarded_sz(size_t size) { + assert(size % PAGE == 0); + assert(size >= SAN_PAGE_GUARDS_SIZE); + return size - SAN_PAGE_GUARDS_SIZE; +} + +static inline size_t +san_two_side_guarded_sz(size_t size) { + assert(size % PAGE == 0); + return size + SAN_PAGE_GUARDS_SIZE; +} + +static inline size_t +san_one_side_unguarded_sz(size_t size) { + assert(size % PAGE == 0); + assert(size >= SAN_PAGE_GUARD); + return size - SAN_PAGE_GUARD; +} + +static inline size_t +san_one_side_guarded_sz(size_t size) { + assert(size % PAGE == 0); + return size + SAN_PAGE_GUARD; +} + +static inline bool +san_guard_enabled(void) { + return (opt_san_guard_large != 0 || opt_san_guard_small != 0); +} + +static inline bool +san_large_extent_decide_guard(tsdn_t *tsdn, ehooks_t *ehooks, size_t size, + size_t alignment) { + if (opt_san_guard_large == 0 || ehooks_guard_will_fail(ehooks) || + tsdn_null(tsdn)) { + return false; + } + + tsd_t *tsd = tsdn_tsd(tsdn); + uint64_t n = tsd_san_extents_until_guard_large_get(tsd); + assert(n >= 1); + if (n > 1) { + /* + * Subtract conditionally because the guard may not happen due + * to alignment or size restriction below. + */ + *tsd_san_extents_until_guard_largep_get(tsd) = n - 1; + } + + if (n == 1 && (alignment <= PAGE) && + (san_two_side_guarded_sz(size) <= SC_LARGE_MAXCLASS)) { + *tsd_san_extents_until_guard_largep_get(tsd) = + opt_san_guard_large; + return true; + } else { + assert(tsd_san_extents_until_guard_large_get(tsd) >= 1); + return false; + } +} + +static inline bool +san_slab_extent_decide_guard(tsdn_t *tsdn, ehooks_t *ehooks) { + if (opt_san_guard_small == 0 || ehooks_guard_will_fail(ehooks) || + tsdn_null(tsdn)) { + return false; + } + + tsd_t *tsd = tsdn_tsd(tsdn); + uint64_t n = tsd_san_extents_until_guard_small_get(tsd); + assert(n >= 1); + if (n == 1) { + *tsd_san_extents_until_guard_smallp_get(tsd) = + opt_san_guard_small; + return true; + } else { + *tsd_san_extents_until_guard_smallp_get(tsd) = n - 1; + assert(tsd_san_extents_until_guard_small_get(tsd) >= 1); + return false; + } +} + +static inline void +san_junk_ptr_locations(void *ptr, size_t usize, void **first, void **mid, + void **last) { + size_t ptr_sz = sizeof(void *); + + *first = ptr; + + *mid = (void *)((uintptr_t)ptr + ((usize >> 1) & ~(ptr_sz - 1))); + assert(*first != *mid || usize == ptr_sz); + assert((uintptr_t)*first <= (uintptr_t)*mid); + + /* + * When usize > 32K, the gap between requested_size and usize might be + * greater than 4K -- this means the last write may access an + * likely-untouched page (default settings w/ 4K pages). However by + * default the tcache only goes up to the 32K size class, and is usually + * tuned lower instead of higher, which makes it less of a concern. + */ + *last = (void *)((uintptr_t)ptr + usize - sizeof(uaf_detect_junk)); + assert(*first != *last || usize == ptr_sz); + assert(*mid != *last || usize <= ptr_sz * 2); + assert((uintptr_t)*mid <= (uintptr_t)*last); +} + +static inline bool +san_junk_ptr_should_slow(void) { + /* + * The latter condition (pointer size greater than the min size class) + * is not expected -- fall back to the slow path for simplicity. + */ + return config_debug || (LG_SIZEOF_PTR > SC_LG_TINY_MIN); +} + +static inline void +san_junk_ptr(void *ptr, size_t usize) { + if (san_junk_ptr_should_slow()) { + memset(ptr, (char)uaf_detect_junk, usize); + return; + } + + void *first, *mid, *last; + san_junk_ptr_locations(ptr, usize, &first, &mid, &last); + *(uintptr_t *)first = uaf_detect_junk; + *(uintptr_t *)mid = uaf_detect_junk; + *(uintptr_t *)last = uaf_detect_junk; +} + +static inline bool +san_uaf_detection_enabled(void) { + bool ret = config_uaf_detection && (opt_lg_san_uaf_align != -1); + if (config_uaf_detection && ret) { + assert(san_cache_bin_nonfast_mask == ((uintptr_t)1 << + opt_lg_san_uaf_align) - 1); + } + + return ret; +} + +#endif /* JEMALLOC_INTERNAL_GUARD_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/san_bump.h b/BeefRT/JEMalloc/include/jemalloc/internal/san_bump.h new file mode 100644 index 00000000..8ec4a710 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/san_bump.h @@ -0,0 +1,52 @@ +#ifndef JEMALLOC_INTERNAL_SAN_BUMP_H +#define JEMALLOC_INTERNAL_SAN_BUMP_H + +#include "jemalloc/internal/edata.h" +#include "jemalloc/internal/exp_grow.h" +#include "jemalloc/internal/mutex.h" + +#define SBA_RETAINED_ALLOC_SIZE ((size_t)4 << 20) + +extern bool opt_retain; + +typedef struct ehooks_s ehooks_t; +typedef struct pac_s pac_t; + +typedef struct san_bump_alloc_s san_bump_alloc_t; +struct san_bump_alloc_s { + malloc_mutex_t mtx; + + edata_t *curr_reg; +}; + +static inline bool +san_bump_enabled() { + /* + * We enable san_bump allocator only when it's possible to break up a + * mapping and unmap a part of it (maps_coalesce). This is needed to + * ensure the arena destruction process can destroy all retained guarded + * extents one by one and to unmap a trailing part of a retained guarded + * region when it's too small to fit a pending allocation. + * opt_retain is required, because this allocator retains a large + * virtual memory mapping and returns smaller parts of it. + */ + return maps_coalesce && opt_retain; +} + +static inline bool +san_bump_alloc_init(san_bump_alloc_t* sba) { + bool err = malloc_mutex_init(&sba->mtx, "sanitizer_bump_allocator", + WITNESS_RANK_SAN_BUMP_ALLOC, malloc_mutex_rank_exclusive); + if (err) { + return true; + } + sba->curr_reg = NULL; + + return false; +} + +edata_t * +san_bump_alloc(tsdn_t *tsdn, san_bump_alloc_t* sba, pac_t *pac, ehooks_t *ehooks, + size_t size, bool zero); + +#endif /* JEMALLOC_INTERNAL_SAN_BUMP_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/sc.h b/BeefRT/JEMalloc/include/jemalloc/internal/sc.h new file mode 100644 index 00000000..9bab347b --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/sc.h @@ -0,0 +1,357 @@ +#ifndef JEMALLOC_INTERNAL_SC_H +#define JEMALLOC_INTERNAL_SC_H + +#include "jemalloc/internal/jemalloc_internal_types.h" + +/* + * Size class computations: + * + * These are a little tricky; we'll first start by describing how things + * generally work, and then describe some of the details. + * + * Ignore the first few size classes for a moment. We can then split all the + * remaining size classes into groups. The size classes in a group are spaced + * such that they cover allocation request sizes in a power-of-2 range. The + * power of two is called the base of the group, and the size classes in it + * satisfy allocations in the half-open range (base, base * 2]. There are + * SC_NGROUP size classes in each group, equally spaced in the range, so that + * each one covers allocations for base / SC_NGROUP possible allocation sizes. + * We call that value (base / SC_NGROUP) the delta of the group. Each size class + * is delta larger than the one before it (including the initial size class in a + * group, which is delta larger than base, the largest size class in the + * previous group). + * To make the math all work out nicely, we require that SC_NGROUP is a power of + * two, and define it in terms of SC_LG_NGROUP. We'll often talk in terms of + * lg_base and lg_delta. For each of these groups then, we have that + * lg_delta == lg_base - SC_LG_NGROUP. + * The size classes in a group with a given lg_base and lg_delta (which, recall, + * can be computed from lg_base for these groups) are therefore: + * base + 1 * delta + * which covers allocations in (base, base + 1 * delta] + * base + 2 * delta + * which covers allocations in (base + 1 * delta, base + 2 * delta]. + * base + 3 * delta + * which covers allocations in (base + 2 * delta, base + 3 * delta]. + * ... + * base + SC_NGROUP * delta ( == 2 * base) + * which covers allocations in (base + (SC_NGROUP - 1) * delta, 2 * base]. + * (Note that currently SC_NGROUP is always 4, so the "..." is empty in + * practice.) + * Note that the last size class in the group is the next power of two (after + * base), so that we've set up the induction correctly for the next group's + * selection of delta. + * + * Now, let's start considering the first few size classes. Two extra constants + * come into play here: LG_QUANTUM and SC_LG_TINY_MIN. LG_QUANTUM ensures + * correct platform alignment; all objects of size (1 << LG_QUANTUM) or larger + * are at least (1 << LG_QUANTUM) aligned; this can be used to ensure that we + * never return improperly aligned memory, by making (1 << LG_QUANTUM) equal the + * highest required alignment of a platform. For allocation sizes smaller than + * (1 << LG_QUANTUM) though, we can be more relaxed (since we don't support + * platforms with types with alignment larger than their size). To allow such + * allocations (without wasting space unnecessarily), we introduce tiny size + * classes; one per power of two, up until we hit the quantum size. There are + * therefore LG_QUANTUM - SC_LG_TINY_MIN such size classes. + * + * Next, we have a size class of size (1 << LG_QUANTUM). This can't be the + * start of a group in the sense we described above (covering a power of two + * range) since, if we divided into it to pick a value of delta, we'd get a + * delta smaller than (1 << LG_QUANTUM) for sizes >= (1 << LG_QUANTUM), which + * is against the rules. + * + * The first base we can divide by SC_NGROUP while still being at least + * (1 << LG_QUANTUM) is SC_NGROUP * (1 << LG_QUANTUM). We can get there by + * having SC_NGROUP size classes, spaced (1 << LG_QUANTUM) apart. These size + * classes are: + * 1 * (1 << LG_QUANTUM) + * 2 * (1 << LG_QUANTUM) + * 3 * (1 << LG_QUANTUM) + * ... (although, as above, this "..." is empty in practice) + * SC_NGROUP * (1 << LG_QUANTUM). + * + * There are SC_NGROUP of these size classes, so we can regard it as a sort of + * pseudo-group, even though it spans multiple powers of 2, is divided + * differently, and both starts and ends on a power of 2 (as opposed to just + * ending). SC_NGROUP is itself a power of two, so the first group after the + * pseudo-group has the power-of-two base SC_NGROUP * (1 << LG_QUANTUM), for a + * lg_base of LG_QUANTUM + SC_LG_NGROUP. We can divide this base into SC_NGROUP + * sizes without violating our LG_QUANTUM requirements, so we can safely set + * lg_delta = lg_base - SC_LG_GROUP (== LG_QUANTUM). + * + * So, in order, the size classes are: + * + * Tiny size classes: + * - Count: LG_QUANTUM - SC_LG_TINY_MIN. + * - Sizes: + * 1 << SC_LG_TINY_MIN + * 1 << (SC_LG_TINY_MIN + 1) + * 1 << (SC_LG_TINY_MIN + 2) + * ... + * 1 << (LG_QUANTUM - 1) + * + * Initial pseudo-group: + * - Count: SC_NGROUP + * - Sizes: + * 1 * (1 << LG_QUANTUM) + * 2 * (1 << LG_QUANTUM) + * 3 * (1 << LG_QUANTUM) + * ... + * SC_NGROUP * (1 << LG_QUANTUM) + * + * Regular group 0: + * - Count: SC_NGROUP + * - Sizes: + * (relative to lg_base of LG_QUANTUM + SC_LG_NGROUP and lg_delta of + * lg_base - SC_LG_NGROUP) + * (1 << lg_base) + 1 * (1 << lg_delta) + * (1 << lg_base) + 2 * (1 << lg_delta) + * (1 << lg_base) + 3 * (1 << lg_delta) + * ... + * (1 << lg_base) + SC_NGROUP * (1 << lg_delta) [ == (1 << (lg_base + 1)) ] + * + * Regular group 1: + * - Count: SC_NGROUP + * - Sizes: + * (relative to lg_base of LG_QUANTUM + SC_LG_NGROUP + 1 and lg_delta of + * lg_base - SC_LG_NGROUP) + * (1 << lg_base) + 1 * (1 << lg_delta) + * (1 << lg_base) + 2 * (1 << lg_delta) + * (1 << lg_base) + 3 * (1 << lg_delta) + * ... + * (1 << lg_base) + SC_NGROUP * (1 << lg_delta) [ == (1 << (lg_base + 1)) ] + * + * ... + * + * Regular group N: + * - Count: SC_NGROUP + * - Sizes: + * (relative to lg_base of LG_QUANTUM + SC_LG_NGROUP + N and lg_delta of + * lg_base - SC_LG_NGROUP) + * (1 << lg_base) + 1 * (1 << lg_delta) + * (1 << lg_base) + 2 * (1 << lg_delta) + * (1 << lg_base) + 3 * (1 << lg_delta) + * ... + * (1 << lg_base) + SC_NGROUP * (1 << lg_delta) [ == (1 << (lg_base + 1)) ] + * + * + * Representation of metadata: + * To make the math easy, we'll mostly work in lg quantities. We record lg_base, + * lg_delta, and ndelta (i.e. number of deltas above the base) on a + * per-size-class basis, and maintain the invariant that, across all size + * classes, size == (1 << lg_base) + ndelta * (1 << lg_delta). + * + * For regular groups (i.e. those with lg_base >= LG_QUANTUM + SC_LG_NGROUP), + * lg_delta is lg_base - SC_LG_NGROUP, and ndelta goes from 1 to SC_NGROUP. + * + * For the initial tiny size classes (if any), lg_base is lg(size class size). + * lg_delta is lg_base for the first size class, and lg_base - 1 for all + * subsequent ones. ndelta is always 0. + * + * For the pseudo-group, if there are no tiny size classes, then we set + * lg_base == LG_QUANTUM, lg_delta == LG_QUANTUM, and have ndelta range from 0 + * to SC_NGROUP - 1. (Note that delta == base, so base + (SC_NGROUP - 1) * delta + * is just SC_NGROUP * base, or (1 << (SC_LG_NGROUP + LG_QUANTUM)), so we do + * indeed get a power of two that way). If there *are* tiny size classes, then + * the first size class needs to have lg_delta relative to the largest tiny size + * class. We therefore set lg_base == LG_QUANTUM - 1, + * lg_delta == LG_QUANTUM - 1, and ndelta == 1, keeping the rest of the + * pseudo-group the same. + * + * + * Other terminology: + * "Small" size classes mean those that are allocated out of bins, which is the + * same as those that are slab allocated. + * "Large" size classes are those that are not small. The cutoff for counting as + * large is page size * group size. + */ + +/* + * Size class N + (1 << SC_LG_NGROUP) twice the size of size class N. + */ +#define SC_LG_NGROUP 2 +#define SC_LG_TINY_MIN 3 + +#if SC_LG_TINY_MIN == 0 +/* The div module doesn't support division by 1, which this would require. */ +#error "Unsupported LG_TINY_MIN" +#endif + +/* + * The definitions below are all determined by the above settings and system + * characteristics. + */ +#define SC_NGROUP (1ULL << SC_LG_NGROUP) +#define SC_PTR_BITS ((1ULL << LG_SIZEOF_PTR) * 8) +#define SC_NTINY (LG_QUANTUM - SC_LG_TINY_MIN) +#define SC_LG_TINY_MAXCLASS (LG_QUANTUM > SC_LG_TINY_MIN ? LG_QUANTUM - 1 : -1) +#define SC_NPSEUDO SC_NGROUP +#define SC_LG_FIRST_REGULAR_BASE (LG_QUANTUM + SC_LG_NGROUP) +/* + * We cap allocations to be less than 2 ** (ptr_bits - 1), so the highest base + * we need is 2 ** (ptr_bits - 2). (This also means that the last group is 1 + * size class shorter than the others). + * We could probably save some space in arenas by capping this at LG_VADDR size. + */ +#define SC_LG_BASE_MAX (SC_PTR_BITS - 2) +#define SC_NREGULAR (SC_NGROUP * \ + (SC_LG_BASE_MAX - SC_LG_FIRST_REGULAR_BASE + 1) - 1) +#define SC_NSIZES (SC_NTINY + SC_NPSEUDO + SC_NREGULAR) + +/* + * The number of size classes that are a multiple of the page size. + * + * Here are the first few bases that have a page-sized SC. + * + * lg(base) | base | highest SC | page-multiple SCs + * --------------|------------------------------------------ + * LG_PAGE - 1 | PAGE / 2 | PAGE | 1 + * LG_PAGE | PAGE | 2 * PAGE | 1 + * LG_PAGE + 1 | 2 * PAGE | 4 * PAGE | 2 + * LG_PAGE + 2 | 4 * PAGE | 8 * PAGE | 4 + * + * The number of page-multiple SCs continues to grow in powers of two, up until + * lg_delta == lg_page, which corresponds to setting lg_base to lg_page + + * SC_LG_NGROUP. So, then, the number of size classes that are multiples of the + * page size whose lg_delta is less than the page size are + * is 1 + (2**0 + 2**1 + ... + 2**(lg_ngroup - 1) == 2**lg_ngroup. + * + * For each base with lg_base in [lg_page + lg_ngroup, lg_base_max), there are + * NGROUP page-sized size classes, and when lg_base == lg_base_max, there are + * NGROUP - 1. + * + * This gives us the quantity we seek. + */ +#define SC_NPSIZES ( \ + SC_NGROUP \ + + (SC_LG_BASE_MAX - (LG_PAGE + SC_LG_NGROUP)) * SC_NGROUP \ + + SC_NGROUP - 1) + +/* + * We declare a size class is binnable if size < page size * group. Or, in other + * words, lg(size) < lg(page size) + lg(group size). + */ +#define SC_NBINS ( \ + /* Sub-regular size classes. */ \ + SC_NTINY + SC_NPSEUDO \ + /* Groups with lg_regular_min_base <= lg_base <= lg_base_max */ \ + + SC_NGROUP * (LG_PAGE + SC_LG_NGROUP - SC_LG_FIRST_REGULAR_BASE) \ + /* Last SC of the last group hits the bound exactly; exclude it. */ \ + - 1) + +/* + * The size2index_tab lookup table uses uint8_t to encode each bin index, so we + * cannot support more than 256 small size classes. + */ +#if (SC_NBINS > 256) +# error "Too many small size classes" +#endif + +/* The largest size class in the lookup table, and its binary log. */ +#define SC_LG_MAX_LOOKUP 12 +#define SC_LOOKUP_MAXCLASS (1 << SC_LG_MAX_LOOKUP) + +/* Internal, only used for the definition of SC_SMALL_MAXCLASS. */ +#define SC_SMALL_MAX_BASE (1 << (LG_PAGE + SC_LG_NGROUP - 1)) +#define SC_SMALL_MAX_DELTA (1 << (LG_PAGE - 1)) + +/* The largest size class allocated out of a slab. */ +#define SC_SMALL_MAXCLASS (SC_SMALL_MAX_BASE \ + + (SC_NGROUP - 1) * SC_SMALL_MAX_DELTA) + +/* The fastpath assumes all lookup-able sizes are small. */ +#if (SC_SMALL_MAXCLASS < SC_LOOKUP_MAXCLASS) +# error "Lookup table sizes must be small" +#endif + +/* The smallest size class not allocated out of a slab. */ +#define SC_LARGE_MINCLASS ((size_t)1ULL << (LG_PAGE + SC_LG_NGROUP)) +#define SC_LG_LARGE_MINCLASS (LG_PAGE + SC_LG_NGROUP) + +/* Internal; only used for the definition of SC_LARGE_MAXCLASS. */ +#define SC_MAX_BASE ((size_t)1 << (SC_PTR_BITS - 2)) +#define SC_MAX_DELTA ((size_t)1 << (SC_PTR_BITS - 2 - SC_LG_NGROUP)) + +/* The largest size class supported. */ +#define SC_LARGE_MAXCLASS (SC_MAX_BASE + (SC_NGROUP - 1) * SC_MAX_DELTA) + +/* Maximum number of regions in one slab. */ +#ifndef CONFIG_LG_SLAB_MAXREGS +# define SC_LG_SLAB_MAXREGS (LG_PAGE - SC_LG_TINY_MIN) +#else +# if CONFIG_LG_SLAB_MAXREGS < (LG_PAGE - SC_LG_TINY_MIN) +# error "Unsupported SC_LG_SLAB_MAXREGS" +# else +# define SC_LG_SLAB_MAXREGS CONFIG_LG_SLAB_MAXREGS +# endif +#endif + +#define SC_SLAB_MAXREGS (1U << SC_LG_SLAB_MAXREGS) + +typedef struct sc_s sc_t; +struct sc_s { + /* Size class index, or -1 if not a valid size class. */ + int index; + /* Lg group base size (no deltas added). */ + int lg_base; + /* Lg delta to previous size class. */ + int lg_delta; + /* Delta multiplier. size == 1<bytes += src->bytes; +} + +/* A collections of free extents, all of the same size. */ +typedef struct sec_bin_s sec_bin_t; +struct sec_bin_s { + /* + * When we fail to fulfill an allocation, we do a batch-alloc on the + * underlying allocator to fill extra items, as well. We drop the SEC + * lock while doing so, to allow operations on other bins to succeed. + * That introduces the possibility of other threads also trying to + * allocate out of this bin, failing, and also going to the backing + * allocator. To avoid a thundering herd problem in which lots of + * threads do batch allocs and overfill this bin as a result, we only + * allow one batch allocation at a time for a bin. This bool tracks + * whether or not some thread is already batch allocating. + * + * Eventually, the right answer may be a smarter sharding policy for the + * bins (e.g. a mutex per bin, which would also be more scalable + * generally; the batch-allocating thread could hold it while + * batch-allocating). + */ + bool being_batch_filled; + + /* + * Number of bytes in this particular bin (as opposed to the + * sec_shard_t's bytes_cur. This isn't user visible or reported in + * stats; rather, it allows us to quickly determine the change in the + * centralized counter when flushing. + */ + size_t bytes_cur; + edata_list_active_t freelist; +}; + +typedef struct sec_shard_s sec_shard_t; +struct sec_shard_s { + /* + * We don't keep per-bin mutexes, even though that would allow more + * sharding; this allows global cache-eviction, which in turn allows for + * better balancing across free lists. + */ + malloc_mutex_t mtx; + /* + * A SEC may need to be shut down (i.e. flushed of its contents and + * prevented from further caching). To avoid tricky synchronization + * issues, we just track enabled-status in each shard, guarded by a + * mutex. In practice, this is only ever checked during brief races, + * since the arena-level atomic boolean tracking HPA enabled-ness means + * that we won't go down these pathways very often after custom extent + * hooks are installed. + */ + bool enabled; + sec_bin_t *bins; + /* Number of bytes in all bins in the shard. */ + size_t bytes_cur; + /* The next pszind to flush in the flush-some pathways. */ + pszind_t to_flush_next; +}; + +typedef struct sec_s sec_t; +struct sec_s { + pai_t pai; + pai_t *fallback; + + sec_opts_t opts; + sec_shard_t *shards; + pszind_t npsizes; +}; + +bool sec_init(tsdn_t *tsdn, sec_t *sec, base_t *base, pai_t *fallback, + const sec_opts_t *opts); +void sec_flush(tsdn_t *tsdn, sec_t *sec); +void sec_disable(tsdn_t *tsdn, sec_t *sec); + +/* + * Morally, these two stats methods probably ought to be a single one (and the + * mutex_prof_data ought to live in the sec_stats_t. But splitting them apart + * lets them fit easily into the pa_shard stats framework (which also has this + * split), which simplifies the stats management. + */ +void sec_stats_merge(tsdn_t *tsdn, sec_t *sec, sec_stats_t *stats); +void sec_mutex_stats_read(tsdn_t *tsdn, sec_t *sec, + mutex_prof_data_t *mutex_prof_data); + +/* + * We use the arena lock ordering; these are acquired in phase 2 of forking, but + * should be acquired before the underlying allocator mutexes. + */ +void sec_prefork2(tsdn_t *tsdn, sec_t *sec); +void sec_postfork_parent(tsdn_t *tsdn, sec_t *sec); +void sec_postfork_child(tsdn_t *tsdn, sec_t *sec); + +#endif /* JEMALLOC_INTERNAL_SEC_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/sec_opts.h b/BeefRT/JEMalloc/include/jemalloc/internal/sec_opts.h new file mode 100644 index 00000000..a3ad72fb --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/sec_opts.h @@ -0,0 +1,59 @@ +#ifndef JEMALLOC_INTERNAL_SEC_OPTS_H +#define JEMALLOC_INTERNAL_SEC_OPTS_H + +/* + * The configuration settings used by an sec_t. Morally, this is part of the + * SEC interface, but we put it here for header-ordering reasons. + */ + +typedef struct sec_opts_s sec_opts_t; +struct sec_opts_s { + /* + * We don't necessarily always use all the shards; requests are + * distributed across shards [0, nshards - 1). + */ + size_t nshards; + /* + * We'll automatically refuse to cache any objects in this sec if + * they're larger than max_alloc bytes, instead forwarding such objects + * directly to the fallback. + */ + size_t max_alloc; + /* + * Exceeding this amount of cached extents in a shard causes us to start + * flushing bins in that shard until we fall below bytes_after_flush. + */ + size_t max_bytes; + /* + * The number of bytes (in all bins) we flush down to when we exceed + * bytes_cur. We want this to be less than bytes_cur, because + * otherwise we could get into situations where a shard undergoing + * net-deallocation keeps bytes_cur very near to max_bytes, so that + * most deallocations get immediately forwarded to the underlying PAI + * implementation, defeating the point of the SEC. + */ + size_t bytes_after_flush; + /* + * When we can't satisfy an allocation out of the SEC because there are + * no available ones cached, we allocate multiple of that size out of + * the fallback allocator. Eventually we might want to do something + * cleverer, but for now we just grab a fixed number. + */ + size_t batch_fill_extra; +}; + +#define SEC_OPTS_DEFAULT { \ + /* nshards */ \ + 4, \ + /* max_alloc */ \ + (32 * 1024) < PAGE ? PAGE : (32 * 1024), \ + /* max_bytes */ \ + 256 * 1024, \ + /* bytes_after_flush */ \ + 128 * 1024, \ + /* batch_fill_extra */ \ + 0 \ +} + + +#endif /* JEMALLOC_INTERNAL_SEC_OPTS_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/seq.h b/BeefRT/JEMalloc/include/jemalloc/internal/seq.h new file mode 100644 index 00000000..ef2df4c6 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/seq.h @@ -0,0 +1,55 @@ +#ifndef JEMALLOC_INTERNAL_SEQ_H +#define JEMALLOC_INTERNAL_SEQ_H + +#include "jemalloc/internal/atomic.h" + +/* + * A simple seqlock implementation. + */ + +#define seq_define(type, short_type) \ +typedef struct { \ + atomic_zu_t seq; \ + atomic_zu_t data[ \ + (sizeof(type) + sizeof(size_t) - 1) / sizeof(size_t)]; \ +} seq_##short_type##_t; \ + \ +/* \ + * No internal synchronization -- the caller must ensure that there's \ + * only a single writer at a time. \ + */ \ +static inline void \ +seq_store_##short_type(seq_##short_type##_t *dst, type *src) { \ + size_t buf[sizeof(dst->data) / sizeof(size_t)]; \ + buf[sizeof(buf) / sizeof(size_t) - 1] = 0; \ + memcpy(buf, src, sizeof(type)); \ + size_t old_seq = atomic_load_zu(&dst->seq, ATOMIC_RELAXED); \ + atomic_store_zu(&dst->seq, old_seq + 1, ATOMIC_RELAXED); \ + atomic_fence(ATOMIC_RELEASE); \ + for (size_t i = 0; i < sizeof(buf) / sizeof(size_t); i++) { \ + atomic_store_zu(&dst->data[i], buf[i], ATOMIC_RELAXED); \ + } \ + atomic_store_zu(&dst->seq, old_seq + 2, ATOMIC_RELEASE); \ +} \ + \ +/* Returns whether or not the read was consistent. */ \ +static inline bool \ +seq_try_load_##short_type(type *dst, seq_##short_type##_t *src) { \ + size_t buf[sizeof(src->data) / sizeof(size_t)]; \ + size_t seq1 = atomic_load_zu(&src->seq, ATOMIC_ACQUIRE); \ + if (seq1 % 2 != 0) { \ + return false; \ + } \ + for (size_t i = 0; i < sizeof(buf) / sizeof(size_t); i++) { \ + buf[i] = atomic_load_zu(&src->data[i], ATOMIC_RELAXED); \ + } \ + atomic_fence(ATOMIC_ACQUIRE); \ + size_t seq2 = atomic_load_zu(&src->seq, ATOMIC_RELAXED); \ + if (seq1 != seq2) { \ + return false; \ + } \ + memcpy(dst, buf, sizeof(type)); \ + return true; \ +} + +#endif /* JEMALLOC_INTERNAL_SEQ_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/slab_data.h b/BeefRT/JEMalloc/include/jemalloc/internal/slab_data.h new file mode 100644 index 00000000..e821863d --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/slab_data.h @@ -0,0 +1,12 @@ +#ifndef JEMALLOC_INTERNAL_SLAB_DATA_H +#define JEMALLOC_INTERNAL_SLAB_DATA_H + +#include "jemalloc/internal/bitmap.h" + +typedef struct slab_data_s slab_data_t; +struct slab_data_s { + /* Per region allocated/deallocated bitmap. */ + bitmap_t bitmap[BITMAP_GROUPS_MAX]; +}; + +#endif /* JEMALLOC_INTERNAL_SLAB_DATA_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/smoothstep.h b/BeefRT/JEMalloc/include/jemalloc/internal/smoothstep.h new file mode 100644 index 00000000..2e14430f --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/smoothstep.h @@ -0,0 +1,232 @@ +#ifndef JEMALLOC_INTERNAL_SMOOTHSTEP_H +#define JEMALLOC_INTERNAL_SMOOTHSTEP_H + +/* + * This file was generated by the following command: + * sh smoothstep.sh smoother 200 24 3 15 + */ +/******************************************************************************/ + +/* + * This header defines a precomputed table based on the smoothstep family of + * sigmoidal curves (https://en.wikipedia.org/wiki/Smoothstep) that grow from 0 + * to 1 in 0 <= x <= 1. The table is stored as integer fixed point values so + * that floating point math can be avoided. + * + * 3 2 + * smoothstep(x) = -2x + 3x + * + * 5 4 3 + * smootherstep(x) = 6x - 15x + 10x + * + * 7 6 5 4 + * smootheststep(x) = -20x + 70x - 84x + 35x + */ + +#define SMOOTHSTEP_VARIANT "smoother" +#define SMOOTHSTEP_NSTEPS 200 +#define SMOOTHSTEP_BFP 24 +#define SMOOTHSTEP \ + /* STEP(step, h, x, y) */ \ + STEP( 1, UINT64_C(0x0000000000000014), 0.005, 0.000001240643750) \ + STEP( 2, UINT64_C(0x00000000000000a5), 0.010, 0.000009850600000) \ + STEP( 3, UINT64_C(0x0000000000000229), 0.015, 0.000032995181250) \ + STEP( 4, UINT64_C(0x0000000000000516), 0.020, 0.000077619200000) \ + STEP( 5, UINT64_C(0x00000000000009dc), 0.025, 0.000150449218750) \ + STEP( 6, UINT64_C(0x00000000000010e8), 0.030, 0.000257995800000) \ + STEP( 7, UINT64_C(0x0000000000001aa4), 0.035, 0.000406555756250) \ + STEP( 8, UINT64_C(0x0000000000002777), 0.040, 0.000602214400000) \ + STEP( 9, UINT64_C(0x00000000000037c2), 0.045, 0.000850847793750) \ + STEP( 10, UINT64_C(0x0000000000004be6), 0.050, 0.001158125000000) \ + STEP( 11, UINT64_C(0x000000000000643c), 0.055, 0.001529510331250) \ + STEP( 12, UINT64_C(0x000000000000811f), 0.060, 0.001970265600000) \ + STEP( 13, UINT64_C(0x000000000000a2e2), 0.065, 0.002485452368750) \ + STEP( 14, UINT64_C(0x000000000000c9d8), 0.070, 0.003079934200000) \ + STEP( 15, UINT64_C(0x000000000000f64f), 0.075, 0.003758378906250) \ + STEP( 16, UINT64_C(0x0000000000012891), 0.080, 0.004525260800000) \ + STEP( 17, UINT64_C(0x00000000000160e7), 0.085, 0.005384862943750) \ + STEP( 18, UINT64_C(0x0000000000019f95), 0.090, 0.006341279400000) \ + STEP( 19, UINT64_C(0x000000000001e4dc), 0.095, 0.007398417481250) \ + STEP( 20, UINT64_C(0x00000000000230fc), 0.100, 0.008560000000000) \ + STEP( 21, UINT64_C(0x0000000000028430), 0.105, 0.009829567518750) \ + STEP( 22, UINT64_C(0x000000000002deb0), 0.110, 0.011210480600000) \ + STEP( 23, UINT64_C(0x00000000000340b1), 0.115, 0.012705922056250) \ + STEP( 24, UINT64_C(0x000000000003aa67), 0.120, 0.014318899200000) \ + STEP( 25, UINT64_C(0x0000000000041c00), 0.125, 0.016052246093750) \ + STEP( 26, UINT64_C(0x00000000000495a8), 0.130, 0.017908625800000) \ + STEP( 27, UINT64_C(0x000000000005178b), 0.135, 0.019890532631250) \ + STEP( 28, UINT64_C(0x000000000005a1cf), 0.140, 0.022000294400000) \ + STEP( 29, UINT64_C(0x0000000000063498), 0.145, 0.024240074668750) \ + STEP( 30, UINT64_C(0x000000000006d009), 0.150, 0.026611875000000) \ + STEP( 31, UINT64_C(0x000000000007743f), 0.155, 0.029117537206250) \ + STEP( 32, UINT64_C(0x0000000000082157), 0.160, 0.031758745600000) \ + STEP( 33, UINT64_C(0x000000000008d76b), 0.165, 0.034537029243750) \ + STEP( 34, UINT64_C(0x0000000000099691), 0.170, 0.037453764200000) \ + STEP( 35, UINT64_C(0x00000000000a5edf), 0.175, 0.040510175781250) \ + STEP( 36, UINT64_C(0x00000000000b3067), 0.180, 0.043707340800000) \ + STEP( 37, UINT64_C(0x00000000000c0b38), 0.185, 0.047046189818750) \ + STEP( 38, UINT64_C(0x00000000000cef5e), 0.190, 0.050527509400000) \ + STEP( 39, UINT64_C(0x00000000000ddce6), 0.195, 0.054151944356250) \ + STEP( 40, UINT64_C(0x00000000000ed3d8), 0.200, 0.057920000000000) \ + STEP( 41, UINT64_C(0x00000000000fd439), 0.205, 0.061832044393750) \ + STEP( 42, UINT64_C(0x000000000010de0e), 0.210, 0.065888310600000) \ + STEP( 43, UINT64_C(0x000000000011f158), 0.215, 0.070088898931250) \ + STEP( 44, UINT64_C(0x0000000000130e17), 0.220, 0.074433779200000) \ + STEP( 45, UINT64_C(0x0000000000143448), 0.225, 0.078922792968750) \ + STEP( 46, UINT64_C(0x00000000001563e7), 0.230, 0.083555655800000) \ + STEP( 47, UINT64_C(0x0000000000169cec), 0.235, 0.088331959506250) \ + STEP( 48, UINT64_C(0x000000000017df4f), 0.240, 0.093251174400000) \ + STEP( 49, UINT64_C(0x0000000000192b04), 0.245, 0.098312651543750) \ + STEP( 50, UINT64_C(0x00000000001a8000), 0.250, 0.103515625000000) \ + STEP( 51, UINT64_C(0x00000000001bde32), 0.255, 0.108859214081250) \ + STEP( 52, UINT64_C(0x00000000001d458b), 0.260, 0.114342425600000) \ + STEP( 53, UINT64_C(0x00000000001eb5f8), 0.265, 0.119964156118750) \ + STEP( 54, UINT64_C(0x0000000000202f65), 0.270, 0.125723194200000) \ + STEP( 55, UINT64_C(0x000000000021b1bb), 0.275, 0.131618222656250) \ + STEP( 56, UINT64_C(0x0000000000233ce3), 0.280, 0.137647820800000) \ + STEP( 57, UINT64_C(0x000000000024d0c3), 0.285, 0.143810466693750) \ + STEP( 58, UINT64_C(0x0000000000266d40), 0.290, 0.150104539400000) \ + STEP( 59, UINT64_C(0x000000000028123d), 0.295, 0.156528321231250) \ + STEP( 60, UINT64_C(0x000000000029bf9c), 0.300, 0.163080000000000) \ + STEP( 61, UINT64_C(0x00000000002b753d), 0.305, 0.169757671268750) \ + STEP( 62, UINT64_C(0x00000000002d32fe), 0.310, 0.176559340600000) \ + STEP( 63, UINT64_C(0x00000000002ef8bc), 0.315, 0.183482925806250) \ + STEP( 64, UINT64_C(0x000000000030c654), 0.320, 0.190526259200000) \ + STEP( 65, UINT64_C(0x0000000000329b9f), 0.325, 0.197687089843750) \ + STEP( 66, UINT64_C(0x0000000000347875), 0.330, 0.204963085800000) \ + STEP( 67, UINT64_C(0x0000000000365cb0), 0.335, 0.212351836381250) \ + STEP( 68, UINT64_C(0x0000000000384825), 0.340, 0.219850854400000) \ + STEP( 69, UINT64_C(0x00000000003a3aa8), 0.345, 0.227457578418750) \ + STEP( 70, UINT64_C(0x00000000003c340f), 0.350, 0.235169375000000) \ + STEP( 71, UINT64_C(0x00000000003e342b), 0.355, 0.242983540956250) \ + STEP( 72, UINT64_C(0x0000000000403ace), 0.360, 0.250897305600000) \ + STEP( 73, UINT64_C(0x00000000004247c8), 0.365, 0.258907832993750) \ + STEP( 74, UINT64_C(0x0000000000445ae9), 0.370, 0.267012224200000) \ + STEP( 75, UINT64_C(0x0000000000467400), 0.375, 0.275207519531250) \ + STEP( 76, UINT64_C(0x00000000004892d8), 0.380, 0.283490700800000) \ + STEP( 77, UINT64_C(0x00000000004ab740), 0.385, 0.291858693568750) \ + STEP( 78, UINT64_C(0x00000000004ce102), 0.390, 0.300308369400000) \ + STEP( 79, UINT64_C(0x00000000004f0fe9), 0.395, 0.308836548106250) \ + STEP( 80, UINT64_C(0x00000000005143bf), 0.400, 0.317440000000000) \ + STEP( 81, UINT64_C(0x0000000000537c4d), 0.405, 0.326115448143750) \ + STEP( 82, UINT64_C(0x000000000055b95b), 0.410, 0.334859570600000) \ + STEP( 83, UINT64_C(0x000000000057fab1), 0.415, 0.343669002681250) \ + STEP( 84, UINT64_C(0x00000000005a4015), 0.420, 0.352540339200000) \ + STEP( 85, UINT64_C(0x00000000005c894e), 0.425, 0.361470136718750) \ + STEP( 86, UINT64_C(0x00000000005ed622), 0.430, 0.370454915800000) \ + STEP( 87, UINT64_C(0x0000000000612655), 0.435, 0.379491163256250) \ + STEP( 88, UINT64_C(0x00000000006379ac), 0.440, 0.388575334400000) \ + STEP( 89, UINT64_C(0x000000000065cfeb), 0.445, 0.397703855293750) \ + STEP( 90, UINT64_C(0x00000000006828d6), 0.450, 0.406873125000000) \ + STEP( 91, UINT64_C(0x00000000006a842f), 0.455, 0.416079517831250) \ + STEP( 92, UINT64_C(0x00000000006ce1bb), 0.460, 0.425319385600000) \ + STEP( 93, UINT64_C(0x00000000006f413a), 0.465, 0.434589059868750) \ + STEP( 94, UINT64_C(0x000000000071a270), 0.470, 0.443884854200000) \ + STEP( 95, UINT64_C(0x000000000074051d), 0.475, 0.453203066406250) \ + STEP( 96, UINT64_C(0x0000000000766905), 0.480, 0.462539980800000) \ + STEP( 97, UINT64_C(0x000000000078cde7), 0.485, 0.471891870443750) \ + STEP( 98, UINT64_C(0x00000000007b3387), 0.490, 0.481254999400000) \ + STEP( 99, UINT64_C(0x00000000007d99a4), 0.495, 0.490625624981250) \ + STEP( 100, UINT64_C(0x0000000000800000), 0.500, 0.500000000000000) \ + STEP( 101, UINT64_C(0x000000000082665b), 0.505, 0.509374375018750) \ + STEP( 102, UINT64_C(0x000000000084cc78), 0.510, 0.518745000600000) \ + STEP( 103, UINT64_C(0x0000000000873218), 0.515, 0.528108129556250) \ + STEP( 104, UINT64_C(0x00000000008996fa), 0.520, 0.537460019200000) \ + STEP( 105, UINT64_C(0x00000000008bfae2), 0.525, 0.546796933593750) \ + STEP( 106, UINT64_C(0x00000000008e5d8f), 0.530, 0.556115145800000) \ + STEP( 107, UINT64_C(0x000000000090bec5), 0.535, 0.565410940131250) \ + STEP( 108, UINT64_C(0x0000000000931e44), 0.540, 0.574680614400000) \ + STEP( 109, UINT64_C(0x0000000000957bd0), 0.545, 0.583920482168750) \ + STEP( 110, UINT64_C(0x000000000097d729), 0.550, 0.593126875000000) \ + STEP( 111, UINT64_C(0x00000000009a3014), 0.555, 0.602296144706250) \ + STEP( 112, UINT64_C(0x00000000009c8653), 0.560, 0.611424665600000) \ + STEP( 113, UINT64_C(0x00000000009ed9aa), 0.565, 0.620508836743750) \ + STEP( 114, UINT64_C(0x0000000000a129dd), 0.570, 0.629545084200000) \ + STEP( 115, UINT64_C(0x0000000000a376b1), 0.575, 0.638529863281250) \ + STEP( 116, UINT64_C(0x0000000000a5bfea), 0.580, 0.647459660800000) \ + STEP( 117, UINT64_C(0x0000000000a8054e), 0.585, 0.656330997318750) \ + STEP( 118, UINT64_C(0x0000000000aa46a4), 0.590, 0.665140429400000) \ + STEP( 119, UINT64_C(0x0000000000ac83b2), 0.595, 0.673884551856250) \ + STEP( 120, UINT64_C(0x0000000000aebc40), 0.600, 0.682560000000000) \ + STEP( 121, UINT64_C(0x0000000000b0f016), 0.605, 0.691163451893750) \ + STEP( 122, UINT64_C(0x0000000000b31efd), 0.610, 0.699691630600000) \ + STEP( 123, UINT64_C(0x0000000000b548bf), 0.615, 0.708141306431250) \ + STEP( 124, UINT64_C(0x0000000000b76d27), 0.620, 0.716509299200000) \ + STEP( 125, UINT64_C(0x0000000000b98c00), 0.625, 0.724792480468750) \ + STEP( 126, UINT64_C(0x0000000000bba516), 0.630, 0.732987775800000) \ + STEP( 127, UINT64_C(0x0000000000bdb837), 0.635, 0.741092167006250) \ + STEP( 128, UINT64_C(0x0000000000bfc531), 0.640, 0.749102694400000) \ + STEP( 129, UINT64_C(0x0000000000c1cbd4), 0.645, 0.757016459043750) \ + STEP( 130, UINT64_C(0x0000000000c3cbf0), 0.650, 0.764830625000000) \ + STEP( 131, UINT64_C(0x0000000000c5c557), 0.655, 0.772542421581250) \ + STEP( 132, UINT64_C(0x0000000000c7b7da), 0.660, 0.780149145600000) \ + STEP( 133, UINT64_C(0x0000000000c9a34f), 0.665, 0.787648163618750) \ + STEP( 134, UINT64_C(0x0000000000cb878a), 0.670, 0.795036914200000) \ + STEP( 135, UINT64_C(0x0000000000cd6460), 0.675, 0.802312910156250) \ + STEP( 136, UINT64_C(0x0000000000cf39ab), 0.680, 0.809473740800000) \ + STEP( 137, UINT64_C(0x0000000000d10743), 0.685, 0.816517074193750) \ + STEP( 138, UINT64_C(0x0000000000d2cd01), 0.690, 0.823440659400000) \ + STEP( 139, UINT64_C(0x0000000000d48ac2), 0.695, 0.830242328731250) \ + STEP( 140, UINT64_C(0x0000000000d64063), 0.700, 0.836920000000000) \ + STEP( 141, UINT64_C(0x0000000000d7edc2), 0.705, 0.843471678768750) \ + STEP( 142, UINT64_C(0x0000000000d992bf), 0.710, 0.849895460600000) \ + STEP( 143, UINT64_C(0x0000000000db2f3c), 0.715, 0.856189533306250) \ + STEP( 144, UINT64_C(0x0000000000dcc31c), 0.720, 0.862352179200000) \ + STEP( 145, UINT64_C(0x0000000000de4e44), 0.725, 0.868381777343750) \ + STEP( 146, UINT64_C(0x0000000000dfd09a), 0.730, 0.874276805800000) \ + STEP( 147, UINT64_C(0x0000000000e14a07), 0.735, 0.880035843881250) \ + STEP( 148, UINT64_C(0x0000000000e2ba74), 0.740, 0.885657574400000) \ + STEP( 149, UINT64_C(0x0000000000e421cd), 0.745, 0.891140785918750) \ + STEP( 150, UINT64_C(0x0000000000e58000), 0.750, 0.896484375000000) \ + STEP( 151, UINT64_C(0x0000000000e6d4fb), 0.755, 0.901687348456250) \ + STEP( 152, UINT64_C(0x0000000000e820b0), 0.760, 0.906748825600000) \ + STEP( 153, UINT64_C(0x0000000000e96313), 0.765, 0.911668040493750) \ + STEP( 154, UINT64_C(0x0000000000ea9c18), 0.770, 0.916444344200000) \ + STEP( 155, UINT64_C(0x0000000000ebcbb7), 0.775, 0.921077207031250) \ + STEP( 156, UINT64_C(0x0000000000ecf1e8), 0.780, 0.925566220800000) \ + STEP( 157, UINT64_C(0x0000000000ee0ea7), 0.785, 0.929911101068750) \ + STEP( 158, UINT64_C(0x0000000000ef21f1), 0.790, 0.934111689400000) \ + STEP( 159, UINT64_C(0x0000000000f02bc6), 0.795, 0.938167955606250) \ + STEP( 160, UINT64_C(0x0000000000f12c27), 0.800, 0.942080000000000) \ + STEP( 161, UINT64_C(0x0000000000f22319), 0.805, 0.945848055643750) \ + STEP( 162, UINT64_C(0x0000000000f310a1), 0.810, 0.949472490600000) \ + STEP( 163, UINT64_C(0x0000000000f3f4c7), 0.815, 0.952953810181250) \ + STEP( 164, UINT64_C(0x0000000000f4cf98), 0.820, 0.956292659200000) \ + STEP( 165, UINT64_C(0x0000000000f5a120), 0.825, 0.959489824218750) \ + STEP( 166, UINT64_C(0x0000000000f6696e), 0.830, 0.962546235800000) \ + STEP( 167, UINT64_C(0x0000000000f72894), 0.835, 0.965462970756250) \ + STEP( 168, UINT64_C(0x0000000000f7dea8), 0.840, 0.968241254400000) \ + STEP( 169, UINT64_C(0x0000000000f88bc0), 0.845, 0.970882462793750) \ + STEP( 170, UINT64_C(0x0000000000f92ff6), 0.850, 0.973388125000000) \ + STEP( 171, UINT64_C(0x0000000000f9cb67), 0.855, 0.975759925331250) \ + STEP( 172, UINT64_C(0x0000000000fa5e30), 0.860, 0.977999705600000) \ + STEP( 173, UINT64_C(0x0000000000fae874), 0.865, 0.980109467368750) \ + STEP( 174, UINT64_C(0x0000000000fb6a57), 0.870, 0.982091374200000) \ + STEP( 175, UINT64_C(0x0000000000fbe400), 0.875, 0.983947753906250) \ + STEP( 176, UINT64_C(0x0000000000fc5598), 0.880, 0.985681100800000) \ + STEP( 177, UINT64_C(0x0000000000fcbf4e), 0.885, 0.987294077943750) \ + STEP( 178, UINT64_C(0x0000000000fd214f), 0.890, 0.988789519400000) \ + STEP( 179, UINT64_C(0x0000000000fd7bcf), 0.895, 0.990170432481250) \ + STEP( 180, UINT64_C(0x0000000000fdcf03), 0.900, 0.991440000000000) \ + STEP( 181, UINT64_C(0x0000000000fe1b23), 0.905, 0.992601582518750) \ + STEP( 182, UINT64_C(0x0000000000fe606a), 0.910, 0.993658720600000) \ + STEP( 183, UINT64_C(0x0000000000fe9f18), 0.915, 0.994615137056250) \ + STEP( 184, UINT64_C(0x0000000000fed76e), 0.920, 0.995474739200000) \ + STEP( 185, UINT64_C(0x0000000000ff09b0), 0.925, 0.996241621093750) \ + STEP( 186, UINT64_C(0x0000000000ff3627), 0.930, 0.996920065800000) \ + STEP( 187, UINT64_C(0x0000000000ff5d1d), 0.935, 0.997514547631250) \ + STEP( 188, UINT64_C(0x0000000000ff7ee0), 0.940, 0.998029734400000) \ + STEP( 189, UINT64_C(0x0000000000ff9bc3), 0.945, 0.998470489668750) \ + STEP( 190, UINT64_C(0x0000000000ffb419), 0.950, 0.998841875000000) \ + STEP( 191, UINT64_C(0x0000000000ffc83d), 0.955, 0.999149152206250) \ + STEP( 192, UINT64_C(0x0000000000ffd888), 0.960, 0.999397785600000) \ + STEP( 193, UINT64_C(0x0000000000ffe55b), 0.965, 0.999593444243750) \ + STEP( 194, UINT64_C(0x0000000000ffef17), 0.970, 0.999742004200000) \ + STEP( 195, UINT64_C(0x0000000000fff623), 0.975, 0.999849550781250) \ + STEP( 196, UINT64_C(0x0000000000fffae9), 0.980, 0.999922380800000) \ + STEP( 197, UINT64_C(0x0000000000fffdd6), 0.985, 0.999967004818750) \ + STEP( 198, UINT64_C(0x0000000000ffff5a), 0.990, 0.999990149400000) \ + STEP( 199, UINT64_C(0x0000000000ffffeb), 0.995, 0.999998759356250) \ + STEP( 200, UINT64_C(0x0000000001000000), 1.000, 1.000000000000000) \ + +#endif /* JEMALLOC_INTERNAL_SMOOTHSTEP_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/smoothstep.sh b/BeefRT/JEMalloc/include/jemalloc/internal/smoothstep.sh new file mode 100644 index 00000000..65de97bf --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/smoothstep.sh @@ -0,0 +1,101 @@ +#!/bin/sh +# +# Generate a discrete lookup table for a sigmoid function in the smoothstep +# family (https://en.wikipedia.org/wiki/Smoothstep), where the lookup table +# entries correspond to x in [1/nsteps, 2/nsteps, ..., nsteps/nsteps]. Encode +# the entries using a binary fixed point representation. +# +# Usage: smoothstep.sh +# +# is in {smooth, smoother, smoothest}. +# must be greater than zero. +# must be in [0..62]; reasonable values are roughly [10..30]. +# is x decimal precision. +# is y decimal precision. + +#set -x + +cmd="sh smoothstep.sh $*" +variant=$1 +nsteps=$2 +bfp=$3 +xprec=$4 +yprec=$5 + +case "${variant}" in + smooth) + ;; + smoother) + ;; + smoothest) + ;; + *) + echo "Unsupported variant" + exit 1 + ;; +esac + +smooth() { + step=$1 + y=`echo ${yprec} k ${step} ${nsteps} / sx _2 lx 3 ^ '*' 3 lx 2 ^ '*' + p | dc | tr -d '\\\\\n' | sed -e 's#^\.#0.#g'` + h=`echo ${yprec} k 2 ${bfp} ^ ${y} '*' p | dc | tr -d '\\\\\n' | sed -e 's#^\.#0.#g' | tr '.' ' ' | awk '{print $1}' ` +} + +smoother() { + step=$1 + y=`echo ${yprec} k ${step} ${nsteps} / sx 6 lx 5 ^ '*' _15 lx 4 ^ '*' + 10 lx 3 ^ '*' + p | dc | tr -d '\\\\\n' | sed -e 's#^\.#0.#g'` + h=`echo ${yprec} k 2 ${bfp} ^ ${y} '*' p | dc | tr -d '\\\\\n' | sed -e 's#^\.#0.#g' | tr '.' ' ' | awk '{print $1}' ` +} + +smoothest() { + step=$1 + y=`echo ${yprec} k ${step} ${nsteps} / sx _20 lx 7 ^ '*' 70 lx 6 ^ '*' + _84 lx 5 ^ '*' + 35 lx 4 ^ '*' + p | dc | tr -d '\\\\\n' | sed -e 's#^\.#0.#g'` + h=`echo ${yprec} k 2 ${bfp} ^ ${y} '*' p | dc | tr -d '\\\\\n' | sed -e 's#^\.#0.#g' | tr '.' ' ' | awk '{print $1}' ` +} + +cat <iteration < 5) { + for (i = 0; i < (1U << spin->iteration); i++) { + spin_cpu_spinwait(); + } + spin->iteration++; + } else { +#ifdef _WIN32 + SwitchToThread(); +#else + sched_yield(); +#endif + } +} + +#undef SPIN_INLINE + +#endif /* JEMALLOC_INTERNAL_SPIN_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/stats.h b/BeefRT/JEMalloc/include/jemalloc/internal/stats.h new file mode 100644 index 00000000..727f7dcb --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/stats.h @@ -0,0 +1,54 @@ +#ifndef JEMALLOC_INTERNAL_STATS_H +#define JEMALLOC_INTERNAL_STATS_H + +/* OPTION(opt, var_name, default, set_value_to) */ +#define STATS_PRINT_OPTIONS \ + OPTION('J', json, false, true) \ + OPTION('g', general, true, false) \ + OPTION('m', merged, config_stats, false) \ + OPTION('d', destroyed, config_stats, false) \ + OPTION('a', unmerged, config_stats, false) \ + OPTION('b', bins, true, false) \ + OPTION('l', large, true, false) \ + OPTION('x', mutex, true, false) \ + OPTION('e', extents, true, false) \ + OPTION('h', hpa, config_stats, false) + +enum { +#define OPTION(o, v, d, s) stats_print_option_num_##v, + STATS_PRINT_OPTIONS +#undef OPTION + stats_print_tot_num_options +}; + +/* Options for stats_print. */ +extern bool opt_stats_print; +extern char opt_stats_print_opts[stats_print_tot_num_options+1]; + +/* Utilities for stats_interval. */ +extern int64_t opt_stats_interval; +extern char opt_stats_interval_opts[stats_print_tot_num_options+1]; + +#define STATS_INTERVAL_DEFAULT -1 +/* + * Batch-increment the counter to reduce synchronization overhead. Each thread + * merges after (interval >> LG_BATCH_SIZE) bytes of allocations; also limit the + * BATCH_MAX for accuracy when the interval is huge (which is expected). + */ +#define STATS_INTERVAL_ACCUM_LG_BATCH_SIZE 6 +#define STATS_INTERVAL_ACCUM_BATCH_MAX (4 << 20) + +/* Only accessed by thread event. */ +uint64_t stats_interval_new_event_wait(tsd_t *tsd); +uint64_t stats_interval_postponed_event_wait(tsd_t *tsd); +void stats_interval_event_handler(tsd_t *tsd, uint64_t elapsed); + +/* Implements je_malloc_stats_print. */ +void stats_print(write_cb_t *write_cb, void *cbopaque, const char *opts); + +bool stats_boot(void); +void stats_prefork(tsdn_t *tsdn); +void stats_postfork_parent(tsdn_t *tsdn); +void stats_postfork_child(tsdn_t *tsdn); + +#endif /* JEMALLOC_INTERNAL_STATS_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/sz.h b/BeefRT/JEMalloc/include/jemalloc/internal/sz.h new file mode 100644 index 00000000..3c0fc1da --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/sz.h @@ -0,0 +1,371 @@ +#ifndef JEMALLOC_INTERNAL_SIZE_H +#define JEMALLOC_INTERNAL_SIZE_H + +#include "jemalloc/internal/bit_util.h" +#include "jemalloc/internal/pages.h" +#include "jemalloc/internal/sc.h" +#include "jemalloc/internal/util.h" + +/* + * sz module: Size computations. + * + * Some abbreviations used here: + * p: Page + * ind: Index + * s, sz: Size + * u: Usable size + * a: Aligned + * + * These are not always used completely consistently, but should be enough to + * interpret function names. E.g. sz_psz2ind converts page size to page size + * index; sz_sa2u converts a (size, alignment) allocation request to the usable + * size that would result from such an allocation. + */ + +/* Page size index type. */ +typedef unsigned pszind_t; + +/* Size class index type. */ +typedef unsigned szind_t; + +/* + * sz_pind2sz_tab encodes the same information as could be computed by + * sz_pind2sz_compute(). + */ +extern size_t sz_pind2sz_tab[SC_NPSIZES + 1]; +/* + * sz_index2size_tab encodes the same information as could be computed (at + * unacceptable cost in some code paths) by sz_index2size_compute(). + */ +extern size_t sz_index2size_tab[SC_NSIZES]; +/* + * sz_size2index_tab is a compact lookup table that rounds request sizes up to + * size classes. In order to reduce cache footprint, the table is compressed, + * and all accesses are via sz_size2index(). + */ +extern uint8_t sz_size2index_tab[]; + +/* + * Padding for large allocations: PAGE when opt_cache_oblivious == true (to + * enable cache index randomization); 0 otherwise. + */ +extern size_t sz_large_pad; + +extern void sz_boot(const sc_data_t *sc_data, bool cache_oblivious); + +JEMALLOC_ALWAYS_INLINE pszind_t +sz_psz2ind(size_t psz) { + assert(psz > 0); + if (unlikely(psz > SC_LARGE_MAXCLASS)) { + return SC_NPSIZES; + } + /* x is the lg of the first base >= psz. */ + pszind_t x = lg_ceil(psz); + /* + * sc.h introduces a lot of size classes. These size classes are divided + * into different size class groups. There is a very special size class + * group, each size class in or after it is an integer multiple of PAGE. + * We call it first_ps_rg. It means first page size regular group. The + * range of first_ps_rg is (base, base * 2], and base == PAGE * + * SC_NGROUP. off_to_first_ps_rg begins from 1, instead of 0. e.g. + * off_to_first_ps_rg is 1 when psz is (PAGE * SC_NGROUP + 1). + */ + pszind_t off_to_first_ps_rg = (x < SC_LG_NGROUP + LG_PAGE) ? + 0 : x - (SC_LG_NGROUP + LG_PAGE); + + /* + * Same as sc_s::lg_delta. + * Delta for off_to_first_ps_rg == 1 is PAGE, + * for each increase in offset, it's multiplied by two. + * Therefore, lg_delta = LG_PAGE + (off_to_first_ps_rg - 1). + */ + pszind_t lg_delta = (off_to_first_ps_rg == 0) ? + LG_PAGE : LG_PAGE + (off_to_first_ps_rg - 1); + + /* + * Let's write psz in binary, e.g. 0011 for 0x3, 0111 for 0x7. + * The leftmost bits whose len is lg_base decide the base of psz. + * The rightmost bits whose len is lg_delta decide (pgz % PAGE). + * The middle bits whose len is SC_LG_NGROUP decide ndelta. + * ndelta is offset to the first size class in the size class group, + * starts from 1. + * If you don't know lg_base, ndelta or lg_delta, see sc.h. + * |xxxxxxxxxxxxxxxxxxxx|------------------------|yyyyyyyyyyyyyyyyyyyyy| + * |<-- len: lg_base -->|<-- len: SC_LG_NGROUP-->|<-- len: lg_delta -->| + * |<-- ndelta -->| + * rg_inner_off = ndelta - 1 + * Why use (psz - 1)? + * To handle case: psz % (1 << lg_delta) == 0. + */ + pszind_t rg_inner_off = (((psz - 1)) >> lg_delta) & (SC_NGROUP - 1); + + pszind_t base_ind = off_to_first_ps_rg << SC_LG_NGROUP; + pszind_t ind = base_ind + rg_inner_off; + return ind; +} + +static inline size_t +sz_pind2sz_compute(pszind_t pind) { + if (unlikely(pind == SC_NPSIZES)) { + return SC_LARGE_MAXCLASS + PAGE; + } + size_t grp = pind >> SC_LG_NGROUP; + size_t mod = pind & ((ZU(1) << SC_LG_NGROUP) - 1); + + size_t grp_size_mask = ~((!!grp)-1); + size_t grp_size = ((ZU(1) << (LG_PAGE + (SC_LG_NGROUP-1))) << grp) + & grp_size_mask; + + size_t shift = (grp == 0) ? 1 : grp; + size_t lg_delta = shift + (LG_PAGE-1); + size_t mod_size = (mod+1) << lg_delta; + + size_t sz = grp_size + mod_size; + return sz; +} + +static inline size_t +sz_pind2sz_lookup(pszind_t pind) { + size_t ret = (size_t)sz_pind2sz_tab[pind]; + assert(ret == sz_pind2sz_compute(pind)); + return ret; +} + +static inline size_t +sz_pind2sz(pszind_t pind) { + assert(pind < SC_NPSIZES + 1); + return sz_pind2sz_lookup(pind); +} + +static inline size_t +sz_psz2u(size_t psz) { + if (unlikely(psz > SC_LARGE_MAXCLASS)) { + return SC_LARGE_MAXCLASS + PAGE; + } + size_t x = lg_floor((psz<<1)-1); + size_t lg_delta = (x < SC_LG_NGROUP + LG_PAGE + 1) ? + LG_PAGE : x - SC_LG_NGROUP - 1; + size_t delta = ZU(1) << lg_delta; + size_t delta_mask = delta - 1; + size_t usize = (psz + delta_mask) & ~delta_mask; + return usize; +} + +static inline szind_t +sz_size2index_compute(size_t size) { + if (unlikely(size > SC_LARGE_MAXCLASS)) { + return SC_NSIZES; + } + + if (size == 0) { + return 0; + } +#if (SC_NTINY != 0) + if (size <= (ZU(1) << SC_LG_TINY_MAXCLASS)) { + szind_t lg_tmin = SC_LG_TINY_MAXCLASS - SC_NTINY + 1; + szind_t lg_ceil = lg_floor(pow2_ceil_zu(size)); + return (lg_ceil < lg_tmin ? 0 : lg_ceil - lg_tmin); + } +#endif + { + szind_t x = lg_floor((size<<1)-1); + szind_t shift = (x < SC_LG_NGROUP + LG_QUANTUM) ? 0 : + x - (SC_LG_NGROUP + LG_QUANTUM); + szind_t grp = shift << SC_LG_NGROUP; + + szind_t lg_delta = (x < SC_LG_NGROUP + LG_QUANTUM + 1) + ? LG_QUANTUM : x - SC_LG_NGROUP - 1; + + size_t delta_inverse_mask = ZU(-1) << lg_delta; + szind_t mod = ((((size-1) & delta_inverse_mask) >> lg_delta)) & + ((ZU(1) << SC_LG_NGROUP) - 1); + + szind_t index = SC_NTINY + grp + mod; + return index; + } +} + +JEMALLOC_ALWAYS_INLINE szind_t +sz_size2index_lookup_impl(size_t size) { + assert(size <= SC_LOOKUP_MAXCLASS); + return sz_size2index_tab[(size + (ZU(1) << SC_LG_TINY_MIN) - 1) + >> SC_LG_TINY_MIN]; +} + +JEMALLOC_ALWAYS_INLINE szind_t +sz_size2index_lookup(size_t size) { + szind_t ret = sz_size2index_lookup_impl(size); + assert(ret == sz_size2index_compute(size)); + return ret; +} + +JEMALLOC_ALWAYS_INLINE szind_t +sz_size2index(size_t size) { + if (likely(size <= SC_LOOKUP_MAXCLASS)) { + return sz_size2index_lookup(size); + } + return sz_size2index_compute(size); +} + +static inline size_t +sz_index2size_compute(szind_t index) { +#if (SC_NTINY > 0) + if (index < SC_NTINY) { + return (ZU(1) << (SC_LG_TINY_MAXCLASS - SC_NTINY + 1 + index)); + } +#endif + { + size_t reduced_index = index - SC_NTINY; + size_t grp = reduced_index >> SC_LG_NGROUP; + size_t mod = reduced_index & ((ZU(1) << SC_LG_NGROUP) - + 1); + + size_t grp_size_mask = ~((!!grp)-1); + size_t grp_size = ((ZU(1) << (LG_QUANTUM + + (SC_LG_NGROUP-1))) << grp) & grp_size_mask; + + size_t shift = (grp == 0) ? 1 : grp; + size_t lg_delta = shift + (LG_QUANTUM-1); + size_t mod_size = (mod+1) << lg_delta; + + size_t usize = grp_size + mod_size; + return usize; + } +} + +JEMALLOC_ALWAYS_INLINE size_t +sz_index2size_lookup_impl(szind_t index) { + return sz_index2size_tab[index]; +} + +JEMALLOC_ALWAYS_INLINE size_t +sz_index2size_lookup(szind_t index) { + size_t ret = sz_index2size_lookup_impl(index); + assert(ret == sz_index2size_compute(index)); + return ret; +} + +JEMALLOC_ALWAYS_INLINE size_t +sz_index2size(szind_t index) { + assert(index < SC_NSIZES); + return sz_index2size_lookup(index); +} + +JEMALLOC_ALWAYS_INLINE void +sz_size2index_usize_fastpath(size_t size, szind_t *ind, size_t *usize) { + *ind = sz_size2index_lookup_impl(size); + *usize = sz_index2size_lookup_impl(*ind); +} + +JEMALLOC_ALWAYS_INLINE size_t +sz_s2u_compute(size_t size) { + if (unlikely(size > SC_LARGE_MAXCLASS)) { + return 0; + } + + if (size == 0) { + size++; + } +#if (SC_NTINY > 0) + if (size <= (ZU(1) << SC_LG_TINY_MAXCLASS)) { + size_t lg_tmin = SC_LG_TINY_MAXCLASS - SC_NTINY + 1; + size_t lg_ceil = lg_floor(pow2_ceil_zu(size)); + return (lg_ceil < lg_tmin ? (ZU(1) << lg_tmin) : + (ZU(1) << lg_ceil)); + } +#endif + { + size_t x = lg_floor((size<<1)-1); + size_t lg_delta = (x < SC_LG_NGROUP + LG_QUANTUM + 1) + ? LG_QUANTUM : x - SC_LG_NGROUP - 1; + size_t delta = ZU(1) << lg_delta; + size_t delta_mask = delta - 1; + size_t usize = (size + delta_mask) & ~delta_mask; + return usize; + } +} + +JEMALLOC_ALWAYS_INLINE size_t +sz_s2u_lookup(size_t size) { + size_t ret = sz_index2size_lookup(sz_size2index_lookup(size)); + + assert(ret == sz_s2u_compute(size)); + return ret; +} + +/* + * Compute usable size that would result from allocating an object with the + * specified size. + */ +JEMALLOC_ALWAYS_INLINE size_t +sz_s2u(size_t size) { + if (likely(size <= SC_LOOKUP_MAXCLASS)) { + return sz_s2u_lookup(size); + } + return sz_s2u_compute(size); +} + +/* + * Compute usable size that would result from allocating an object with the + * specified size and alignment. + */ +JEMALLOC_ALWAYS_INLINE size_t +sz_sa2u(size_t size, size_t alignment) { + size_t usize; + + assert(alignment != 0 && ((alignment - 1) & alignment) == 0); + + /* Try for a small size class. */ + if (size <= SC_SMALL_MAXCLASS && alignment <= PAGE) { + /* + * Round size up to the nearest multiple of alignment. + * + * This done, we can take advantage of the fact that for each + * small size class, every object is aligned at the smallest + * power of two that is non-zero in the base two representation + * of the size. For example: + * + * Size | Base 2 | Minimum alignment + * -----+----------+------------------ + * 96 | 1100000 | 32 + * 144 | 10100000 | 32 + * 192 | 11000000 | 64 + */ + usize = sz_s2u(ALIGNMENT_CEILING(size, alignment)); + if (usize < SC_LARGE_MINCLASS) { + return usize; + } + } + + /* Large size class. Beware of overflow. */ + + if (unlikely(alignment > SC_LARGE_MAXCLASS)) { + return 0; + } + + /* Make sure result is a large size class. */ + if (size <= SC_LARGE_MINCLASS) { + usize = SC_LARGE_MINCLASS; + } else { + usize = sz_s2u(size); + if (usize < size) { + /* size_t overflow. */ + return 0; + } + } + + /* + * Calculate the multi-page mapping that large_palloc() would need in + * order to guarantee the alignment. + */ + if (usize + sz_large_pad + PAGE_CEILING(alignment) - PAGE < usize) { + /* size_t overflow. */ + return 0; + } + return usize; +} + +size_t sz_psz_quantize_floor(size_t size); +size_t sz_psz_quantize_ceil(size_t size); + +#endif /* JEMALLOC_INTERNAL_SIZE_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/tcache_externs.h b/BeefRT/JEMalloc/include/jemalloc/internal/tcache_externs.h new file mode 100644 index 00000000..a2ab7101 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/tcache_externs.h @@ -0,0 +1,75 @@ +#ifndef JEMALLOC_INTERNAL_TCACHE_EXTERNS_H +#define JEMALLOC_INTERNAL_TCACHE_EXTERNS_H + +extern bool opt_tcache; +extern size_t opt_tcache_max; +extern ssize_t opt_lg_tcache_nslots_mul; +extern unsigned opt_tcache_nslots_small_min; +extern unsigned opt_tcache_nslots_small_max; +extern unsigned opt_tcache_nslots_large; +extern ssize_t opt_lg_tcache_shift; +extern size_t opt_tcache_gc_incr_bytes; +extern size_t opt_tcache_gc_delay_bytes; +extern unsigned opt_lg_tcache_flush_small_div; +extern unsigned opt_lg_tcache_flush_large_div; + +/* + * Number of tcache bins. There are SC_NBINS small-object bins, plus 0 or more + * large-object bins. + */ +extern unsigned nhbins; + +/* Maximum cached size class. */ +extern size_t tcache_maxclass; + +extern cache_bin_info_t *tcache_bin_info; + +/* + * Explicit tcaches, managed via the tcache.{create,flush,destroy} mallctls and + * usable via the MALLOCX_TCACHE() flag. The automatic per thread tcaches are + * completely disjoint from this data structure. tcaches starts off as a sparse + * array, so it has no physical memory footprint until individual pages are + * touched. This allows the entire array to be allocated the first time an + * explicit tcache is created without a disproportionate impact on memory usage. + */ +extern tcaches_t *tcaches; + +size_t tcache_salloc(tsdn_t *tsdn, const void *ptr); +void *tcache_alloc_small_hard(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache, + cache_bin_t *tbin, szind_t binind, bool *tcache_success); + +void tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin, + szind_t binind, unsigned rem); +void tcache_bin_flush_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *tbin, + szind_t binind, unsigned rem); +void tcache_bin_flush_stashed(tsd_t *tsd, tcache_t *tcache, cache_bin_t *bin, + szind_t binind, bool is_small); +void tcache_arena_reassociate(tsdn_t *tsdn, tcache_slow_t *tcache_slow, + tcache_t *tcache, arena_t *arena); +tcache_t *tcache_create_explicit(tsd_t *tsd); +void tcache_cleanup(tsd_t *tsd); +void tcache_stats_merge(tsdn_t *tsdn, tcache_t *tcache, arena_t *arena); +bool tcaches_create(tsd_t *tsd, base_t *base, unsigned *r_ind); +void tcaches_flush(tsd_t *tsd, unsigned ind); +void tcaches_destroy(tsd_t *tsd, unsigned ind); +bool tcache_boot(tsdn_t *tsdn, base_t *base); +void tcache_arena_associate(tsdn_t *tsdn, tcache_slow_t *tcache_slow, + tcache_t *tcache, arena_t *arena); +void tcache_prefork(tsdn_t *tsdn); +void tcache_postfork_parent(tsdn_t *tsdn); +void tcache_postfork_child(tsdn_t *tsdn); +void tcache_flush(tsd_t *tsd); +bool tsd_tcache_data_init(tsd_t *tsd); +bool tsd_tcache_enabled_data_init(tsd_t *tsd); + +void tcache_assert_initialized(tcache_t *tcache); + +/* Only accessed by thread event. */ +uint64_t tcache_gc_new_event_wait(tsd_t *tsd); +uint64_t tcache_gc_postponed_event_wait(tsd_t *tsd); +void tcache_gc_event_handler(tsd_t *tsd, uint64_t elapsed); +uint64_t tcache_gc_dalloc_new_event_wait(tsd_t *tsd); +uint64_t tcache_gc_dalloc_postponed_event_wait(tsd_t *tsd); +void tcache_gc_dalloc_event_handler(tsd_t *tsd, uint64_t elapsed); + +#endif /* JEMALLOC_INTERNAL_TCACHE_EXTERNS_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/tcache_inlines.h b/BeefRT/JEMalloc/include/jemalloc/internal/tcache_inlines.h new file mode 100644 index 00000000..2634f145 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/tcache_inlines.h @@ -0,0 +1,193 @@ +#ifndef JEMALLOC_INTERNAL_TCACHE_INLINES_H +#define JEMALLOC_INTERNAL_TCACHE_INLINES_H + +#include "jemalloc/internal/bin.h" +#include "jemalloc/internal/jemalloc_internal_types.h" +#include "jemalloc/internal/san.h" +#include "jemalloc/internal/sc.h" +#include "jemalloc/internal/sz.h" +#include "jemalloc/internal/util.h" + +static inline bool +tcache_enabled_get(tsd_t *tsd) { + return tsd_tcache_enabled_get(tsd); +} + +static inline void +tcache_enabled_set(tsd_t *tsd, bool enabled) { + bool was_enabled = tsd_tcache_enabled_get(tsd); + + if (!was_enabled && enabled) { + tsd_tcache_data_init(tsd); + } else if (was_enabled && !enabled) { + tcache_cleanup(tsd); + } + /* Commit the state last. Above calls check current state. */ + tsd_tcache_enabled_set(tsd, enabled); + tsd_slow_update(tsd); +} + +JEMALLOC_ALWAYS_INLINE bool +tcache_small_bin_disabled(szind_t ind, cache_bin_t *bin) { + assert(ind < SC_NBINS); + bool ret = (cache_bin_info_ncached_max(&tcache_bin_info[ind]) == 0); + if (ret && bin != NULL) { + /* small size class but cache bin disabled. */ + assert(ind >= nhbins); + assert((uintptr_t)(*bin->stack_head) == + cache_bin_preceding_junk); + } + + return ret; +} + +JEMALLOC_ALWAYS_INLINE void * +tcache_alloc_small(tsd_t *tsd, arena_t *arena, tcache_t *tcache, + size_t size, szind_t binind, bool zero, bool slow_path) { + void *ret; + bool tcache_success; + + assert(binind < SC_NBINS); + cache_bin_t *bin = &tcache->bins[binind]; + ret = cache_bin_alloc(bin, &tcache_success); + assert(tcache_success == (ret != NULL)); + if (unlikely(!tcache_success)) { + bool tcache_hard_success; + arena = arena_choose(tsd, arena); + if (unlikely(arena == NULL)) { + return NULL; + } + if (unlikely(tcache_small_bin_disabled(binind, bin))) { + /* stats and zero are handled directly by the arena. */ + return arena_malloc_hard(tsd_tsdn(tsd), arena, size, + binind, zero); + } + tcache_bin_flush_stashed(tsd, tcache, bin, binind, + /* is_small */ true); + + ret = tcache_alloc_small_hard(tsd_tsdn(tsd), arena, tcache, + bin, binind, &tcache_hard_success); + if (tcache_hard_success == false) { + return NULL; + } + } + + assert(ret); + if (unlikely(zero)) { + size_t usize = sz_index2size(binind); + assert(tcache_salloc(tsd_tsdn(tsd), ret) == usize); + memset(ret, 0, usize); + } + if (config_stats) { + bin->tstats.nrequests++; + } + return ret; +} + +JEMALLOC_ALWAYS_INLINE void * +tcache_alloc_large(tsd_t *tsd, arena_t *arena, tcache_t *tcache, size_t size, + szind_t binind, bool zero, bool slow_path) { + void *ret; + bool tcache_success; + + assert(binind >= SC_NBINS && binind < nhbins); + cache_bin_t *bin = &tcache->bins[binind]; + ret = cache_bin_alloc(bin, &tcache_success); + assert(tcache_success == (ret != NULL)); + if (unlikely(!tcache_success)) { + /* + * Only allocate one large object at a time, because it's quite + * expensive to create one and not use it. + */ + arena = arena_choose(tsd, arena); + if (unlikely(arena == NULL)) { + return NULL; + } + tcache_bin_flush_stashed(tsd, tcache, bin, binind, + /* is_small */ false); + + ret = large_malloc(tsd_tsdn(tsd), arena, sz_s2u(size), zero); + if (ret == NULL) { + return NULL; + } + } else { + if (unlikely(zero)) { + size_t usize = sz_index2size(binind); + assert(usize <= tcache_maxclass); + memset(ret, 0, usize); + } + + if (config_stats) { + bin->tstats.nrequests++; + } + } + + return ret; +} + +JEMALLOC_ALWAYS_INLINE void +tcache_dalloc_small(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind, + bool slow_path) { + assert(tcache_salloc(tsd_tsdn(tsd), ptr) <= SC_SMALL_MAXCLASS); + + cache_bin_t *bin = &tcache->bins[binind]; + /* + * Not marking the branch unlikely because this is past free_fastpath() + * (which handles the most common cases), i.e. at this point it's often + * uncommon cases. + */ + if (cache_bin_nonfast_aligned(ptr)) { + /* Junk unconditionally, even if bin is full. */ + san_junk_ptr(ptr, sz_index2size(binind)); + if (cache_bin_stash(bin, ptr)) { + return; + } + assert(cache_bin_full(bin)); + /* Bin full; fall through into the flush branch. */ + } + + if (unlikely(!cache_bin_dalloc_easy(bin, ptr))) { + if (unlikely(tcache_small_bin_disabled(binind, bin))) { + arena_dalloc_small(tsd_tsdn(tsd), ptr); + return; + } + cache_bin_sz_t max = cache_bin_info_ncached_max( + &tcache_bin_info[binind]); + unsigned remain = max >> opt_lg_tcache_flush_small_div; + tcache_bin_flush_small(tsd, tcache, bin, binind, remain); + bool ret = cache_bin_dalloc_easy(bin, ptr); + assert(ret); + } +} + +JEMALLOC_ALWAYS_INLINE void +tcache_dalloc_large(tsd_t *tsd, tcache_t *tcache, void *ptr, szind_t binind, + bool slow_path) { + + assert(tcache_salloc(tsd_tsdn(tsd), ptr) + > SC_SMALL_MAXCLASS); + assert(tcache_salloc(tsd_tsdn(tsd), ptr) <= tcache_maxclass); + + cache_bin_t *bin = &tcache->bins[binind]; + if (unlikely(!cache_bin_dalloc_easy(bin, ptr))) { + unsigned remain = cache_bin_info_ncached_max( + &tcache_bin_info[binind]) >> opt_lg_tcache_flush_large_div; + tcache_bin_flush_large(tsd, tcache, bin, binind, remain); + bool ret = cache_bin_dalloc_easy(bin, ptr); + assert(ret); + } +} + +JEMALLOC_ALWAYS_INLINE tcache_t * +tcaches_get(tsd_t *tsd, unsigned ind) { + tcaches_t *elm = &tcaches[ind]; + if (unlikely(elm->tcache == NULL)) { + malloc_printf(": invalid tcache id (%u).\n", ind); + abort(); + } else if (unlikely(elm->tcache == TCACHES_ELM_NEED_REINIT)) { + elm->tcache = tcache_create_explicit(tsd); + } + return elm->tcache; +} + +#endif /* JEMALLOC_INTERNAL_TCACHE_INLINES_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/tcache_structs.h b/BeefRT/JEMalloc/include/jemalloc/internal/tcache_structs.h new file mode 100644 index 00000000..176d73de --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/tcache_structs.h @@ -0,0 +1,68 @@ +#ifndef JEMALLOC_INTERNAL_TCACHE_STRUCTS_H +#define JEMALLOC_INTERNAL_TCACHE_STRUCTS_H + +#include "jemalloc/internal/cache_bin.h" +#include "jemalloc/internal/ql.h" +#include "jemalloc/internal/sc.h" +#include "jemalloc/internal/ticker.h" +#include "jemalloc/internal/tsd_types.h" + +/* + * The tcache state is split into the slow and hot path data. Each has a + * pointer to the other, and the data always comes in pairs. The layout of each + * of them varies in practice; tcache_slow lives in the TSD for the automatic + * tcache, and as part of a dynamic allocation for manual allocations. Keeping + * a pointer to tcache_slow lets us treat these cases uniformly, rather than + * splitting up the tcache [de]allocation code into those paths called with the + * TSD tcache and those called with a manual tcache. + */ + +struct tcache_slow_s { + /* Lets us track all the tcaches in an arena. */ + ql_elm(tcache_slow_t) link; + + /* + * The descriptor lets the arena find our cache bins without seeing the + * tcache definition. This enables arenas to aggregate stats across + * tcaches without having a tcache dependency. + */ + cache_bin_array_descriptor_t cache_bin_array_descriptor; + + /* The arena this tcache is associated with. */ + arena_t *arena; + /* Next bin to GC. */ + szind_t next_gc_bin; + /* For small bins, fill (ncached_max >> lg_fill_div). */ + uint8_t lg_fill_div[SC_NBINS]; + /* For small bins, whether has been refilled since last GC. */ + bool bin_refilled[SC_NBINS]; + /* + * For small bins, the number of items we can pretend to flush before + * actually flushing. + */ + uint8_t bin_flush_delay_items[SC_NBINS]; + /* + * The start of the allocation containing the dynamic allocation for + * either the cache bins alone, or the cache bin memory as well as this + * tcache_slow_t and its associated tcache_t. + */ + void *dyn_alloc; + + /* The associated bins. */ + tcache_t *tcache; +}; + +struct tcache_s { + tcache_slow_t *tcache_slow; + cache_bin_t bins[TCACHE_NBINS_MAX]; +}; + +/* Linkage for list of available (previously used) explicit tcache IDs. */ +struct tcaches_s { + union { + tcache_t *tcache; + tcaches_t *next; + }; +}; + +#endif /* JEMALLOC_INTERNAL_TCACHE_STRUCTS_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/tcache_types.h b/BeefRT/JEMalloc/include/jemalloc/internal/tcache_types.h new file mode 100644 index 00000000..583677ea --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/tcache_types.h @@ -0,0 +1,35 @@ +#ifndef JEMALLOC_INTERNAL_TCACHE_TYPES_H +#define JEMALLOC_INTERNAL_TCACHE_TYPES_H + +#include "jemalloc/internal/sc.h" + +typedef struct tcache_slow_s tcache_slow_t; +typedef struct tcache_s tcache_t; +typedef struct tcaches_s tcaches_t; + +/* + * tcache pointers close to NULL are used to encode state information that is + * used for two purposes: preventing thread caching on a per thread basis and + * cleaning up during thread shutdown. + */ +#define TCACHE_STATE_DISABLED ((tcache_t *)(uintptr_t)1) +#define TCACHE_STATE_REINCARNATED ((tcache_t *)(uintptr_t)2) +#define TCACHE_STATE_PURGATORY ((tcache_t *)(uintptr_t)3) +#define TCACHE_STATE_MAX TCACHE_STATE_PURGATORY + +/* Used in TSD static initializer only. Real init in tsd_tcache_data_init(). */ +#define TCACHE_ZERO_INITIALIZER {0} +#define TCACHE_SLOW_ZERO_INITIALIZER {0} + +/* Used in TSD static initializer only. Will be initialized to opt_tcache. */ +#define TCACHE_ENABLED_ZERO_INITIALIZER false + +/* Used for explicit tcache only. Means flushed but not destroyed. */ +#define TCACHES_ELM_NEED_REINIT ((tcache_t *)(uintptr_t)1) + +#define TCACHE_LG_MAXCLASS_LIMIT 23 /* tcache_maxclass = 8M */ +#define TCACHE_MAXCLASS_LIMIT ((size_t)1 << TCACHE_LG_MAXCLASS_LIMIT) +#define TCACHE_NBINS_MAX (SC_NBINS + SC_NGROUP * \ + (TCACHE_LG_MAXCLASS_LIMIT - SC_LG_LARGE_MINCLASS) + 1) + +#endif /* JEMALLOC_INTERNAL_TCACHE_TYPES_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/test_hooks.h b/BeefRT/JEMalloc/include/jemalloc/internal/test_hooks.h new file mode 100644 index 00000000..3d530b5c --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/test_hooks.h @@ -0,0 +1,24 @@ +#ifndef JEMALLOC_INTERNAL_TEST_HOOKS_H +#define JEMALLOC_INTERNAL_TEST_HOOKS_H + +extern JEMALLOC_EXPORT void (*test_hooks_arena_new_hook)(); +extern JEMALLOC_EXPORT void (*test_hooks_libc_hook)(); + +#if defined(JEMALLOC_JET) || defined(JEMALLOC_UNIT_TEST) +# define JEMALLOC_TEST_HOOK(fn, hook) ((void)(hook != NULL && (hook(), 0)), fn) + +# define open JEMALLOC_TEST_HOOK(open, test_hooks_libc_hook) +# define read JEMALLOC_TEST_HOOK(read, test_hooks_libc_hook) +# define write JEMALLOC_TEST_HOOK(write, test_hooks_libc_hook) +# define readlink JEMALLOC_TEST_HOOK(readlink, test_hooks_libc_hook) +# define close JEMALLOC_TEST_HOOK(close, test_hooks_libc_hook) +# define creat JEMALLOC_TEST_HOOK(creat, test_hooks_libc_hook) +# define secure_getenv JEMALLOC_TEST_HOOK(secure_getenv, test_hooks_libc_hook) +/* Note that this is undef'd and re-define'd in src/prof.c. */ +# define _Unwind_Backtrace JEMALLOC_TEST_HOOK(_Unwind_Backtrace, test_hooks_libc_hook) +#else +# define JEMALLOC_TEST_HOOK(fn, hook) fn +#endif + + +#endif /* JEMALLOC_INTERNAL_TEST_HOOKS_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/thread_event.h b/BeefRT/JEMalloc/include/jemalloc/internal/thread_event.h new file mode 100644 index 00000000..2f4e1b39 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/thread_event.h @@ -0,0 +1,301 @@ +#ifndef JEMALLOC_INTERNAL_THREAD_EVENT_H +#define JEMALLOC_INTERNAL_THREAD_EVENT_H + +#include "jemalloc/internal/tsd.h" + +/* "te" is short for "thread_event" */ + +/* + * TE_MIN_START_WAIT should not exceed the minimal allocation usize. + */ +#define TE_MIN_START_WAIT ((uint64_t)1U) +#define TE_MAX_START_WAIT UINT64_MAX + +/* + * Maximum threshold on thread_(de)allocated_next_event_fast, so that there is + * no need to check overflow in malloc fast path. (The allocation size in malloc + * fast path never exceeds SC_LOOKUP_MAXCLASS.) + */ +#define TE_NEXT_EVENT_FAST_MAX (UINT64_MAX - SC_LOOKUP_MAXCLASS + 1U) + +/* + * The max interval helps make sure that malloc stays on the fast path in the + * common case, i.e. thread_allocated < thread_allocated_next_event_fast. When + * thread_allocated is within an event's distance to TE_NEXT_EVENT_FAST_MAX + * above, thread_allocated_next_event_fast is wrapped around and we fall back to + * the medium-fast path. The max interval makes sure that we're not staying on + * the fallback case for too long, even if there's no active event or if all + * active events have long wait times. + */ +#define TE_MAX_INTERVAL ((uint64_t)(4U << 20)) + +/* + * Invalid elapsed time, for situations where elapsed time is not needed. See + * comments in thread_event.c for more info. + */ +#define TE_INVALID_ELAPSED UINT64_MAX + +typedef struct te_ctx_s { + bool is_alloc; + uint64_t *current; + uint64_t *last_event; + uint64_t *next_event; + uint64_t *next_event_fast; +} te_ctx_t; + +void te_assert_invariants_debug(tsd_t *tsd); +void te_event_trigger(tsd_t *tsd, te_ctx_t *ctx); +void te_recompute_fast_threshold(tsd_t *tsd); +void tsd_te_init(tsd_t *tsd); + +/* + * List of all events, in the following format: + * E(event, (condition), is_alloc_event) + */ +#define ITERATE_OVER_ALL_EVENTS \ + E(tcache_gc, (opt_tcache_gc_incr_bytes > 0), true) \ + E(prof_sample, (config_prof && opt_prof), true) \ + E(stats_interval, (opt_stats_interval >= 0), true) \ + E(tcache_gc_dalloc, (opt_tcache_gc_incr_bytes > 0), false) \ + E(peak_alloc, config_stats, true) \ + E(peak_dalloc, config_stats, false) + +#define E(event, condition_unused, is_alloc_event_unused) \ + C(event##_event_wait) + +/* List of all thread event counters. */ +#define ITERATE_OVER_ALL_COUNTERS \ + C(thread_allocated) \ + C(thread_allocated_last_event) \ + ITERATE_OVER_ALL_EVENTS \ + C(prof_sample_last_event) \ + C(stats_interval_last_event) + +/* Getters directly wrap TSD getters. */ +#define C(counter) \ +JEMALLOC_ALWAYS_INLINE uint64_t \ +counter##_get(tsd_t *tsd) { \ + return tsd_##counter##_get(tsd); \ +} + +ITERATE_OVER_ALL_COUNTERS +#undef C + +/* + * Setters call the TSD pointer getters rather than the TSD setters, so that + * the counters can be modified even when TSD state is reincarnated or + * minimal_initialized: if an event is triggered in such cases, we will + * temporarily delay the event and let it be immediately triggered at the next + * allocation call. + */ +#define C(counter) \ +JEMALLOC_ALWAYS_INLINE void \ +counter##_set(tsd_t *tsd, uint64_t v) { \ + *tsd_##counter##p_get(tsd) = v; \ +} + +ITERATE_OVER_ALL_COUNTERS +#undef C + +/* + * For generating _event_wait getter / setter functions for each individual + * event. + */ +#undef E + +/* + * The malloc and free fastpath getters -- use the unsafe getters since tsd may + * be non-nominal, in which case the fast_threshold will be set to 0. This + * allows checking for events and tsd non-nominal in a single branch. + * + * Note that these can only be used on the fastpath. + */ +JEMALLOC_ALWAYS_INLINE void +te_malloc_fastpath_ctx(tsd_t *tsd, uint64_t *allocated, uint64_t *threshold) { + *allocated = *tsd_thread_allocatedp_get_unsafe(tsd); + *threshold = *tsd_thread_allocated_next_event_fastp_get_unsafe(tsd); + assert(*threshold <= TE_NEXT_EVENT_FAST_MAX); +} + +JEMALLOC_ALWAYS_INLINE void +te_free_fastpath_ctx(tsd_t *tsd, uint64_t *deallocated, uint64_t *threshold) { + /* Unsafe getters since this may happen before tsd_init. */ + *deallocated = *tsd_thread_deallocatedp_get_unsafe(tsd); + *threshold = *tsd_thread_deallocated_next_event_fastp_get_unsafe(tsd); + assert(*threshold <= TE_NEXT_EVENT_FAST_MAX); +} + +JEMALLOC_ALWAYS_INLINE bool +te_ctx_is_alloc(te_ctx_t *ctx) { + return ctx->is_alloc; +} + +JEMALLOC_ALWAYS_INLINE uint64_t +te_ctx_current_bytes_get(te_ctx_t *ctx) { + return *ctx->current; +} + +JEMALLOC_ALWAYS_INLINE void +te_ctx_current_bytes_set(te_ctx_t *ctx, uint64_t v) { + *ctx->current = v; +} + +JEMALLOC_ALWAYS_INLINE uint64_t +te_ctx_last_event_get(te_ctx_t *ctx) { + return *ctx->last_event; +} + +JEMALLOC_ALWAYS_INLINE void +te_ctx_last_event_set(te_ctx_t *ctx, uint64_t v) { + *ctx->last_event = v; +} + +/* Below 3 for next_event_fast. */ +JEMALLOC_ALWAYS_INLINE uint64_t +te_ctx_next_event_fast_get(te_ctx_t *ctx) { + uint64_t v = *ctx->next_event_fast; + assert(v <= TE_NEXT_EVENT_FAST_MAX); + return v; +} + +JEMALLOC_ALWAYS_INLINE void +te_ctx_next_event_fast_set(te_ctx_t *ctx, uint64_t v) { + assert(v <= TE_NEXT_EVENT_FAST_MAX); + *ctx->next_event_fast = v; +} + +JEMALLOC_ALWAYS_INLINE void +te_next_event_fast_set_non_nominal(tsd_t *tsd) { + /* + * Set the fast thresholds to zero when tsd is non-nominal. Use the + * unsafe getter as this may get called during tsd init and clean up. + */ + *tsd_thread_allocated_next_event_fastp_get_unsafe(tsd) = 0; + *tsd_thread_deallocated_next_event_fastp_get_unsafe(tsd) = 0; +} + +/* For next_event. Setter also updates the fast threshold. */ +JEMALLOC_ALWAYS_INLINE uint64_t +te_ctx_next_event_get(te_ctx_t *ctx) { + return *ctx->next_event; +} + +JEMALLOC_ALWAYS_INLINE void +te_ctx_next_event_set(tsd_t *tsd, te_ctx_t *ctx, uint64_t v) { + *ctx->next_event = v; + te_recompute_fast_threshold(tsd); +} + +/* + * The function checks in debug mode whether the thread event counters are in + * a consistent state, which forms the invariants before and after each round + * of thread event handling that we can rely on and need to promise. + * The invariants are only temporarily violated in the middle of + * te_event_advance() if an event is triggered (the te_event_trigger() call at + * the end will restore the invariants). + */ +JEMALLOC_ALWAYS_INLINE void +te_assert_invariants(tsd_t *tsd) { + if (config_debug) { + te_assert_invariants_debug(tsd); + } +} + +JEMALLOC_ALWAYS_INLINE void +te_ctx_get(tsd_t *tsd, te_ctx_t *ctx, bool is_alloc) { + ctx->is_alloc = is_alloc; + if (is_alloc) { + ctx->current = tsd_thread_allocatedp_get(tsd); + ctx->last_event = tsd_thread_allocated_last_eventp_get(tsd); + ctx->next_event = tsd_thread_allocated_next_eventp_get(tsd); + ctx->next_event_fast = + tsd_thread_allocated_next_event_fastp_get(tsd); + } else { + ctx->current = tsd_thread_deallocatedp_get(tsd); + ctx->last_event = tsd_thread_deallocated_last_eventp_get(tsd); + ctx->next_event = tsd_thread_deallocated_next_eventp_get(tsd); + ctx->next_event_fast = + tsd_thread_deallocated_next_event_fastp_get(tsd); + } +} + +/* + * The lookahead functionality facilitates events to be able to lookahead, i.e. + * without touching the event counters, to determine whether an event would be + * triggered. The event counters are not advanced until the end of the + * allocation / deallocation calls, so the lookahead can be useful if some + * preparation work for some event must be done early in the allocation / + * deallocation calls. + * + * Currently only the profiling sampling event needs the lookahead + * functionality, so we don't yet define general purpose lookahead functions. + * + * Surplus is a terminology referring to the amount of bytes beyond what's + * needed for triggering an event, which can be a useful quantity to have in + * general when lookahead is being called. + */ + +JEMALLOC_ALWAYS_INLINE bool +te_prof_sample_event_lookahead_surplus(tsd_t *tsd, size_t usize, + size_t *surplus) { + if (surplus != NULL) { + /* + * This is a dead store: the surplus will be overwritten before + * any read. The initialization suppresses compiler warnings. + * Meanwhile, using SIZE_MAX to initialize is good for + * debugging purpose, because a valid surplus value is strictly + * less than usize, which is at most SIZE_MAX. + */ + *surplus = SIZE_MAX; + } + if (unlikely(!tsd_nominal(tsd) || tsd_reentrancy_level_get(tsd) > 0)) { + return false; + } + /* The subtraction is intentionally susceptible to underflow. */ + uint64_t accumbytes = tsd_thread_allocated_get(tsd) + usize - + tsd_thread_allocated_last_event_get(tsd); + uint64_t sample_wait = tsd_prof_sample_event_wait_get(tsd); + if (accumbytes < sample_wait) { + return false; + } + assert(accumbytes - sample_wait < (uint64_t)usize); + if (surplus != NULL) { + *surplus = (size_t)(accumbytes - sample_wait); + } + return true; +} + +JEMALLOC_ALWAYS_INLINE bool +te_prof_sample_event_lookahead(tsd_t *tsd, size_t usize) { + return te_prof_sample_event_lookahead_surplus(tsd, usize, NULL); +} + +JEMALLOC_ALWAYS_INLINE void +te_event_advance(tsd_t *tsd, size_t usize, bool is_alloc) { + te_assert_invariants(tsd); + + te_ctx_t ctx; + te_ctx_get(tsd, &ctx, is_alloc); + + uint64_t bytes_before = te_ctx_current_bytes_get(&ctx); + te_ctx_current_bytes_set(&ctx, bytes_before + usize); + + /* The subtraction is intentionally susceptible to underflow. */ + if (likely(usize < te_ctx_next_event_get(&ctx) - bytes_before)) { + te_assert_invariants(tsd); + } else { + te_event_trigger(tsd, &ctx); + } +} + +JEMALLOC_ALWAYS_INLINE void +thread_dalloc_event(tsd_t *tsd, size_t usize) { + te_event_advance(tsd, usize, false); +} + +JEMALLOC_ALWAYS_INLINE void +thread_alloc_event(tsd_t *tsd, size_t usize) { + te_event_advance(tsd, usize, true); +} + +#endif /* JEMALLOC_INTERNAL_THREAD_EVENT_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/ticker.h b/BeefRT/JEMalloc/include/jemalloc/internal/ticker.h new file mode 100644 index 00000000..6b51ddec --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/ticker.h @@ -0,0 +1,175 @@ +#ifndef JEMALLOC_INTERNAL_TICKER_H +#define JEMALLOC_INTERNAL_TICKER_H + +#include "jemalloc/internal/prng.h" +#include "jemalloc/internal/util.h" + +/** + * A ticker makes it easy to count-down events until some limit. You + * ticker_init the ticker to trigger every nticks events. You then notify it + * that an event has occurred with calls to ticker_tick (or that nticks events + * have occurred with a call to ticker_ticks), which will return true (and reset + * the counter) if the countdown hit zero. + */ +typedef struct ticker_s ticker_t; +struct ticker_s { + int32_t tick; + int32_t nticks; +}; + +static inline void +ticker_init(ticker_t *ticker, int32_t nticks) { + ticker->tick = nticks; + ticker->nticks = nticks; +} + +static inline void +ticker_copy(ticker_t *ticker, const ticker_t *other) { + *ticker = *other; +} + +static inline int32_t +ticker_read(const ticker_t *ticker) { + return ticker->tick; +} + +/* + * Not intended to be a public API. Unfortunately, on x86, neither gcc nor + * clang seems smart enough to turn + * ticker->tick -= nticks; + * if (unlikely(ticker->tick < 0)) { + * fixup ticker + * return true; + * } + * return false; + * into + * subq %nticks_reg, (%ticker_reg) + * js fixup ticker + * + * unless we force "fixup ticker" out of line. In that case, gcc gets it right, + * but clang now does worse than before. So, on x86 with gcc, we force it out + * of line, but otherwise let the inlining occur. Ordinarily this wouldn't be + * worth the hassle, but this is on the fast path of both malloc and free (via + * tcache_event). + */ +#if defined(__GNUC__) && !defined(__clang__) \ + && (defined(__x86_64__) || defined(__i386__)) +JEMALLOC_NOINLINE +#endif +static bool +ticker_fixup(ticker_t *ticker) { + ticker->tick = ticker->nticks; + return true; +} + +static inline bool +ticker_ticks(ticker_t *ticker, int32_t nticks) { + ticker->tick -= nticks; + if (unlikely(ticker->tick < 0)) { + return ticker_fixup(ticker); + } + return false; +} + +static inline bool +ticker_tick(ticker_t *ticker) { + return ticker_ticks(ticker, 1); +} + +/* + * Try to tick. If ticker would fire, return true, but rely on + * slowpath to reset ticker. + */ +static inline bool +ticker_trytick(ticker_t *ticker) { + --ticker->tick; + if (unlikely(ticker->tick < 0)) { + return true; + } + return false; +} + +/* + * The ticker_geom_t is much like the ticker_t, except that instead of ticker + * having a constant countdown, it has an approximate one; each tick has + * approximately a 1/nticks chance of triggering the count. + * + * The motivation is in triggering arena decay. With a naive strategy, each + * thread would maintain a ticker per arena, and check if decay is necessary + * each time that the arena's ticker fires. This has two costs: + * - Since under reasonable assumptions both threads and arenas can scale + * linearly with the number of CPUs, maintaining per-arena data in each thread + * scales quadratically with the number of CPUs. + * - These tickers are often a cache miss down tcache flush pathways. + * + * By giving each tick a 1/nticks chance of firing, we still maintain the same + * average number of ticks-until-firing per arena, with only a single ticker's + * worth of metadata. + */ + +/* See ticker.c for an explanation of these constants. */ +#define TICKER_GEOM_NBITS 6 +#define TICKER_GEOM_MUL 61 +extern const uint8_t ticker_geom_table[1 << TICKER_GEOM_NBITS]; + +/* Not actually any different from ticker_t; just for type safety. */ +typedef struct ticker_geom_s ticker_geom_t; +struct ticker_geom_s { + int32_t tick; + int32_t nticks; +}; + +/* + * Just pick the average delay for the first counter. We're more concerned with + * the behavior over long periods of time rather than the exact timing of the + * initial ticks. + */ +#define TICKER_GEOM_INIT(nticks) {nticks, nticks} + +static inline void +ticker_geom_init(ticker_geom_t *ticker, int32_t nticks) { + /* + * Make sure there's no overflow possible. This shouldn't really be a + * problem for reasonable nticks choices, which are all static and + * relatively small. + */ + assert((uint64_t)nticks * (uint64_t)255 / (uint64_t)TICKER_GEOM_MUL + <= (uint64_t)INT32_MAX); + ticker->tick = nticks; + ticker->nticks = nticks; +} + +static inline int32_t +ticker_geom_read(const ticker_geom_t *ticker) { + return ticker->tick; +} + +/* Same deal as above. */ +#if defined(__GNUC__) && !defined(__clang__) \ + && (defined(__x86_64__) || defined(__i386__)) +JEMALLOC_NOINLINE +#endif +static bool +ticker_geom_fixup(ticker_geom_t *ticker, uint64_t *prng_state) { + uint64_t idx = prng_lg_range_u64(prng_state, TICKER_GEOM_NBITS); + ticker->tick = (uint32_t)( + (uint64_t)ticker->nticks * (uint64_t)ticker_geom_table[idx] + / (uint64_t)TICKER_GEOM_MUL); + return true; +} + +static inline bool +ticker_geom_ticks(ticker_geom_t *ticker, uint64_t *prng_state, int32_t nticks) { + ticker->tick -= nticks; + if (unlikely(ticker->tick < 0)) { + return ticker_geom_fixup(ticker, prng_state); + } + return false; +} + +static inline bool +ticker_geom_tick(ticker_geom_t *ticker, uint64_t *prng_state) { + return ticker_geom_ticks(ticker, prng_state, 1); +} + +#endif /* JEMALLOC_INTERNAL_TICKER_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/tsd.h b/BeefRT/JEMalloc/include/jemalloc/internal/tsd.h new file mode 100644 index 00000000..66d68822 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/tsd.h @@ -0,0 +1,518 @@ +#ifndef JEMALLOC_INTERNAL_TSD_H +#define JEMALLOC_INTERNAL_TSD_H + +#include "jemalloc/internal/activity_callback.h" +#include "jemalloc/internal/arena_types.h" +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/bin_types.h" +#include "jemalloc/internal/jemalloc_internal_externs.h" +#include "jemalloc/internal/peak.h" +#include "jemalloc/internal/prof_types.h" +#include "jemalloc/internal/ql.h" +#include "jemalloc/internal/rtree_tsd.h" +#include "jemalloc/internal/tcache_types.h" +#include "jemalloc/internal/tcache_structs.h" +#include "jemalloc/internal/util.h" +#include "jemalloc/internal/witness.h" + +/* + * Thread-Specific-Data layout + * + * At least some thread-local data gets touched on the fast-path of almost all + * malloc operations. But much of it is only necessary down slow-paths, or + * testing. We want to colocate the fast-path data so that it can live on the + * same cacheline if possible. So we define three tiers of hotness: + * TSD_DATA_FAST: Touched on the alloc/dalloc fast paths. + * TSD_DATA_SLOW: Touched down slow paths. "Slow" here is sort of general; + * there are "semi-slow" paths like "not a sized deallocation, but can still + * live in the tcache". We'll want to keep these closer to the fast-path + * data. + * TSD_DATA_SLOWER: Only touched in test or debug modes, or not touched at all. + * + * An additional concern is that the larger tcache bins won't be used (we have a + * bin per size class, but by default only cache relatively small objects). So + * the earlier bins are in the TSD_DATA_FAST tier, but the later ones are in the + * TSD_DATA_SLOWER tier. + * + * As a result of all this, we put the slow data first, then the fast data, then + * the slower data, while keeping the tcache as the last element of the fast + * data (so that the fast -> slower transition happens midway through the + * tcache). While we don't yet play alignment tricks to guarantee it, this + * increases our odds of getting some cache/page locality on fast paths. + */ + +#ifdef JEMALLOC_JET +typedef void (*test_callback_t)(int *); +# define MALLOC_TSD_TEST_DATA_INIT 0x72b65c10 +# define MALLOC_TEST_TSD \ + O(test_data, int, int) \ + O(test_callback, test_callback_t, int) +# define MALLOC_TEST_TSD_INITIALIZER , MALLOC_TSD_TEST_DATA_INIT, NULL +#else +# define MALLOC_TEST_TSD +# define MALLOC_TEST_TSD_INITIALIZER +#endif + +typedef ql_elm(tsd_t) tsd_link_t; + +/* O(name, type, nullable type) */ +#define TSD_DATA_SLOW \ + O(tcache_enabled, bool, bool) \ + O(reentrancy_level, int8_t, int8_t) \ + O(thread_allocated_last_event, uint64_t, uint64_t) \ + O(thread_allocated_next_event, uint64_t, uint64_t) \ + O(thread_deallocated_last_event, uint64_t, uint64_t) \ + O(thread_deallocated_next_event, uint64_t, uint64_t) \ + O(tcache_gc_event_wait, uint64_t, uint64_t) \ + O(tcache_gc_dalloc_event_wait, uint64_t, uint64_t) \ + O(prof_sample_event_wait, uint64_t, uint64_t) \ + O(prof_sample_last_event, uint64_t, uint64_t) \ + O(stats_interval_event_wait, uint64_t, uint64_t) \ + O(stats_interval_last_event, uint64_t, uint64_t) \ + O(peak_alloc_event_wait, uint64_t, uint64_t) \ + O(peak_dalloc_event_wait, uint64_t, uint64_t) \ + O(prof_tdata, prof_tdata_t *, prof_tdata_t *) \ + O(prng_state, uint64_t, uint64_t) \ + O(san_extents_until_guard_small, uint64_t, uint64_t) \ + O(san_extents_until_guard_large, uint64_t, uint64_t) \ + O(iarena, arena_t *, arena_t *) \ + O(arena, arena_t *, arena_t *) \ + O(arena_decay_ticker, ticker_geom_t, ticker_geom_t) \ + O(sec_shard, uint8_t, uint8_t) \ + O(binshards, tsd_binshards_t, tsd_binshards_t)\ + O(tsd_link, tsd_link_t, tsd_link_t) \ + O(in_hook, bool, bool) \ + O(peak, peak_t, peak_t) \ + O(activity_callback_thunk, activity_callback_thunk_t, \ + activity_callback_thunk_t) \ + O(tcache_slow, tcache_slow_t, tcache_slow_t) \ + O(rtree_ctx, rtree_ctx_t, rtree_ctx_t) + +#define TSD_DATA_SLOW_INITIALIZER \ + /* tcache_enabled */ TCACHE_ENABLED_ZERO_INITIALIZER, \ + /* reentrancy_level */ 0, \ + /* thread_allocated_last_event */ 0, \ + /* thread_allocated_next_event */ 0, \ + /* thread_deallocated_last_event */ 0, \ + /* thread_deallocated_next_event */ 0, \ + /* tcache_gc_event_wait */ 0, \ + /* tcache_gc_dalloc_event_wait */ 0, \ + /* prof_sample_event_wait */ 0, \ + /* prof_sample_last_event */ 0, \ + /* stats_interval_event_wait */ 0, \ + /* stats_interval_last_event */ 0, \ + /* peak_alloc_event_wait */ 0, \ + /* peak_dalloc_event_wait */ 0, \ + /* prof_tdata */ NULL, \ + /* prng_state */ 0, \ + /* san_extents_until_guard_small */ 0, \ + /* san_extents_until_guard_large */ 0, \ + /* iarena */ NULL, \ + /* arena */ NULL, \ + /* arena_decay_ticker */ \ + TICKER_GEOM_INIT(ARENA_DECAY_NTICKS_PER_UPDATE), \ + /* sec_shard */ (uint8_t)-1, \ + /* binshards */ TSD_BINSHARDS_ZERO_INITIALIZER, \ + /* tsd_link */ {NULL}, \ + /* in_hook */ false, \ + /* peak */ PEAK_INITIALIZER, \ + /* activity_callback_thunk */ \ + ACTIVITY_CALLBACK_THUNK_INITIALIZER, \ + /* tcache_slow */ TCACHE_SLOW_ZERO_INITIALIZER, \ + /* rtree_ctx */ RTREE_CTX_INITIALIZER, + +/* O(name, type, nullable type) */ +#define TSD_DATA_FAST \ + O(thread_allocated, uint64_t, uint64_t) \ + O(thread_allocated_next_event_fast, uint64_t, uint64_t) \ + O(thread_deallocated, uint64_t, uint64_t) \ + O(thread_deallocated_next_event_fast, uint64_t, uint64_t) \ + O(tcache, tcache_t, tcache_t) + +#define TSD_DATA_FAST_INITIALIZER \ + /* thread_allocated */ 0, \ + /* thread_allocated_next_event_fast */ 0, \ + /* thread_deallocated */ 0, \ + /* thread_deallocated_next_event_fast */ 0, \ + /* tcache */ TCACHE_ZERO_INITIALIZER, + +/* O(name, type, nullable type) */ +#define TSD_DATA_SLOWER \ + O(witness_tsd, witness_tsd_t, witness_tsdn_t) \ + MALLOC_TEST_TSD + +#define TSD_DATA_SLOWER_INITIALIZER \ + /* witness */ WITNESS_TSD_INITIALIZER \ + /* test data */ MALLOC_TEST_TSD_INITIALIZER + + +#define TSD_INITIALIZER { \ + TSD_DATA_SLOW_INITIALIZER \ + /* state */ ATOMIC_INIT(tsd_state_uninitialized), \ + TSD_DATA_FAST_INITIALIZER \ + TSD_DATA_SLOWER_INITIALIZER \ +} + +#if defined(JEMALLOC_MALLOC_THREAD_CLEANUP) || defined(_WIN32) +void _malloc_tsd_cleanup_register(bool (*f)(void)); +#endif + +void *malloc_tsd_malloc(size_t size); +void malloc_tsd_dalloc(void *wrapper); +tsd_t *malloc_tsd_boot0(void); +void malloc_tsd_boot1(void); +void tsd_cleanup(void *arg); +tsd_t *tsd_fetch_slow(tsd_t *tsd, bool internal); +void tsd_state_set(tsd_t *tsd, uint8_t new_state); +void tsd_slow_update(tsd_t *tsd); +void tsd_prefork(tsd_t *tsd); +void tsd_postfork_parent(tsd_t *tsd); +void tsd_postfork_child(tsd_t *tsd); + +/* + * Call ..._inc when your module wants to take all threads down the slow paths, + * and ..._dec when it no longer needs to. + */ +void tsd_global_slow_inc(tsdn_t *tsdn); +void tsd_global_slow_dec(tsdn_t *tsdn); +bool tsd_global_slow(); + +enum { + /* Common case --> jnz. */ + tsd_state_nominal = 0, + /* Initialized but on slow path. */ + tsd_state_nominal_slow = 1, + /* + * Some thread has changed global state in such a way that all nominal + * threads need to recompute their fast / slow status the next time they + * get a chance. + * + * Any thread can change another thread's status *to* recompute, but + * threads are the only ones who can change their status *from* + * recompute. + */ + tsd_state_nominal_recompute = 2, + /* + * The above nominal states should be lower values. We use + * tsd_nominal_max to separate nominal states from threads in the + * process of being born / dying. + */ + tsd_state_nominal_max = 2, + + /* + * A thread might free() during its death as its only allocator action; + * in such scenarios, we need tsd, but set up in such a way that no + * cleanup is necessary. + */ + tsd_state_minimal_initialized = 3, + /* States during which we know we're in thread death. */ + tsd_state_purgatory = 4, + tsd_state_reincarnated = 5, + /* + * What it says on the tin; tsd that hasn't been initialized. Note + * that even when the tsd struct lives in TLS, when need to keep track + * of stuff like whether or not our pthread destructors have been + * scheduled, so this really truly is different than the nominal state. + */ + tsd_state_uninitialized = 6 +}; + +/* + * Some TSD accesses can only be done in a nominal state. To enforce this, we + * wrap TSD member access in a function that asserts on TSD state, and mangle + * field names to prevent touching them accidentally. + */ +#define TSD_MANGLE(n) cant_access_tsd_items_directly_use_a_getter_or_setter_##n + +#ifdef JEMALLOC_U8_ATOMICS +# define tsd_state_t atomic_u8_t +# define tsd_atomic_load atomic_load_u8 +# define tsd_atomic_store atomic_store_u8 +# define tsd_atomic_exchange atomic_exchange_u8 +#else +# define tsd_state_t atomic_u32_t +# define tsd_atomic_load atomic_load_u32 +# define tsd_atomic_store atomic_store_u32 +# define tsd_atomic_exchange atomic_exchange_u32 +#endif + +/* The actual tsd. */ +struct tsd_s { + /* + * The contents should be treated as totally opaque outside the tsd + * module. Access any thread-local state through the getters and + * setters below. + */ + +#define O(n, t, nt) \ + t TSD_MANGLE(n); + + TSD_DATA_SLOW + /* + * We manually limit the state to just a single byte. Unless the 8-bit + * atomics are unavailable (which is rare). + */ + tsd_state_t state; + TSD_DATA_FAST + TSD_DATA_SLOWER +#undef O +}; + +JEMALLOC_ALWAYS_INLINE uint8_t +tsd_state_get(tsd_t *tsd) { + /* + * This should be atomic. Unfortunately, compilers right now can't tell + * that this can be done as a memory comparison, and forces a load into + * a register that hurts fast-path performance. + */ + /* return atomic_load_u8(&tsd->state, ATOMIC_RELAXED); */ + return *(uint8_t *)&tsd->state; +} + +/* + * Wrapper around tsd_t that makes it possible to avoid implicit conversion + * between tsd_t and tsdn_t, where tsdn_t is "nullable" and has to be + * explicitly converted to tsd_t, which is non-nullable. + */ +struct tsdn_s { + tsd_t tsd; +}; +#define TSDN_NULL ((tsdn_t *)0) +JEMALLOC_ALWAYS_INLINE tsdn_t * +tsd_tsdn(tsd_t *tsd) { + return (tsdn_t *)tsd; +} + +JEMALLOC_ALWAYS_INLINE bool +tsdn_null(const tsdn_t *tsdn) { + return tsdn == NULL; +} + +JEMALLOC_ALWAYS_INLINE tsd_t * +tsdn_tsd(tsdn_t *tsdn) { + assert(!tsdn_null(tsdn)); + + return &tsdn->tsd; +} + +/* + * We put the platform-specific data declarations and inlines into their own + * header files to avoid cluttering this file. They define tsd_boot0, + * tsd_boot1, tsd_boot, tsd_booted_get, tsd_get_allocates, tsd_get, and tsd_set. + */ +#ifdef JEMALLOC_MALLOC_THREAD_CLEANUP +#include "jemalloc/internal/tsd_malloc_thread_cleanup.h" +#elif (defined(JEMALLOC_TLS)) +#include "jemalloc/internal/tsd_tls.h" +#elif (defined(_WIN32)) +#include "jemalloc/internal/tsd_win.h" +#else +#include "jemalloc/internal/tsd_generic.h" +#endif + +/* + * tsd_foop_get_unsafe(tsd) returns a pointer to the thread-local instance of + * foo. This omits some safety checks, and so can be used during tsd + * initialization and cleanup. + */ +#define O(n, t, nt) \ +JEMALLOC_ALWAYS_INLINE t * \ +tsd_##n##p_get_unsafe(tsd_t *tsd) { \ + return &tsd->TSD_MANGLE(n); \ +} +TSD_DATA_SLOW +TSD_DATA_FAST +TSD_DATA_SLOWER +#undef O + +/* tsd_foop_get(tsd) returns a pointer to the thread-local instance of foo. */ +#define O(n, t, nt) \ +JEMALLOC_ALWAYS_INLINE t * \ +tsd_##n##p_get(tsd_t *tsd) { \ + /* \ + * Because the state might change asynchronously if it's \ + * nominal, we need to make sure that we only read it once. \ + */ \ + uint8_t state = tsd_state_get(tsd); \ + assert(state == tsd_state_nominal || \ + state == tsd_state_nominal_slow || \ + state == tsd_state_nominal_recompute || \ + state == tsd_state_reincarnated || \ + state == tsd_state_minimal_initialized); \ + return tsd_##n##p_get_unsafe(tsd); \ +} +TSD_DATA_SLOW +TSD_DATA_FAST +TSD_DATA_SLOWER +#undef O + +/* + * tsdn_foop_get(tsdn) returns either the thread-local instance of foo (if tsdn + * isn't NULL), or NULL (if tsdn is NULL), cast to the nullable pointer type. + */ +#define O(n, t, nt) \ +JEMALLOC_ALWAYS_INLINE nt * \ +tsdn_##n##p_get(tsdn_t *tsdn) { \ + if (tsdn_null(tsdn)) { \ + return NULL; \ + } \ + tsd_t *tsd = tsdn_tsd(tsdn); \ + return (nt *)tsd_##n##p_get(tsd); \ +} +TSD_DATA_SLOW +TSD_DATA_FAST +TSD_DATA_SLOWER +#undef O + +/* tsd_foo_get(tsd) returns the value of the thread-local instance of foo. */ +#define O(n, t, nt) \ +JEMALLOC_ALWAYS_INLINE t \ +tsd_##n##_get(tsd_t *tsd) { \ + return *tsd_##n##p_get(tsd); \ +} +TSD_DATA_SLOW +TSD_DATA_FAST +TSD_DATA_SLOWER +#undef O + +/* tsd_foo_set(tsd, val) updates the thread-local instance of foo to be val. */ +#define O(n, t, nt) \ +JEMALLOC_ALWAYS_INLINE void \ +tsd_##n##_set(tsd_t *tsd, t val) { \ + assert(tsd_state_get(tsd) != tsd_state_reincarnated && \ + tsd_state_get(tsd) != tsd_state_minimal_initialized); \ + *tsd_##n##p_get(tsd) = val; \ +} +TSD_DATA_SLOW +TSD_DATA_FAST +TSD_DATA_SLOWER +#undef O + +JEMALLOC_ALWAYS_INLINE void +tsd_assert_fast(tsd_t *tsd) { + /* + * Note that our fastness assertion does *not* include global slowness + * counters; it's not in general possible to ensure that they won't + * change asynchronously from underneath us. + */ + assert(!malloc_slow && tsd_tcache_enabled_get(tsd) && + tsd_reentrancy_level_get(tsd) == 0); +} + +JEMALLOC_ALWAYS_INLINE bool +tsd_fast(tsd_t *tsd) { + bool fast = (tsd_state_get(tsd) == tsd_state_nominal); + if (fast) { + tsd_assert_fast(tsd); + } + + return fast; +} + +JEMALLOC_ALWAYS_INLINE tsd_t * +tsd_fetch_impl(bool init, bool minimal) { + tsd_t *tsd = tsd_get(init); + + if (!init && tsd_get_allocates() && tsd == NULL) { + return NULL; + } + assert(tsd != NULL); + + if (unlikely(tsd_state_get(tsd) != tsd_state_nominal)) { + return tsd_fetch_slow(tsd, minimal); + } + assert(tsd_fast(tsd)); + tsd_assert_fast(tsd); + + return tsd; +} + +/* Get a minimal TSD that requires no cleanup. See comments in free(). */ +JEMALLOC_ALWAYS_INLINE tsd_t * +tsd_fetch_min(void) { + return tsd_fetch_impl(true, true); +} + +/* For internal background threads use only. */ +JEMALLOC_ALWAYS_INLINE tsd_t * +tsd_internal_fetch(void) { + tsd_t *tsd = tsd_fetch_min(); + /* Use reincarnated state to prevent full initialization. */ + tsd_state_set(tsd, tsd_state_reincarnated); + + return tsd; +} + +JEMALLOC_ALWAYS_INLINE tsd_t * +tsd_fetch(void) { + return tsd_fetch_impl(true, false); +} + +static inline bool +tsd_nominal(tsd_t *tsd) { + bool nominal = tsd_state_get(tsd) <= tsd_state_nominal_max; + assert(nominal || tsd_reentrancy_level_get(tsd) > 0); + + return nominal; +} + +JEMALLOC_ALWAYS_INLINE tsdn_t * +tsdn_fetch(void) { + if (!tsd_booted_get()) { + return NULL; + } + + return tsd_tsdn(tsd_fetch_impl(false, false)); +} + +JEMALLOC_ALWAYS_INLINE rtree_ctx_t * +tsd_rtree_ctx(tsd_t *tsd) { + return tsd_rtree_ctxp_get(tsd); +} + +JEMALLOC_ALWAYS_INLINE rtree_ctx_t * +tsdn_rtree_ctx(tsdn_t *tsdn, rtree_ctx_t *fallback) { + /* + * If tsd cannot be accessed, initialize the fallback rtree_ctx and + * return a pointer to it. + */ + if (unlikely(tsdn_null(tsdn))) { + rtree_ctx_data_init(fallback); + return fallback; + } + return tsd_rtree_ctx(tsdn_tsd(tsdn)); +} + +static inline bool +tsd_state_nocleanup(tsd_t *tsd) { + return tsd_state_get(tsd) == tsd_state_reincarnated || + tsd_state_get(tsd) == tsd_state_minimal_initialized; +} + +/* + * These "raw" tsd reentrancy functions don't have any debug checking to make + * sure that we're not touching arena 0. Better is to call pre_reentrancy and + * post_reentrancy if this is possible. + */ +static inline void +tsd_pre_reentrancy_raw(tsd_t *tsd) { + bool fast = tsd_fast(tsd); + assert(tsd_reentrancy_level_get(tsd) < INT8_MAX); + ++*tsd_reentrancy_levelp_get(tsd); + if (fast) { + /* Prepare slow path for reentrancy. */ + tsd_slow_update(tsd); + assert(tsd_state_get(tsd) == tsd_state_nominal_slow); + } +} + +static inline void +tsd_post_reentrancy_raw(tsd_t *tsd) { + int8_t *reentrancy_level = tsd_reentrancy_levelp_get(tsd); + assert(*reentrancy_level > 0); + if (--*reentrancy_level == 0) { + tsd_slow_update(tsd); + } +} + +#endif /* JEMALLOC_INTERNAL_TSD_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/tsd_generic.h b/BeefRT/JEMalloc/include/jemalloc/internal/tsd_generic.h new file mode 100644 index 00000000..a718472f --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/tsd_generic.h @@ -0,0 +1,182 @@ +#ifdef JEMALLOC_INTERNAL_TSD_GENERIC_H +#error This file should be included only once, by tsd.h. +#endif +#define JEMALLOC_INTERNAL_TSD_GENERIC_H + +typedef struct tsd_init_block_s tsd_init_block_t; +struct tsd_init_block_s { + ql_elm(tsd_init_block_t) link; + pthread_t thread; + void *data; +}; + +/* Defined in tsd.c, to allow the mutex headers to have tsd dependencies. */ +typedef struct tsd_init_head_s tsd_init_head_t; + +typedef struct { + bool initialized; + tsd_t val; +} tsd_wrapper_t; + +void *tsd_init_check_recursion(tsd_init_head_t *head, + tsd_init_block_t *block); +void tsd_init_finish(tsd_init_head_t *head, tsd_init_block_t *block); + +extern pthread_key_t tsd_tsd; +extern tsd_init_head_t tsd_init_head; +extern tsd_wrapper_t tsd_boot_wrapper; +extern bool tsd_booted; + +/* Initialization/cleanup. */ +JEMALLOC_ALWAYS_INLINE void +tsd_cleanup_wrapper(void *arg) { + tsd_wrapper_t *wrapper = (tsd_wrapper_t *)arg; + + if (wrapper->initialized) { + wrapper->initialized = false; + tsd_cleanup(&wrapper->val); + if (wrapper->initialized) { + /* Trigger another cleanup round. */ + if (pthread_setspecific(tsd_tsd, (void *)wrapper) != 0) + { + malloc_write(": Error setting TSD\n"); + if (opt_abort) { + abort(); + } + } + return; + } + } + malloc_tsd_dalloc(wrapper); +} + +JEMALLOC_ALWAYS_INLINE void +tsd_wrapper_set(tsd_wrapper_t *wrapper) { + if (unlikely(!tsd_booted)) { + return; + } + if (pthread_setspecific(tsd_tsd, (void *)wrapper) != 0) { + malloc_write(": Error setting TSD\n"); + abort(); + } +} + +JEMALLOC_ALWAYS_INLINE tsd_wrapper_t * +tsd_wrapper_get(bool init) { + tsd_wrapper_t *wrapper; + + if (unlikely(!tsd_booted)) { + return &tsd_boot_wrapper; + } + + wrapper = (tsd_wrapper_t *)pthread_getspecific(tsd_tsd); + + if (init && unlikely(wrapper == NULL)) { + tsd_init_block_t block; + wrapper = (tsd_wrapper_t *) + tsd_init_check_recursion(&tsd_init_head, &block); + if (wrapper) { + return wrapper; + } + wrapper = (tsd_wrapper_t *) + malloc_tsd_malloc(sizeof(tsd_wrapper_t)); + block.data = (void *)wrapper; + if (wrapper == NULL) { + malloc_write(": Error allocating TSD\n"); + abort(); + } else { + wrapper->initialized = false; + JEMALLOC_DIAGNOSTIC_PUSH + JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS + tsd_t initializer = TSD_INITIALIZER; + JEMALLOC_DIAGNOSTIC_POP + wrapper->val = initializer; + } + tsd_wrapper_set(wrapper); + tsd_init_finish(&tsd_init_head, &block); + } + return wrapper; +} + +JEMALLOC_ALWAYS_INLINE bool +tsd_boot0(void) { + tsd_wrapper_t *wrapper; + tsd_init_block_t block; + + wrapper = (tsd_wrapper_t *) + tsd_init_check_recursion(&tsd_init_head, &block); + if (wrapper) { + return false; + } + block.data = &tsd_boot_wrapper; + if (pthread_key_create(&tsd_tsd, tsd_cleanup_wrapper) != 0) { + return true; + } + tsd_booted = true; + tsd_wrapper_set(&tsd_boot_wrapper); + tsd_init_finish(&tsd_init_head, &block); + return false; +} + +JEMALLOC_ALWAYS_INLINE void +tsd_boot1(void) { + tsd_wrapper_t *wrapper; + wrapper = (tsd_wrapper_t *)malloc_tsd_malloc(sizeof(tsd_wrapper_t)); + if (wrapper == NULL) { + malloc_write(": Error allocating TSD\n"); + abort(); + } + tsd_boot_wrapper.initialized = false; + tsd_cleanup(&tsd_boot_wrapper.val); + wrapper->initialized = false; + JEMALLOC_DIAGNOSTIC_PUSH + JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS + tsd_t initializer = TSD_INITIALIZER; + JEMALLOC_DIAGNOSTIC_POP + wrapper->val = initializer; + tsd_wrapper_set(wrapper); +} + +JEMALLOC_ALWAYS_INLINE bool +tsd_boot(void) { + if (tsd_boot0()) { + return true; + } + tsd_boot1(); + return false; +} + +JEMALLOC_ALWAYS_INLINE bool +tsd_booted_get(void) { + return tsd_booted; +} + +JEMALLOC_ALWAYS_INLINE bool +tsd_get_allocates(void) { + return true; +} + +/* Get/set. */ +JEMALLOC_ALWAYS_INLINE tsd_t * +tsd_get(bool init) { + tsd_wrapper_t *wrapper; + + assert(tsd_booted); + wrapper = tsd_wrapper_get(init); + if (tsd_get_allocates() && !init && wrapper == NULL) { + return NULL; + } + return &wrapper->val; +} + +JEMALLOC_ALWAYS_INLINE void +tsd_set(tsd_t *val) { + tsd_wrapper_t *wrapper; + + assert(tsd_booted); + wrapper = tsd_wrapper_get(true); + if (likely(&wrapper->val != val)) { + wrapper->val = *(val); + } + wrapper->initialized = true; +} diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/tsd_malloc_thread_cleanup.h b/BeefRT/JEMalloc/include/jemalloc/internal/tsd_malloc_thread_cleanup.h new file mode 100644 index 00000000..d8f3ef13 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/tsd_malloc_thread_cleanup.h @@ -0,0 +1,61 @@ +#ifdef JEMALLOC_INTERNAL_TSD_MALLOC_THREAD_CLEANUP_H +#error This file should be included only once, by tsd.h. +#endif +#define JEMALLOC_INTERNAL_TSD_MALLOC_THREAD_CLEANUP_H + +#define JEMALLOC_TSD_TYPE_ATTR(type) __thread type JEMALLOC_TLS_MODEL + +extern JEMALLOC_TSD_TYPE_ATTR(tsd_t) tsd_tls; +extern JEMALLOC_TSD_TYPE_ATTR(bool) tsd_initialized; +extern bool tsd_booted; + +/* Initialization/cleanup. */ +JEMALLOC_ALWAYS_INLINE bool +tsd_cleanup_wrapper(void) { + if (tsd_initialized) { + tsd_initialized = false; + tsd_cleanup(&tsd_tls); + } + return tsd_initialized; +} + +JEMALLOC_ALWAYS_INLINE bool +tsd_boot0(void) { + _malloc_tsd_cleanup_register(&tsd_cleanup_wrapper); + tsd_booted = true; + return false; +} + +JEMALLOC_ALWAYS_INLINE void +tsd_boot1(void) { + /* Do nothing. */ +} + +JEMALLOC_ALWAYS_INLINE bool +tsd_boot(void) { + return tsd_boot0(); +} + +JEMALLOC_ALWAYS_INLINE bool +tsd_booted_get(void) { + return tsd_booted; +} + +JEMALLOC_ALWAYS_INLINE bool +tsd_get_allocates(void) { + return false; +} + +/* Get/set. */ +JEMALLOC_ALWAYS_INLINE tsd_t * +tsd_get(bool init) { + return &tsd_tls; +} +JEMALLOC_ALWAYS_INLINE void +tsd_set(tsd_t *val) { + assert(tsd_booted); + if (likely(&tsd_tls != val)) { + tsd_tls = (*val); + } + tsd_initialized = true; +} diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/tsd_tls.h b/BeefRT/JEMalloc/include/jemalloc/internal/tsd_tls.h new file mode 100644 index 00000000..7d6c805b --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/tsd_tls.h @@ -0,0 +1,60 @@ +#ifdef JEMALLOC_INTERNAL_TSD_TLS_H +#error This file should be included only once, by tsd.h. +#endif +#define JEMALLOC_INTERNAL_TSD_TLS_H + +#define JEMALLOC_TSD_TYPE_ATTR(type) __thread type JEMALLOC_TLS_MODEL + +extern JEMALLOC_TSD_TYPE_ATTR(tsd_t) tsd_tls; +extern pthread_key_t tsd_tsd; +extern bool tsd_booted; + +/* Initialization/cleanup. */ +JEMALLOC_ALWAYS_INLINE bool +tsd_boot0(void) { + if (pthread_key_create(&tsd_tsd, &tsd_cleanup) != 0) { + return true; + } + tsd_booted = true; + return false; +} + +JEMALLOC_ALWAYS_INLINE void +tsd_boot1(void) { + /* Do nothing. */ +} + +JEMALLOC_ALWAYS_INLINE bool +tsd_boot(void) { + return tsd_boot0(); +} + +JEMALLOC_ALWAYS_INLINE bool +tsd_booted_get(void) { + return tsd_booted; +} + +JEMALLOC_ALWAYS_INLINE bool +tsd_get_allocates(void) { + return false; +} + +/* Get/set. */ +JEMALLOC_ALWAYS_INLINE tsd_t * +tsd_get(bool init) { + return &tsd_tls; +} + +JEMALLOC_ALWAYS_INLINE void +tsd_set(tsd_t *val) { + assert(tsd_booted); + if (likely(&tsd_tls != val)) { + tsd_tls = (*val); + } + if (pthread_setspecific(tsd_tsd, (void *)(&tsd_tls)) != 0) { + malloc_write(": Error setting tsd.\n"); + if (opt_abort) { + abort(); + } + } +} diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/tsd_types.h b/BeefRT/JEMalloc/include/jemalloc/internal/tsd_types.h new file mode 100644 index 00000000..a6ae37da --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/tsd_types.h @@ -0,0 +1,10 @@ +#ifndef JEMALLOC_INTERNAL_TSD_TYPES_H +#define JEMALLOC_INTERNAL_TSD_TYPES_H + +#define MALLOC_TSD_CLEANUPS_MAX 4 + +typedef struct tsd_s tsd_t; +typedef struct tsdn_s tsdn_t; +typedef bool (*malloc_tsd_cleanup_t)(void); + +#endif /* JEMALLOC_INTERNAL_TSD_TYPES_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/tsd_win.h b/BeefRT/JEMalloc/include/jemalloc/internal/tsd_win.h new file mode 100644 index 00000000..a91dac88 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/tsd_win.h @@ -0,0 +1,139 @@ +#ifdef JEMALLOC_INTERNAL_TSD_WIN_H +#error This file should be included only once, by tsd.h. +#endif +#define JEMALLOC_INTERNAL_TSD_WIN_H + +typedef struct { + bool initialized; + tsd_t val; +} tsd_wrapper_t; + +extern DWORD tsd_tsd; +extern tsd_wrapper_t tsd_boot_wrapper; +extern bool tsd_booted; + +/* Initialization/cleanup. */ +JEMALLOC_ALWAYS_INLINE bool +tsd_cleanup_wrapper(void) { + DWORD error = GetLastError(); + tsd_wrapper_t *wrapper = (tsd_wrapper_t *)TlsGetValue(tsd_tsd); + SetLastError(error); + + if (wrapper == NULL) { + return false; + } + + if (wrapper->initialized) { + wrapper->initialized = false; + tsd_cleanup(&wrapper->val); + if (wrapper->initialized) { + /* Trigger another cleanup round. */ + return true; + } + } + malloc_tsd_dalloc(wrapper); + return false; +} + +JEMALLOC_ALWAYS_INLINE void +tsd_wrapper_set(tsd_wrapper_t *wrapper) { + if (!TlsSetValue(tsd_tsd, (void *)wrapper)) { + malloc_write(": Error setting TSD\n"); + abort(); + } +} + +JEMALLOC_ALWAYS_INLINE tsd_wrapper_t * +tsd_wrapper_get(bool init) { + DWORD error = GetLastError(); + tsd_wrapper_t *wrapper = (tsd_wrapper_t *) TlsGetValue(tsd_tsd); + SetLastError(error); + + if (init && unlikely(wrapper == NULL)) { + wrapper = (tsd_wrapper_t *) + malloc_tsd_malloc(sizeof(tsd_wrapper_t)); + if (wrapper == NULL) { + malloc_write(": Error allocating TSD\n"); + abort(); + } else { + wrapper->initialized = false; + /* MSVC is finicky about aggregate initialization. */ + tsd_t tsd_initializer = TSD_INITIALIZER; + wrapper->val = tsd_initializer; + } + tsd_wrapper_set(wrapper); + } + return wrapper; +} + +JEMALLOC_ALWAYS_INLINE bool +tsd_boot0(void) { + tsd_tsd = TlsAlloc(); + if (tsd_tsd == TLS_OUT_OF_INDEXES) { + return true; + } + _malloc_tsd_cleanup_register(&tsd_cleanup_wrapper); + tsd_wrapper_set(&tsd_boot_wrapper); + tsd_booted = true; + return false; +} + +JEMALLOC_ALWAYS_INLINE void +tsd_boot1(void) { + tsd_wrapper_t *wrapper; + wrapper = (tsd_wrapper_t *) + malloc_tsd_malloc(sizeof(tsd_wrapper_t)); + if (wrapper == NULL) { + malloc_write(": Error allocating TSD\n"); + abort(); + } + tsd_boot_wrapper.initialized = false; + tsd_cleanup(&tsd_boot_wrapper.val); + wrapper->initialized = false; + tsd_t initializer = TSD_INITIALIZER; + wrapper->val = initializer; + tsd_wrapper_set(wrapper); +} +JEMALLOC_ALWAYS_INLINE bool +tsd_boot(void) { + if (tsd_boot0()) { + return true; + } + tsd_boot1(); + return false; +} + +JEMALLOC_ALWAYS_INLINE bool +tsd_booted_get(void) { + return tsd_booted; +} + +JEMALLOC_ALWAYS_INLINE bool +tsd_get_allocates(void) { + return true; +} + +/* Get/set. */ +JEMALLOC_ALWAYS_INLINE tsd_t * +tsd_get(bool init) { + tsd_wrapper_t *wrapper; + + assert(tsd_booted); + wrapper = tsd_wrapper_get(init); + if (tsd_get_allocates() && !init && wrapper == NULL) { + return NULL; + } + return &wrapper->val; +} + +JEMALLOC_ALWAYS_INLINE void +tsd_set(tsd_t *val) { + tsd_wrapper_t *wrapper; + + assert(tsd_booted); + wrapper = tsd_wrapper_get(true); + if (likely(&wrapper->val != val)) { + wrapper->val = *(val); + } + wrapper->initialized = true; +} diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/typed_list.h b/BeefRT/JEMalloc/include/jemalloc/internal/typed_list.h new file mode 100644 index 00000000..6535055a --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/typed_list.h @@ -0,0 +1,55 @@ +#ifndef JEMALLOC_INTERNAL_TYPED_LIST_H +#define JEMALLOC_INTERNAL_TYPED_LIST_H + +/* + * This wraps the ql module to implement a list class in a way that's a little + * bit easier to use; it handles ql_elm_new calls and provides type safety. + */ + +#define TYPED_LIST(list_type, el_type, linkage) \ +typedef struct { \ + ql_head(el_type) head; \ +} list_type##_t; \ +static inline void \ +list_type##_init(list_type##_t *list) { \ + ql_new(&list->head); \ +} \ +static inline el_type * \ +list_type##_first(const list_type##_t *list) { \ + return ql_first(&list->head); \ +} \ +static inline el_type * \ +list_type##_last(const list_type##_t *list) { \ + return ql_last(&list->head, linkage); \ +} \ +static inline void \ +list_type##_append(list_type##_t *list, el_type *item) { \ + ql_elm_new(item, linkage); \ + ql_tail_insert(&list->head, item, linkage); \ +} \ +static inline void \ +list_type##_prepend(list_type##_t *list, el_type *item) { \ + ql_elm_new(item, linkage); \ + ql_head_insert(&list->head, item, linkage); \ +} \ +static inline void \ +list_type##_replace(list_type##_t *list, el_type *to_remove, \ + el_type *to_insert) { \ + ql_elm_new(to_insert, linkage); \ + ql_after_insert(to_remove, to_insert, linkage); \ + ql_remove(&list->head, to_remove, linkage); \ +} \ +static inline void \ +list_type##_remove(list_type##_t *list, el_type *item) { \ + ql_remove(&list->head, item, linkage); \ +} \ +static inline bool \ +list_type##_empty(list_type##_t *list) { \ + return ql_empty(&list->head); \ +} \ +static inline void \ +list_type##_concat(list_type##_t *list_a, list_type##_t *list_b) { \ + ql_concat(&list_a->head, &list_b->head, linkage); \ +} + +#endif /* JEMALLOC_INTERNAL_TYPED_LIST_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/util.h b/BeefRT/JEMalloc/include/jemalloc/internal/util.h new file mode 100644 index 00000000..dcb1c0a5 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/util.h @@ -0,0 +1,123 @@ +#ifndef JEMALLOC_INTERNAL_UTIL_H +#define JEMALLOC_INTERNAL_UTIL_H + +#define UTIL_INLINE static inline + +/* Junk fill patterns. */ +#ifndef JEMALLOC_ALLOC_JUNK +# define JEMALLOC_ALLOC_JUNK ((uint8_t)0xa5) +#endif +#ifndef JEMALLOC_FREE_JUNK +# define JEMALLOC_FREE_JUNK ((uint8_t)0x5a) +#endif + +/* + * Wrap a cpp argument that contains commas such that it isn't broken up into + * multiple arguments. + */ +#define JEMALLOC_ARG_CONCAT(...) __VA_ARGS__ + +/* cpp macro definition stringification. */ +#define STRINGIFY_HELPER(x) #x +#define STRINGIFY(x) STRINGIFY_HELPER(x) + +/* + * Silence compiler warnings due to uninitialized values. This is used + * wherever the compiler fails to recognize that the variable is never used + * uninitialized. + */ +#define JEMALLOC_CC_SILENCE_INIT(v) = v + +#ifdef __GNUC__ +# define likely(x) __builtin_expect(!!(x), 1) +# define unlikely(x) __builtin_expect(!!(x), 0) +#else +# define likely(x) !!(x) +# define unlikely(x) !!(x) +#endif + +#if !defined(JEMALLOC_INTERNAL_UNREACHABLE) +# error JEMALLOC_INTERNAL_UNREACHABLE should have been defined by configure +#endif + +#define unreachable() JEMALLOC_INTERNAL_UNREACHABLE() + +/* Set error code. */ +UTIL_INLINE void +set_errno(int errnum) { +#ifdef _WIN32 + SetLastError(errnum); +#else + errno = errnum; +#endif +} + +/* Get last error code. */ +UTIL_INLINE int +get_errno(void) { +#ifdef _WIN32 + return GetLastError(); +#else + return errno; +#endif +} + +JEMALLOC_ALWAYS_INLINE void +util_assume(bool b) { + if (!b) { + unreachable(); + } +} + +/* ptr should be valid. */ +JEMALLOC_ALWAYS_INLINE void +util_prefetch_read(void *ptr) { + /* + * This should arguably be a config check; but any version of GCC so old + * that it doesn't support __builtin_prefetch is also too old to build + * jemalloc. + */ +#ifdef __GNUC__ + if (config_debug) { + /* Enforce the "valid ptr" requirement. */ + *(volatile char *)ptr; + } + __builtin_prefetch(ptr, /* read or write */ 0, /* locality hint */ 3); +#else + *(volatile char *)ptr; +#endif +} + +JEMALLOC_ALWAYS_INLINE void +util_prefetch_write(void *ptr) { +#ifdef __GNUC__ + if (config_debug) { + *(volatile char *)ptr; + } + /* + * The only difference from the read variant is that this has a 1 as the + * second argument (the write hint). + */ + __builtin_prefetch(ptr, 1, 3); +#else + *(volatile char *)ptr; +#endif +} + +JEMALLOC_ALWAYS_INLINE void +util_prefetch_read_range(void *ptr, size_t sz) { + for (size_t i = 0; i < sz; i += CACHELINE) { + util_prefetch_read((void *)((uintptr_t)ptr + i)); + } +} + +JEMALLOC_ALWAYS_INLINE void +util_prefetch_write_range(void *ptr, size_t sz) { + for (size_t i = 0; i < sz; i += CACHELINE) { + util_prefetch_write((void *)((uintptr_t)ptr + i)); + } +} + +#undef UTIL_INLINE + +#endif /* JEMALLOC_INTERNAL_UTIL_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/internal/witness.h b/BeefRT/JEMalloc/include/jemalloc/internal/witness.h new file mode 100644 index 00000000..e81b9a00 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/internal/witness.h @@ -0,0 +1,378 @@ +#ifndef JEMALLOC_INTERNAL_WITNESS_H +#define JEMALLOC_INTERNAL_WITNESS_H + +#include "jemalloc/internal/ql.h" + +/******************************************************************************/ +/* LOCK RANKS */ +/******************************************************************************/ + +enum witness_rank_e { + /* + * Order matters within this enum listing -- higher valued locks can + * only be acquired after lower-valued ones. We use the + * auto-incrementing-ness of enum values to enforce this. + */ + + /* + * Witnesses with rank WITNESS_RANK_OMIT are completely ignored by the + * witness machinery. + */ + WITNESS_RANK_OMIT, + WITNESS_RANK_MIN, + WITNESS_RANK_INIT = WITNESS_RANK_MIN, + WITNESS_RANK_CTL, + WITNESS_RANK_TCACHES, + WITNESS_RANK_ARENAS, + WITNESS_RANK_BACKGROUND_THREAD_GLOBAL, + WITNESS_RANK_PROF_DUMP, + WITNESS_RANK_PROF_BT2GCTX, + WITNESS_RANK_PROF_TDATAS, + WITNESS_RANK_PROF_TDATA, + WITNESS_RANK_PROF_LOG, + WITNESS_RANK_PROF_GCTX, + WITNESS_RANK_PROF_RECENT_DUMP, + WITNESS_RANK_BACKGROUND_THREAD, + /* + * Used as an argument to witness_assert_depth_to_rank() in order to + * validate depth excluding non-core locks with lower ranks. Since the + * rank argument to witness_assert_depth_to_rank() is inclusive rather + * than exclusive, this definition can have the same value as the + * minimally ranked core lock. + */ + WITNESS_RANK_CORE, + WITNESS_RANK_DECAY = WITNESS_RANK_CORE, + WITNESS_RANK_TCACHE_QL, + + WITNESS_RANK_SEC_SHARD, + + WITNESS_RANK_EXTENT_GROW, + WITNESS_RANK_HPA_SHARD_GROW = WITNESS_RANK_EXTENT_GROW, + WITNESS_RANK_SAN_BUMP_ALLOC = WITNESS_RANK_EXTENT_GROW, + + WITNESS_RANK_EXTENTS, + WITNESS_RANK_HPA_SHARD = WITNESS_RANK_EXTENTS, + + WITNESS_RANK_HPA_CENTRAL_GROW, + WITNESS_RANK_HPA_CENTRAL, + + WITNESS_RANK_EDATA_CACHE, + + WITNESS_RANK_RTREE, + WITNESS_RANK_BASE, + WITNESS_RANK_ARENA_LARGE, + WITNESS_RANK_HOOK, + + WITNESS_RANK_LEAF=0x1000, + WITNESS_RANK_BIN = WITNESS_RANK_LEAF, + WITNESS_RANK_ARENA_STATS = WITNESS_RANK_LEAF, + WITNESS_RANK_COUNTER_ACCUM = WITNESS_RANK_LEAF, + WITNESS_RANK_DSS = WITNESS_RANK_LEAF, + WITNESS_RANK_PROF_ACTIVE = WITNESS_RANK_LEAF, + WITNESS_RANK_PROF_DUMP_FILENAME = WITNESS_RANK_LEAF, + WITNESS_RANK_PROF_GDUMP = WITNESS_RANK_LEAF, + WITNESS_RANK_PROF_NEXT_THR_UID = WITNESS_RANK_LEAF, + WITNESS_RANK_PROF_RECENT_ALLOC = WITNESS_RANK_LEAF, + WITNESS_RANK_PROF_STATS = WITNESS_RANK_LEAF, + WITNESS_RANK_PROF_THREAD_ACTIVE_INIT = WITNESS_RANK_LEAF, +}; +typedef enum witness_rank_e witness_rank_t; + +/******************************************************************************/ +/* PER-WITNESS DATA */ +/******************************************************************************/ +#if defined(JEMALLOC_DEBUG) +# define WITNESS_INITIALIZER(name, rank) {name, rank, NULL, NULL, {NULL, NULL}} +#else +# define WITNESS_INITIALIZER(name, rank) +#endif + +typedef struct witness_s witness_t; +typedef ql_head(witness_t) witness_list_t; +typedef int witness_comp_t (const witness_t *, void *, const witness_t *, + void *); + +struct witness_s { + /* Name, used for printing lock order reversal messages. */ + const char *name; + + /* + * Witness rank, where 0 is lowest and WITNESS_RANK_LEAF is highest. + * Witnesses must be acquired in order of increasing rank. + */ + witness_rank_t rank; + + /* + * If two witnesses are of equal rank and they have the samp comp + * function pointer, it is called as a last attempt to differentiate + * between witnesses of equal rank. + */ + witness_comp_t *comp; + + /* Opaque data, passed to comp(). */ + void *opaque; + + /* Linkage for thread's currently owned locks. */ + ql_elm(witness_t) link; +}; + +/******************************************************************************/ +/* PER-THREAD DATA */ +/******************************************************************************/ +typedef struct witness_tsd_s witness_tsd_t; +struct witness_tsd_s { + witness_list_t witnesses; + bool forking; +}; + +#define WITNESS_TSD_INITIALIZER { ql_head_initializer(witnesses), false } +#define WITNESS_TSDN_NULL ((witness_tsdn_t *)0) + +/******************************************************************************/ +/* (PER-THREAD) NULLABILITY HELPERS */ +/******************************************************************************/ +typedef struct witness_tsdn_s witness_tsdn_t; +struct witness_tsdn_s { + witness_tsd_t witness_tsd; +}; + +JEMALLOC_ALWAYS_INLINE witness_tsdn_t * +witness_tsd_tsdn(witness_tsd_t *witness_tsd) { + return (witness_tsdn_t *)witness_tsd; +} + +JEMALLOC_ALWAYS_INLINE bool +witness_tsdn_null(witness_tsdn_t *witness_tsdn) { + return witness_tsdn == NULL; +} + +JEMALLOC_ALWAYS_INLINE witness_tsd_t * +witness_tsdn_tsd(witness_tsdn_t *witness_tsdn) { + assert(!witness_tsdn_null(witness_tsdn)); + return &witness_tsdn->witness_tsd; +} + +/******************************************************************************/ +/* API */ +/******************************************************************************/ +void witness_init(witness_t *witness, const char *name, witness_rank_t rank, + witness_comp_t *comp, void *opaque); + +typedef void (witness_lock_error_t)(const witness_list_t *, const witness_t *); +extern witness_lock_error_t *JET_MUTABLE witness_lock_error; + +typedef void (witness_owner_error_t)(const witness_t *); +extern witness_owner_error_t *JET_MUTABLE witness_owner_error; + +typedef void (witness_not_owner_error_t)(const witness_t *); +extern witness_not_owner_error_t *JET_MUTABLE witness_not_owner_error; + +typedef void (witness_depth_error_t)(const witness_list_t *, + witness_rank_t rank_inclusive, unsigned depth); +extern witness_depth_error_t *JET_MUTABLE witness_depth_error; + +void witnesses_cleanup(witness_tsd_t *witness_tsd); +void witness_prefork(witness_tsd_t *witness_tsd); +void witness_postfork_parent(witness_tsd_t *witness_tsd); +void witness_postfork_child(witness_tsd_t *witness_tsd); + +/* Helper, not intended for direct use. */ +static inline bool +witness_owner(witness_tsd_t *witness_tsd, const witness_t *witness) { + witness_list_t *witnesses; + witness_t *w; + + cassert(config_debug); + + witnesses = &witness_tsd->witnesses; + ql_foreach(w, witnesses, link) { + if (w == witness) { + return true; + } + } + + return false; +} + +static inline void +witness_assert_owner(witness_tsdn_t *witness_tsdn, const witness_t *witness) { + witness_tsd_t *witness_tsd; + + if (!config_debug) { + return; + } + + if (witness_tsdn_null(witness_tsdn)) { + return; + } + witness_tsd = witness_tsdn_tsd(witness_tsdn); + if (witness->rank == WITNESS_RANK_OMIT) { + return; + } + + if (witness_owner(witness_tsd, witness)) { + return; + } + witness_owner_error(witness); +} + +static inline void +witness_assert_not_owner(witness_tsdn_t *witness_tsdn, + const witness_t *witness) { + witness_tsd_t *witness_tsd; + witness_list_t *witnesses; + witness_t *w; + + if (!config_debug) { + return; + } + + if (witness_tsdn_null(witness_tsdn)) { + return; + } + witness_tsd = witness_tsdn_tsd(witness_tsdn); + if (witness->rank == WITNESS_RANK_OMIT) { + return; + } + + witnesses = &witness_tsd->witnesses; + ql_foreach(w, witnesses, link) { + if (w == witness) { + witness_not_owner_error(witness); + } + } +} + +/* Returns depth. Not intended for direct use. */ +static inline unsigned +witness_depth_to_rank(witness_list_t *witnesses, witness_rank_t rank_inclusive) +{ + unsigned d = 0; + witness_t *w = ql_last(witnesses, link); + + if (w != NULL) { + ql_reverse_foreach(w, witnesses, link) { + if (w->rank < rank_inclusive) { + break; + } + d++; + } + } + + return d; +} + +static inline void +witness_assert_depth_to_rank(witness_tsdn_t *witness_tsdn, + witness_rank_t rank_inclusive, unsigned depth) { + if (!config_debug || witness_tsdn_null(witness_tsdn)) { + return; + } + + witness_list_t *witnesses = &witness_tsdn_tsd(witness_tsdn)->witnesses; + unsigned d = witness_depth_to_rank(witnesses, rank_inclusive); + + if (d != depth) { + witness_depth_error(witnesses, rank_inclusive, depth); + } +} + +static inline void +witness_assert_depth(witness_tsdn_t *witness_tsdn, unsigned depth) { + witness_assert_depth_to_rank(witness_tsdn, WITNESS_RANK_MIN, depth); +} + +static inline void +witness_assert_lockless(witness_tsdn_t *witness_tsdn) { + witness_assert_depth(witness_tsdn, 0); +} + +static inline void +witness_assert_positive_depth_to_rank(witness_tsdn_t *witness_tsdn, + witness_rank_t rank_inclusive) { + if (!config_debug || witness_tsdn_null(witness_tsdn)) { + return; + } + + witness_list_t *witnesses = &witness_tsdn_tsd(witness_tsdn)->witnesses; + unsigned d = witness_depth_to_rank(witnesses, rank_inclusive); + + if (d == 0) { + witness_depth_error(witnesses, rank_inclusive, 1); + } +} + +static inline void +witness_lock(witness_tsdn_t *witness_tsdn, witness_t *witness) { + witness_tsd_t *witness_tsd; + witness_list_t *witnesses; + witness_t *w; + + if (!config_debug) { + return; + } + + if (witness_tsdn_null(witness_tsdn)) { + return; + } + witness_tsd = witness_tsdn_tsd(witness_tsdn); + if (witness->rank == WITNESS_RANK_OMIT) { + return; + } + + witness_assert_not_owner(witness_tsdn, witness); + + witnesses = &witness_tsd->witnesses; + w = ql_last(witnesses, link); + if (w == NULL) { + /* No other locks; do nothing. */ + } else if (witness_tsd->forking && w->rank <= witness->rank) { + /* Forking, and relaxed ranking satisfied. */ + } else if (w->rank > witness->rank) { + /* Not forking, rank order reversal. */ + witness_lock_error(witnesses, witness); + } else if (w->rank == witness->rank && (w->comp == NULL || w->comp != + witness->comp || w->comp(w, w->opaque, witness, witness->opaque) > + 0)) { + /* + * Missing/incompatible comparison function, or comparison + * function indicates rank order reversal. + */ + witness_lock_error(witnesses, witness); + } + + ql_elm_new(witness, link); + ql_tail_insert(witnesses, witness, link); +} + +static inline void +witness_unlock(witness_tsdn_t *witness_tsdn, witness_t *witness) { + witness_tsd_t *witness_tsd; + witness_list_t *witnesses; + + if (!config_debug) { + return; + } + + if (witness_tsdn_null(witness_tsdn)) { + return; + } + witness_tsd = witness_tsdn_tsd(witness_tsdn); + if (witness->rank == WITNESS_RANK_OMIT) { + return; + } + + /* + * Check whether owner before removal, rather than relying on + * witness_assert_owner() to abort, so that unit tests can test this + * function's failure mode without causing undefined behavior. + */ + if (witness_owner(witness_tsd, witness)) { + witnesses = &witness_tsd->witnesses; + ql_remove(witnesses, witness, link); + } else { + witness_assert_owner(witness_tsdn, witness); + } +} + +#endif /* JEMALLOC_INTERNAL_WITNESS_H */ diff --git a/BeefRT/JEMalloc/include/jemalloc/jemalloc.h b/BeefRT/JEMalloc/include/jemalloc/jemalloc.h new file mode 100644 index 00000000..da5ac06f --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/jemalloc.h @@ -0,0 +1,462 @@ +#ifndef JEMALLOC_H_ +#define JEMALLOC_H_ +#ifdef __cplusplus +extern "C" { +#endif + +/* Defined if __attribute__((...)) syntax is supported. */ +/* #undef JEMALLOC_HAVE_ATTR */ + +/* Defined if alloc_size attribute is supported. */ +/* #undef JEMALLOC_HAVE_ATTR_ALLOC_SIZE */ + +/* Defined if format_arg(...) attribute is supported. */ +/* #undef JEMALLOC_HAVE_ATTR_FORMAT_ARG */ + +/* Defined if format(gnu_printf, ...) attribute is supported. */ +/* #undef JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF */ + +/* Defined if format(printf, ...) attribute is supported. */ +/* #undef JEMALLOC_HAVE_ATTR_FORMAT_PRINTF */ + +/* Defined if fallthrough attribute is supported. */ +/* #undef JEMALLOC_HAVE_ATTR_FALLTHROUGH */ + +/* Defined if cold attribute is supported. */ +/* #undef JEMALLOC_HAVE_ATTR_COLD */ + +/* + * Define overrides for non-standard allocator-related functions if they are + * present on the system. + */ +/* #undef JEMALLOC_OVERRIDE_MEMALIGN */ +/* #undef JEMALLOC_OVERRIDE_VALLOC */ + +/* + * At least Linux omits the "const" in: + * + * size_t malloc_usable_size(const void *ptr); + * + * Match the operating system's prototype. + */ +#define JEMALLOC_USABLE_SIZE_CONST const + +/* + * If defined, specify throw() for the public function prototypes when compiling + * with C++. The only justification for this is to match the prototypes that + * glibc defines. + */ +/* #undef JEMALLOC_USE_CXX_THROW */ + +#ifdef _MSC_VER +# ifdef _WIN64 +# define LG_SIZEOF_PTR_WIN 3 +# else +# define LG_SIZEOF_PTR_WIN 2 +# endif +#endif + +/* sizeof(void *) == 2^LG_SIZEOF_PTR. */ +#define LG_SIZEOF_PTR LG_SIZEOF_PTR_WIN + +/* + * Name mangling for public symbols is controlled by --with-mangling and + * --with-jemalloc-prefix. With default settings the je_ prefix is stripped by + * these macro definitions. + */ +#ifndef JEMALLOC_NO_RENAME +# define je_aligned_alloc je_aligned_alloc +# define je_calloc je_calloc +# define je_dallocx je_dallocx +# define je_free je_free +# define je_mallctl je_mallctl +# define je_mallctlbymib je_mallctlbymib +# define je_mallctlnametomib je_mallctlnametomib +# define je_malloc je_malloc +# define je_malloc_conf je_malloc_conf +# define je_malloc_conf_2_conf_harder je_malloc_conf_2_conf_harder +# define je_malloc_message je_malloc_message +# define je_malloc_stats_print je_malloc_stats_print +# define je_malloc_usable_size je_malloc_usable_size +# define je_mallocx je_mallocx +# define je_smallocx_54eaed1d8b56b1aa528be3bdd1877e59c56fa90c je_smallocx_54eaed1d8b56b1aa528be3bdd1877e59c56fa90c +# define je_nallocx je_nallocx +# define je_posix_memalign je_posix_memalign +# define je_rallocx je_rallocx +# define je_realloc je_realloc +# define je_sallocx je_sallocx +# define je_sdallocx je_sdallocx +# define je_xallocx je_xallocx +#endif + +#include +#include +#include +#include +#include + +#define JEMALLOC_VERSION "5.3.0-0-g54eaed1d8b56b1aa528be3bdd1877e59c56fa90c" +#define JEMALLOC_VERSION_MAJOR 5 +#define JEMALLOC_VERSION_MINOR 3 +#define JEMALLOC_VERSION_BUGFIX 0 +#define JEMALLOC_VERSION_NREV 0 +#define JEMALLOC_VERSION_GID "54eaed1d8b56b1aa528be3bdd1877e59c56fa90c" +#define JEMALLOC_VERSION_GID_IDENT 54eaed1d8b56b1aa528be3bdd1877e59c56fa90c + +#define MALLOCX_LG_ALIGN(la) ((int)(la)) +#if LG_SIZEOF_PTR == 2 +# define MALLOCX_ALIGN(a) ((int)(ffs((int)(a))-1)) +#else +# define MALLOCX_ALIGN(a) \ + ((int)(((size_t)(a) < (size_t)INT_MAX) ? ffs((int)(a))-1 : \ + ffs((int)(((size_t)(a))>>32))+31)) +#endif +#define MALLOCX_ZERO ((int)0x40) +/* + * Bias tcache index bits so that 0 encodes "automatic tcache management", and 1 + * encodes MALLOCX_TCACHE_NONE. + */ +#define MALLOCX_TCACHE(tc) ((int)(((tc)+2) << 8)) +#define MALLOCX_TCACHE_NONE MALLOCX_TCACHE(-1) +/* + * Bias arena index bits so that 0 encodes "use an automatically chosen arena". + */ +#define MALLOCX_ARENA(a) ((((int)(a))+1) << 20) + +/* + * Use as arena index in "arena..{purge,decay,dss}" and + * "stats.arenas..*" mallctl interfaces to select all arenas. This + * definition is intentionally specified in raw decimal format to support + * cpp-based string concatenation, e.g. + * + * #define STRINGIFY_HELPER(x) #x + * #define STRINGIFY(x) STRINGIFY_HELPER(x) + * + * mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".purge", NULL, NULL, NULL, + * 0); + */ +#define MALLCTL_ARENAS_ALL 4096 +/* + * Use as arena index in "stats.arenas..*" mallctl interfaces to select + * destroyed arenas. + */ +#define MALLCTL_ARENAS_DESTROYED 4097 + +#if defined(__cplusplus) && defined(JEMALLOC_USE_CXX_THROW) +# define JEMALLOC_CXX_THROW throw() +#else +# define JEMALLOC_CXX_THROW +#endif + +#if defined(_MSC_VER) +# define JEMALLOC_ATTR(s) +# define JEMALLOC_ALIGNED(s) __declspec(align(s)) +# define JEMALLOC_ALLOC_SIZE(s) +# define JEMALLOC_ALLOC_SIZE2(s1, s2) +# ifndef JEMALLOC_EXPORT +# ifdef DLLEXPORT +# define JEMALLOC_EXPORT __declspec(dllexport) +# else +# define JEMALLOC_EXPORT __declspec(dllimport) +# endif +# endif +# define JEMALLOC_FORMAT_ARG(i) +# define JEMALLOC_FORMAT_PRINTF(s, i) +# define JEMALLOC_FALLTHROUGH +# define JEMALLOC_NOINLINE __declspec(noinline) +# ifdef __cplusplus +# define JEMALLOC_NOTHROW __declspec(nothrow) +# else +# define JEMALLOC_NOTHROW +# endif +# define JEMALLOC_SECTION(s) __declspec(allocate(s)) +# define JEMALLOC_RESTRICT_RETURN __declspec(restrict) +# if _MSC_VER >= 1900 && !defined(__EDG__) +# define JEMALLOC_ALLOCATOR __declspec(allocator) +# else +# define JEMALLOC_ALLOCATOR +# endif +# define JEMALLOC_COLD +#elif defined(JEMALLOC_HAVE_ATTR) +# define JEMALLOC_ATTR(s) __attribute__((s)) +# define JEMALLOC_ALIGNED(s) JEMALLOC_ATTR(aligned(s)) +# ifdef JEMALLOC_HAVE_ATTR_ALLOC_SIZE +# define JEMALLOC_ALLOC_SIZE(s) JEMALLOC_ATTR(alloc_size(s)) +# define JEMALLOC_ALLOC_SIZE2(s1, s2) JEMALLOC_ATTR(alloc_size(s1, s2)) +# else +# define JEMALLOC_ALLOC_SIZE(s) +# define JEMALLOC_ALLOC_SIZE2(s1, s2) +# endif +# ifndef JEMALLOC_EXPORT +# define JEMALLOC_EXPORT JEMALLOC_ATTR(visibility("default")) +# endif +# ifdef JEMALLOC_HAVE_ATTR_FORMAT_ARG +# define JEMALLOC_FORMAT_ARG(i) JEMALLOC_ATTR(__format_arg__(3)) +# else +# define JEMALLOC_FORMAT_ARG(i) +# endif +# ifdef JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF +# define JEMALLOC_FORMAT_PRINTF(s, i) JEMALLOC_ATTR(format(gnu_printf, s, i)) +# elif defined(JEMALLOC_HAVE_ATTR_FORMAT_PRINTF) +# define JEMALLOC_FORMAT_PRINTF(s, i) JEMALLOC_ATTR(format(printf, s, i)) +# else +# define JEMALLOC_FORMAT_PRINTF(s, i) +# endif +# ifdef JEMALLOC_HAVE_ATTR_FALLTHROUGH +# define JEMALLOC_FALLTHROUGH JEMALLOC_ATTR(fallthrough) +# else +# define JEMALLOC_FALLTHROUGH +# endif +# define JEMALLOC_NOINLINE JEMALLOC_ATTR(noinline) +# define JEMALLOC_NOTHROW JEMALLOC_ATTR(nothrow) +# define JEMALLOC_SECTION(s) JEMALLOC_ATTR(section(s)) +# define JEMALLOC_RESTRICT_RETURN +# define JEMALLOC_ALLOCATOR +# ifdef JEMALLOC_HAVE_ATTR_COLD +# define JEMALLOC_COLD JEMALLOC_ATTR(__cold__) +# else +# define JEMALLOC_COLD +# endif +#else +# define JEMALLOC_ATTR(s) +# define JEMALLOC_ALIGNED(s) +# define JEMALLOC_ALLOC_SIZE(s) +# define JEMALLOC_ALLOC_SIZE2(s1, s2) +# define JEMALLOC_EXPORT +# define JEMALLOC_FORMAT_PRINTF(s, i) +# define JEMALLOC_FALLTHROUGH +# define JEMALLOC_NOINLINE +# define JEMALLOC_NOTHROW +# define JEMALLOC_SECTION(s) +# define JEMALLOC_RESTRICT_RETURN +# define JEMALLOC_ALLOCATOR +# define JEMALLOC_COLD +#endif + +#if (defined(__APPLE__) || defined(__FreeBSD__)) && !defined(JEMALLOC_NO_RENAME) +# define JEMALLOC_SYS_NOTHROW +#else +# define JEMALLOC_SYS_NOTHROW JEMALLOC_NOTHROW +#endif + +/* + * The je_ prefix on the following public symbol declarations is an artifact + * of namespace management, and should be omitted in application code unless + * JEMALLOC_NO_DEMANGLE is defined (see jemalloc_mangle.h). + */ +extern JEMALLOC_EXPORT const char *je_malloc_conf; +extern JEMALLOC_EXPORT void (*je_malloc_message)(void *cbopaque, + const char *s); + +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_SYS_NOTHROW *je_malloc(size_t size) + JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1); +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_SYS_NOTHROW *je_calloc(size_t num, size_t size) + JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE2(1, 2); +JEMALLOC_EXPORT int JEMALLOC_SYS_NOTHROW je_posix_memalign( + void **memptr, size_t alignment, size_t size) JEMALLOC_CXX_THROW + JEMALLOC_ATTR(nonnull(1)); +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_SYS_NOTHROW *je_aligned_alloc(size_t alignment, + size_t size) JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc) + JEMALLOC_ALLOC_SIZE(2); +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_SYS_NOTHROW *je_realloc(void *ptr, size_t size) + JEMALLOC_CXX_THROW JEMALLOC_ALLOC_SIZE(2); +JEMALLOC_EXPORT void JEMALLOC_SYS_NOTHROW je_free(void *ptr) + JEMALLOC_CXX_THROW; + +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_NOTHROW *je_mallocx(size_t size, int flags) + JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1); +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_NOTHROW *je_rallocx(void *ptr, size_t size, + int flags) JEMALLOC_ALLOC_SIZE(2); +JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW je_xallocx(void *ptr, size_t size, + size_t extra, int flags); +JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW je_sallocx(const void *ptr, + int flags) JEMALLOC_ATTR(pure); +JEMALLOC_EXPORT void JEMALLOC_NOTHROW je_dallocx(void *ptr, int flags); +JEMALLOC_EXPORT void JEMALLOC_NOTHROW je_sdallocx(void *ptr, size_t size, + int flags); +JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW je_nallocx(size_t size, int flags) + JEMALLOC_ATTR(pure); + +JEMALLOC_EXPORT int JEMALLOC_NOTHROW je_mallctl(const char *name, + void *oldp, size_t *oldlenp, void *newp, size_t newlen); +JEMALLOC_EXPORT int JEMALLOC_NOTHROW je_mallctlnametomib(const char *name, + size_t *mibp, size_t *miblenp); +JEMALLOC_EXPORT int JEMALLOC_NOTHROW je_mallctlbymib(const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen); +JEMALLOC_EXPORT void JEMALLOC_NOTHROW je_malloc_stats_print( + void (*write_cb)(void *, const char *), void *je_cbopaque, + const char *opts); +JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW je_malloc_usable_size( + JEMALLOC_USABLE_SIZE_CONST void *ptr) JEMALLOC_CXX_THROW; +#ifdef JEMALLOC_HAVE_MALLOC_SIZE +JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW je_malloc_size( + const void *ptr); +#endif + +#ifdef JEMALLOC_OVERRIDE_MEMALIGN +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_SYS_NOTHROW *je_memalign(size_t alignment, size_t size) + JEMALLOC_CXX_THROW JEMALLOC_ATTR(malloc); +#endif + +#ifdef JEMALLOC_OVERRIDE_VALLOC +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN + void JEMALLOC_SYS_NOTHROW *je_valloc(size_t size) JEMALLOC_CXX_THROW + JEMALLOC_ATTR(malloc); +#endif + +typedef struct extent_hooks_s extent_hooks_t; + +/* + * void * + * extent_alloc(extent_hooks_t *extent_hooks, void *new_addr, size_t size, + * size_t alignment, bool *zero, bool *commit, unsigned arena_ind); + */ +typedef void *(extent_alloc_t)(extent_hooks_t *, void *, size_t, size_t, bool *, + bool *, unsigned); + +/* + * bool + * extent_dalloc(extent_hooks_t *extent_hooks, void *addr, size_t size, + * bool committed, unsigned arena_ind); + */ +typedef bool (extent_dalloc_t)(extent_hooks_t *, void *, size_t, bool, + unsigned); + +/* + * void + * extent_destroy(extent_hooks_t *extent_hooks, void *addr, size_t size, + * bool committed, unsigned arena_ind); + */ +typedef void (extent_destroy_t)(extent_hooks_t *, void *, size_t, bool, + unsigned); + +/* + * bool + * extent_commit(extent_hooks_t *extent_hooks, void *addr, size_t size, + * size_t offset, size_t length, unsigned arena_ind); + */ +typedef bool (extent_commit_t)(extent_hooks_t *, void *, size_t, size_t, size_t, + unsigned); + +/* + * bool + * extent_decommit(extent_hooks_t *extent_hooks, void *addr, size_t size, + * size_t offset, size_t length, unsigned arena_ind); + */ +typedef bool (extent_decommit_t)(extent_hooks_t *, void *, size_t, size_t, + size_t, unsigned); + +/* + * bool + * extent_purge(extent_hooks_t *extent_hooks, void *addr, size_t size, + * size_t offset, size_t length, unsigned arena_ind); + */ +typedef bool (extent_purge_t)(extent_hooks_t *, void *, size_t, size_t, size_t, + unsigned); + +/* + * bool + * extent_split(extent_hooks_t *extent_hooks, void *addr, size_t size, + * size_t size_a, size_t size_b, bool committed, unsigned arena_ind); + */ +typedef bool (extent_split_t)(extent_hooks_t *, void *, size_t, size_t, size_t, + bool, unsigned); + +/* + * bool + * extent_merge(extent_hooks_t *extent_hooks, void *addr_a, size_t size_a, + * void *addr_b, size_t size_b, bool committed, unsigned arena_ind); + */ +typedef bool (extent_merge_t)(extent_hooks_t *, void *, size_t, void *, size_t, + bool, unsigned); + +struct extent_hooks_s { + extent_alloc_t *alloc; + extent_dalloc_t *dalloc; + extent_destroy_t *destroy; + extent_commit_t *commit; + extent_decommit_t *decommit; + extent_purge_t *purge_lazy; + extent_purge_t *purge_forced; + extent_split_t *split; + extent_merge_t *merge; +}; + +/* + * By default application code must explicitly refer to mangled symbol names, + * so that it is possible to use jemalloc in conjunction with another allocator + * in the same application. Define JEMALLOC_MANGLE in order to cause automatic + * name mangling that matches the API prefixing that happened as a result of + * --with-mangling and/or --with-jemalloc-prefix configuration settings. + */ +#ifdef JEMALLOC_MANGLE +# ifndef JEMALLOC_NO_DEMANGLE +# define JEMALLOC_NO_DEMANGLE +# endif +# define aligned_alloc je_aligned_alloc +# define calloc je_calloc +# define dallocx je_dallocx +# define free je_free +# define mallctl je_mallctl +# define mallctlbymib je_mallctlbymib +# define mallctlnametomib je_mallctlnametomib +# define malloc je_malloc +# define malloc_conf je_malloc_conf +# define malloc_conf_2_conf_harder je_malloc_conf_2_conf_harder +# define malloc_message je_malloc_message +# define malloc_stats_print je_malloc_stats_print +# define malloc_usable_size je_malloc_usable_size +# define mallocx je_mallocx +# define smallocx_54eaed1d8b56b1aa528be3bdd1877e59c56fa90c je_smallocx_54eaed1d8b56b1aa528be3bdd1877e59c56fa90c +# define nallocx je_nallocx +# define posix_memalign je_posix_memalign +# define rallocx je_rallocx +# define realloc je_realloc +# define sallocx je_sallocx +# define sdallocx je_sdallocx +# define xallocx je_xallocx +#endif + +/* + * The je_* macros can be used as stable alternative names for the + * public jemalloc API if JEMALLOC_NO_DEMANGLE is defined. This is primarily + * meant for use in jemalloc itself, but it can be used by application code to + * provide isolation from the name mangling specified via --with-mangling + * and/or --with-jemalloc-prefix. + */ +#ifndef JEMALLOC_NO_DEMANGLE +# undef je_aligned_alloc +# undef je_calloc +# undef je_dallocx +# undef je_free +# undef je_mallctl +# undef je_mallctlbymib +# undef je_mallctlnametomib +# undef je_malloc +# undef je_malloc_conf +# undef je_malloc_conf_2_conf_harder +# undef je_malloc_message +# undef je_malloc_stats_print +# undef je_malloc_usable_size +# undef je_mallocx +# undef je_smallocx_54eaed1d8b56b1aa528be3bdd1877e59c56fa90c +# undef je_nallocx +# undef je_posix_memalign +# undef je_rallocx +# undef je_realloc +# undef je_sallocx +# undef je_sdallocx +# undef je_xallocx +#endif + +#ifdef __cplusplus +} +#endif +#endif /* JEMALLOC_H_ */ diff --git a/BeefRT/JEMalloc/include/jemalloc/jemalloc.sh b/BeefRT/JEMalloc/include/jemalloc/jemalloc.sh new file mode 100644 index 00000000..b19b1548 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/jemalloc.sh @@ -0,0 +1,27 @@ +#!/bin/sh + +objroot=$1 + +cat < +#include +#include +#include +#include + +#define JEMALLOC_VERSION "5.3.0-0-g54eaed1d8b56b1aa528be3bdd1877e59c56fa90c" +#define JEMALLOC_VERSION_MAJOR 5 +#define JEMALLOC_VERSION_MINOR 3 +#define JEMALLOC_VERSION_BUGFIX 0 +#define JEMALLOC_VERSION_NREV 0 +#define JEMALLOC_VERSION_GID "54eaed1d8b56b1aa528be3bdd1877e59c56fa90c" +#define JEMALLOC_VERSION_GID_IDENT 54eaed1d8b56b1aa528be3bdd1877e59c56fa90c + +#define MALLOCX_LG_ALIGN(la) ((int)(la)) +#if LG_SIZEOF_PTR == 2 +# define MALLOCX_ALIGN(a) ((int)(ffs((int)(a))-1)) +#else +# define MALLOCX_ALIGN(a) \ + ((int)(((size_t)(a) < (size_t)INT_MAX) ? ffs((int)(a))-1 : \ + ffs((int)(((size_t)(a))>>32))+31)) +#endif +#define MALLOCX_ZERO ((int)0x40) +/* + * Bias tcache index bits so that 0 encodes "automatic tcache management", and 1 + * encodes MALLOCX_TCACHE_NONE. + */ +#define MALLOCX_TCACHE(tc) ((int)(((tc)+2) << 8)) +#define MALLOCX_TCACHE_NONE MALLOCX_TCACHE(-1) +/* + * Bias arena index bits so that 0 encodes "use an automatically chosen arena". + */ +#define MALLOCX_ARENA(a) ((((int)(a))+1) << 20) + +/* + * Use as arena index in "arena..{purge,decay,dss}" and + * "stats.arenas..*" mallctl interfaces to select all arenas. This + * definition is intentionally specified in raw decimal format to support + * cpp-based string concatenation, e.g. + * + * #define STRINGIFY_HELPER(x) #x + * #define STRINGIFY(x) STRINGIFY_HELPER(x) + * + * mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".purge", NULL, NULL, NULL, + * 0); + */ +#define MALLCTL_ARENAS_ALL 4096 +/* + * Use as arena index in "stats.arenas..*" mallctl interfaces to select + * destroyed arenas. + */ +#define MALLCTL_ARENAS_DESTROYED 4097 + +#if defined(__cplusplus) && defined(JEMALLOC_USE_CXX_THROW) +# define JEMALLOC_CXX_THROW throw() +#else +# define JEMALLOC_CXX_THROW +#endif + +#if defined(_MSC_VER) +# define JEMALLOC_ATTR(s) +# define JEMALLOC_ALIGNED(s) __declspec(align(s)) +# define JEMALLOC_ALLOC_SIZE(s) +# define JEMALLOC_ALLOC_SIZE2(s1, s2) +# ifndef JEMALLOC_EXPORT +# ifdef DLLEXPORT +# define JEMALLOC_EXPORT __declspec(dllexport) +# else +# define JEMALLOC_EXPORT __declspec(dllimport) +# endif +# endif +# define JEMALLOC_FORMAT_ARG(i) +# define JEMALLOC_FORMAT_PRINTF(s, i) +# define JEMALLOC_FALLTHROUGH +# define JEMALLOC_NOINLINE __declspec(noinline) +# ifdef __cplusplus +# define JEMALLOC_NOTHROW __declspec(nothrow) +# else +# define JEMALLOC_NOTHROW +# endif +# define JEMALLOC_SECTION(s) __declspec(allocate(s)) +# define JEMALLOC_RESTRICT_RETURN __declspec(restrict) +# if _MSC_VER >= 1900 && !defined(__EDG__) +# define JEMALLOC_ALLOCATOR __declspec(allocator) +# else +# define JEMALLOC_ALLOCATOR +# endif +# define JEMALLOC_COLD +#elif defined(JEMALLOC_HAVE_ATTR) +# define JEMALLOC_ATTR(s) __attribute__((s)) +# define JEMALLOC_ALIGNED(s) JEMALLOC_ATTR(aligned(s)) +# ifdef JEMALLOC_HAVE_ATTR_ALLOC_SIZE +# define JEMALLOC_ALLOC_SIZE(s) JEMALLOC_ATTR(alloc_size(s)) +# define JEMALLOC_ALLOC_SIZE2(s1, s2) JEMALLOC_ATTR(alloc_size(s1, s2)) +# else +# define JEMALLOC_ALLOC_SIZE(s) +# define JEMALLOC_ALLOC_SIZE2(s1, s2) +# endif +# ifndef JEMALLOC_EXPORT +# define JEMALLOC_EXPORT JEMALLOC_ATTR(visibility("default")) +# endif +# ifdef JEMALLOC_HAVE_ATTR_FORMAT_ARG +# define JEMALLOC_FORMAT_ARG(i) JEMALLOC_ATTR(__format_arg__(3)) +# else +# define JEMALLOC_FORMAT_ARG(i) +# endif +# ifdef JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF +# define JEMALLOC_FORMAT_PRINTF(s, i) JEMALLOC_ATTR(format(gnu_printf, s, i)) +# elif defined(JEMALLOC_HAVE_ATTR_FORMAT_PRINTF) +# define JEMALLOC_FORMAT_PRINTF(s, i) JEMALLOC_ATTR(format(printf, s, i)) +# else +# define JEMALLOC_FORMAT_PRINTF(s, i) +# endif +# ifdef JEMALLOC_HAVE_ATTR_FALLTHROUGH +# define JEMALLOC_FALLTHROUGH JEMALLOC_ATTR(fallthrough) +# else +# define JEMALLOC_FALLTHROUGH +# endif +# define JEMALLOC_NOINLINE JEMALLOC_ATTR(noinline) +# define JEMALLOC_NOTHROW JEMALLOC_ATTR(nothrow) +# define JEMALLOC_SECTION(s) JEMALLOC_ATTR(section(s)) +# define JEMALLOC_RESTRICT_RETURN +# define JEMALLOC_ALLOCATOR +# ifdef JEMALLOC_HAVE_ATTR_COLD +# define JEMALLOC_COLD JEMALLOC_ATTR(__cold__) +# else +# define JEMALLOC_COLD +# endif +#else +# define JEMALLOC_ATTR(s) +# define JEMALLOC_ALIGNED(s) +# define JEMALLOC_ALLOC_SIZE(s) +# define JEMALLOC_ALLOC_SIZE2(s1, s2) +# define JEMALLOC_EXPORT +# define JEMALLOC_FORMAT_PRINTF(s, i) +# define JEMALLOC_FALLTHROUGH +# define JEMALLOC_NOINLINE +# define JEMALLOC_NOTHROW +# define JEMALLOC_SECTION(s) +# define JEMALLOC_RESTRICT_RETURN +# define JEMALLOC_ALLOCATOR +# define JEMALLOC_COLD +#endif + +#if (defined(__APPLE__) || defined(__FreeBSD__)) && !defined(JEMALLOC_NO_RENAME) +# define JEMALLOC_SYS_NOTHROW +#else +# define JEMALLOC_SYS_NOTHROW JEMALLOC_NOTHROW +#endif diff --git a/BeefRT/JEMalloc/include/jemalloc/jemalloc_macros.h.in b/BeefRT/JEMalloc/include/jemalloc/jemalloc_macros.h.in new file mode 100644 index 00000000..ebb3137e --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/jemalloc_macros.h.in @@ -0,0 +1,149 @@ +#include +#include +#include +#include +#include + +#define JEMALLOC_VERSION "@jemalloc_version@" +#define JEMALLOC_VERSION_MAJOR @jemalloc_version_major@ +#define JEMALLOC_VERSION_MINOR @jemalloc_version_minor@ +#define JEMALLOC_VERSION_BUGFIX @jemalloc_version_bugfix@ +#define JEMALLOC_VERSION_NREV @jemalloc_version_nrev@ +#define JEMALLOC_VERSION_GID "@jemalloc_version_gid@" +#define JEMALLOC_VERSION_GID_IDENT @jemalloc_version_gid@ + +#define MALLOCX_LG_ALIGN(la) ((int)(la)) +#if LG_SIZEOF_PTR == 2 +# define MALLOCX_ALIGN(a) ((int)(ffs((int)(a))-1)) +#else +# define MALLOCX_ALIGN(a) \ + ((int)(((size_t)(a) < (size_t)INT_MAX) ? ffs((int)(a))-1 : \ + ffs((int)(((size_t)(a))>>32))+31)) +#endif +#define MALLOCX_ZERO ((int)0x40) +/* + * Bias tcache index bits so that 0 encodes "automatic tcache management", and 1 + * encodes MALLOCX_TCACHE_NONE. + */ +#define MALLOCX_TCACHE(tc) ((int)(((tc)+2) << 8)) +#define MALLOCX_TCACHE_NONE MALLOCX_TCACHE(-1) +/* + * Bias arena index bits so that 0 encodes "use an automatically chosen arena". + */ +#define MALLOCX_ARENA(a) ((((int)(a))+1) << 20) + +/* + * Use as arena index in "arena..{purge,decay,dss}" and + * "stats.arenas..*" mallctl interfaces to select all arenas. This + * definition is intentionally specified in raw decimal format to support + * cpp-based string concatenation, e.g. + * + * #define STRINGIFY_HELPER(x) #x + * #define STRINGIFY(x) STRINGIFY_HELPER(x) + * + * mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".purge", NULL, NULL, NULL, + * 0); + */ +#define MALLCTL_ARENAS_ALL 4096 +/* + * Use as arena index in "stats.arenas..*" mallctl interfaces to select + * destroyed arenas. + */ +#define MALLCTL_ARENAS_DESTROYED 4097 + +#if defined(__cplusplus) && defined(JEMALLOC_USE_CXX_THROW) +# define JEMALLOC_CXX_THROW throw() +#else +# define JEMALLOC_CXX_THROW +#endif + +#if defined(_MSC_VER) +# define JEMALLOC_ATTR(s) +# define JEMALLOC_ALIGNED(s) __declspec(align(s)) +# define JEMALLOC_ALLOC_SIZE(s) +# define JEMALLOC_ALLOC_SIZE2(s1, s2) +# ifndef JEMALLOC_EXPORT +# ifdef DLLEXPORT +# define JEMALLOC_EXPORT __declspec(dllexport) +# else +# define JEMALLOC_EXPORT __declspec(dllimport) +# endif +# endif +# define JEMALLOC_FORMAT_ARG(i) +# define JEMALLOC_FORMAT_PRINTF(s, i) +# define JEMALLOC_FALLTHROUGH +# define JEMALLOC_NOINLINE __declspec(noinline) +# ifdef __cplusplus +# define JEMALLOC_NOTHROW __declspec(nothrow) +# else +# define JEMALLOC_NOTHROW +# endif +# define JEMALLOC_SECTION(s) __declspec(allocate(s)) +# define JEMALLOC_RESTRICT_RETURN __declspec(restrict) +# if _MSC_VER >= 1900 && !defined(__EDG__) +# define JEMALLOC_ALLOCATOR __declspec(allocator) +# else +# define JEMALLOC_ALLOCATOR +# endif +# define JEMALLOC_COLD +#elif defined(JEMALLOC_HAVE_ATTR) +# define JEMALLOC_ATTR(s) __attribute__((s)) +# define JEMALLOC_ALIGNED(s) JEMALLOC_ATTR(aligned(s)) +# ifdef JEMALLOC_HAVE_ATTR_ALLOC_SIZE +# define JEMALLOC_ALLOC_SIZE(s) JEMALLOC_ATTR(alloc_size(s)) +# define JEMALLOC_ALLOC_SIZE2(s1, s2) JEMALLOC_ATTR(alloc_size(s1, s2)) +# else +# define JEMALLOC_ALLOC_SIZE(s) +# define JEMALLOC_ALLOC_SIZE2(s1, s2) +# endif +# ifndef JEMALLOC_EXPORT +# define JEMALLOC_EXPORT JEMALLOC_ATTR(visibility("default")) +# endif +# ifdef JEMALLOC_HAVE_ATTR_FORMAT_ARG +# define JEMALLOC_FORMAT_ARG(i) JEMALLOC_ATTR(__format_arg__(3)) +# else +# define JEMALLOC_FORMAT_ARG(i) +# endif +# ifdef JEMALLOC_HAVE_ATTR_FORMAT_GNU_PRINTF +# define JEMALLOC_FORMAT_PRINTF(s, i) JEMALLOC_ATTR(format(gnu_printf, s, i)) +# elif defined(JEMALLOC_HAVE_ATTR_FORMAT_PRINTF) +# define JEMALLOC_FORMAT_PRINTF(s, i) JEMALLOC_ATTR(format(printf, s, i)) +# else +# define JEMALLOC_FORMAT_PRINTF(s, i) +# endif +# ifdef JEMALLOC_HAVE_ATTR_FALLTHROUGH +# define JEMALLOC_FALLTHROUGH JEMALLOC_ATTR(fallthrough) +# else +# define JEMALLOC_FALLTHROUGH +# endif +# define JEMALLOC_NOINLINE JEMALLOC_ATTR(noinline) +# define JEMALLOC_NOTHROW JEMALLOC_ATTR(nothrow) +# define JEMALLOC_SECTION(s) JEMALLOC_ATTR(section(s)) +# define JEMALLOC_RESTRICT_RETURN +# define JEMALLOC_ALLOCATOR +# ifdef JEMALLOC_HAVE_ATTR_COLD +# define JEMALLOC_COLD JEMALLOC_ATTR(__cold__) +# else +# define JEMALLOC_COLD +# endif +#else +# define JEMALLOC_ATTR(s) +# define JEMALLOC_ALIGNED(s) +# define JEMALLOC_ALLOC_SIZE(s) +# define JEMALLOC_ALLOC_SIZE2(s1, s2) +# define JEMALLOC_EXPORT +# define JEMALLOC_FORMAT_PRINTF(s, i) +# define JEMALLOC_FALLTHROUGH +# define JEMALLOC_NOINLINE +# define JEMALLOC_NOTHROW +# define JEMALLOC_SECTION(s) +# define JEMALLOC_RESTRICT_RETURN +# define JEMALLOC_ALLOCATOR +# define JEMALLOC_COLD +#endif + +#if (defined(__APPLE__) || defined(__FreeBSD__)) && !defined(JEMALLOC_NO_RENAME) +# define JEMALLOC_SYS_NOTHROW +#else +# define JEMALLOC_SYS_NOTHROW JEMALLOC_NOTHROW +#endif diff --git a/BeefRT/JEMalloc/include/jemalloc/jemalloc_mangle.h b/BeefRT/JEMalloc/include/jemalloc/jemalloc_mangle.h new file mode 100644 index 00000000..007cb280 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/jemalloc_mangle.h @@ -0,0 +1,66 @@ +/* + * By default application code must explicitly refer to mangled symbol names, + * so that it is possible to use jemalloc in conjunction with another allocator + * in the same application. Define JEMALLOC_MANGLE in order to cause automatic + * name mangling that matches the API prefixing that happened as a result of + * --with-mangling and/or --with-jemalloc-prefix configuration settings. + */ +#ifdef JEMALLOC_MANGLE +# ifndef JEMALLOC_NO_DEMANGLE +# define JEMALLOC_NO_DEMANGLE +# endif +# define aligned_alloc je_aligned_alloc +# define calloc je_calloc +# define dallocx je_dallocx +# define free je_free +# define mallctl je_mallctl +# define mallctlbymib je_mallctlbymib +# define mallctlnametomib je_mallctlnametomib +# define malloc je_malloc +# define malloc_conf je_malloc_conf +# define malloc_conf_2_conf_harder je_malloc_conf_2_conf_harder +# define malloc_message je_malloc_message +# define malloc_stats_print je_malloc_stats_print +# define malloc_usable_size je_malloc_usable_size +# define mallocx je_mallocx +# define smallocx_54eaed1d8b56b1aa528be3bdd1877e59c56fa90c je_smallocx_54eaed1d8b56b1aa528be3bdd1877e59c56fa90c +# define nallocx je_nallocx +# define posix_memalign je_posix_memalign +# define rallocx je_rallocx +# define realloc je_realloc +# define sallocx je_sallocx +# define sdallocx je_sdallocx +# define xallocx je_xallocx +#endif + +/* + * The je_* macros can be used as stable alternative names for the + * public jemalloc API if JEMALLOC_NO_DEMANGLE is defined. This is primarily + * meant for use in jemalloc itself, but it can be used by application code to + * provide isolation from the name mangling specified via --with-mangling + * and/or --with-jemalloc-prefix. + */ +#ifndef JEMALLOC_NO_DEMANGLE +# undef je_aligned_alloc +# undef je_calloc +# undef je_dallocx +# undef je_free +# undef je_mallctl +# undef je_mallctlbymib +# undef je_mallctlnametomib +# undef je_malloc +# undef je_malloc_conf +# undef je_malloc_conf_2_conf_harder +# undef je_malloc_message +# undef je_malloc_stats_print +# undef je_malloc_usable_size +# undef je_mallocx +# undef je_smallocx_54eaed1d8b56b1aa528be3bdd1877e59c56fa90c +# undef je_nallocx +# undef je_posix_memalign +# undef je_rallocx +# undef je_realloc +# undef je_sallocx +# undef je_sdallocx +# undef je_xallocx +#endif diff --git a/BeefRT/JEMalloc/include/jemalloc/jemalloc_mangle.sh b/BeefRT/JEMalloc/include/jemalloc/jemalloc_mangle.sh new file mode 100644 index 00000000..c675bb46 --- /dev/null +++ b/BeefRT/JEMalloc/include/jemalloc/jemalloc_mangle.sh @@ -0,0 +1,45 @@ +#!/bin/sh -eu + +public_symbols_txt=$1 +symbol_prefix=$2 + +cat < + +/* MSVC doesn't define _Bool or bool in C, but does have BOOL */ +/* Note this doesn't pass autoconf's test because (bool) 0.5 != true */ +/* Clang-cl uses MSVC headers, so needs msvc_compat, but has _Bool as + * a built-in type. */ +#ifndef __clang__ +typedef BOOL _Bool; +#endif + +#define bool _Bool +#define true 1 +#define false 0 + +#define __bool_true_false_are_defined 1 + +#endif /* stdbool_h */ diff --git a/BeefRT/JEMalloc/include/msvc_compat/C99/stdint.h b/BeefRT/JEMalloc/include/msvc_compat/C99/stdint.h new file mode 100644 index 00000000..d02608a5 --- /dev/null +++ b/BeefRT/JEMalloc/include/msvc_compat/C99/stdint.h @@ -0,0 +1,247 @@ +// ISO C9x compliant stdint.h for Microsoft Visual Studio +// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 +// +// Copyright (c) 2006-2008 Alexander Chemeris +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. The name of the author may be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +/////////////////////////////////////////////////////////////////////////////// + +#ifndef _MSC_VER // [ +#error "Use this header only with Microsoft Visual C++ compilers!" +#endif // _MSC_VER ] + +#ifndef _MSC_STDINT_H_ // [ +#define _MSC_STDINT_H_ + +#if _MSC_VER > 1000 +#pragma once +#endif + +#include + +// For Visual Studio 6 in C++ mode and for many Visual Studio versions when +// compiling for ARM we should wrap include with 'extern "C++" {}' +// or compiler give many errors like this: +// error C2733: second C linkage of overloaded function 'wmemchr' not allowed +#ifdef __cplusplus +extern "C" { +#endif +# include +#ifdef __cplusplus +} +#endif + +// Define _W64 macros to mark types changing their size, like intptr_t. +#ifndef _W64 +# if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300 +# define _W64 __w64 +# else +# define _W64 +# endif +#endif + + +// 7.18.1 Integer types + +// 7.18.1.1 Exact-width integer types + +// Visual Studio 6 and Embedded Visual C++ 4 doesn't +// realize that, e.g. char has the same size as __int8 +// so we give up on __intX for them. +#if (_MSC_VER < 1300) + typedef signed char int8_t; + typedef signed short int16_t; + typedef signed int int32_t; + typedef unsigned char uint8_t; + typedef unsigned short uint16_t; + typedef unsigned int uint32_t; +#else + typedef signed __int8 int8_t; + typedef signed __int16 int16_t; + typedef signed __int32 int32_t; + typedef unsigned __int8 uint8_t; + typedef unsigned __int16 uint16_t; + typedef unsigned __int32 uint32_t; +#endif +typedef signed __int64 int64_t; +typedef unsigned __int64 uint64_t; + + +// 7.18.1.2 Minimum-width integer types +typedef int8_t int_least8_t; +typedef int16_t int_least16_t; +typedef int32_t int_least32_t; +typedef int64_t int_least64_t; +typedef uint8_t uint_least8_t; +typedef uint16_t uint_least16_t; +typedef uint32_t uint_least32_t; +typedef uint64_t uint_least64_t; + +// 7.18.1.3 Fastest minimum-width integer types +typedef int8_t int_fast8_t; +typedef int16_t int_fast16_t; +typedef int32_t int_fast32_t; +typedef int64_t int_fast64_t; +typedef uint8_t uint_fast8_t; +typedef uint16_t uint_fast16_t; +typedef uint32_t uint_fast32_t; +typedef uint64_t uint_fast64_t; + +// 7.18.1.4 Integer types capable of holding object pointers +#ifdef _WIN64 // [ + typedef signed __int64 intptr_t; + typedef unsigned __int64 uintptr_t; +#else // _WIN64 ][ + typedef _W64 signed int intptr_t; + typedef _W64 unsigned int uintptr_t; +#endif // _WIN64 ] + +// 7.18.1.5 Greatest-width integer types +typedef int64_t intmax_t; +typedef uint64_t uintmax_t; + + +// 7.18.2 Limits of specified-width integer types + +#if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [ See footnote 220 at page 257 and footnote 221 at page 259 + +// 7.18.2.1 Limits of exact-width integer types +#define INT8_MIN ((int8_t)_I8_MIN) +#define INT8_MAX _I8_MAX +#define INT16_MIN ((int16_t)_I16_MIN) +#define INT16_MAX _I16_MAX +#define INT32_MIN ((int32_t)_I32_MIN) +#define INT32_MAX _I32_MAX +#define INT64_MIN ((int64_t)_I64_MIN) +#define INT64_MAX _I64_MAX +#define UINT8_MAX _UI8_MAX +#define UINT16_MAX _UI16_MAX +#define UINT32_MAX _UI32_MAX +#define UINT64_MAX _UI64_MAX + +// 7.18.2.2 Limits of minimum-width integer types +#define INT_LEAST8_MIN INT8_MIN +#define INT_LEAST8_MAX INT8_MAX +#define INT_LEAST16_MIN INT16_MIN +#define INT_LEAST16_MAX INT16_MAX +#define INT_LEAST32_MIN INT32_MIN +#define INT_LEAST32_MAX INT32_MAX +#define INT_LEAST64_MIN INT64_MIN +#define INT_LEAST64_MAX INT64_MAX +#define UINT_LEAST8_MAX UINT8_MAX +#define UINT_LEAST16_MAX UINT16_MAX +#define UINT_LEAST32_MAX UINT32_MAX +#define UINT_LEAST64_MAX UINT64_MAX + +// 7.18.2.3 Limits of fastest minimum-width integer types +#define INT_FAST8_MIN INT8_MIN +#define INT_FAST8_MAX INT8_MAX +#define INT_FAST16_MIN INT16_MIN +#define INT_FAST16_MAX INT16_MAX +#define INT_FAST32_MIN INT32_MIN +#define INT_FAST32_MAX INT32_MAX +#define INT_FAST64_MIN INT64_MIN +#define INT_FAST64_MAX INT64_MAX +#define UINT_FAST8_MAX UINT8_MAX +#define UINT_FAST16_MAX UINT16_MAX +#define UINT_FAST32_MAX UINT32_MAX +#define UINT_FAST64_MAX UINT64_MAX + +// 7.18.2.4 Limits of integer types capable of holding object pointers +#ifdef _WIN64 // [ +# define INTPTR_MIN INT64_MIN +# define INTPTR_MAX INT64_MAX +# define UINTPTR_MAX UINT64_MAX +#else // _WIN64 ][ +# define INTPTR_MIN INT32_MIN +# define INTPTR_MAX INT32_MAX +# define UINTPTR_MAX UINT32_MAX +#endif // _WIN64 ] + +// 7.18.2.5 Limits of greatest-width integer types +#define INTMAX_MIN INT64_MIN +#define INTMAX_MAX INT64_MAX +#define UINTMAX_MAX UINT64_MAX + +// 7.18.3 Limits of other integer types + +#ifdef _WIN64 // [ +# define PTRDIFF_MIN _I64_MIN +# define PTRDIFF_MAX _I64_MAX +#else // _WIN64 ][ +# define PTRDIFF_MIN _I32_MIN +# define PTRDIFF_MAX _I32_MAX +#endif // _WIN64 ] + +#define SIG_ATOMIC_MIN INT_MIN +#define SIG_ATOMIC_MAX INT_MAX + +#ifndef SIZE_MAX // [ +# ifdef _WIN64 // [ +# define SIZE_MAX _UI64_MAX +# else // _WIN64 ][ +# define SIZE_MAX _UI32_MAX +# endif // _WIN64 ] +#endif // SIZE_MAX ] + +// WCHAR_MIN and WCHAR_MAX are also defined in +#ifndef WCHAR_MIN // [ +# define WCHAR_MIN 0 +#endif // WCHAR_MIN ] +#ifndef WCHAR_MAX // [ +# define WCHAR_MAX _UI16_MAX +#endif // WCHAR_MAX ] + +#define WINT_MIN 0 +#define WINT_MAX _UI16_MAX + +#endif // __STDC_LIMIT_MACROS ] + + +// 7.18.4 Limits of other integer types + +#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [ See footnote 224 at page 260 + +// 7.18.4.1 Macros for minimum-width integer constants + +#define INT8_C(val) val##i8 +#define INT16_C(val) val##i16 +#define INT32_C(val) val##i32 +#define INT64_C(val) val##i64 + +#define UINT8_C(val) val##ui8 +#define UINT16_C(val) val##ui16 +#define UINT32_C(val) val##ui32 +#define UINT64_C(val) val##ui64 + +// 7.18.4.2 Macros for greatest-width integer constants +#define INTMAX_C INT64_C +#define UINTMAX_C UINT64_C + +#endif // __STDC_CONSTANT_MACROS ] + + +#endif // _MSC_STDINT_H_ ] diff --git a/BeefRT/JEMalloc/include/msvc_compat/strings.h b/BeefRT/JEMalloc/include/msvc_compat/strings.h new file mode 100644 index 00000000..996f256c --- /dev/null +++ b/BeefRT/JEMalloc/include/msvc_compat/strings.h @@ -0,0 +1,58 @@ +#ifndef strings_h +#define strings_h + +/* MSVC doesn't define ffs/ffsl. This dummy strings.h header is provided + * for both */ +#ifdef _MSC_VER +# include +# pragma intrinsic(_BitScanForward) +static __forceinline int ffsl(long x) { + unsigned long i; + + if (_BitScanForward(&i, x)) { + return i + 1; + } + return 0; +} + +static __forceinline int ffs(int x) { + return ffsl(x); +} + +# ifdef _M_X64 +# pragma intrinsic(_BitScanForward64) +# endif + +static __forceinline int ffsll(unsigned __int64 x) { + unsigned long i; +#ifdef _M_X64 + if (_BitScanForward64(&i, x)) { + return i + 1; + } + return 0; +#else +// Fallback for 32-bit build where 64-bit version not available +// assuming little endian + union { + unsigned __int64 ll; + unsigned long l[2]; + } s; + + s.ll = x; + + if (_BitScanForward(&i, s.l[0])) { + return i + 1; + } else if(_BitScanForward(&i, s.l[1])) { + return i + 33; + } + return 0; +#endif +} + +#else +# define ffsll(x) __builtin_ffsll(x) +# define ffsl(x) __builtin_ffsl(x) +# define ffs(x) __builtin_ffs(x) +#endif + +#endif /* strings_h */ diff --git a/BeefRT/JEMalloc/include/msvc_compat/windows_extra.h b/BeefRT/JEMalloc/include/msvc_compat/windows_extra.h new file mode 100644 index 00000000..a6ebb930 --- /dev/null +++ b/BeefRT/JEMalloc/include/msvc_compat/windows_extra.h @@ -0,0 +1,6 @@ +#ifndef MSVC_COMPAT_WINDOWS_EXTRA_H +#define MSVC_COMPAT_WINDOWS_EXTRA_H + +#include + +#endif /* MSVC_COMPAT_WINDOWS_EXTRA_H */ diff --git a/BeefRT/JEMalloc/jemalloc.pc.in b/BeefRT/JEMalloc/jemalloc.pc.in new file mode 100644 index 00000000..c428a86d --- /dev/null +++ b/BeefRT/JEMalloc/jemalloc.pc.in @@ -0,0 +1,12 @@ +prefix=@prefix@ +exec_prefix=@exec_prefix@ +libdir=@libdir@ +includedir=@includedir@ +install_suffix=@install_suffix@ + +Name: jemalloc +Description: A general purpose malloc(3) implementation that emphasizes fragmentation avoidance and scalable concurrency support. +URL: http://jemalloc.net/ +Version: @jemalloc_version_major@.@jemalloc_version_minor@.@jemalloc_version_bugfix@_@jemalloc_version_nrev@ +Cflags: -I${includedir} +Libs: -L${libdir} -ljemalloc${install_suffix} diff --git a/BeefRT/JEMalloc/jemalloc.sln b/BeefRT/JEMalloc/jemalloc.sln new file mode 100644 index 00000000..c9977546 --- /dev/null +++ b/BeefRT/JEMalloc/jemalloc.sln @@ -0,0 +1,58 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.32413.511 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{70A99006-6DE9-472B-8F83-4CEE6C616DF3}" + ProjectSection(SolutionItems) = preProject + ReadMe.txt = ReadMe.txt + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "jemalloc", "jemalloc.vcxproj", "{8D6BB292-9E1C-413D-9F98-4864BDC1514A}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug Static CStatic|x64 = Debug Static CStatic|x64 + Debug Static CStatic|x86 = Debug Static CStatic|x86 + Debug Static|x64 = Debug Static|x64 + Debug Static|x86 = Debug Static|x86 + Debug|x64 = Debug|x64 + Debug|x86 = Debug|x86 + Release Static CStatic|x64 = Release Static CStatic|x64 + Release Static CStatic|x86 = Release Static CStatic|x86 + Release Static|x64 = Release Static|x64 + Release Static|x86 = Release Static|x86 + Release|x64 = Release|x64 + Release|x86 = Release|x86 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {8D6BB292-9E1C-413D-9F98-4864BDC1514A}.Debug Static CStatic|x64.ActiveCfg = Debug Static CStatic|x64 + {8D6BB292-9E1C-413D-9F98-4864BDC1514A}.Debug Static CStatic|x64.Build.0 = Debug Static CStatic|x64 + {8D6BB292-9E1C-413D-9F98-4864BDC1514A}.Debug Static CStatic|x86.ActiveCfg = Debug Static CStatic|x64 + {8D6BB292-9E1C-413D-9F98-4864BDC1514A}.Debug Static|x64.ActiveCfg = Debug Static|x64 + {8D6BB292-9E1C-413D-9F98-4864BDC1514A}.Debug Static|x64.Build.0 = Debug Static|x64 + {8D6BB292-9E1C-413D-9F98-4864BDC1514A}.Debug Static|x86.ActiveCfg = Debug Static|Win32 + {8D6BB292-9E1C-413D-9F98-4864BDC1514A}.Debug Static|x86.Build.0 = Debug Static|Win32 + {8D6BB292-9E1C-413D-9F98-4864BDC1514A}.Debug|x64.ActiveCfg = Debug|x64 + {8D6BB292-9E1C-413D-9F98-4864BDC1514A}.Debug|x64.Build.0 = Debug|x64 + {8D6BB292-9E1C-413D-9F98-4864BDC1514A}.Debug|x86.ActiveCfg = Debug|Win32 + {8D6BB292-9E1C-413D-9F98-4864BDC1514A}.Debug|x86.Build.0 = Debug|Win32 + {8D6BB292-9E1C-413D-9F98-4864BDC1514A}.Release Static CStatic|x64.ActiveCfg = Release Static CStatic|x64 + {8D6BB292-9E1C-413D-9F98-4864BDC1514A}.Release Static CStatic|x64.Build.0 = Release Static CStatic|x64 + {8D6BB292-9E1C-413D-9F98-4864BDC1514A}.Release Static CStatic|x86.ActiveCfg = Release Static CStatic|x64 + {8D6BB292-9E1C-413D-9F98-4864BDC1514A}.Release Static|x64.ActiveCfg = Release Static|x64 + {8D6BB292-9E1C-413D-9F98-4864BDC1514A}.Release Static|x64.Build.0 = Release Static|x64 + {8D6BB292-9E1C-413D-9F98-4864BDC1514A}.Release Static|x86.ActiveCfg = Release Static|Win32 + {8D6BB292-9E1C-413D-9F98-4864BDC1514A}.Release Static|x86.Build.0 = Release Static|Win32 + {8D6BB292-9E1C-413D-9F98-4864BDC1514A}.Release|x64.ActiveCfg = Release|x64 + {8D6BB292-9E1C-413D-9F98-4864BDC1514A}.Release|x64.Build.0 = Release|x64 + {8D6BB292-9E1C-413D-9F98-4864BDC1514A}.Release|x86.ActiveCfg = Release|Win32 + {8D6BB292-9E1C-413D-9F98-4864BDC1514A}.Release|x86.Build.0 = Release|Win32 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {8A66A875-4597-49CF-9487-1B1C10B9D5FB} + EndGlobalSection +EndGlobal diff --git a/BeefRT/JEMalloc/jemalloc.vcxproj b/BeefRT/JEMalloc/jemalloc.vcxproj new file mode 100644 index 00000000..561f2802 --- /dev/null +++ b/BeefRT/JEMalloc/jemalloc.vcxproj @@ -0,0 +1,464 @@ + + + + + Debug Static + Win32 + + + Debug Static + x64 + + + Debug Static CStatic + x64 + + + Debug + Win32 + + + Release Static + Win32 + + + Release Static + x64 + + + Release Static CStatic + x64 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + {8D6BB292-9E1C-413D-9F98-4864BDC1514A} + Win32Proj + jemalloc + 10.0 + + + + DynamicLibrary + true + v142 + MultiByte + + + StaticLibrary + true + v142 + MultiByte + + + DynamicLibrary + false + v142 + true + MultiByte + + + StaticLibrary + false + v142 + true + MultiByte + + + DynamicLibrary + true + v142 + MultiByte + + + StaticLibrary + true + v142 + MultiByte + + + StaticLibrary + true + v142 + MultiByte + + + DynamicLibrary + false + v142 + true + MultiByte + + + StaticLibrary + false + v142 + true + MultiByte + + + StaticLibrary + false + v142 + true + MultiByte + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + JEMalloc64_d + $(SolutionDir)\..\..\ide\dist\ + + + JEMalloc64_ssd + $(SolutionDir)\..\..\ide\dist\ + + + JEMalloc64_sd + $(SolutionDir)\..\..\ide\dist\ + + + JEMalloc64 + $(SolutionDir)\..\..\ide\dist\ + false + + + JEMalloc64_ss + $(SolutionDir)\..\..\ide\dist\ + false + + + JEMalloc64_s + $(SolutionDir)\..\..\ide\dist\ + false + + + $(SolutionDir)\..\..\ide\dist\ + JEMalloc32_d + + + $(SolutionDir)\..\..\ide\dist\ + JEMalloc32_ssd + + + $(SolutionDir)\..\..\ide\dist\ + JEMalloc32_sd + + + $(SolutionDir)\..\..\ide\dist\ + JEMalloc32 + + + $(SolutionDir)\..\..\ide\dist\ + JEMalloc32_ss + + + $(SolutionDir)\..\..\ide\dist\ + JEMalloc32_s + + + + + + Level3 + Disabled + _REENTRANT;_WINDLL;DLLEXPORT;JEMALLOC_DEBUG;_DEBUG;%(PreprocessorDefinitions) + include;include\msvc_compat;%(AdditionalIncludeDirectories) + 4090;4146;4267;4334 + $(OutputPath)$(TargetName).pdb + + + Windows + true + + + + + + + Level3 + Disabled + JEMALLOC_DEBUG;_REENTRANT;JEMALLOC_EXPORT=;_DEBUG;_LIB;%(PreprocessorDefinitions) + include;include\msvc_compat;%(AdditionalIncludeDirectories) + MultiThreadedDebug + 4090;4146;4267;4334 + $(OutputPath)$(TargetName).pdb + + + Windows + true + + + + + + + Level3 + Disabled + JEMALLOC_NO_PRIVATE_NAMESPACE;_REENTRANT;_WINDLL;DLLEXPORT;JEMALLOC_DEBUG;_DEBUG;%(PreprocessorDefinitions) + include;include\msvc_compat;%(AdditionalIncludeDirectories) + 4090;4146;4267;4334 + $(OutputPath)$(TargetName).pdb + false + MultiThreadedDebug + + + Windows + true + + + + + + + Level3 + Disabled + JEMALLOC_NO_PRIVATE_NAMESPACE;JEMALLOC_DEBUG;_REENTRANT;JEMALLOC_EXPORT=;_DEBUG;_LIB;%(PreprocessorDefinitions) + include;include\msvc_compat;%(AdditionalIncludeDirectories) + MultiThreadedDebugDLL + 4090;4146;4267;4334 + OldStyle + false + false + + + Windows + true + + + + + + + Level3 + Disabled + JEMALLOC_NO_PRIVATE_NAMESPACE;JEMALLOC_DEBUG;_REENTRANT;JEMALLOC_EXPORT=;_DEBUG;_LIB;%(PreprocessorDefinitions) + include;include\msvc_compat;%(AdditionalIncludeDirectories) + MultiThreadedDebug + 4090;4146;4267;4334 + OldStyle + false + false + + + Windows + true + + + + + Level3 + + + MaxSpeed + true + true + _REENTRANT;_WINDLL;DLLEXPORT;NDEBUG;%(PreprocessorDefinitions) + include;include\msvc_compat;%(AdditionalIncludeDirectories) + 4090;4146;4267;4334 + $(OutputPath)$(TargetName).pdb + + + Windows + true + true + true + + + + + Level3 + + + MaxSpeed + true + true + _REENTRANT;JEMALLOC_EXPORT=;NDEBUG;_LIB;%(PreprocessorDefinitions) + include;include\msvc_compat;%(AdditionalIncludeDirectories) + MultiThreaded + 4090;4146;4267;4334 + $(OutputPath)$(TargetName).pdb + + + Windows + true + true + true + + + + + Level3 + + + MaxSpeed + true + true + include;include\msvc_compat;%(AdditionalIncludeDirectories) + JEMALLOC_NO_PRIVATE_NAMESPACE;_REENTRANT;_WINDLL;DLLEXPORT;NDEBUG;%(PreprocessorDefinitions) + 4090;4146;4267;4334 + $(OutputPath)$(TargetName).pdb + MultiThreaded + + + Windows + true + true + true + + + + + Level3 + + + MaxSpeed + true + true + JEMALLOC_NO_PRIVATE_NAMESPACE;_REENTRANT;JEMALLOC_EXPORT=;NDEBUG;_LIB;%(PreprocessorDefinitions) + include;include\msvc_compat;%(AdditionalIncludeDirectories) + MultiThreadedDLL + 4090;4146;4267;4334 + OldStyle + + + Windows + true + true + true + + + + + Level3 + + + MaxSpeed + true + true + JEMALLOC_NO_PRIVATE_NAMESPACE;_REENTRANT;JEMALLOC_EXPORT=;NDEBUG;_LIB;%(PreprocessorDefinitions) + include;include\msvc_compat;%(AdditionalIncludeDirectories) + MultiThreaded + 4090;4146;4267;4334 + OldStyle + + + Windows + true + true + true + + + + + + \ No newline at end of file diff --git a/BeefRT/JEMalloc/jemalloc.vcxproj.filters b/BeefRT/JEMalloc/jemalloc.vcxproj.filters new file mode 100644 index 00000000..f040f9f2 --- /dev/null +++ b/BeefRT/JEMalloc/jemalloc.vcxproj.filters @@ -0,0 +1,197 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + \ No newline at end of file diff --git a/BeefRT/JEMalloc/src/arena.c b/BeefRT/JEMalloc/src/arena.c new file mode 100644 index 00000000..857b27c5 --- /dev/null +++ b/BeefRT/JEMalloc/src/arena.c @@ -0,0 +1,1891 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/decay.h" +#include "jemalloc/internal/ehooks.h" +#include "jemalloc/internal/extent_dss.h" +#include "jemalloc/internal/extent_mmap.h" +#include "jemalloc/internal/san.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/rtree.h" +#include "jemalloc/internal/safety_check.h" +#include "jemalloc/internal/util.h" + +JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS + +/******************************************************************************/ +/* Data. */ + +/* + * Define names for both unininitialized and initialized phases, so that + * options and mallctl processing are straightforward. + */ +const char *percpu_arena_mode_names[] = { + "percpu", + "phycpu", + "disabled", + "percpu", + "phycpu" +}; +percpu_arena_mode_t opt_percpu_arena = PERCPU_ARENA_DEFAULT; + +ssize_t opt_dirty_decay_ms = DIRTY_DECAY_MS_DEFAULT; +ssize_t opt_muzzy_decay_ms = MUZZY_DECAY_MS_DEFAULT; + +static atomic_zd_t dirty_decay_ms_default; +static atomic_zd_t muzzy_decay_ms_default; + +emap_t arena_emap_global; +pa_central_t arena_pa_central_global; + +div_info_t arena_binind_div_info[SC_NBINS]; + +size_t opt_oversize_threshold = OVERSIZE_THRESHOLD_DEFAULT; +size_t oversize_threshold = OVERSIZE_THRESHOLD_DEFAULT; + +uint32_t arena_bin_offsets[SC_NBINS]; +static unsigned nbins_total; + +static unsigned huge_arena_ind; + +const arena_config_t arena_config_default = { + /* .extent_hooks = */ (extent_hooks_t *)&ehooks_default_extent_hooks, + /* .metadata_use_hooks = */ true, +}; + +/******************************************************************************/ +/* + * Function prototypes for static functions that are referenced prior to + * definition. + */ + +static bool arena_decay_dirty(tsdn_t *tsdn, arena_t *arena, + bool is_background_thread, bool all); +static void arena_bin_lower_slab(tsdn_t *tsdn, arena_t *arena, edata_t *slab, + bin_t *bin); +static void +arena_maybe_do_deferred_work(tsdn_t *tsdn, arena_t *arena, decay_t *decay, + size_t npages_new); + +/******************************************************************************/ + +void +arena_basic_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads, + const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms, + size_t *nactive, size_t *ndirty, size_t *nmuzzy) { + *nthreads += arena_nthreads_get(arena, false); + *dss = dss_prec_names[arena_dss_prec_get(arena)]; + *dirty_decay_ms = arena_decay_ms_get(arena, extent_state_dirty); + *muzzy_decay_ms = arena_decay_ms_get(arena, extent_state_muzzy); + pa_shard_basic_stats_merge(&arena->pa_shard, nactive, ndirty, nmuzzy); +} + +void +arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads, + const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms, + size_t *nactive, size_t *ndirty, size_t *nmuzzy, arena_stats_t *astats, + bin_stats_data_t *bstats, arena_stats_large_t *lstats, + pac_estats_t *estats, hpa_shard_stats_t *hpastats, sec_stats_t *secstats) { + cassert(config_stats); + + arena_basic_stats_merge(tsdn, arena, nthreads, dss, dirty_decay_ms, + muzzy_decay_ms, nactive, ndirty, nmuzzy); + + size_t base_allocated, base_resident, base_mapped, metadata_thp; + base_stats_get(tsdn, arena->base, &base_allocated, &base_resident, + &base_mapped, &metadata_thp); + size_t pac_mapped_sz = pac_mapped(&arena->pa_shard.pac); + astats->mapped += base_mapped + pac_mapped_sz; + astats->resident += base_resident; + + LOCKEDINT_MTX_LOCK(tsdn, arena->stats.mtx); + + astats->base += base_allocated; + atomic_load_add_store_zu(&astats->internal, arena_internal_get(arena)); + astats->metadata_thp += metadata_thp; + + for (szind_t i = 0; i < SC_NSIZES - SC_NBINS; i++) { + uint64_t nmalloc = locked_read_u64(tsdn, + LOCKEDINT_MTX(arena->stats.mtx), + &arena->stats.lstats[i].nmalloc); + locked_inc_u64_unsynchronized(&lstats[i].nmalloc, nmalloc); + astats->nmalloc_large += nmalloc; + + uint64_t ndalloc = locked_read_u64(tsdn, + LOCKEDINT_MTX(arena->stats.mtx), + &arena->stats.lstats[i].ndalloc); + locked_inc_u64_unsynchronized(&lstats[i].ndalloc, ndalloc); + astats->ndalloc_large += ndalloc; + + uint64_t nrequests = locked_read_u64(tsdn, + LOCKEDINT_MTX(arena->stats.mtx), + &arena->stats.lstats[i].nrequests); + locked_inc_u64_unsynchronized(&lstats[i].nrequests, + nmalloc + nrequests); + astats->nrequests_large += nmalloc + nrequests; + + /* nfill == nmalloc for large currently. */ + locked_inc_u64_unsynchronized(&lstats[i].nfills, nmalloc); + astats->nfills_large += nmalloc; + + uint64_t nflush = locked_read_u64(tsdn, + LOCKEDINT_MTX(arena->stats.mtx), + &arena->stats.lstats[i].nflushes); + locked_inc_u64_unsynchronized(&lstats[i].nflushes, nflush); + astats->nflushes_large += nflush; + + assert(nmalloc >= ndalloc); + assert(nmalloc - ndalloc <= SIZE_T_MAX); + size_t curlextents = (size_t)(nmalloc - ndalloc); + lstats[i].curlextents += curlextents; + astats->allocated_large += + curlextents * sz_index2size(SC_NBINS + i); + } + + pa_shard_stats_merge(tsdn, &arena->pa_shard, &astats->pa_shard_stats, + estats, hpastats, secstats, &astats->resident); + + LOCKEDINT_MTX_UNLOCK(tsdn, arena->stats.mtx); + + /* Currently cached bytes and sanitizer-stashed bytes in tcache. */ + astats->tcache_bytes = 0; + astats->tcache_stashed_bytes = 0; + malloc_mutex_lock(tsdn, &arena->tcache_ql_mtx); + cache_bin_array_descriptor_t *descriptor; + ql_foreach(descriptor, &arena->cache_bin_array_descriptor_ql, link) { + for (szind_t i = 0; i < nhbins; i++) { + cache_bin_t *cache_bin = &descriptor->bins[i]; + cache_bin_sz_t ncached, nstashed; + cache_bin_nitems_get_remote(cache_bin, + &tcache_bin_info[i], &ncached, &nstashed); + + astats->tcache_bytes += ncached * sz_index2size(i); + astats->tcache_stashed_bytes += nstashed * + sz_index2size(i); + } + } + malloc_mutex_prof_read(tsdn, + &astats->mutex_prof_data[arena_prof_mutex_tcache_list], + &arena->tcache_ql_mtx); + malloc_mutex_unlock(tsdn, &arena->tcache_ql_mtx); + +#define READ_ARENA_MUTEX_PROF_DATA(mtx, ind) \ + malloc_mutex_lock(tsdn, &arena->mtx); \ + malloc_mutex_prof_read(tsdn, &astats->mutex_prof_data[ind], \ + &arena->mtx); \ + malloc_mutex_unlock(tsdn, &arena->mtx); + + /* Gather per arena mutex profiling data. */ + READ_ARENA_MUTEX_PROF_DATA(large_mtx, arena_prof_mutex_large); + READ_ARENA_MUTEX_PROF_DATA(base->mtx, + arena_prof_mutex_base); +#undef READ_ARENA_MUTEX_PROF_DATA + pa_shard_mtx_stats_read(tsdn, &arena->pa_shard, + astats->mutex_prof_data); + + nstime_copy(&astats->uptime, &arena->create_time); + nstime_update(&astats->uptime); + nstime_subtract(&astats->uptime, &arena->create_time); + + for (szind_t i = 0; i < SC_NBINS; i++) { + for (unsigned j = 0; j < bin_infos[i].n_shards; j++) { + bin_stats_merge(tsdn, &bstats[i], + arena_get_bin(arena, i, j)); + } + } +} + +static void +arena_background_thread_inactivity_check(tsdn_t *tsdn, arena_t *arena, + bool is_background_thread) { + if (!background_thread_enabled() || is_background_thread) { + return; + } + background_thread_info_t *info = + arena_background_thread_info_get(arena); + if (background_thread_indefinite_sleep(info)) { + arena_maybe_do_deferred_work(tsdn, arena, + &arena->pa_shard.pac.decay_dirty, 0); + } +} + +/* + * React to deferred work generated by a PAI function. + */ +void arena_handle_deferred_work(tsdn_t *tsdn, arena_t *arena) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + + if (decay_immediately(&arena->pa_shard.pac.decay_dirty)) { + arena_decay_dirty(tsdn, arena, false, true); + } + arena_background_thread_inactivity_check(tsdn, arena, false); +} + +static void * +arena_slab_reg_alloc(edata_t *slab, const bin_info_t *bin_info) { + void *ret; + slab_data_t *slab_data = edata_slab_data_get(slab); + size_t regind; + + assert(edata_nfree_get(slab) > 0); + assert(!bitmap_full(slab_data->bitmap, &bin_info->bitmap_info)); + + regind = bitmap_sfu(slab_data->bitmap, &bin_info->bitmap_info); + ret = (void *)((uintptr_t)edata_addr_get(slab) + + (uintptr_t)(bin_info->reg_size * regind)); + edata_nfree_dec(slab); + return ret; +} + +static void +arena_slab_reg_alloc_batch(edata_t *slab, const bin_info_t *bin_info, + unsigned cnt, void** ptrs) { + slab_data_t *slab_data = edata_slab_data_get(slab); + + assert(edata_nfree_get(slab) >= cnt); + assert(!bitmap_full(slab_data->bitmap, &bin_info->bitmap_info)); + +#if (! defined JEMALLOC_INTERNAL_POPCOUNTL) || (defined BITMAP_USE_TREE) + for (unsigned i = 0; i < cnt; i++) { + size_t regind = bitmap_sfu(slab_data->bitmap, + &bin_info->bitmap_info); + *(ptrs + i) = (void *)((uintptr_t)edata_addr_get(slab) + + (uintptr_t)(bin_info->reg_size * regind)); + } +#else + unsigned group = 0; + bitmap_t g = slab_data->bitmap[group]; + unsigned i = 0; + while (i < cnt) { + while (g == 0) { + g = slab_data->bitmap[++group]; + } + size_t shift = group << LG_BITMAP_GROUP_NBITS; + size_t pop = popcount_lu(g); + if (pop > (cnt - i)) { + pop = cnt - i; + } + + /* + * Load from memory locations only once, outside the + * hot loop below. + */ + uintptr_t base = (uintptr_t)edata_addr_get(slab); + uintptr_t regsize = (uintptr_t)bin_info->reg_size; + while (pop--) { + size_t bit = cfs_lu(&g); + size_t regind = shift + bit; + *(ptrs + i) = (void *)(base + regsize * regind); + + i++; + } + slab_data->bitmap[group] = g; + } +#endif + edata_nfree_sub(slab, cnt); +} + +static void +arena_large_malloc_stats_update(tsdn_t *tsdn, arena_t *arena, size_t usize) { + szind_t index, hindex; + + cassert(config_stats); + + if (usize < SC_LARGE_MINCLASS) { + usize = SC_LARGE_MINCLASS; + } + index = sz_size2index(usize); + hindex = (index >= SC_NBINS) ? index - SC_NBINS : 0; + + locked_inc_u64(tsdn, LOCKEDINT_MTX(arena->stats.mtx), + &arena->stats.lstats[hindex].nmalloc, 1); +} + +static void +arena_large_dalloc_stats_update(tsdn_t *tsdn, arena_t *arena, size_t usize) { + szind_t index, hindex; + + cassert(config_stats); + + if (usize < SC_LARGE_MINCLASS) { + usize = SC_LARGE_MINCLASS; + } + index = sz_size2index(usize); + hindex = (index >= SC_NBINS) ? index - SC_NBINS : 0; + + locked_inc_u64(tsdn, LOCKEDINT_MTX(arena->stats.mtx), + &arena->stats.lstats[hindex].ndalloc, 1); +} + +static void +arena_large_ralloc_stats_update(tsdn_t *tsdn, arena_t *arena, size_t oldusize, + size_t usize) { + arena_large_malloc_stats_update(tsdn, arena, usize); + arena_large_dalloc_stats_update(tsdn, arena, oldusize); +} + +edata_t * +arena_extent_alloc_large(tsdn_t *tsdn, arena_t *arena, size_t usize, + size_t alignment, bool zero) { + bool deferred_work_generated = false; + szind_t szind = sz_size2index(usize); + size_t esize = usize + sz_large_pad; + + bool guarded = san_large_extent_decide_guard(tsdn, + arena_get_ehooks(arena), esize, alignment); + edata_t *edata = pa_alloc(tsdn, &arena->pa_shard, esize, alignment, + /* slab */ false, szind, zero, guarded, &deferred_work_generated); + assert(deferred_work_generated == false); + + if (edata != NULL) { + if (config_stats) { + LOCKEDINT_MTX_LOCK(tsdn, arena->stats.mtx); + arena_large_malloc_stats_update(tsdn, arena, usize); + LOCKEDINT_MTX_UNLOCK(tsdn, arena->stats.mtx); + } + } + + if (edata != NULL && sz_large_pad != 0) { + arena_cache_oblivious_randomize(tsdn, arena, edata, alignment); + } + + return edata; +} + +void +arena_extent_dalloc_large_prep(tsdn_t *tsdn, arena_t *arena, edata_t *edata) { + if (config_stats) { + LOCKEDINT_MTX_LOCK(tsdn, arena->stats.mtx); + arena_large_dalloc_stats_update(tsdn, arena, + edata_usize_get(edata)); + LOCKEDINT_MTX_UNLOCK(tsdn, arena->stats.mtx); + } +} + +void +arena_extent_ralloc_large_shrink(tsdn_t *tsdn, arena_t *arena, edata_t *edata, + size_t oldusize) { + size_t usize = edata_usize_get(edata); + + if (config_stats) { + LOCKEDINT_MTX_LOCK(tsdn, arena->stats.mtx); + arena_large_ralloc_stats_update(tsdn, arena, oldusize, usize); + LOCKEDINT_MTX_UNLOCK(tsdn, arena->stats.mtx); + } +} + +void +arena_extent_ralloc_large_expand(tsdn_t *tsdn, arena_t *arena, edata_t *edata, + size_t oldusize) { + size_t usize = edata_usize_get(edata); + + if (config_stats) { + LOCKEDINT_MTX_LOCK(tsdn, arena->stats.mtx); + arena_large_ralloc_stats_update(tsdn, arena, oldusize, usize); + LOCKEDINT_MTX_UNLOCK(tsdn, arena->stats.mtx); + } +} + +/* + * In situations where we're not forcing a decay (i.e. because the user + * specifically requested it), should we purge ourselves, or wait for the + * background thread to get to it. + */ +static pac_purge_eagerness_t +arena_decide_unforced_purge_eagerness(bool is_background_thread) { + if (is_background_thread) { + return PAC_PURGE_ALWAYS; + } else if (!is_background_thread && background_thread_enabled()) { + return PAC_PURGE_NEVER; + } else { + return PAC_PURGE_ON_EPOCH_ADVANCE; + } +} + +bool +arena_decay_ms_set(tsdn_t *tsdn, arena_t *arena, extent_state_t state, + ssize_t decay_ms) { + pac_purge_eagerness_t eagerness = arena_decide_unforced_purge_eagerness( + /* is_background_thread */ false); + return pa_decay_ms_set(tsdn, &arena->pa_shard, state, decay_ms, + eagerness); +} + +ssize_t +arena_decay_ms_get(arena_t *arena, extent_state_t state) { + return pa_decay_ms_get(&arena->pa_shard, state); +} + +static bool +arena_decay_impl(tsdn_t *tsdn, arena_t *arena, decay_t *decay, + pac_decay_stats_t *decay_stats, ecache_t *ecache, + bool is_background_thread, bool all) { + if (all) { + malloc_mutex_lock(tsdn, &decay->mtx); + pac_decay_all(tsdn, &arena->pa_shard.pac, decay, decay_stats, + ecache, /* fully_decay */ all); + malloc_mutex_unlock(tsdn, &decay->mtx); + return false; + } + + if (malloc_mutex_trylock(tsdn, &decay->mtx)) { + /* No need to wait if another thread is in progress. */ + return true; + } + pac_purge_eagerness_t eagerness = + arena_decide_unforced_purge_eagerness(is_background_thread); + bool epoch_advanced = pac_maybe_decay_purge(tsdn, &arena->pa_shard.pac, + decay, decay_stats, ecache, eagerness); + size_t npages_new; + if (epoch_advanced) { + /* Backlog is updated on epoch advance. */ + npages_new = decay_epoch_npages_delta(decay); + } + malloc_mutex_unlock(tsdn, &decay->mtx); + + if (have_background_thread && background_thread_enabled() && + epoch_advanced && !is_background_thread) { + arena_maybe_do_deferred_work(tsdn, arena, decay, npages_new); + } + + return false; +} + +static bool +arena_decay_dirty(tsdn_t *tsdn, arena_t *arena, bool is_background_thread, + bool all) { + return arena_decay_impl(tsdn, arena, &arena->pa_shard.pac.decay_dirty, + &arena->pa_shard.pac.stats->decay_dirty, + &arena->pa_shard.pac.ecache_dirty, is_background_thread, all); +} + +static bool +arena_decay_muzzy(tsdn_t *tsdn, arena_t *arena, bool is_background_thread, + bool all) { + if (pa_shard_dont_decay_muzzy(&arena->pa_shard)) { + return false; + } + return arena_decay_impl(tsdn, arena, &arena->pa_shard.pac.decay_muzzy, + &arena->pa_shard.pac.stats->decay_muzzy, + &arena->pa_shard.pac.ecache_muzzy, is_background_thread, all); +} + +void +arena_decay(tsdn_t *tsdn, arena_t *arena, bool is_background_thread, bool all) { + if (all) { + /* + * We should take a purge of "all" to mean "save as much memory + * as possible", including flushing any caches (for situations + * like thread death, or manual purge calls). + */ + sec_flush(tsdn, &arena->pa_shard.hpa_sec); + } + if (arena_decay_dirty(tsdn, arena, is_background_thread, all)) { + return; + } + arena_decay_muzzy(tsdn, arena, is_background_thread, all); +} + +static bool +arena_should_decay_early(tsdn_t *tsdn, arena_t *arena, decay_t *decay, + background_thread_info_t *info, nstime_t *remaining_sleep, + size_t npages_new) { + malloc_mutex_assert_owner(tsdn, &info->mtx); + + if (malloc_mutex_trylock(tsdn, &decay->mtx)) { + return false; + } + + if (!decay_gradually(decay)) { + malloc_mutex_unlock(tsdn, &decay->mtx); + return false; + } + + nstime_init(remaining_sleep, background_thread_wakeup_time_get(info)); + if (nstime_compare(remaining_sleep, &decay->epoch) <= 0) { + malloc_mutex_unlock(tsdn, &decay->mtx); + return false; + } + nstime_subtract(remaining_sleep, &decay->epoch); + if (npages_new > 0) { + uint64_t npurge_new = decay_npages_purge_in(decay, + remaining_sleep, npages_new); + info->npages_to_purge_new += npurge_new; + } + malloc_mutex_unlock(tsdn, &decay->mtx); + return info->npages_to_purge_new > + ARENA_DEFERRED_PURGE_NPAGES_THRESHOLD; +} + +/* + * Check if deferred work needs to be done sooner than planned. + * For decay we might want to wake up earlier because of an influx of dirty + * pages. Rather than waiting for previously estimated time, we proactively + * purge those pages. + * If background thread sleeps indefinitely, always wake up because some + * deferred work has been generated. + */ +static void +arena_maybe_do_deferred_work(tsdn_t *tsdn, arena_t *arena, decay_t *decay, + size_t npages_new) { + background_thread_info_t *info = arena_background_thread_info_get( + arena); + if (malloc_mutex_trylock(tsdn, &info->mtx)) { + /* + * Background thread may hold the mutex for a long period of + * time. We'd like to avoid the variance on application + * threads. So keep this non-blocking, and leave the work to a + * future epoch. + */ + return; + } + if (!background_thread_is_started(info)) { + goto label_done; + } + + nstime_t remaining_sleep; + if (background_thread_indefinite_sleep(info)) { + background_thread_wakeup_early(info, NULL); + } else if (arena_should_decay_early(tsdn, arena, decay, info, + &remaining_sleep, npages_new)) { + info->npages_to_purge_new = 0; + background_thread_wakeup_early(info, &remaining_sleep); + } +label_done: + malloc_mutex_unlock(tsdn, &info->mtx); +} + +/* Called from background threads. */ +void +arena_do_deferred_work(tsdn_t *tsdn, arena_t *arena) { + arena_decay(tsdn, arena, true, false); + pa_shard_do_deferred_work(tsdn, &arena->pa_shard); +} + +void +arena_slab_dalloc(tsdn_t *tsdn, arena_t *arena, edata_t *slab) { + bool deferred_work_generated = false; + pa_dalloc(tsdn, &arena->pa_shard, slab, &deferred_work_generated); + if (deferred_work_generated) { + arena_handle_deferred_work(tsdn, arena); + } +} + +static void +arena_bin_slabs_nonfull_insert(bin_t *bin, edata_t *slab) { + assert(edata_nfree_get(slab) > 0); + edata_heap_insert(&bin->slabs_nonfull, slab); + if (config_stats) { + bin->stats.nonfull_slabs++; + } +} + +static void +arena_bin_slabs_nonfull_remove(bin_t *bin, edata_t *slab) { + edata_heap_remove(&bin->slabs_nonfull, slab); + if (config_stats) { + bin->stats.nonfull_slabs--; + } +} + +static edata_t * +arena_bin_slabs_nonfull_tryget(bin_t *bin) { + edata_t *slab = edata_heap_remove_first(&bin->slabs_nonfull); + if (slab == NULL) { + return NULL; + } + if (config_stats) { + bin->stats.reslabs++; + bin->stats.nonfull_slabs--; + } + return slab; +} + +static void +arena_bin_slabs_full_insert(arena_t *arena, bin_t *bin, edata_t *slab) { + assert(edata_nfree_get(slab) == 0); + /* + * Tracking extents is required by arena_reset, which is not allowed + * for auto arenas. Bypass this step to avoid touching the edata + * linkage (often results in cache misses) for auto arenas. + */ + if (arena_is_auto(arena)) { + return; + } + edata_list_active_append(&bin->slabs_full, slab); +} + +static void +arena_bin_slabs_full_remove(arena_t *arena, bin_t *bin, edata_t *slab) { + if (arena_is_auto(arena)) { + return; + } + edata_list_active_remove(&bin->slabs_full, slab); +} + +static void +arena_bin_reset(tsd_t *tsd, arena_t *arena, bin_t *bin) { + edata_t *slab; + + malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock); + if (bin->slabcur != NULL) { + slab = bin->slabcur; + bin->slabcur = NULL; + malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock); + arena_slab_dalloc(tsd_tsdn(tsd), arena, slab); + malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock); + } + while ((slab = edata_heap_remove_first(&bin->slabs_nonfull)) != NULL) { + malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock); + arena_slab_dalloc(tsd_tsdn(tsd), arena, slab); + malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock); + } + for (slab = edata_list_active_first(&bin->slabs_full); slab != NULL; + slab = edata_list_active_first(&bin->slabs_full)) { + arena_bin_slabs_full_remove(arena, bin, slab); + malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock); + arena_slab_dalloc(tsd_tsdn(tsd), arena, slab); + malloc_mutex_lock(tsd_tsdn(tsd), &bin->lock); + } + if (config_stats) { + bin->stats.curregs = 0; + bin->stats.curslabs = 0; + } + malloc_mutex_unlock(tsd_tsdn(tsd), &bin->lock); +} + +void +arena_reset(tsd_t *tsd, arena_t *arena) { + /* + * Locking in this function is unintuitive. The caller guarantees that + * no concurrent operations are happening in this arena, but there are + * still reasons that some locking is necessary: + * + * - Some of the functions in the transitive closure of calls assume + * appropriate locks are held, and in some cases these locks are + * temporarily dropped to avoid lock order reversal or deadlock due to + * reentry. + * - mallctl("epoch", ...) may concurrently refresh stats. While + * strictly speaking this is a "concurrent operation", disallowing + * stats refreshes would impose an inconvenient burden. + */ + + /* Large allocations. */ + malloc_mutex_lock(tsd_tsdn(tsd), &arena->large_mtx); + + for (edata_t *edata = edata_list_active_first(&arena->large); + edata != NULL; edata = edata_list_active_first(&arena->large)) { + void *ptr = edata_base_get(edata); + size_t usize; + + malloc_mutex_unlock(tsd_tsdn(tsd), &arena->large_mtx); + emap_alloc_ctx_t alloc_ctx; + emap_alloc_ctx_lookup(tsd_tsdn(tsd), &arena_emap_global, ptr, + &alloc_ctx); + assert(alloc_ctx.szind != SC_NSIZES); + + if (config_stats || (config_prof && opt_prof)) { + usize = sz_index2size(alloc_ctx.szind); + assert(usize == isalloc(tsd_tsdn(tsd), ptr)); + } + /* Remove large allocation from prof sample set. */ + if (config_prof && opt_prof) { + prof_free(tsd, ptr, usize, &alloc_ctx); + } + large_dalloc(tsd_tsdn(tsd), edata); + malloc_mutex_lock(tsd_tsdn(tsd), &arena->large_mtx); + } + malloc_mutex_unlock(tsd_tsdn(tsd), &arena->large_mtx); + + /* Bins. */ + for (unsigned i = 0; i < SC_NBINS; i++) { + for (unsigned j = 0; j < bin_infos[i].n_shards; j++) { + arena_bin_reset(tsd, arena, arena_get_bin(arena, i, j)); + } + } + pa_shard_reset(tsd_tsdn(tsd), &arena->pa_shard); +} + +static void +arena_prepare_base_deletion_sync_finish(tsd_t *tsd, malloc_mutex_t **mutexes, + unsigned n_mtx) { + for (unsigned i = 0; i < n_mtx; i++) { + malloc_mutex_lock(tsd_tsdn(tsd), mutexes[i]); + malloc_mutex_unlock(tsd_tsdn(tsd), mutexes[i]); + } +} + +#define ARENA_DESTROY_MAX_DELAYED_MTX 32 +static void +arena_prepare_base_deletion_sync(tsd_t *tsd, malloc_mutex_t *mtx, + malloc_mutex_t **delayed_mtx, unsigned *n_delayed) { + if (!malloc_mutex_trylock(tsd_tsdn(tsd), mtx)) { + /* No contention. */ + malloc_mutex_unlock(tsd_tsdn(tsd), mtx); + return; + } + unsigned n = *n_delayed; + assert(n < ARENA_DESTROY_MAX_DELAYED_MTX); + /* Add another to the batch. */ + delayed_mtx[n++] = mtx; + + if (n == ARENA_DESTROY_MAX_DELAYED_MTX) { + arena_prepare_base_deletion_sync_finish(tsd, delayed_mtx, n); + n = 0; + } + *n_delayed = n; +} + +static void +arena_prepare_base_deletion(tsd_t *tsd, base_t *base_to_destroy) { + /* + * In order to coalesce, emap_try_acquire_edata_neighbor will attempt to + * check neighbor edata's state to determine eligibility. This means + * under certain conditions, the metadata from an arena can be accessed + * w/o holding any locks from that arena. In order to guarantee safe + * memory access, the metadata and the underlying base allocator needs + * to be kept alive, until all pending accesses are done. + * + * 1) with opt_retain, the arena boundary implies the is_head state + * (tracked in the rtree leaf), and the coalesce flow will stop at the + * head state branch. Therefore no cross arena metadata access + * possible. + * + * 2) w/o opt_retain, the arena id needs to be read from the edata_t, + * meaning read only cross-arena metadata access is possible. The + * coalesce attempt will stop at the arena_id mismatch, and is always + * under one of the ecache locks. To allow safe passthrough of such + * metadata accesses, the loop below will iterate through all manual + * arenas' ecache locks. As all the metadata from this base allocator + * have been unlinked from the rtree, after going through all the + * relevant ecache locks, it's safe to say that a) pending accesses are + * all finished, and b) no new access will be generated. + */ + if (opt_retain) { + return; + } + unsigned destroy_ind = base_ind_get(base_to_destroy); + assert(destroy_ind >= manual_arena_base); + + tsdn_t *tsdn = tsd_tsdn(tsd); + malloc_mutex_t *delayed_mtx[ARENA_DESTROY_MAX_DELAYED_MTX]; + unsigned n_delayed = 0, total = narenas_total_get(); + for (unsigned i = 0; i < total; i++) { + if (i == destroy_ind) { + continue; + } + arena_t *arena = arena_get(tsdn, i, false); + if (arena == NULL) { + continue; + } + pac_t *pac = &arena->pa_shard.pac; + arena_prepare_base_deletion_sync(tsd, &pac->ecache_dirty.mtx, + delayed_mtx, &n_delayed); + arena_prepare_base_deletion_sync(tsd, &pac->ecache_muzzy.mtx, + delayed_mtx, &n_delayed); + arena_prepare_base_deletion_sync(tsd, &pac->ecache_retained.mtx, + delayed_mtx, &n_delayed); + } + arena_prepare_base_deletion_sync_finish(tsd, delayed_mtx, n_delayed); +} +#undef ARENA_DESTROY_MAX_DELAYED_MTX + +void +arena_destroy(tsd_t *tsd, arena_t *arena) { + assert(base_ind_get(arena->base) >= narenas_auto); + assert(arena_nthreads_get(arena, false) == 0); + assert(arena_nthreads_get(arena, true) == 0); + + /* + * No allocations have occurred since arena_reset() was called. + * Furthermore, the caller (arena_i_destroy_ctl()) purged all cached + * extents, so only retained extents may remain and it's safe to call + * pa_shard_destroy_retained. + */ + pa_shard_destroy(tsd_tsdn(tsd), &arena->pa_shard); + + /* + * Remove the arena pointer from the arenas array. We rely on the fact + * that there is no way for the application to get a dirty read from the + * arenas array unless there is an inherent race in the application + * involving access of an arena being concurrently destroyed. The + * application must synchronize knowledge of the arena's validity, so as + * long as we use an atomic write to update the arenas array, the + * application will get a clean read any time after it synchronizes + * knowledge that the arena is no longer valid. + */ + arena_set(base_ind_get(arena->base), NULL); + + /* + * Destroy the base allocator, which manages all metadata ever mapped by + * this arena. The prepare function will make sure no pending access to + * the metadata in this base anymore. + */ + arena_prepare_base_deletion(tsd, arena->base); + base_delete(tsd_tsdn(tsd), arena->base); +} + +static edata_t * +arena_slab_alloc(tsdn_t *tsdn, arena_t *arena, szind_t binind, unsigned binshard, + const bin_info_t *bin_info) { + bool deferred_work_generated = false; + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + + bool guarded = san_slab_extent_decide_guard(tsdn, + arena_get_ehooks(arena)); + edata_t *slab = pa_alloc(tsdn, &arena->pa_shard, bin_info->slab_size, + /* alignment */ PAGE, /* slab */ true, /* szind */ binind, + /* zero */ false, guarded, &deferred_work_generated); + + if (deferred_work_generated) { + arena_handle_deferred_work(tsdn, arena); + } + + if (slab == NULL) { + return NULL; + } + assert(edata_slab_get(slab)); + + /* Initialize slab internals. */ + slab_data_t *slab_data = edata_slab_data_get(slab); + edata_nfree_binshard_set(slab, bin_info->nregs, binshard); + bitmap_init(slab_data->bitmap, &bin_info->bitmap_info, false); + + return slab; +} + +/* + * Before attempting the _with_fresh_slab approaches below, the _no_fresh_slab + * variants (i.e. through slabcur and nonfull) must be tried first. + */ +static void +arena_bin_refill_slabcur_with_fresh_slab(tsdn_t *tsdn, arena_t *arena, + bin_t *bin, szind_t binind, edata_t *fresh_slab) { + malloc_mutex_assert_owner(tsdn, &bin->lock); + /* Only called after slabcur and nonfull both failed. */ + assert(bin->slabcur == NULL); + assert(edata_heap_first(&bin->slabs_nonfull) == NULL); + assert(fresh_slab != NULL); + + /* A new slab from arena_slab_alloc() */ + assert(edata_nfree_get(fresh_slab) == bin_infos[binind].nregs); + if (config_stats) { + bin->stats.nslabs++; + bin->stats.curslabs++; + } + bin->slabcur = fresh_slab; +} + +/* Refill slabcur and then alloc using the fresh slab */ +static void * +arena_bin_malloc_with_fresh_slab(tsdn_t *tsdn, arena_t *arena, bin_t *bin, + szind_t binind, edata_t *fresh_slab) { + malloc_mutex_assert_owner(tsdn, &bin->lock); + arena_bin_refill_slabcur_with_fresh_slab(tsdn, arena, bin, binind, + fresh_slab); + + return arena_slab_reg_alloc(bin->slabcur, &bin_infos[binind]); +} + +static bool +arena_bin_refill_slabcur_no_fresh_slab(tsdn_t *tsdn, arena_t *arena, + bin_t *bin) { + malloc_mutex_assert_owner(tsdn, &bin->lock); + /* Only called after arena_slab_reg_alloc[_batch] failed. */ + assert(bin->slabcur == NULL || edata_nfree_get(bin->slabcur) == 0); + + if (bin->slabcur != NULL) { + arena_bin_slabs_full_insert(arena, bin, bin->slabcur); + } + + /* Look for a usable slab. */ + bin->slabcur = arena_bin_slabs_nonfull_tryget(bin); + assert(bin->slabcur == NULL || edata_nfree_get(bin->slabcur) > 0); + + return (bin->slabcur == NULL); +} + +bin_t * +arena_bin_choose(tsdn_t *tsdn, arena_t *arena, szind_t binind, + unsigned *binshard_p) { + unsigned binshard; + if (tsdn_null(tsdn) || tsd_arena_get(tsdn_tsd(tsdn)) == NULL) { + binshard = 0; + } else { + binshard = tsd_binshardsp_get(tsdn_tsd(tsdn))->binshard[binind]; + } + assert(binshard < bin_infos[binind].n_shards); + if (binshard_p != NULL) { + *binshard_p = binshard; + } + return arena_get_bin(arena, binind, binshard); +} + +void +arena_cache_bin_fill_small(tsdn_t *tsdn, arena_t *arena, + cache_bin_t *cache_bin, cache_bin_info_t *cache_bin_info, szind_t binind, + const unsigned nfill) { + assert(cache_bin_ncached_get_local(cache_bin, cache_bin_info) == 0); + + const bin_info_t *bin_info = &bin_infos[binind]; + + CACHE_BIN_PTR_ARRAY_DECLARE(ptrs, nfill); + cache_bin_init_ptr_array_for_fill(cache_bin, cache_bin_info, &ptrs, + nfill); + /* + * Bin-local resources are used first: 1) bin->slabcur, and 2) nonfull + * slabs. After both are exhausted, new slabs will be allocated through + * arena_slab_alloc(). + * + * Bin lock is only taken / released right before / after the while(...) + * refill loop, with new slab allocation (which has its own locking) + * kept outside of the loop. This setup facilitates flat combining, at + * the cost of the nested loop (through goto label_refill). + * + * To optimize for cases with contention and limited resources + * (e.g. hugepage-backed or non-overcommit arenas), each fill-iteration + * gets one chance of slab_alloc, and a retry of bin local resources + * after the slab allocation (regardless if slab_alloc failed, because + * the bin lock is dropped during the slab allocation). + * + * In other words, new slab allocation is allowed, as long as there was + * progress since the previous slab_alloc. This is tracked with + * made_progress below, initialized to true to jump start the first + * iteration. + * + * In other words (again), the loop will only terminate early (i.e. stop + * with filled < nfill) after going through the three steps: a) bin + * local exhausted, b) unlock and slab_alloc returns null, c) re-lock + * and bin local fails again. + */ + bool made_progress = true; + edata_t *fresh_slab = NULL; + bool alloc_and_retry = false; + unsigned filled = 0; + unsigned binshard; + bin_t *bin = arena_bin_choose(tsdn, arena, binind, &binshard); + +label_refill: + malloc_mutex_lock(tsdn, &bin->lock); + + while (filled < nfill) { + /* Try batch-fill from slabcur first. */ + edata_t *slabcur = bin->slabcur; + if (slabcur != NULL && edata_nfree_get(slabcur) > 0) { + unsigned tofill = nfill - filled; + unsigned nfree = edata_nfree_get(slabcur); + unsigned cnt = tofill < nfree ? tofill : nfree; + + arena_slab_reg_alloc_batch(slabcur, bin_info, cnt, + &ptrs.ptr[filled]); + made_progress = true; + filled += cnt; + continue; + } + /* Next try refilling slabcur from nonfull slabs. */ + if (!arena_bin_refill_slabcur_no_fresh_slab(tsdn, arena, bin)) { + assert(bin->slabcur != NULL); + continue; + } + + /* Then see if a new slab was reserved already. */ + if (fresh_slab != NULL) { + arena_bin_refill_slabcur_with_fresh_slab(tsdn, arena, + bin, binind, fresh_slab); + assert(bin->slabcur != NULL); + fresh_slab = NULL; + continue; + } + + /* Try slab_alloc if made progress (or never did slab_alloc). */ + if (made_progress) { + assert(bin->slabcur == NULL); + assert(fresh_slab == NULL); + alloc_and_retry = true; + /* Alloc a new slab then come back. */ + break; + } + + /* OOM. */ + + assert(fresh_slab == NULL); + assert(!alloc_and_retry); + break; + } /* while (filled < nfill) loop. */ + + if (config_stats && !alloc_and_retry) { + bin->stats.nmalloc += filled; + bin->stats.nrequests += cache_bin->tstats.nrequests; + bin->stats.curregs += filled; + bin->stats.nfills++; + cache_bin->tstats.nrequests = 0; + } + + malloc_mutex_unlock(tsdn, &bin->lock); + + if (alloc_and_retry) { + assert(fresh_slab == NULL); + assert(filled < nfill); + assert(made_progress); + + fresh_slab = arena_slab_alloc(tsdn, arena, binind, binshard, + bin_info); + /* fresh_slab NULL case handled in the for loop. */ + + alloc_and_retry = false; + made_progress = false; + goto label_refill; + } + assert(filled == nfill || (fresh_slab == NULL && !made_progress)); + + /* Release if allocated but not used. */ + if (fresh_slab != NULL) { + assert(edata_nfree_get(fresh_slab) == bin_info->nregs); + arena_slab_dalloc(tsdn, arena, fresh_slab); + fresh_slab = NULL; + } + + cache_bin_finish_fill(cache_bin, cache_bin_info, &ptrs, filled); + arena_decay_tick(tsdn, arena); +} + +size_t +arena_fill_small_fresh(tsdn_t *tsdn, arena_t *arena, szind_t binind, + void **ptrs, size_t nfill, bool zero) { + assert(binind < SC_NBINS); + const bin_info_t *bin_info = &bin_infos[binind]; + const size_t nregs = bin_info->nregs; + assert(nregs > 0); + const size_t usize = bin_info->reg_size; + + const bool manual_arena = !arena_is_auto(arena); + unsigned binshard; + bin_t *bin = arena_bin_choose(tsdn, arena, binind, &binshard); + + size_t nslab = 0; + size_t filled = 0; + edata_t *slab = NULL; + edata_list_active_t fulls; + edata_list_active_init(&fulls); + + while (filled < nfill && (slab = arena_slab_alloc(tsdn, arena, binind, + binshard, bin_info)) != NULL) { + assert((size_t)edata_nfree_get(slab) == nregs); + ++nslab; + size_t batch = nfill - filled; + if (batch > nregs) { + batch = nregs; + } + assert(batch > 0); + arena_slab_reg_alloc_batch(slab, bin_info, (unsigned)batch, + &ptrs[filled]); + assert(edata_addr_get(slab) == ptrs[filled]); + if (zero) { + memset(ptrs[filled], 0, batch * usize); + } + filled += batch; + if (batch == nregs) { + if (manual_arena) { + edata_list_active_append(&fulls, slab); + } + slab = NULL; + } + } + + malloc_mutex_lock(tsdn, &bin->lock); + /* + * Only the last slab can be non-empty, and the last slab is non-empty + * iff slab != NULL. + */ + if (slab != NULL) { + arena_bin_lower_slab(tsdn, arena, slab, bin); + } + if (manual_arena) { + edata_list_active_concat(&bin->slabs_full, &fulls); + } + assert(edata_list_active_empty(&fulls)); + if (config_stats) { + bin->stats.nslabs += nslab; + bin->stats.curslabs += nslab; + bin->stats.nmalloc += filled; + bin->stats.nrequests += filled; + bin->stats.curregs += filled; + } + malloc_mutex_unlock(tsdn, &bin->lock); + + arena_decay_tick(tsdn, arena); + return filled; +} + +/* + * Without allocating a new slab, try arena_slab_reg_alloc() and re-fill + * bin->slabcur if necessary. + */ +static void * +arena_bin_malloc_no_fresh_slab(tsdn_t *tsdn, arena_t *arena, bin_t *bin, + szind_t binind) { + malloc_mutex_assert_owner(tsdn, &bin->lock); + if (bin->slabcur == NULL || edata_nfree_get(bin->slabcur) == 0) { + if (arena_bin_refill_slabcur_no_fresh_slab(tsdn, arena, bin)) { + return NULL; + } + } + + assert(bin->slabcur != NULL && edata_nfree_get(bin->slabcur) > 0); + return arena_slab_reg_alloc(bin->slabcur, &bin_infos[binind]); +} + +static void * +arena_malloc_small(tsdn_t *tsdn, arena_t *arena, szind_t binind, bool zero) { + assert(binind < SC_NBINS); + const bin_info_t *bin_info = &bin_infos[binind]; + size_t usize = sz_index2size(binind); + unsigned binshard; + bin_t *bin = arena_bin_choose(tsdn, arena, binind, &binshard); + + malloc_mutex_lock(tsdn, &bin->lock); + edata_t *fresh_slab = NULL; + void *ret = arena_bin_malloc_no_fresh_slab(tsdn, arena, bin, binind); + if (ret == NULL) { + malloc_mutex_unlock(tsdn, &bin->lock); + /******************************/ + fresh_slab = arena_slab_alloc(tsdn, arena, binind, binshard, + bin_info); + /********************************/ + malloc_mutex_lock(tsdn, &bin->lock); + /* Retry since the lock was dropped. */ + ret = arena_bin_malloc_no_fresh_slab(tsdn, arena, bin, binind); + if (ret == NULL) { + if (fresh_slab == NULL) { + /* OOM */ + malloc_mutex_unlock(tsdn, &bin->lock); + return NULL; + } + ret = arena_bin_malloc_with_fresh_slab(tsdn, arena, bin, + binind, fresh_slab); + fresh_slab = NULL; + } + } + if (config_stats) { + bin->stats.nmalloc++; + bin->stats.nrequests++; + bin->stats.curregs++; + } + malloc_mutex_unlock(tsdn, &bin->lock); + + if (fresh_slab != NULL) { + arena_slab_dalloc(tsdn, arena, fresh_slab); + } + if (zero) { + memset(ret, 0, usize); + } + arena_decay_tick(tsdn, arena); + + return ret; +} + +void * +arena_malloc_hard(tsdn_t *tsdn, arena_t *arena, size_t size, szind_t ind, + bool zero) { + assert(!tsdn_null(tsdn) || arena != NULL); + + if (likely(!tsdn_null(tsdn))) { + arena = arena_choose_maybe_huge(tsdn_tsd(tsdn), arena, size); + } + if (unlikely(arena == NULL)) { + return NULL; + } + + if (likely(size <= SC_SMALL_MAXCLASS)) { + return arena_malloc_small(tsdn, arena, ind, zero); + } + return large_malloc(tsdn, arena, sz_index2size(ind), zero); +} + +void * +arena_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment, + bool zero, tcache_t *tcache) { + void *ret; + + if (usize <= SC_SMALL_MAXCLASS) { + /* Small; alignment doesn't require special slab placement. */ + + /* usize should be a result of sz_sa2u() */ + assert((usize & (alignment - 1)) == 0); + + /* + * Small usize can't come from an alignment larger than a page. + */ + assert(alignment <= PAGE); + + ret = arena_malloc(tsdn, arena, usize, sz_size2index(usize), + zero, tcache, true); + } else { + if (likely(alignment <= CACHELINE)) { + ret = large_malloc(tsdn, arena, usize, zero); + } else { + ret = large_palloc(tsdn, arena, usize, alignment, zero); + } + } + return ret; +} + +void +arena_prof_promote(tsdn_t *tsdn, void *ptr, size_t usize) { + cassert(config_prof); + assert(ptr != NULL); + assert(isalloc(tsdn, ptr) == SC_LARGE_MINCLASS); + assert(usize <= SC_SMALL_MAXCLASS); + + if (config_opt_safety_checks) { + safety_check_set_redzone(ptr, usize, SC_LARGE_MINCLASS); + } + + edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global, ptr); + + szind_t szind = sz_size2index(usize); + edata_szind_set(edata, szind); + emap_remap(tsdn, &arena_emap_global, edata, szind, /* slab */ false); + + assert(isalloc(tsdn, ptr) == usize); +} + +static size_t +arena_prof_demote(tsdn_t *tsdn, edata_t *edata, const void *ptr) { + cassert(config_prof); + assert(ptr != NULL); + + edata_szind_set(edata, SC_NBINS); + emap_remap(tsdn, &arena_emap_global, edata, SC_NBINS, /* slab */ false); + + assert(isalloc(tsdn, ptr) == SC_LARGE_MINCLASS); + + return SC_LARGE_MINCLASS; +} + +void +arena_dalloc_promoted(tsdn_t *tsdn, void *ptr, tcache_t *tcache, + bool slow_path) { + cassert(config_prof); + assert(opt_prof); + + edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global, ptr); + size_t usize = edata_usize_get(edata); + size_t bumped_usize = arena_prof_demote(tsdn, edata, ptr); + if (config_opt_safety_checks && usize < SC_LARGE_MINCLASS) { + /* + * Currently, we only do redzoning for small sampled + * allocations. + */ + assert(bumped_usize == SC_LARGE_MINCLASS); + safety_check_verify_redzone(ptr, usize, bumped_usize); + } + if (bumped_usize <= tcache_maxclass && tcache != NULL) { + tcache_dalloc_large(tsdn_tsd(tsdn), tcache, ptr, + sz_size2index(bumped_usize), slow_path); + } else { + large_dalloc(tsdn, edata); + } +} + +static void +arena_dissociate_bin_slab(arena_t *arena, edata_t *slab, bin_t *bin) { + /* Dissociate slab from bin. */ + if (slab == bin->slabcur) { + bin->slabcur = NULL; + } else { + szind_t binind = edata_szind_get(slab); + const bin_info_t *bin_info = &bin_infos[binind]; + + /* + * The following block's conditional is necessary because if the + * slab only contains one region, then it never gets inserted + * into the non-full slabs heap. + */ + if (bin_info->nregs == 1) { + arena_bin_slabs_full_remove(arena, bin, slab); + } else { + arena_bin_slabs_nonfull_remove(bin, slab); + } + } +} + +static void +arena_bin_lower_slab(tsdn_t *tsdn, arena_t *arena, edata_t *slab, + bin_t *bin) { + assert(edata_nfree_get(slab) > 0); + + /* + * Make sure that if bin->slabcur is non-NULL, it refers to the + * oldest/lowest non-full slab. It is okay to NULL slabcur out rather + * than proactively keeping it pointing at the oldest/lowest non-full + * slab. + */ + if (bin->slabcur != NULL && edata_snad_comp(bin->slabcur, slab) > 0) { + /* Switch slabcur. */ + if (edata_nfree_get(bin->slabcur) > 0) { + arena_bin_slabs_nonfull_insert(bin, bin->slabcur); + } else { + arena_bin_slabs_full_insert(arena, bin, bin->slabcur); + } + bin->slabcur = slab; + if (config_stats) { + bin->stats.reslabs++; + } + } else { + arena_bin_slabs_nonfull_insert(bin, slab); + } +} + +static void +arena_dalloc_bin_slab_prepare(tsdn_t *tsdn, edata_t *slab, bin_t *bin) { + malloc_mutex_assert_owner(tsdn, &bin->lock); + + assert(slab != bin->slabcur); + if (config_stats) { + bin->stats.curslabs--; + } +} + +void +arena_dalloc_bin_locked_handle_newly_empty(tsdn_t *tsdn, arena_t *arena, + edata_t *slab, bin_t *bin) { + arena_dissociate_bin_slab(arena, slab, bin); + arena_dalloc_bin_slab_prepare(tsdn, slab, bin); +} + +void +arena_dalloc_bin_locked_handle_newly_nonempty(tsdn_t *tsdn, arena_t *arena, + edata_t *slab, bin_t *bin) { + arena_bin_slabs_full_remove(arena, bin, slab); + arena_bin_lower_slab(tsdn, arena, slab, bin); +} + +static void +arena_dalloc_bin(tsdn_t *tsdn, arena_t *arena, edata_t *edata, void *ptr) { + szind_t binind = edata_szind_get(edata); + unsigned binshard = edata_binshard_get(edata); + bin_t *bin = arena_get_bin(arena, binind, binshard); + + malloc_mutex_lock(tsdn, &bin->lock); + arena_dalloc_bin_locked_info_t info; + arena_dalloc_bin_locked_begin(&info, binind); + bool ret = arena_dalloc_bin_locked_step(tsdn, arena, bin, + &info, binind, edata, ptr); + arena_dalloc_bin_locked_finish(tsdn, arena, bin, &info); + malloc_mutex_unlock(tsdn, &bin->lock); + + if (ret) { + arena_slab_dalloc(tsdn, arena, edata); + } +} + +void +arena_dalloc_small(tsdn_t *tsdn, void *ptr) { + edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global, ptr); + arena_t *arena = arena_get_from_edata(edata); + + arena_dalloc_bin(tsdn, arena, edata, ptr); + arena_decay_tick(tsdn, arena); +} + +bool +arena_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, + size_t extra, bool zero, size_t *newsize) { + bool ret; + /* Calls with non-zero extra had to clamp extra. */ + assert(extra == 0 || size + extra <= SC_LARGE_MAXCLASS); + + edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global, ptr); + if (unlikely(size > SC_LARGE_MAXCLASS)) { + ret = true; + goto done; + } + + size_t usize_min = sz_s2u(size); + size_t usize_max = sz_s2u(size + extra); + if (likely(oldsize <= SC_SMALL_MAXCLASS && usize_min + <= SC_SMALL_MAXCLASS)) { + /* + * Avoid moving the allocation if the size class can be left the + * same. + */ + assert(bin_infos[sz_size2index(oldsize)].reg_size == + oldsize); + if ((usize_max > SC_SMALL_MAXCLASS + || sz_size2index(usize_max) != sz_size2index(oldsize)) + && (size > oldsize || usize_max < oldsize)) { + ret = true; + goto done; + } + + arena_t *arena = arena_get_from_edata(edata); + arena_decay_tick(tsdn, arena); + ret = false; + } else if (oldsize >= SC_LARGE_MINCLASS + && usize_max >= SC_LARGE_MINCLASS) { + ret = large_ralloc_no_move(tsdn, edata, usize_min, usize_max, + zero); + } else { + ret = true; + } +done: + assert(edata == emap_edata_lookup(tsdn, &arena_emap_global, ptr)); + *newsize = edata_usize_get(edata); + + return ret; +} + +static void * +arena_ralloc_move_helper(tsdn_t *tsdn, arena_t *arena, size_t usize, + size_t alignment, bool zero, tcache_t *tcache) { + if (alignment == 0) { + return arena_malloc(tsdn, arena, usize, sz_size2index(usize), + zero, tcache, true); + } + usize = sz_sa2u(usize, alignment); + if (unlikely(usize == 0 || usize > SC_LARGE_MAXCLASS)) { + return NULL; + } + return ipalloct(tsdn, usize, alignment, zero, tcache, arena); +} + +void * +arena_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t oldsize, + size_t size, size_t alignment, bool zero, tcache_t *tcache, + hook_ralloc_args_t *hook_args) { + size_t usize = alignment == 0 ? sz_s2u(size) : sz_sa2u(size, alignment); + if (unlikely(usize == 0 || size > SC_LARGE_MAXCLASS)) { + return NULL; + } + + if (likely(usize <= SC_SMALL_MAXCLASS)) { + /* Try to avoid moving the allocation. */ + UNUSED size_t newsize; + if (!arena_ralloc_no_move(tsdn, ptr, oldsize, usize, 0, zero, + &newsize)) { + hook_invoke_expand(hook_args->is_realloc + ? hook_expand_realloc : hook_expand_rallocx, + ptr, oldsize, usize, (uintptr_t)ptr, + hook_args->args); + return ptr; + } + } + + if (oldsize >= SC_LARGE_MINCLASS + && usize >= SC_LARGE_MINCLASS) { + return large_ralloc(tsdn, arena, ptr, usize, + alignment, zero, tcache, hook_args); + } + + /* + * size and oldsize are different enough that we need to move the + * object. In that case, fall back to allocating new space and copying. + */ + void *ret = arena_ralloc_move_helper(tsdn, arena, usize, alignment, + zero, tcache); + if (ret == NULL) { + return NULL; + } + + hook_invoke_alloc(hook_args->is_realloc + ? hook_alloc_realloc : hook_alloc_rallocx, ret, (uintptr_t)ret, + hook_args->args); + hook_invoke_dalloc(hook_args->is_realloc + ? hook_dalloc_realloc : hook_dalloc_rallocx, ptr, hook_args->args); + + /* + * Junk/zero-filling were already done by + * ipalloc()/arena_malloc(). + */ + size_t copysize = (usize < oldsize) ? usize : oldsize; + memcpy(ret, ptr, copysize); + isdalloct(tsdn, ptr, oldsize, tcache, NULL, true); + return ret; +} + +ehooks_t * +arena_get_ehooks(arena_t *arena) { + return base_ehooks_get(arena->base); +} + +extent_hooks_t * +arena_set_extent_hooks(tsd_t *tsd, arena_t *arena, + extent_hooks_t *extent_hooks) { + background_thread_info_t *info; + if (have_background_thread) { + info = arena_background_thread_info_get(arena); + malloc_mutex_lock(tsd_tsdn(tsd), &info->mtx); + } + /* No using the HPA now that we have the custom hooks. */ + pa_shard_disable_hpa(tsd_tsdn(tsd), &arena->pa_shard); + extent_hooks_t *ret = base_extent_hooks_set(arena->base, extent_hooks); + if (have_background_thread) { + malloc_mutex_unlock(tsd_tsdn(tsd), &info->mtx); + } + + return ret; +} + +dss_prec_t +arena_dss_prec_get(arena_t *arena) { + return (dss_prec_t)atomic_load_u(&arena->dss_prec, ATOMIC_ACQUIRE); +} + +bool +arena_dss_prec_set(arena_t *arena, dss_prec_t dss_prec) { + if (!have_dss) { + return (dss_prec != dss_prec_disabled); + } + atomic_store_u(&arena->dss_prec, (unsigned)dss_prec, ATOMIC_RELEASE); + return false; +} + +ssize_t +arena_dirty_decay_ms_default_get(void) { + return atomic_load_zd(&dirty_decay_ms_default, ATOMIC_RELAXED); +} + +bool +arena_dirty_decay_ms_default_set(ssize_t decay_ms) { + if (!decay_ms_valid(decay_ms)) { + return true; + } + atomic_store_zd(&dirty_decay_ms_default, decay_ms, ATOMIC_RELAXED); + return false; +} + +ssize_t +arena_muzzy_decay_ms_default_get(void) { + return atomic_load_zd(&muzzy_decay_ms_default, ATOMIC_RELAXED); +} + +bool +arena_muzzy_decay_ms_default_set(ssize_t decay_ms) { + if (!decay_ms_valid(decay_ms)) { + return true; + } + atomic_store_zd(&muzzy_decay_ms_default, decay_ms, ATOMIC_RELAXED); + return false; +} + +bool +arena_retain_grow_limit_get_set(tsd_t *tsd, arena_t *arena, size_t *old_limit, + size_t *new_limit) { + assert(opt_retain); + return pac_retain_grow_limit_get_set(tsd_tsdn(tsd), + &arena->pa_shard.pac, old_limit, new_limit); +} + +unsigned +arena_nthreads_get(arena_t *arena, bool internal) { + return atomic_load_u(&arena->nthreads[internal], ATOMIC_RELAXED); +} + +void +arena_nthreads_inc(arena_t *arena, bool internal) { + atomic_fetch_add_u(&arena->nthreads[internal], 1, ATOMIC_RELAXED); +} + +void +arena_nthreads_dec(arena_t *arena, bool internal) { + atomic_fetch_sub_u(&arena->nthreads[internal], 1, ATOMIC_RELAXED); +} + +arena_t * +arena_new(tsdn_t *tsdn, unsigned ind, const arena_config_t *config) { + arena_t *arena; + base_t *base; + unsigned i; + + if (ind == 0) { + base = b0get(); + } else { + base = base_new(tsdn, ind, config->extent_hooks, + config->metadata_use_hooks); + if (base == NULL) { + return NULL; + } + } + + size_t arena_size = sizeof(arena_t) + sizeof(bin_t) * nbins_total; + arena = (arena_t *)base_alloc(tsdn, base, arena_size, CACHELINE); + if (arena == NULL) { + goto label_error; + } + + atomic_store_u(&arena->nthreads[0], 0, ATOMIC_RELAXED); + atomic_store_u(&arena->nthreads[1], 0, ATOMIC_RELAXED); + arena->last_thd = NULL; + + if (config_stats) { + if (arena_stats_init(tsdn, &arena->stats)) { + goto label_error; + } + + ql_new(&arena->tcache_ql); + ql_new(&arena->cache_bin_array_descriptor_ql); + if (malloc_mutex_init(&arena->tcache_ql_mtx, "tcache_ql", + WITNESS_RANK_TCACHE_QL, malloc_mutex_rank_exclusive)) { + goto label_error; + } + } + + atomic_store_u(&arena->dss_prec, (unsigned)extent_dss_prec_get(), + ATOMIC_RELAXED); + + edata_list_active_init(&arena->large); + if (malloc_mutex_init(&arena->large_mtx, "arena_large", + WITNESS_RANK_ARENA_LARGE, malloc_mutex_rank_exclusive)) { + goto label_error; + } + + nstime_t cur_time; + nstime_init_update(&cur_time); + if (pa_shard_init(tsdn, &arena->pa_shard, &arena_pa_central_global, + &arena_emap_global, base, ind, &arena->stats.pa_shard_stats, + LOCKEDINT_MTX(arena->stats.mtx), &cur_time, oversize_threshold, + arena_dirty_decay_ms_default_get(), + arena_muzzy_decay_ms_default_get())) { + goto label_error; + } + + /* Initialize bins. */ + atomic_store_u(&arena->binshard_next, 0, ATOMIC_RELEASE); + for (i = 0; i < nbins_total; i++) { + bool err = bin_init(&arena->bins[i]); + if (err) { + goto label_error; + } + } + + arena->base = base; + /* Set arena before creating background threads. */ + arena_set(ind, arena); + arena->ind = ind; + + nstime_init_update(&arena->create_time); + + /* + * We turn on the HPA if set to. There are two exceptions: + * - Custom extent hooks (we should only return memory allocated from + * them in that case). + * - Arena 0 initialization. In this case, we're mid-bootstrapping, and + * so arena_hpa_global is not yet initialized. + */ + if (opt_hpa && ehooks_are_default(base_ehooks_get(base)) && ind != 0) { + hpa_shard_opts_t hpa_shard_opts = opt_hpa_opts; + hpa_shard_opts.deferral_allowed = background_thread_enabled(); + if (pa_shard_enable_hpa(tsdn, &arena->pa_shard, + &hpa_shard_opts, &opt_hpa_sec_opts)) { + goto label_error; + } + } + + /* We don't support reentrancy for arena 0 bootstrapping. */ + if (ind != 0) { + /* + * If we're here, then arena 0 already exists, so bootstrapping + * is done enough that we should have tsd. + */ + assert(!tsdn_null(tsdn)); + pre_reentrancy(tsdn_tsd(tsdn), arena); + if (test_hooks_arena_new_hook) { + test_hooks_arena_new_hook(); + } + post_reentrancy(tsdn_tsd(tsdn)); + } + + return arena; +label_error: + if (ind != 0) { + base_delete(tsdn, base); + } + return NULL; +} + +arena_t * +arena_choose_huge(tsd_t *tsd) { + /* huge_arena_ind can be 0 during init (will use a0). */ + if (huge_arena_ind == 0) { + assert(!malloc_initialized()); + } + + arena_t *huge_arena = arena_get(tsd_tsdn(tsd), huge_arena_ind, false); + if (huge_arena == NULL) { + /* Create the huge arena on demand. */ + assert(huge_arena_ind != 0); + huge_arena = arena_get(tsd_tsdn(tsd), huge_arena_ind, true); + if (huge_arena == NULL) { + return NULL; + } + /* + * Purge eagerly for huge allocations, because: 1) number of + * huge allocations is usually small, which means ticker based + * decay is not reliable; and 2) less immediate reuse is + * expected for huge allocations. + */ + if (arena_dirty_decay_ms_default_get() > 0) { + arena_decay_ms_set(tsd_tsdn(tsd), huge_arena, + extent_state_dirty, 0); + } + if (arena_muzzy_decay_ms_default_get() > 0) { + arena_decay_ms_set(tsd_tsdn(tsd), huge_arena, + extent_state_muzzy, 0); + } + } + + return huge_arena; +} + +bool +arena_init_huge(void) { + bool huge_enabled; + + /* The threshold should be large size class. */ + if (opt_oversize_threshold > SC_LARGE_MAXCLASS || + opt_oversize_threshold < SC_LARGE_MINCLASS) { + opt_oversize_threshold = 0; + oversize_threshold = SC_LARGE_MAXCLASS + PAGE; + huge_enabled = false; + } else { + /* Reserve the index for the huge arena. */ + huge_arena_ind = narenas_total_get(); + oversize_threshold = opt_oversize_threshold; + huge_enabled = true; + } + + return huge_enabled; +} + +bool +arena_is_huge(unsigned arena_ind) { + if (huge_arena_ind == 0) { + return false; + } + return (arena_ind == huge_arena_ind); +} + +bool +arena_boot(sc_data_t *sc_data, base_t *base, bool hpa) { + arena_dirty_decay_ms_default_set(opt_dirty_decay_ms); + arena_muzzy_decay_ms_default_set(opt_muzzy_decay_ms); + for (unsigned i = 0; i < SC_NBINS; i++) { + sc_t *sc = &sc_data->sc[i]; + div_init(&arena_binind_div_info[i], + (1U << sc->lg_base) + (sc->ndelta << sc->lg_delta)); + } + + uint32_t cur_offset = (uint32_t)offsetof(arena_t, bins); + for (szind_t i = 0; i < SC_NBINS; i++) { + arena_bin_offsets[i] = cur_offset; + nbins_total += bin_infos[i].n_shards; + cur_offset += (uint32_t)(bin_infos[i].n_shards * sizeof(bin_t)); + } + return pa_central_init(&arena_pa_central_global, base, hpa, + &hpa_hooks_default); +} + +void +arena_prefork0(tsdn_t *tsdn, arena_t *arena) { + pa_shard_prefork0(tsdn, &arena->pa_shard); +} + +void +arena_prefork1(tsdn_t *tsdn, arena_t *arena) { + if (config_stats) { + malloc_mutex_prefork(tsdn, &arena->tcache_ql_mtx); + } +} + +void +arena_prefork2(tsdn_t *tsdn, arena_t *arena) { + pa_shard_prefork2(tsdn, &arena->pa_shard); +} + +void +arena_prefork3(tsdn_t *tsdn, arena_t *arena) { + pa_shard_prefork3(tsdn, &arena->pa_shard); +} + +void +arena_prefork4(tsdn_t *tsdn, arena_t *arena) { + pa_shard_prefork4(tsdn, &arena->pa_shard); +} + +void +arena_prefork5(tsdn_t *tsdn, arena_t *arena) { + pa_shard_prefork5(tsdn, &arena->pa_shard); +} + +void +arena_prefork6(tsdn_t *tsdn, arena_t *arena) { + base_prefork(tsdn, arena->base); +} + +void +arena_prefork7(tsdn_t *tsdn, arena_t *arena) { + malloc_mutex_prefork(tsdn, &arena->large_mtx); +} + +void +arena_prefork8(tsdn_t *tsdn, arena_t *arena) { + for (unsigned i = 0; i < nbins_total; i++) { + bin_prefork(tsdn, &arena->bins[i]); + } +} + +void +arena_postfork_parent(tsdn_t *tsdn, arena_t *arena) { + for (unsigned i = 0; i < nbins_total; i++) { + bin_postfork_parent(tsdn, &arena->bins[i]); + } + + malloc_mutex_postfork_parent(tsdn, &arena->large_mtx); + base_postfork_parent(tsdn, arena->base); + pa_shard_postfork_parent(tsdn, &arena->pa_shard); + if (config_stats) { + malloc_mutex_postfork_parent(tsdn, &arena->tcache_ql_mtx); + } +} + +void +arena_postfork_child(tsdn_t *tsdn, arena_t *arena) { + atomic_store_u(&arena->nthreads[0], 0, ATOMIC_RELAXED); + atomic_store_u(&arena->nthreads[1], 0, ATOMIC_RELAXED); + if (tsd_arena_get(tsdn_tsd(tsdn)) == arena) { + arena_nthreads_inc(arena, false); + } + if (tsd_iarena_get(tsdn_tsd(tsdn)) == arena) { + arena_nthreads_inc(arena, true); + } + if (config_stats) { + ql_new(&arena->tcache_ql); + ql_new(&arena->cache_bin_array_descriptor_ql); + tcache_slow_t *tcache_slow = tcache_slow_get(tsdn_tsd(tsdn)); + if (tcache_slow != NULL && tcache_slow->arena == arena) { + tcache_t *tcache = tcache_slow->tcache; + ql_elm_new(tcache_slow, link); + ql_tail_insert(&arena->tcache_ql, tcache_slow, link); + cache_bin_array_descriptor_init( + &tcache_slow->cache_bin_array_descriptor, + tcache->bins); + ql_tail_insert(&arena->cache_bin_array_descriptor_ql, + &tcache_slow->cache_bin_array_descriptor, link); + } + } + + for (unsigned i = 0; i < nbins_total; i++) { + bin_postfork_child(tsdn, &arena->bins[i]); + } + + malloc_mutex_postfork_child(tsdn, &arena->large_mtx); + base_postfork_child(tsdn, arena->base); + pa_shard_postfork_child(tsdn, &arena->pa_shard); + if (config_stats) { + malloc_mutex_postfork_child(tsdn, &arena->tcache_ql_mtx); + } +} diff --git a/BeefRT/JEMalloc/src/background_thread.c b/BeefRT/JEMalloc/src/background_thread.c new file mode 100644 index 00000000..3bb8d26c --- /dev/null +++ b/BeefRT/JEMalloc/src/background_thread.c @@ -0,0 +1,820 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" + +JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS + +/******************************************************************************/ +/* Data. */ + +/* This option should be opt-in only. */ +#define BACKGROUND_THREAD_DEFAULT false +/* Read-only after initialization. */ +bool opt_background_thread = BACKGROUND_THREAD_DEFAULT; +size_t opt_max_background_threads = MAX_BACKGROUND_THREAD_LIMIT + 1; + +/* Used for thread creation, termination and stats. */ +malloc_mutex_t background_thread_lock; +/* Indicates global state. Atomic because decay reads this w/o locking. */ +atomic_b_t background_thread_enabled_state; +size_t n_background_threads; +size_t max_background_threads; +/* Thread info per-index. */ +background_thread_info_t *background_thread_info; + +/******************************************************************************/ + +#ifdef JEMALLOC_PTHREAD_CREATE_WRAPPER + +static int (*pthread_create_fptr)(pthread_t *__restrict, const pthread_attr_t *, + void *(*)(void *), void *__restrict); + +static void +pthread_create_wrapper_init(void) { +#ifdef JEMALLOC_LAZY_LOCK + if (!isthreaded) { + isthreaded = true; + } +#endif +} + +int +pthread_create_wrapper(pthread_t *__restrict thread, const pthread_attr_t *attr, + void *(*start_routine)(void *), void *__restrict arg) { + pthread_create_wrapper_init(); + + return pthread_create_fptr(thread, attr, start_routine, arg); +} +#endif /* JEMALLOC_PTHREAD_CREATE_WRAPPER */ + +#ifndef JEMALLOC_BACKGROUND_THREAD +#define NOT_REACHED { not_reached(); } +bool background_thread_create(tsd_t *tsd, unsigned arena_ind) NOT_REACHED +bool background_threads_enable(tsd_t *tsd) NOT_REACHED +bool background_threads_disable(tsd_t *tsd) NOT_REACHED +bool background_thread_is_started(background_thread_info_t *info) NOT_REACHED +void background_thread_wakeup_early(background_thread_info_t *info, + nstime_t *remaining_sleep) NOT_REACHED +void background_thread_prefork0(tsdn_t *tsdn) NOT_REACHED +void background_thread_prefork1(tsdn_t *tsdn) NOT_REACHED +void background_thread_postfork_parent(tsdn_t *tsdn) NOT_REACHED +void background_thread_postfork_child(tsdn_t *tsdn) NOT_REACHED +bool background_thread_stats_read(tsdn_t *tsdn, + background_thread_stats_t *stats) NOT_REACHED +void background_thread_ctl_init(tsdn_t *tsdn) NOT_REACHED +#undef NOT_REACHED +#else + +static bool background_thread_enabled_at_fork; + +static void +background_thread_info_init(tsdn_t *tsdn, background_thread_info_t *info) { + background_thread_wakeup_time_set(tsdn, info, 0); + info->npages_to_purge_new = 0; + if (config_stats) { + info->tot_n_runs = 0; + nstime_init_zero(&info->tot_sleep_time); + } +} + +static inline bool +set_current_thread_affinity(int cpu) { +#if defined(JEMALLOC_HAVE_SCHED_SETAFFINITY) + cpu_set_t cpuset; +#else +# ifndef __NetBSD__ + cpuset_t cpuset; +# else + cpuset_t *cpuset; +# endif +#endif + +#ifndef __NetBSD__ + CPU_ZERO(&cpuset); + CPU_SET(cpu, &cpuset); +#else + cpuset = cpuset_create(); +#endif + +#if defined(JEMALLOC_HAVE_SCHED_SETAFFINITY) + return (sched_setaffinity(0, sizeof(cpu_set_t), &cpuset) != 0); +#else +# ifndef __NetBSD__ + int ret = pthread_setaffinity_np(pthread_self(), sizeof(cpuset_t), + &cpuset); +# else + int ret = pthread_setaffinity_np(pthread_self(), cpuset_size(cpuset), + cpuset); + cpuset_destroy(cpuset); +# endif + return ret != 0; +#endif +} + +#define BILLION UINT64_C(1000000000) +/* Minimal sleep interval 100 ms. */ +#define BACKGROUND_THREAD_MIN_INTERVAL_NS (BILLION / 10) + +static void +background_thread_sleep(tsdn_t *tsdn, background_thread_info_t *info, + uint64_t interval) { + if (config_stats) { + info->tot_n_runs++; + } + info->npages_to_purge_new = 0; + + struct timeval tv; + /* Specific clock required by timedwait. */ + gettimeofday(&tv, NULL); + nstime_t before_sleep; + nstime_init2(&before_sleep, tv.tv_sec, tv.tv_usec * 1000); + + int ret; + if (interval == BACKGROUND_THREAD_INDEFINITE_SLEEP) { + background_thread_wakeup_time_set(tsdn, info, + BACKGROUND_THREAD_INDEFINITE_SLEEP); + ret = pthread_cond_wait(&info->cond, &info->mtx.lock); + assert(ret == 0); + } else { + assert(interval >= BACKGROUND_THREAD_MIN_INTERVAL_NS && + interval <= BACKGROUND_THREAD_INDEFINITE_SLEEP); + /* We need malloc clock (can be different from tv). */ + nstime_t next_wakeup; + nstime_init_update(&next_wakeup); + nstime_iadd(&next_wakeup, interval); + assert(nstime_ns(&next_wakeup) < + BACKGROUND_THREAD_INDEFINITE_SLEEP); + background_thread_wakeup_time_set(tsdn, info, + nstime_ns(&next_wakeup)); + + nstime_t ts_wakeup; + nstime_copy(&ts_wakeup, &before_sleep); + nstime_iadd(&ts_wakeup, interval); + struct timespec ts; + ts.tv_sec = (size_t)nstime_sec(&ts_wakeup); + ts.tv_nsec = (size_t)nstime_nsec(&ts_wakeup); + + assert(!background_thread_indefinite_sleep(info)); + ret = pthread_cond_timedwait(&info->cond, &info->mtx.lock, &ts); + assert(ret == ETIMEDOUT || ret == 0); + } + if (config_stats) { + gettimeofday(&tv, NULL); + nstime_t after_sleep; + nstime_init2(&after_sleep, tv.tv_sec, tv.tv_usec * 1000); + if (nstime_compare(&after_sleep, &before_sleep) > 0) { + nstime_subtract(&after_sleep, &before_sleep); + nstime_add(&info->tot_sleep_time, &after_sleep); + } + } +} + +static bool +background_thread_pause_check(tsdn_t *tsdn, background_thread_info_t *info) { + if (unlikely(info->state == background_thread_paused)) { + malloc_mutex_unlock(tsdn, &info->mtx); + /* Wait on global lock to update status. */ + malloc_mutex_lock(tsdn, &background_thread_lock); + malloc_mutex_unlock(tsdn, &background_thread_lock); + malloc_mutex_lock(tsdn, &info->mtx); + return true; + } + + return false; +} + +static inline void +background_work_sleep_once(tsdn_t *tsdn, background_thread_info_t *info, + unsigned ind) { + uint64_t ns_until_deferred = BACKGROUND_THREAD_DEFERRED_MAX; + unsigned narenas = narenas_total_get(); + bool slept_indefinitely = background_thread_indefinite_sleep(info); + + for (unsigned i = ind; i < narenas; i += max_background_threads) { + arena_t *arena = arena_get(tsdn, i, false); + if (!arena) { + continue; + } + /* + * If thread was woken up from the indefinite sleep, don't + * do the work instantly, but rather check when the deferred + * work that caused this thread to wake up is scheduled for. + */ + if (!slept_indefinitely) { + arena_do_deferred_work(tsdn, arena); + } + if (ns_until_deferred <= BACKGROUND_THREAD_MIN_INTERVAL_NS) { + /* Min interval will be used. */ + continue; + } + uint64_t ns_arena_deferred = pa_shard_time_until_deferred_work( + tsdn, &arena->pa_shard); + if (ns_arena_deferred < ns_until_deferred) { + ns_until_deferred = ns_arena_deferred; + } + } + + uint64_t sleep_ns; + if (ns_until_deferred == BACKGROUND_THREAD_DEFERRED_MAX) { + sleep_ns = BACKGROUND_THREAD_INDEFINITE_SLEEP; + } else { + sleep_ns = + (ns_until_deferred < BACKGROUND_THREAD_MIN_INTERVAL_NS) + ? BACKGROUND_THREAD_MIN_INTERVAL_NS + : ns_until_deferred; + + } + + background_thread_sleep(tsdn, info, sleep_ns); +} + +static bool +background_threads_disable_single(tsd_t *tsd, background_thread_info_t *info) { + if (info == &background_thread_info[0]) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), + &background_thread_lock); + } else { + malloc_mutex_assert_not_owner(tsd_tsdn(tsd), + &background_thread_lock); + } + + pre_reentrancy(tsd, NULL); + malloc_mutex_lock(tsd_tsdn(tsd), &info->mtx); + bool has_thread; + assert(info->state != background_thread_paused); + if (info->state == background_thread_started) { + has_thread = true; + info->state = background_thread_stopped; + pthread_cond_signal(&info->cond); + } else { + has_thread = false; + } + malloc_mutex_unlock(tsd_tsdn(tsd), &info->mtx); + + if (!has_thread) { + post_reentrancy(tsd); + return false; + } + void *ret; + if (pthread_join(info->thread, &ret)) { + post_reentrancy(tsd); + return true; + } + assert(ret == NULL); + n_background_threads--; + post_reentrancy(tsd); + + return false; +} + +static void *background_thread_entry(void *ind_arg); + +static int +background_thread_create_signals_masked(pthread_t *thread, + const pthread_attr_t *attr, void *(*start_routine)(void *), void *arg) { + /* + * Mask signals during thread creation so that the thread inherits + * an empty signal set. + */ + sigset_t set; + sigfillset(&set); + sigset_t oldset; + int mask_err = pthread_sigmask(SIG_SETMASK, &set, &oldset); + if (mask_err != 0) { + return mask_err; + } + int create_err = pthread_create_wrapper(thread, attr, start_routine, + arg); + /* + * Restore the signal mask. Failure to restore the signal mask here + * changes program behavior. + */ + int restore_err = pthread_sigmask(SIG_SETMASK, &oldset, NULL); + if (restore_err != 0) { + malloc_printf(": background thread creation " + "failed (%d), and signal mask restoration failed " + "(%d)\n", create_err, restore_err); + if (opt_abort) { + abort(); + } + } + return create_err; +} + +static bool +check_background_thread_creation(tsd_t *tsd, unsigned *n_created, + bool *created_threads) { + bool ret = false; + if (likely(*n_created == n_background_threads)) { + return ret; + } + + tsdn_t *tsdn = tsd_tsdn(tsd); + malloc_mutex_unlock(tsdn, &background_thread_info[0].mtx); + for (unsigned i = 1; i < max_background_threads; i++) { + if (created_threads[i]) { + continue; + } + background_thread_info_t *info = &background_thread_info[i]; + malloc_mutex_lock(tsdn, &info->mtx); + /* + * In case of the background_thread_paused state because of + * arena reset, delay the creation. + */ + bool create = (info->state == background_thread_started); + malloc_mutex_unlock(tsdn, &info->mtx); + if (!create) { + continue; + } + + pre_reentrancy(tsd, NULL); + int err = background_thread_create_signals_masked(&info->thread, + NULL, background_thread_entry, (void *)(uintptr_t)i); + post_reentrancy(tsd); + + if (err == 0) { + (*n_created)++; + created_threads[i] = true; + } else { + malloc_printf(": background thread " + "creation failed (%d)\n", err); + if (opt_abort) { + abort(); + } + } + /* Return to restart the loop since we unlocked. */ + ret = true; + break; + } + malloc_mutex_lock(tsdn, &background_thread_info[0].mtx); + + return ret; +} + +static void +background_thread0_work(tsd_t *tsd) { + /* Thread0 is also responsible for launching / terminating threads. */ + VARIABLE_ARRAY(bool, created_threads, max_background_threads); + unsigned i; + for (i = 1; i < max_background_threads; i++) { + created_threads[i] = false; + } + /* Start working, and create more threads when asked. */ + unsigned n_created = 1; + while (background_thread_info[0].state != background_thread_stopped) { + if (background_thread_pause_check(tsd_tsdn(tsd), + &background_thread_info[0])) { + continue; + } + if (check_background_thread_creation(tsd, &n_created, + (bool *)&created_threads)) { + continue; + } + background_work_sleep_once(tsd_tsdn(tsd), + &background_thread_info[0], 0); + } + + /* + * Shut down other threads at exit. Note that the ctl thread is holding + * the global background_thread mutex (and is waiting) for us. + */ + assert(!background_thread_enabled()); + for (i = 1; i < max_background_threads; i++) { + background_thread_info_t *info = &background_thread_info[i]; + assert(info->state != background_thread_paused); + if (created_threads[i]) { + background_threads_disable_single(tsd, info); + } else { + malloc_mutex_lock(tsd_tsdn(tsd), &info->mtx); + if (info->state != background_thread_stopped) { + /* The thread was not created. */ + assert(info->state == + background_thread_started); + n_background_threads--; + info->state = background_thread_stopped; + } + malloc_mutex_unlock(tsd_tsdn(tsd), &info->mtx); + } + } + background_thread_info[0].state = background_thread_stopped; + assert(n_background_threads == 1); +} + +static void +background_work(tsd_t *tsd, unsigned ind) { + background_thread_info_t *info = &background_thread_info[ind]; + + malloc_mutex_lock(tsd_tsdn(tsd), &info->mtx); + background_thread_wakeup_time_set(tsd_tsdn(tsd), info, + BACKGROUND_THREAD_INDEFINITE_SLEEP); + if (ind == 0) { + background_thread0_work(tsd); + } else { + while (info->state != background_thread_stopped) { + if (background_thread_pause_check(tsd_tsdn(tsd), + info)) { + continue; + } + background_work_sleep_once(tsd_tsdn(tsd), info, ind); + } + } + assert(info->state == background_thread_stopped); + background_thread_wakeup_time_set(tsd_tsdn(tsd), info, 0); + malloc_mutex_unlock(tsd_tsdn(tsd), &info->mtx); +} + +static void * +background_thread_entry(void *ind_arg) { + unsigned thread_ind = (unsigned)(uintptr_t)ind_arg; + assert(thread_ind < max_background_threads); +#ifdef JEMALLOC_HAVE_PTHREAD_SETNAME_NP + pthread_setname_np(pthread_self(), "jemalloc_bg_thd"); +#elif defined(__FreeBSD__) || defined(__DragonFly__) + pthread_set_name_np(pthread_self(), "jemalloc_bg_thd"); +#endif + if (opt_percpu_arena != percpu_arena_disabled) { + set_current_thread_affinity((int)thread_ind); + } + /* + * Start periodic background work. We use internal tsd which avoids + * side effects, for example triggering new arena creation (which in + * turn triggers another background thread creation). + */ + background_work(tsd_internal_fetch(), thread_ind); + assert(pthread_equal(pthread_self(), + background_thread_info[thread_ind].thread)); + + return NULL; +} + +static void +background_thread_init(tsd_t *tsd, background_thread_info_t *info) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), &background_thread_lock); + info->state = background_thread_started; + background_thread_info_init(tsd_tsdn(tsd), info); + n_background_threads++; +} + +static bool +background_thread_create_locked(tsd_t *tsd, unsigned arena_ind) { + assert(have_background_thread); + malloc_mutex_assert_owner(tsd_tsdn(tsd), &background_thread_lock); + + /* We create at most NCPUs threads. */ + size_t thread_ind = arena_ind % max_background_threads; + background_thread_info_t *info = &background_thread_info[thread_ind]; + + bool need_new_thread; + malloc_mutex_lock(tsd_tsdn(tsd), &info->mtx); + need_new_thread = background_thread_enabled() && + (info->state == background_thread_stopped); + if (need_new_thread) { + background_thread_init(tsd, info); + } + malloc_mutex_unlock(tsd_tsdn(tsd), &info->mtx); + if (!need_new_thread) { + return false; + } + if (arena_ind != 0) { + /* Threads are created asynchronously by Thread 0. */ + background_thread_info_t *t0 = &background_thread_info[0]; + malloc_mutex_lock(tsd_tsdn(tsd), &t0->mtx); + assert(t0->state == background_thread_started); + pthread_cond_signal(&t0->cond); + malloc_mutex_unlock(tsd_tsdn(tsd), &t0->mtx); + + return false; + } + + pre_reentrancy(tsd, NULL); + /* + * To avoid complications (besides reentrancy), create internal + * background threads with the underlying pthread_create. + */ + int err = background_thread_create_signals_masked(&info->thread, NULL, + background_thread_entry, (void *)thread_ind); + post_reentrancy(tsd); + + if (err != 0) { + malloc_printf(": arena 0 background thread creation " + "failed (%d)\n", err); + malloc_mutex_lock(tsd_tsdn(tsd), &info->mtx); + info->state = background_thread_stopped; + n_background_threads--; + malloc_mutex_unlock(tsd_tsdn(tsd), &info->mtx); + + return true; + } + + return false; +} + +/* Create a new background thread if needed. */ +bool +background_thread_create(tsd_t *tsd, unsigned arena_ind) { + assert(have_background_thread); + + bool ret; + malloc_mutex_lock(tsd_tsdn(tsd), &background_thread_lock); + ret = background_thread_create_locked(tsd, arena_ind); + malloc_mutex_unlock(tsd_tsdn(tsd), &background_thread_lock); + + return ret; +} + +bool +background_threads_enable(tsd_t *tsd) { + assert(n_background_threads == 0); + assert(background_thread_enabled()); + malloc_mutex_assert_owner(tsd_tsdn(tsd), &background_thread_lock); + + VARIABLE_ARRAY(bool, marked, max_background_threads); + unsigned nmarked; + for (unsigned i = 0; i < max_background_threads; i++) { + marked[i] = false; + } + nmarked = 0; + /* Thread 0 is required and created at the end. */ + marked[0] = true; + /* Mark the threads we need to create for thread 0. */ + unsigned narenas = narenas_total_get(); + for (unsigned i = 1; i < narenas; i++) { + if (marked[i % max_background_threads] || + arena_get(tsd_tsdn(tsd), i, false) == NULL) { + continue; + } + background_thread_info_t *info = &background_thread_info[ + i % max_background_threads]; + malloc_mutex_lock(tsd_tsdn(tsd), &info->mtx); + assert(info->state == background_thread_stopped); + background_thread_init(tsd, info); + malloc_mutex_unlock(tsd_tsdn(tsd), &info->mtx); + marked[i % max_background_threads] = true; + if (++nmarked == max_background_threads) { + break; + } + } + + bool err = background_thread_create_locked(tsd, 0); + if (err) { + return true; + } + for (unsigned i = 0; i < narenas; i++) { + arena_t *arena = arena_get(tsd_tsdn(tsd), i, false); + if (arena != NULL) { + pa_shard_set_deferral_allowed(tsd_tsdn(tsd), + &arena->pa_shard, true); + } + } + return false; +} + +bool +background_threads_disable(tsd_t *tsd) { + assert(!background_thread_enabled()); + malloc_mutex_assert_owner(tsd_tsdn(tsd), &background_thread_lock); + + /* Thread 0 will be responsible for terminating other threads. */ + if (background_threads_disable_single(tsd, + &background_thread_info[0])) { + return true; + } + assert(n_background_threads == 0); + unsigned narenas = narenas_total_get(); + for (unsigned i = 0; i < narenas; i++) { + arena_t *arena = arena_get(tsd_tsdn(tsd), i, false); + if (arena != NULL) { + pa_shard_set_deferral_allowed(tsd_tsdn(tsd), + &arena->pa_shard, false); + } + } + + return false; +} + +bool +background_thread_is_started(background_thread_info_t *info) { + return info->state == background_thread_started; +} + +void +background_thread_wakeup_early(background_thread_info_t *info, + nstime_t *remaining_sleep) { + /* + * This is an optimization to increase batching. At this point + * we know that background thread wakes up soon, so the time to cache + * the just freed memory is bounded and low. + */ + if (remaining_sleep != NULL && nstime_ns(remaining_sleep) < + BACKGROUND_THREAD_MIN_INTERVAL_NS) { + return; + } + pthread_cond_signal(&info->cond); +} + +void +background_thread_prefork0(tsdn_t *tsdn) { + malloc_mutex_prefork(tsdn, &background_thread_lock); + background_thread_enabled_at_fork = background_thread_enabled(); +} + +void +background_thread_prefork1(tsdn_t *tsdn) { + for (unsigned i = 0; i < max_background_threads; i++) { + malloc_mutex_prefork(tsdn, &background_thread_info[i].mtx); + } +} + +void +background_thread_postfork_parent(tsdn_t *tsdn) { + for (unsigned i = 0; i < max_background_threads; i++) { + malloc_mutex_postfork_parent(tsdn, + &background_thread_info[i].mtx); + } + malloc_mutex_postfork_parent(tsdn, &background_thread_lock); +} + +void +background_thread_postfork_child(tsdn_t *tsdn) { + for (unsigned i = 0; i < max_background_threads; i++) { + malloc_mutex_postfork_child(tsdn, + &background_thread_info[i].mtx); + } + malloc_mutex_postfork_child(tsdn, &background_thread_lock); + if (!background_thread_enabled_at_fork) { + return; + } + + /* Clear background_thread state (reset to disabled for child). */ + malloc_mutex_lock(tsdn, &background_thread_lock); + n_background_threads = 0; + background_thread_enabled_set(tsdn, false); + for (unsigned i = 0; i < max_background_threads; i++) { + background_thread_info_t *info = &background_thread_info[i]; + malloc_mutex_lock(tsdn, &info->mtx); + info->state = background_thread_stopped; + int ret = pthread_cond_init(&info->cond, NULL); + assert(ret == 0); + background_thread_info_init(tsdn, info); + malloc_mutex_unlock(tsdn, &info->mtx); + } + malloc_mutex_unlock(tsdn, &background_thread_lock); +} + +bool +background_thread_stats_read(tsdn_t *tsdn, background_thread_stats_t *stats) { + assert(config_stats); + malloc_mutex_lock(tsdn, &background_thread_lock); + if (!background_thread_enabled()) { + malloc_mutex_unlock(tsdn, &background_thread_lock); + return true; + } + + nstime_init_zero(&stats->run_interval); + memset(&stats->max_counter_per_bg_thd, 0, sizeof(mutex_prof_data_t)); + + uint64_t num_runs = 0; + stats->num_threads = n_background_threads; + for (unsigned i = 0; i < max_background_threads; i++) { + background_thread_info_t *info = &background_thread_info[i]; + if (malloc_mutex_trylock(tsdn, &info->mtx)) { + /* + * Each background thread run may take a long time; + * avoid waiting on the stats if the thread is active. + */ + continue; + } + if (info->state != background_thread_stopped) { + num_runs += info->tot_n_runs; + nstime_add(&stats->run_interval, &info->tot_sleep_time); + malloc_mutex_prof_max_update(tsdn, + &stats->max_counter_per_bg_thd, &info->mtx); + } + malloc_mutex_unlock(tsdn, &info->mtx); + } + stats->num_runs = num_runs; + if (num_runs > 0) { + nstime_idivide(&stats->run_interval, num_runs); + } + malloc_mutex_unlock(tsdn, &background_thread_lock); + + return false; +} + +#undef BACKGROUND_THREAD_NPAGES_THRESHOLD +#undef BILLION +#undef BACKGROUND_THREAD_MIN_INTERVAL_NS + +#ifdef JEMALLOC_HAVE_DLSYM +#include +#endif + +static bool +pthread_create_fptr_init(void) { + if (pthread_create_fptr != NULL) { + return false; + } + /* + * Try the next symbol first, because 1) when use lazy_lock we have a + * wrapper for pthread_create; and 2) application may define its own + * wrapper as well (and can call malloc within the wrapper). + */ +#ifdef JEMALLOC_HAVE_DLSYM + pthread_create_fptr = dlsym(RTLD_NEXT, "pthread_create"); +#else + pthread_create_fptr = NULL; +#endif + if (pthread_create_fptr == NULL) { + if (config_lazy_lock) { + malloc_write(": Error in dlsym(RTLD_NEXT, " + "\"pthread_create\")\n"); + abort(); + } else { + /* Fall back to the default symbol. */ + pthread_create_fptr = pthread_create; + } + } + + return false; +} + +/* + * When lazy lock is enabled, we need to make sure setting isthreaded before + * taking any background_thread locks. This is called early in ctl (instead of + * wait for the pthread_create calls to trigger) because the mutex is required + * before creating background threads. + */ +void +background_thread_ctl_init(tsdn_t *tsdn) { + malloc_mutex_assert_not_owner(tsdn, &background_thread_lock); +#ifdef JEMALLOC_PTHREAD_CREATE_WRAPPER + pthread_create_fptr_init(); + pthread_create_wrapper_init(); +#endif +} + +#endif /* defined(JEMALLOC_BACKGROUND_THREAD) */ + +bool +background_thread_boot0(void) { + if (!have_background_thread && opt_background_thread) { + malloc_printf(": option background_thread currently " + "supports pthread only\n"); + return true; + } +#ifdef JEMALLOC_PTHREAD_CREATE_WRAPPER + if ((config_lazy_lock || opt_background_thread) && + pthread_create_fptr_init()) { + return true; + } +#endif + return false; +} + +bool +background_thread_boot1(tsdn_t *tsdn, base_t *base) { +#ifdef JEMALLOC_BACKGROUND_THREAD + assert(have_background_thread); + assert(narenas_total_get() > 0); + + if (opt_max_background_threads > MAX_BACKGROUND_THREAD_LIMIT) { + opt_max_background_threads = DEFAULT_NUM_BACKGROUND_THREAD; + } + max_background_threads = opt_max_background_threads; + + background_thread_enabled_set(tsdn, opt_background_thread); + if (malloc_mutex_init(&background_thread_lock, + "background_thread_global", + WITNESS_RANK_BACKGROUND_THREAD_GLOBAL, + malloc_mutex_rank_exclusive)) { + return true; + } + + background_thread_info = (background_thread_info_t *)base_alloc(tsdn, + base, opt_max_background_threads * + sizeof(background_thread_info_t), CACHELINE); + if (background_thread_info == NULL) { + return true; + } + + for (unsigned i = 0; i < max_background_threads; i++) { + background_thread_info_t *info = &background_thread_info[i]; + /* Thread mutex is rank_inclusive because of thread0. */ + if (malloc_mutex_init(&info->mtx, "background_thread", + WITNESS_RANK_BACKGROUND_THREAD, + malloc_mutex_address_ordered)) { + return true; + } + if (pthread_cond_init(&info->cond, NULL)) { + return true; + } + malloc_mutex_lock(tsdn, &info->mtx); + info->state = background_thread_stopped; + background_thread_info_init(tsdn, info); + malloc_mutex_unlock(tsdn, &info->mtx); + } +#endif + + return false; +} diff --git a/BeefRT/JEMalloc/src/base.c b/BeefRT/JEMalloc/src/base.c new file mode 100644 index 00000000..7f4d6756 --- /dev/null +++ b/BeefRT/JEMalloc/src/base.c @@ -0,0 +1,529 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/extent_mmap.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/sz.h" + +/* + * In auto mode, arenas switch to huge pages for the base allocator on the + * second base block. a0 switches to thp on the 5th block (after 20 megabytes + * of metadata), since more metadata (e.g. rtree nodes) come from a0's base. + */ + +#define BASE_AUTO_THP_THRESHOLD 2 +#define BASE_AUTO_THP_THRESHOLD_A0 5 + +/******************************************************************************/ +/* Data. */ + +static base_t *b0; + +metadata_thp_mode_t opt_metadata_thp = METADATA_THP_DEFAULT; + +const char *metadata_thp_mode_names[] = { + "disabled", + "auto", + "always" +}; + +/******************************************************************************/ + +static inline bool +metadata_thp_madvise(void) { + return (metadata_thp_enabled() && + (init_system_thp_mode == thp_mode_default)); +} + +static void * +base_map(tsdn_t *tsdn, ehooks_t *ehooks, unsigned ind, size_t size) { + void *addr; + bool zero = true; + bool commit = true; + + /* Use huge page sizes and alignment regardless of opt_metadata_thp. */ + assert(size == HUGEPAGE_CEILING(size)); + size_t alignment = HUGEPAGE; + if (ehooks_are_default(ehooks)) { + addr = extent_alloc_mmap(NULL, size, alignment, &zero, &commit); + if (have_madvise_huge && addr) { + pages_set_thp_state(addr, size); + } + } else { + addr = ehooks_alloc(tsdn, ehooks, NULL, size, alignment, &zero, + &commit); + } + + return addr; +} + +static void +base_unmap(tsdn_t *tsdn, ehooks_t *ehooks, unsigned ind, void *addr, + size_t size) { + /* + * Cascade through dalloc, decommit, purge_forced, and purge_lazy, + * stopping at first success. This cascade is performed for consistency + * with the cascade in extent_dalloc_wrapper() because an application's + * custom hooks may not support e.g. dalloc. This function is only ever + * called as a side effect of arena destruction, so although it might + * seem pointless to do anything besides dalloc here, the application + * may in fact want the end state of all associated virtual memory to be + * in some consistent-but-allocated state. + */ + if (ehooks_are_default(ehooks)) { + if (!extent_dalloc_mmap(addr, size)) { + goto label_done; + } + if (!pages_decommit(addr, size)) { + goto label_done; + } + if (!pages_purge_forced(addr, size)) { + goto label_done; + } + if (!pages_purge_lazy(addr, size)) { + goto label_done; + } + /* Nothing worked. This should never happen. */ + not_reached(); + } else { + if (!ehooks_dalloc(tsdn, ehooks, addr, size, true)) { + goto label_done; + } + if (!ehooks_decommit(tsdn, ehooks, addr, size, 0, size)) { + goto label_done; + } + if (!ehooks_purge_forced(tsdn, ehooks, addr, size, 0, size)) { + goto label_done; + } + if (!ehooks_purge_lazy(tsdn, ehooks, addr, size, 0, size)) { + goto label_done; + } + /* Nothing worked. That's the application's problem. */ + } +label_done: + if (metadata_thp_madvise()) { + /* Set NOHUGEPAGE after unmap to avoid kernel defrag. */ + assert(((uintptr_t)addr & HUGEPAGE_MASK) == 0 && + (size & HUGEPAGE_MASK) == 0); + pages_nohuge(addr, size); + } +} + +static void +base_edata_init(size_t *extent_sn_next, edata_t *edata, void *addr, + size_t size) { + size_t sn; + + sn = *extent_sn_next; + (*extent_sn_next)++; + + edata_binit(edata, addr, size, sn); +} + +static size_t +base_get_num_blocks(base_t *base, bool with_new_block) { + base_block_t *b = base->blocks; + assert(b != NULL); + + size_t n_blocks = with_new_block ? 2 : 1; + while (b->next != NULL) { + n_blocks++; + b = b->next; + } + + return n_blocks; +} + +static void +base_auto_thp_switch(tsdn_t *tsdn, base_t *base) { + assert(opt_metadata_thp == metadata_thp_auto); + malloc_mutex_assert_owner(tsdn, &base->mtx); + if (base->auto_thp_switched) { + return; + } + /* Called when adding a new block. */ + bool should_switch; + if (base_ind_get(base) != 0) { + should_switch = (base_get_num_blocks(base, true) == + BASE_AUTO_THP_THRESHOLD); + } else { + should_switch = (base_get_num_blocks(base, true) == + BASE_AUTO_THP_THRESHOLD_A0); + } + if (!should_switch) { + return; + } + + base->auto_thp_switched = true; + assert(!config_stats || base->n_thp == 0); + /* Make the initial blocks THP lazily. */ + base_block_t *block = base->blocks; + while (block != NULL) { + assert((block->size & HUGEPAGE_MASK) == 0); + pages_huge(block, block->size); + if (config_stats) { + base->n_thp += HUGEPAGE_CEILING(block->size - + edata_bsize_get(&block->edata)) >> LG_HUGEPAGE; + } + block = block->next; + assert(block == NULL || (base_ind_get(base) == 0)); + } +} + +static void * +base_extent_bump_alloc_helper(edata_t *edata, size_t *gap_size, size_t size, + size_t alignment) { + void *ret; + + assert(alignment == ALIGNMENT_CEILING(alignment, QUANTUM)); + assert(size == ALIGNMENT_CEILING(size, alignment)); + + *gap_size = ALIGNMENT_CEILING((uintptr_t)edata_addr_get(edata), + alignment) - (uintptr_t)edata_addr_get(edata); + ret = (void *)((uintptr_t)edata_addr_get(edata) + *gap_size); + assert(edata_bsize_get(edata) >= *gap_size + size); + edata_binit(edata, (void *)((uintptr_t)edata_addr_get(edata) + + *gap_size + size), edata_bsize_get(edata) - *gap_size - size, + edata_sn_get(edata)); + return ret; +} + +static void +base_extent_bump_alloc_post(base_t *base, edata_t *edata, size_t gap_size, + void *addr, size_t size) { + if (edata_bsize_get(edata) > 0) { + /* + * Compute the index for the largest size class that does not + * exceed extent's size. + */ + szind_t index_floor = + sz_size2index(edata_bsize_get(edata) + 1) - 1; + edata_heap_insert(&base->avail[index_floor], edata); + } + + if (config_stats) { + base->allocated += size; + /* + * Add one PAGE to base_resident for every page boundary that is + * crossed by the new allocation. Adjust n_thp similarly when + * metadata_thp is enabled. + */ + base->resident += PAGE_CEILING((uintptr_t)addr + size) - + PAGE_CEILING((uintptr_t)addr - gap_size); + assert(base->allocated <= base->resident); + assert(base->resident <= base->mapped); + if (metadata_thp_madvise() && (opt_metadata_thp == + metadata_thp_always || base->auto_thp_switched)) { + base->n_thp += (HUGEPAGE_CEILING((uintptr_t)addr + size) + - HUGEPAGE_CEILING((uintptr_t)addr - gap_size)) >> + LG_HUGEPAGE; + assert(base->mapped >= base->n_thp << LG_HUGEPAGE); + } + } +} + +static void * +base_extent_bump_alloc(base_t *base, edata_t *edata, size_t size, + size_t alignment) { + void *ret; + size_t gap_size; + + ret = base_extent_bump_alloc_helper(edata, &gap_size, size, alignment); + base_extent_bump_alloc_post(base, edata, gap_size, ret, size); + return ret; +} + +/* + * Allocate a block of virtual memory that is large enough to start with a + * base_block_t header, followed by an object of specified size and alignment. + * On success a pointer to the initialized base_block_t header is returned. + */ +static base_block_t * +base_block_alloc(tsdn_t *tsdn, base_t *base, ehooks_t *ehooks, unsigned ind, + pszind_t *pind_last, size_t *extent_sn_next, size_t size, + size_t alignment) { + alignment = ALIGNMENT_CEILING(alignment, QUANTUM); + size_t usize = ALIGNMENT_CEILING(size, alignment); + size_t header_size = sizeof(base_block_t); + size_t gap_size = ALIGNMENT_CEILING(header_size, alignment) - + header_size; + /* + * Create increasingly larger blocks in order to limit the total number + * of disjoint virtual memory ranges. Choose the next size in the page + * size class series (skipping size classes that are not a multiple of + * HUGEPAGE), or a size large enough to satisfy the requested size and + * alignment, whichever is larger. + */ + size_t min_block_size = HUGEPAGE_CEILING(sz_psz2u(header_size + gap_size + + usize)); + pszind_t pind_next = (*pind_last + 1 < sz_psz2ind(SC_LARGE_MAXCLASS)) ? + *pind_last + 1 : *pind_last; + size_t next_block_size = HUGEPAGE_CEILING(sz_pind2sz(pind_next)); + size_t block_size = (min_block_size > next_block_size) ? min_block_size + : next_block_size; + base_block_t *block = (base_block_t *)base_map(tsdn, ehooks, ind, + block_size); + if (block == NULL) { + return NULL; + } + + if (metadata_thp_madvise()) { + void *addr = (void *)block; + assert(((uintptr_t)addr & HUGEPAGE_MASK) == 0 && + (block_size & HUGEPAGE_MASK) == 0); + if (opt_metadata_thp == metadata_thp_always) { + pages_huge(addr, block_size); + } else if (opt_metadata_thp == metadata_thp_auto && + base != NULL) { + /* base != NULL indicates this is not a new base. */ + malloc_mutex_lock(tsdn, &base->mtx); + base_auto_thp_switch(tsdn, base); + if (base->auto_thp_switched) { + pages_huge(addr, block_size); + } + malloc_mutex_unlock(tsdn, &base->mtx); + } + } + + *pind_last = sz_psz2ind(block_size); + block->size = block_size; + block->next = NULL; + assert(block_size >= header_size); + base_edata_init(extent_sn_next, &block->edata, + (void *)((uintptr_t)block + header_size), block_size - header_size); + return block; +} + +/* + * Allocate an extent that is at least as large as specified size, with + * specified alignment. + */ +static edata_t * +base_extent_alloc(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment) { + malloc_mutex_assert_owner(tsdn, &base->mtx); + + ehooks_t *ehooks = base_ehooks_get_for_metadata(base); + /* + * Drop mutex during base_block_alloc(), because an extent hook will be + * called. + */ + malloc_mutex_unlock(tsdn, &base->mtx); + base_block_t *block = base_block_alloc(tsdn, base, ehooks, + base_ind_get(base), &base->pind_last, &base->extent_sn_next, size, + alignment); + malloc_mutex_lock(tsdn, &base->mtx); + if (block == NULL) { + return NULL; + } + block->next = base->blocks; + base->blocks = block; + if (config_stats) { + base->allocated += sizeof(base_block_t); + base->resident += PAGE_CEILING(sizeof(base_block_t)); + base->mapped += block->size; + if (metadata_thp_madvise() && + !(opt_metadata_thp == metadata_thp_auto + && !base->auto_thp_switched)) { + assert(base->n_thp > 0); + base->n_thp += HUGEPAGE_CEILING(sizeof(base_block_t)) >> + LG_HUGEPAGE; + } + assert(base->allocated <= base->resident); + assert(base->resident <= base->mapped); + assert(base->n_thp << LG_HUGEPAGE <= base->mapped); + } + return &block->edata; +} + +base_t * +b0get(void) { + return b0; +} + +base_t * +base_new(tsdn_t *tsdn, unsigned ind, const extent_hooks_t *extent_hooks, + bool metadata_use_hooks) { + pszind_t pind_last = 0; + size_t extent_sn_next = 0; + + /* + * The base will contain the ehooks eventually, but it itself is + * allocated using them. So we use some stack ehooks to bootstrap its + * memory, and then initialize the ehooks within the base_t. + */ + ehooks_t fake_ehooks; + ehooks_init(&fake_ehooks, metadata_use_hooks ? + (extent_hooks_t *)extent_hooks : + (extent_hooks_t *)&ehooks_default_extent_hooks, ind); + + base_block_t *block = base_block_alloc(tsdn, NULL, &fake_ehooks, ind, + &pind_last, &extent_sn_next, sizeof(base_t), QUANTUM); + if (block == NULL) { + return NULL; + } + + size_t gap_size; + size_t base_alignment = CACHELINE; + size_t base_size = ALIGNMENT_CEILING(sizeof(base_t), base_alignment); + base_t *base = (base_t *)base_extent_bump_alloc_helper(&block->edata, + &gap_size, base_size, base_alignment); + ehooks_init(&base->ehooks, (extent_hooks_t *)extent_hooks, ind); + ehooks_init(&base->ehooks_base, metadata_use_hooks ? + (extent_hooks_t *)extent_hooks : + (extent_hooks_t *)&ehooks_default_extent_hooks, ind); + if (malloc_mutex_init(&base->mtx, "base", WITNESS_RANK_BASE, + malloc_mutex_rank_exclusive)) { + base_unmap(tsdn, &fake_ehooks, ind, block, block->size); + return NULL; + } + base->pind_last = pind_last; + base->extent_sn_next = extent_sn_next; + base->blocks = block; + base->auto_thp_switched = false; + for (szind_t i = 0; i < SC_NSIZES; i++) { + edata_heap_new(&base->avail[i]); + } + if (config_stats) { + base->allocated = sizeof(base_block_t); + base->resident = PAGE_CEILING(sizeof(base_block_t)); + base->mapped = block->size; + base->n_thp = (opt_metadata_thp == metadata_thp_always) && + metadata_thp_madvise() ? HUGEPAGE_CEILING(sizeof(base_block_t)) + >> LG_HUGEPAGE : 0; + assert(base->allocated <= base->resident); + assert(base->resident <= base->mapped); + assert(base->n_thp << LG_HUGEPAGE <= base->mapped); + } + base_extent_bump_alloc_post(base, &block->edata, gap_size, base, + base_size); + + return base; +} + +void +base_delete(tsdn_t *tsdn, base_t *base) { + ehooks_t *ehooks = base_ehooks_get_for_metadata(base); + base_block_t *next = base->blocks; + do { + base_block_t *block = next; + next = block->next; + base_unmap(tsdn, ehooks, base_ind_get(base), block, + block->size); + } while (next != NULL); +} + +ehooks_t * +base_ehooks_get(base_t *base) { + return &base->ehooks; +} + +ehooks_t * +base_ehooks_get_for_metadata(base_t *base) { + return &base->ehooks_base; +} + +extent_hooks_t * +base_extent_hooks_set(base_t *base, extent_hooks_t *extent_hooks) { + extent_hooks_t *old_extent_hooks = + ehooks_get_extent_hooks_ptr(&base->ehooks); + ehooks_init(&base->ehooks, extent_hooks, ehooks_ind_get(&base->ehooks)); + return old_extent_hooks; +} + +static void * +base_alloc_impl(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment, + size_t *esn) { + alignment = QUANTUM_CEILING(alignment); + size_t usize = ALIGNMENT_CEILING(size, alignment); + size_t asize = usize + alignment - QUANTUM; + + edata_t *edata = NULL; + malloc_mutex_lock(tsdn, &base->mtx); + for (szind_t i = sz_size2index(asize); i < SC_NSIZES; i++) { + edata = edata_heap_remove_first(&base->avail[i]); + if (edata != NULL) { + /* Use existing space. */ + break; + } + } + if (edata == NULL) { + /* Try to allocate more space. */ + edata = base_extent_alloc(tsdn, base, usize, alignment); + } + void *ret; + if (edata == NULL) { + ret = NULL; + goto label_return; + } + + ret = base_extent_bump_alloc(base, edata, usize, alignment); + if (esn != NULL) { + *esn = (size_t)edata_sn_get(edata); + } +label_return: + malloc_mutex_unlock(tsdn, &base->mtx); + return ret; +} + +/* + * base_alloc() returns zeroed memory, which is always demand-zeroed for the + * auto arenas, in order to make multi-page sparse data structures such as radix + * tree nodes efficient with respect to physical memory usage. Upon success a + * pointer to at least size bytes with specified alignment is returned. Note + * that size is rounded up to the nearest multiple of alignment to avoid false + * sharing. + */ +void * +base_alloc(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment) { + return base_alloc_impl(tsdn, base, size, alignment, NULL); +} + +edata_t * +base_alloc_edata(tsdn_t *tsdn, base_t *base) { + size_t esn; + edata_t *edata = base_alloc_impl(tsdn, base, sizeof(edata_t), + EDATA_ALIGNMENT, &esn); + if (edata == NULL) { + return NULL; + } + edata_esn_set(edata, esn); + return edata; +} + +void +base_stats_get(tsdn_t *tsdn, base_t *base, size_t *allocated, size_t *resident, + size_t *mapped, size_t *n_thp) { + cassert(config_stats); + + malloc_mutex_lock(tsdn, &base->mtx); + assert(base->allocated <= base->resident); + assert(base->resident <= base->mapped); + *allocated = base->allocated; + *resident = base->resident; + *mapped = base->mapped; + *n_thp = base->n_thp; + malloc_mutex_unlock(tsdn, &base->mtx); +} + +void +base_prefork(tsdn_t *tsdn, base_t *base) { + malloc_mutex_prefork(tsdn, &base->mtx); +} + +void +base_postfork_parent(tsdn_t *tsdn, base_t *base) { + malloc_mutex_postfork_parent(tsdn, &base->mtx); +} + +void +base_postfork_child(tsdn_t *tsdn, base_t *base) { + malloc_mutex_postfork_child(tsdn, &base->mtx); +} + +bool +base_boot(tsdn_t *tsdn) { + b0 = base_new(tsdn, 0, (extent_hooks_t *)&ehooks_default_extent_hooks, + /* metadata_use_hooks */ true); + return (b0 == NULL); +} diff --git a/BeefRT/JEMalloc/src/bin.c b/BeefRT/JEMalloc/src/bin.c new file mode 100644 index 00000000..fa204587 --- /dev/null +++ b/BeefRT/JEMalloc/src/bin.c @@ -0,0 +1,69 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/bin.h" +#include "jemalloc/internal/sc.h" +#include "jemalloc/internal/witness.h" + +bool +bin_update_shard_size(unsigned bin_shard_sizes[SC_NBINS], size_t start_size, + size_t end_size, size_t nshards) { + if (nshards > BIN_SHARDS_MAX || nshards == 0) { + return true; + } + + if (start_size > SC_SMALL_MAXCLASS) { + return false; + } + if (end_size > SC_SMALL_MAXCLASS) { + end_size = SC_SMALL_MAXCLASS; + } + + /* Compute the index since this may happen before sz init. */ + szind_t ind1 = sz_size2index_compute(start_size); + szind_t ind2 = sz_size2index_compute(end_size); + for (unsigned i = ind1; i <= ind2; i++) { + bin_shard_sizes[i] = (unsigned)nshards; + } + + return false; +} + +void +bin_shard_sizes_boot(unsigned bin_shard_sizes[SC_NBINS]) { + /* Load the default number of shards. */ + for (unsigned i = 0; i < SC_NBINS; i++) { + bin_shard_sizes[i] = N_BIN_SHARDS_DEFAULT; + } +} + +bool +bin_init(bin_t *bin) { + if (malloc_mutex_init(&bin->lock, "bin", WITNESS_RANK_BIN, + malloc_mutex_rank_exclusive)) { + return true; + } + bin->slabcur = NULL; + edata_heap_new(&bin->slabs_nonfull); + edata_list_active_init(&bin->slabs_full); + if (config_stats) { + memset(&bin->stats, 0, sizeof(bin_stats_t)); + } + return false; +} + +void +bin_prefork(tsdn_t *tsdn, bin_t *bin) { + malloc_mutex_prefork(tsdn, &bin->lock); +} + +void +bin_postfork_parent(tsdn_t *tsdn, bin_t *bin) { + malloc_mutex_postfork_parent(tsdn, &bin->lock); +} + +void +bin_postfork_child(tsdn_t *tsdn, bin_t *bin) { + malloc_mutex_postfork_child(tsdn, &bin->lock); +} diff --git a/BeefRT/JEMalloc/src/bin_info.c b/BeefRT/JEMalloc/src/bin_info.c new file mode 100644 index 00000000..8629ef88 --- /dev/null +++ b/BeefRT/JEMalloc/src/bin_info.c @@ -0,0 +1,30 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/bin_info.h" + +bin_info_t bin_infos[SC_NBINS]; + +static void +bin_infos_init(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS], + bin_info_t infos[SC_NBINS]) { + for (unsigned i = 0; i < SC_NBINS; i++) { + bin_info_t *bin_info = &infos[i]; + sc_t *sc = &sc_data->sc[i]; + bin_info->reg_size = ((size_t)1U << sc->lg_base) + + ((size_t)sc->ndelta << sc->lg_delta); + bin_info->slab_size = (sc->pgs << LG_PAGE); + bin_info->nregs = + (uint32_t)(bin_info->slab_size / bin_info->reg_size); + bin_info->n_shards = bin_shard_sizes[i]; + bitmap_info_t bitmap_info = BITMAP_INFO_INITIALIZER( + bin_info->nregs); + bin_info->bitmap_info = bitmap_info; + } +} + +void +bin_info_boot(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS]) { + assert(sc_data->initialized); + bin_infos_init(sc_data, bin_shard_sizes, bin_infos); +} diff --git a/BeefRT/JEMalloc/src/bitmap.c b/BeefRT/JEMalloc/src/bitmap.c new file mode 100644 index 00000000..0ccedc5d --- /dev/null +++ b/BeefRT/JEMalloc/src/bitmap.c @@ -0,0 +1,120 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" + +/******************************************************************************/ + +#ifdef BITMAP_USE_TREE + +void +bitmap_info_init(bitmap_info_t *binfo, size_t nbits) { + unsigned i; + size_t group_count; + + assert(nbits > 0); + assert(nbits <= (ZU(1) << LG_BITMAP_MAXBITS)); + + /* + * Compute the number of groups necessary to store nbits bits, and + * progressively work upward through the levels until reaching a level + * that requires only one group. + */ + binfo->levels[0].group_offset = 0; + group_count = BITMAP_BITS2GROUPS(nbits); + for (i = 1; group_count > 1; i++) { + assert(i < BITMAP_MAX_LEVELS); + binfo->levels[i].group_offset = binfo->levels[i-1].group_offset + + group_count; + group_count = BITMAP_BITS2GROUPS(group_count); + } + binfo->levels[i].group_offset = binfo->levels[i-1].group_offset + + group_count; + assert(binfo->levels[i].group_offset <= BITMAP_GROUPS_MAX); + binfo->nlevels = i; + binfo->nbits = nbits; +} + +static size_t +bitmap_info_ngroups(const bitmap_info_t *binfo) { + return binfo->levels[binfo->nlevels].group_offset; +} + +void +bitmap_init(bitmap_t *bitmap, const bitmap_info_t *binfo, bool fill) { + size_t extra; + unsigned i; + + /* + * Bits are actually inverted with regard to the external bitmap + * interface. + */ + + if (fill) { + /* The "filled" bitmap starts out with all 0 bits. */ + memset(bitmap, 0, bitmap_size(binfo)); + return; + } + + /* + * The "empty" bitmap starts out with all 1 bits, except for trailing + * unused bits (if any). Note that each group uses bit 0 to correspond + * to the first logical bit in the group, so extra bits are the most + * significant bits of the last group. + */ + memset(bitmap, 0xffU, bitmap_size(binfo)); + extra = (BITMAP_GROUP_NBITS - (binfo->nbits & BITMAP_GROUP_NBITS_MASK)) + & BITMAP_GROUP_NBITS_MASK; + if (extra != 0) { + bitmap[binfo->levels[1].group_offset - 1] >>= extra; + } + for (i = 1; i < binfo->nlevels; i++) { + size_t group_count = binfo->levels[i].group_offset - + binfo->levels[i-1].group_offset; + extra = (BITMAP_GROUP_NBITS - (group_count & + BITMAP_GROUP_NBITS_MASK)) & BITMAP_GROUP_NBITS_MASK; + if (extra != 0) { + bitmap[binfo->levels[i+1].group_offset - 1] >>= extra; + } + } +} + +#else /* BITMAP_USE_TREE */ + +void +bitmap_info_init(bitmap_info_t *binfo, size_t nbits) { + assert(nbits > 0); + assert(nbits <= (ZU(1) << LG_BITMAP_MAXBITS)); + + binfo->ngroups = BITMAP_BITS2GROUPS(nbits); + binfo->nbits = nbits; +} + +static size_t +bitmap_info_ngroups(const bitmap_info_t *binfo) { + return binfo->ngroups; +} + +void +bitmap_init(bitmap_t *bitmap, const bitmap_info_t *binfo, bool fill) { + size_t extra; + + if (fill) { + memset(bitmap, 0, bitmap_size(binfo)); + return; + } + + memset(bitmap, 0xffU, bitmap_size(binfo)); + extra = (BITMAP_GROUP_NBITS - (binfo->nbits & BITMAP_GROUP_NBITS_MASK)) + & BITMAP_GROUP_NBITS_MASK; + if (extra != 0) { + bitmap[binfo->ngroups - 1] >>= extra; + } +} + +#endif /* BITMAP_USE_TREE */ + +size_t +bitmap_size(const bitmap_info_t *binfo) { + return (bitmap_info_ngroups(binfo) << LG_SIZEOF_BITMAP); +} diff --git a/BeefRT/JEMalloc/src/buf_writer.c b/BeefRT/JEMalloc/src/buf_writer.c new file mode 100644 index 00000000..7c6f7940 --- /dev/null +++ b/BeefRT/JEMalloc/src/buf_writer.c @@ -0,0 +1,144 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/buf_writer.h" +#include "jemalloc/internal/malloc_io.h" + +static void * +buf_writer_allocate_internal_buf(tsdn_t *tsdn, size_t buf_len) { +#ifdef JEMALLOC_JET + if (buf_len > SC_LARGE_MAXCLASS) { + return NULL; + } +#else + assert(buf_len <= SC_LARGE_MAXCLASS); +#endif + return iallocztm(tsdn, buf_len, sz_size2index(buf_len), false, NULL, + true, arena_get(tsdn, 0, false), true); +} + +static void +buf_writer_free_internal_buf(tsdn_t *tsdn, void *buf) { + if (buf != NULL) { + idalloctm(tsdn, buf, NULL, NULL, true, true); + } +} + +static void +buf_writer_assert(buf_writer_t *buf_writer) { + assert(buf_writer != NULL); + assert(buf_writer->write_cb != NULL); + if (buf_writer->buf != NULL) { + assert(buf_writer->buf_size > 0); + } else { + assert(buf_writer->buf_size == 0); + assert(buf_writer->internal_buf); + } + assert(buf_writer->buf_end <= buf_writer->buf_size); +} + +bool +buf_writer_init(tsdn_t *tsdn, buf_writer_t *buf_writer, write_cb_t *write_cb, + void *cbopaque, char *buf, size_t buf_len) { + if (write_cb != NULL) { + buf_writer->write_cb = write_cb; + } else { + buf_writer->write_cb = je_malloc_message != NULL ? + je_malloc_message : wrtmessage; + } + buf_writer->cbopaque = cbopaque; + assert(buf_len >= 2); + if (buf != NULL) { + buf_writer->buf = buf; + buf_writer->internal_buf = false; + } else { + buf_writer->buf = buf_writer_allocate_internal_buf(tsdn, + buf_len); + buf_writer->internal_buf = true; + } + if (buf_writer->buf != NULL) { + buf_writer->buf_size = buf_len - 1; /* Allowing for '\0'. */ + } else { + buf_writer->buf_size = 0; + } + buf_writer->buf_end = 0; + buf_writer_assert(buf_writer); + return buf_writer->buf == NULL; +} + +void +buf_writer_flush(buf_writer_t *buf_writer) { + buf_writer_assert(buf_writer); + if (buf_writer->buf == NULL) { + return; + } + buf_writer->buf[buf_writer->buf_end] = '\0'; + buf_writer->write_cb(buf_writer->cbopaque, buf_writer->buf); + buf_writer->buf_end = 0; + buf_writer_assert(buf_writer); +} + +void +buf_writer_cb(void *buf_writer_arg, const char *s) { + buf_writer_t *buf_writer = (buf_writer_t *)buf_writer_arg; + buf_writer_assert(buf_writer); + if (buf_writer->buf == NULL) { + buf_writer->write_cb(buf_writer->cbopaque, s); + return; + } + size_t i, slen, n; + for (i = 0, slen = strlen(s); i < slen; i += n) { + if (buf_writer->buf_end == buf_writer->buf_size) { + buf_writer_flush(buf_writer); + } + size_t s_remain = slen - i; + size_t buf_remain = buf_writer->buf_size - buf_writer->buf_end; + n = s_remain < buf_remain ? s_remain : buf_remain; + memcpy(buf_writer->buf + buf_writer->buf_end, s + i, n); + buf_writer->buf_end += n; + buf_writer_assert(buf_writer); + } + assert(i == slen); +} + +void +buf_writer_terminate(tsdn_t *tsdn, buf_writer_t *buf_writer) { + buf_writer_assert(buf_writer); + buf_writer_flush(buf_writer); + if (buf_writer->internal_buf) { + buf_writer_free_internal_buf(tsdn, buf_writer->buf); + } +} + +void +buf_writer_pipe(buf_writer_t *buf_writer, read_cb_t *read_cb, + void *read_cbopaque) { + /* + * A tiny local buffer in case the buffered writer failed to allocate + * at init. + */ + static char backup_buf[16]; + static buf_writer_t backup_buf_writer; + + buf_writer_assert(buf_writer); + assert(read_cb != NULL); + if (buf_writer->buf == NULL) { + buf_writer_init(TSDN_NULL, &backup_buf_writer, + buf_writer->write_cb, buf_writer->cbopaque, backup_buf, + sizeof(backup_buf)); + buf_writer = &backup_buf_writer; + } + assert(buf_writer->buf != NULL); + ssize_t nread = 0; + do { + buf_writer->buf_end += nread; + buf_writer_assert(buf_writer); + if (buf_writer->buf_end == buf_writer->buf_size) { + buf_writer_flush(buf_writer); + } + nread = read_cb(read_cbopaque, + buf_writer->buf + buf_writer->buf_end, + buf_writer->buf_size - buf_writer->buf_end); + } while (nread > 0); + buf_writer_flush(buf_writer); +} diff --git a/BeefRT/JEMalloc/src/cache_bin.c b/BeefRT/JEMalloc/src/cache_bin.c new file mode 100644 index 00000000..9ae072a0 --- /dev/null +++ b/BeefRT/JEMalloc/src/cache_bin.c @@ -0,0 +1,99 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/bit_util.h" +#include "jemalloc/internal/cache_bin.h" +#include "jemalloc/internal/safety_check.h" + +void +cache_bin_info_init(cache_bin_info_t *info, + cache_bin_sz_t ncached_max) { + assert(ncached_max <= CACHE_BIN_NCACHED_MAX); + size_t stack_size = (size_t)ncached_max * sizeof(void *); + assert(stack_size < ((size_t)1 << (sizeof(cache_bin_sz_t) * 8))); + info->ncached_max = (cache_bin_sz_t)ncached_max; +} + +void +cache_bin_info_compute_alloc(cache_bin_info_t *infos, szind_t ninfos, + size_t *size, size_t *alignment) { + /* For the total bin stack region (per tcache), reserve 2 more slots so + * that + * 1) the empty position can be safely read on the fast path before + * checking "is_empty"; and + * 2) the cur_ptr can go beyond the empty position by 1 step safely on + * the fast path (i.e. no overflow). + */ + *size = sizeof(void *) * 2; + for (szind_t i = 0; i < ninfos; i++) { + assert(infos[i].ncached_max > 0); + *size += infos[i].ncached_max * sizeof(void *); + } + + /* + * Align to at least PAGE, to minimize the # of TLBs needed by the + * smaller sizes; also helps if the larger sizes don't get used at all. + */ + *alignment = PAGE; +} + +void +cache_bin_preincrement(cache_bin_info_t *infos, szind_t ninfos, void *alloc, + size_t *cur_offset) { + if (config_debug) { + size_t computed_size; + size_t computed_alignment; + + /* Pointer should be as aligned as we asked for. */ + cache_bin_info_compute_alloc(infos, ninfos, &computed_size, + &computed_alignment); + assert(((uintptr_t)alloc & (computed_alignment - 1)) == 0); + } + + *(uintptr_t *)((uintptr_t)alloc + *cur_offset) = + cache_bin_preceding_junk; + *cur_offset += sizeof(void *); +} + +void +cache_bin_postincrement(cache_bin_info_t *infos, szind_t ninfos, void *alloc, + size_t *cur_offset) { + *(uintptr_t *)((uintptr_t)alloc + *cur_offset) = + cache_bin_trailing_junk; + *cur_offset += sizeof(void *); +} + +void +cache_bin_init(cache_bin_t *bin, cache_bin_info_t *info, void *alloc, + size_t *cur_offset) { + /* + * The full_position points to the lowest available space. Allocations + * will access the slots toward higher addresses (for the benefit of + * adjacent prefetch). + */ + void *stack_cur = (void *)((uintptr_t)alloc + *cur_offset); + void *full_position = stack_cur; + uint16_t bin_stack_size = info->ncached_max * sizeof(void *); + + *cur_offset += bin_stack_size; + void *empty_position = (void *)((uintptr_t)alloc + *cur_offset); + + /* Init to the empty position. */ + bin->stack_head = (void **)empty_position; + bin->low_bits_low_water = (uint16_t)(uintptr_t)bin->stack_head; + bin->low_bits_full = (uint16_t)(uintptr_t)full_position; + bin->low_bits_empty = (uint16_t)(uintptr_t)empty_position; + cache_bin_sz_t free_spots = cache_bin_diff(bin, + bin->low_bits_full, (uint16_t)(uintptr_t)bin->stack_head, + /* racy */ false); + assert(free_spots == bin_stack_size); + assert(cache_bin_ncached_get_local(bin, info) == 0); + assert(cache_bin_empty_position_get(bin) == empty_position); + + assert(bin_stack_size > 0 || empty_position == full_position); +} + +bool +cache_bin_still_zero_initialized(cache_bin_t *bin) { + return bin->stack_head == NULL; +} diff --git a/BeefRT/JEMalloc/src/ckh.c b/BeefRT/JEMalloc/src/ckh.c new file mode 100644 index 00000000..8db4319c --- /dev/null +++ b/BeefRT/JEMalloc/src/ckh.c @@ -0,0 +1,569 @@ +/* + ******************************************************************************* + * Implementation of (2^1+,2) cuckoo hashing, where 2^1+ indicates that each + * hash bucket contains 2^n cells, for n >= 1, and 2 indicates that two hash + * functions are employed. The original cuckoo hashing algorithm was described + * in: + * + * Pagh, R., F.F. Rodler (2004) Cuckoo Hashing. Journal of Algorithms + * 51(2):122-144. + * + * Generalization of cuckoo hashing was discussed in: + * + * Erlingsson, U., M. Manasse, F. McSherry (2006) A cool and practical + * alternative to traditional hash tables. In Proceedings of the 7th + * Workshop on Distributed Data and Structures (WDAS'06), Santa Clara, CA, + * January 2006. + * + * This implementation uses precisely two hash functions because that is the + * fewest that can work, and supporting multiple hashes is an implementation + * burden. Here is a reproduction of Figure 1 from Erlingsson et al. (2006) + * that shows approximate expected maximum load factors for various + * configurations: + * + * | #cells/bucket | + * #hashes | 1 | 2 | 4 | 8 | + * --------+-------+-------+-------+-------+ + * 1 | 0.006 | 0.006 | 0.03 | 0.12 | + * 2 | 0.49 | 0.86 |>0.93< |>0.96< | + * 3 | 0.91 | 0.97 | 0.98 | 0.999 | + * 4 | 0.97 | 0.99 | 0.999 | | + * + * The number of cells per bucket is chosen such that a bucket fits in one cache + * line. So, on 32- and 64-bit systems, we use (8,2) and (4,2) cuckoo hashing, + * respectively. + * + ******************************************************************************/ +#include "jemalloc/internal/jemalloc_preamble.h" + +#include "jemalloc/internal/ckh.h" + +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/hash.h" +#include "jemalloc/internal/malloc_io.h" +#include "jemalloc/internal/prng.h" +#include "jemalloc/internal/util.h" + +/******************************************************************************/ +/* Function prototypes for non-inline static functions. */ + +static bool ckh_grow(tsd_t *tsd, ckh_t *ckh); +static void ckh_shrink(tsd_t *tsd, ckh_t *ckh); + +/******************************************************************************/ + +/* + * Search bucket for key and return the cell number if found; SIZE_T_MAX + * otherwise. + */ +static size_t +ckh_bucket_search(ckh_t *ckh, size_t bucket, const void *key) { + ckhc_t *cell; + unsigned i; + + for (i = 0; i < (ZU(1) << LG_CKH_BUCKET_CELLS); i++) { + cell = &ckh->tab[(bucket << LG_CKH_BUCKET_CELLS) + i]; + if (cell->key != NULL && ckh->keycomp(key, cell->key)) { + return (bucket << LG_CKH_BUCKET_CELLS) + i; + } + } + + return SIZE_T_MAX; +} + +/* + * Search table for key and return cell number if found; SIZE_T_MAX otherwise. + */ +static size_t +ckh_isearch(ckh_t *ckh, const void *key) { + size_t hashes[2], bucket, cell; + + assert(ckh != NULL); + + ckh->hash(key, hashes); + + /* Search primary bucket. */ + bucket = hashes[0] & ((ZU(1) << ckh->lg_curbuckets) - 1); + cell = ckh_bucket_search(ckh, bucket, key); + if (cell != SIZE_T_MAX) { + return cell; + } + + /* Search secondary bucket. */ + bucket = hashes[1] & ((ZU(1) << ckh->lg_curbuckets) - 1); + cell = ckh_bucket_search(ckh, bucket, key); + return cell; +} + +static bool +ckh_try_bucket_insert(ckh_t *ckh, size_t bucket, const void *key, + const void *data) { + ckhc_t *cell; + unsigned offset, i; + + /* + * Cycle through the cells in the bucket, starting at a random position. + * The randomness avoids worst-case search overhead as buckets fill up. + */ + offset = (unsigned)prng_lg_range_u64(&ckh->prng_state, + LG_CKH_BUCKET_CELLS); + for (i = 0; i < (ZU(1) << LG_CKH_BUCKET_CELLS); i++) { + cell = &ckh->tab[(bucket << LG_CKH_BUCKET_CELLS) + + ((i + offset) & ((ZU(1) << LG_CKH_BUCKET_CELLS) - 1))]; + if (cell->key == NULL) { + cell->key = key; + cell->data = data; + ckh->count++; + return false; + } + } + + return true; +} + +/* + * No space is available in bucket. Randomly evict an item, then try to find an + * alternate location for that item. Iteratively repeat this + * eviction/relocation procedure until either success or detection of an + * eviction/relocation bucket cycle. + */ +static bool +ckh_evict_reloc_insert(ckh_t *ckh, size_t argbucket, void const **argkey, + void const **argdata) { + const void *key, *data, *tkey, *tdata; + ckhc_t *cell; + size_t hashes[2], bucket, tbucket; + unsigned i; + + bucket = argbucket; + key = *argkey; + data = *argdata; + while (true) { + /* + * Choose a random item within the bucket to evict. This is + * critical to correct function, because without (eventually) + * evicting all items within a bucket during iteration, it + * would be possible to get stuck in an infinite loop if there + * were an item for which both hashes indicated the same + * bucket. + */ + i = (unsigned)prng_lg_range_u64(&ckh->prng_state, + LG_CKH_BUCKET_CELLS); + cell = &ckh->tab[(bucket << LG_CKH_BUCKET_CELLS) + i]; + assert(cell->key != NULL); + + /* Swap cell->{key,data} and {key,data} (evict). */ + tkey = cell->key; tdata = cell->data; + cell->key = key; cell->data = data; + key = tkey; data = tdata; + +#ifdef CKH_COUNT + ckh->nrelocs++; +#endif + + /* Find the alternate bucket for the evicted item. */ + ckh->hash(key, hashes); + tbucket = hashes[1] & ((ZU(1) << ckh->lg_curbuckets) - 1); + if (tbucket == bucket) { + tbucket = hashes[0] & ((ZU(1) << ckh->lg_curbuckets) + - 1); + /* + * It may be that (tbucket == bucket) still, if the + * item's hashes both indicate this bucket. However, + * we are guaranteed to eventually escape this bucket + * during iteration, assuming pseudo-random item + * selection (true randomness would make infinite + * looping a remote possibility). The reason we can + * never get trapped forever is that there are two + * cases: + * + * 1) This bucket == argbucket, so we will quickly + * detect an eviction cycle and terminate. + * 2) An item was evicted to this bucket from another, + * which means that at least one item in this bucket + * has hashes that indicate distinct buckets. + */ + } + /* Check for a cycle. */ + if (tbucket == argbucket) { + *argkey = key; + *argdata = data; + return true; + } + + bucket = tbucket; + if (!ckh_try_bucket_insert(ckh, bucket, key, data)) { + return false; + } + } +} + +static bool +ckh_try_insert(ckh_t *ckh, void const**argkey, void const**argdata) { + size_t hashes[2], bucket; + const void *key = *argkey; + const void *data = *argdata; + + ckh->hash(key, hashes); + + /* Try to insert in primary bucket. */ + bucket = hashes[0] & ((ZU(1) << ckh->lg_curbuckets) - 1); + if (!ckh_try_bucket_insert(ckh, bucket, key, data)) { + return false; + } + + /* Try to insert in secondary bucket. */ + bucket = hashes[1] & ((ZU(1) << ckh->lg_curbuckets) - 1); + if (!ckh_try_bucket_insert(ckh, bucket, key, data)) { + return false; + } + + /* + * Try to find a place for this item via iterative eviction/relocation. + */ + return ckh_evict_reloc_insert(ckh, bucket, argkey, argdata); +} + +/* + * Try to rebuild the hash table from scratch by inserting all items from the + * old table into the new. + */ +static bool +ckh_rebuild(ckh_t *ckh, ckhc_t *aTab) { + size_t count, i, nins; + const void *key, *data; + + count = ckh->count; + ckh->count = 0; + for (i = nins = 0; nins < count; i++) { + if (aTab[i].key != NULL) { + key = aTab[i].key; + data = aTab[i].data; + if (ckh_try_insert(ckh, &key, &data)) { + ckh->count = count; + return true; + } + nins++; + } + } + + return false; +} + +static bool +ckh_grow(tsd_t *tsd, ckh_t *ckh) { + bool ret; + ckhc_t *tab, *ttab; + unsigned lg_prevbuckets, lg_curcells; + +#ifdef CKH_COUNT + ckh->ngrows++; +#endif + + /* + * It is possible (though unlikely, given well behaved hashes) that the + * table will have to be doubled more than once in order to create a + * usable table. + */ + lg_prevbuckets = ckh->lg_curbuckets; + lg_curcells = ckh->lg_curbuckets + LG_CKH_BUCKET_CELLS; + while (true) { + size_t usize; + + lg_curcells++; + usize = sz_sa2u(sizeof(ckhc_t) << lg_curcells, CACHELINE); + if (unlikely(usize == 0 + || usize > SC_LARGE_MAXCLASS)) { + ret = true; + goto label_return; + } + tab = (ckhc_t *)ipallocztm(tsd_tsdn(tsd), usize, CACHELINE, + true, NULL, true, arena_ichoose(tsd, NULL)); + if (tab == NULL) { + ret = true; + goto label_return; + } + /* Swap in new table. */ + ttab = ckh->tab; + ckh->tab = tab; + tab = ttab; + ckh->lg_curbuckets = lg_curcells - LG_CKH_BUCKET_CELLS; + + if (!ckh_rebuild(ckh, tab)) { + idalloctm(tsd_tsdn(tsd), tab, NULL, NULL, true, true); + break; + } + + /* Rebuilding failed, so back out partially rebuilt table. */ + idalloctm(tsd_tsdn(tsd), ckh->tab, NULL, NULL, true, true); + ckh->tab = tab; + ckh->lg_curbuckets = lg_prevbuckets; + } + + ret = false; +label_return: + return ret; +} + +static void +ckh_shrink(tsd_t *tsd, ckh_t *ckh) { + ckhc_t *tab, *ttab; + size_t usize; + unsigned lg_prevbuckets, lg_curcells; + + /* + * It is possible (though unlikely, given well behaved hashes) that the + * table rebuild will fail. + */ + lg_prevbuckets = ckh->lg_curbuckets; + lg_curcells = ckh->lg_curbuckets + LG_CKH_BUCKET_CELLS - 1; + usize = sz_sa2u(sizeof(ckhc_t) << lg_curcells, CACHELINE); + if (unlikely(usize == 0 || usize > SC_LARGE_MAXCLASS)) { + return; + } + tab = (ckhc_t *)ipallocztm(tsd_tsdn(tsd), usize, CACHELINE, true, NULL, + true, arena_ichoose(tsd, NULL)); + if (tab == NULL) { + /* + * An OOM error isn't worth propagating, since it doesn't + * prevent this or future operations from proceeding. + */ + return; + } + /* Swap in new table. */ + ttab = ckh->tab; + ckh->tab = tab; + tab = ttab; + ckh->lg_curbuckets = lg_curcells - LG_CKH_BUCKET_CELLS; + + if (!ckh_rebuild(ckh, tab)) { + idalloctm(tsd_tsdn(tsd), tab, NULL, NULL, true, true); +#ifdef CKH_COUNT + ckh->nshrinks++; +#endif + return; + } + + /* Rebuilding failed, so back out partially rebuilt table. */ + idalloctm(tsd_tsdn(tsd), ckh->tab, NULL, NULL, true, true); + ckh->tab = tab; + ckh->lg_curbuckets = lg_prevbuckets; +#ifdef CKH_COUNT + ckh->nshrinkfails++; +#endif +} + +bool +ckh_new(tsd_t *tsd, ckh_t *ckh, size_t minitems, ckh_hash_t *ckh_hash, + ckh_keycomp_t *keycomp) { + bool ret; + size_t mincells, usize; + unsigned lg_mincells; + + assert(minitems > 0); + assert(ckh_hash != NULL); + assert(keycomp != NULL); + +#ifdef CKH_COUNT + ckh->ngrows = 0; + ckh->nshrinks = 0; + ckh->nshrinkfails = 0; + ckh->ninserts = 0; + ckh->nrelocs = 0; +#endif + ckh->prng_state = 42; /* Value doesn't really matter. */ + ckh->count = 0; + + /* + * Find the minimum power of 2 that is large enough to fit minitems + * entries. We are using (2+,2) cuckoo hashing, which has an expected + * maximum load factor of at least ~0.86, so 0.75 is a conservative load + * factor that will typically allow mincells items to fit without ever + * growing the table. + */ + assert(LG_CKH_BUCKET_CELLS > 0); + mincells = ((minitems + (3 - (minitems % 3))) / 3) << 2; + for (lg_mincells = LG_CKH_BUCKET_CELLS; + (ZU(1) << lg_mincells) < mincells; + lg_mincells++) { + /* Do nothing. */ + } + ckh->lg_minbuckets = lg_mincells - LG_CKH_BUCKET_CELLS; + ckh->lg_curbuckets = lg_mincells - LG_CKH_BUCKET_CELLS; + ckh->hash = ckh_hash; + ckh->keycomp = keycomp; + + usize = sz_sa2u(sizeof(ckhc_t) << lg_mincells, CACHELINE); + if (unlikely(usize == 0 || usize > SC_LARGE_MAXCLASS)) { + ret = true; + goto label_return; + } + ckh->tab = (ckhc_t *)ipallocztm(tsd_tsdn(tsd), usize, CACHELINE, true, + NULL, true, arena_ichoose(tsd, NULL)); + if (ckh->tab == NULL) { + ret = true; + goto label_return; + } + + ret = false; +label_return: + return ret; +} + +void +ckh_delete(tsd_t *tsd, ckh_t *ckh) { + assert(ckh != NULL); + +#ifdef CKH_VERBOSE + malloc_printf( + "%s(%p): ngrows: %"FMTu64", nshrinks: %"FMTu64"," + " nshrinkfails: %"FMTu64", ninserts: %"FMTu64"," + " nrelocs: %"FMTu64"\n", __func__, ckh, + (unsigned long long)ckh->ngrows, + (unsigned long long)ckh->nshrinks, + (unsigned long long)ckh->nshrinkfails, + (unsigned long long)ckh->ninserts, + (unsigned long long)ckh->nrelocs); +#endif + + idalloctm(tsd_tsdn(tsd), ckh->tab, NULL, NULL, true, true); + if (config_debug) { + memset(ckh, JEMALLOC_FREE_JUNK, sizeof(ckh_t)); + } +} + +size_t +ckh_count(ckh_t *ckh) { + assert(ckh != NULL); + + return ckh->count; +} + +bool +ckh_iter(ckh_t *ckh, size_t *tabind, void **key, void **data) { + size_t i, ncells; + + for (i = *tabind, ncells = (ZU(1) << (ckh->lg_curbuckets + + LG_CKH_BUCKET_CELLS)); i < ncells; i++) { + if (ckh->tab[i].key != NULL) { + if (key != NULL) { + *key = (void *)ckh->tab[i].key; + } + if (data != NULL) { + *data = (void *)ckh->tab[i].data; + } + *tabind = i + 1; + return false; + } + } + + return true; +} + +bool +ckh_insert(tsd_t *tsd, ckh_t *ckh, const void *key, const void *data) { + bool ret; + + assert(ckh != NULL); + assert(ckh_search(ckh, key, NULL, NULL)); + +#ifdef CKH_COUNT + ckh->ninserts++; +#endif + + while (ckh_try_insert(ckh, &key, &data)) { + if (ckh_grow(tsd, ckh)) { + ret = true; + goto label_return; + } + } + + ret = false; +label_return: + return ret; +} + +bool +ckh_remove(tsd_t *tsd, ckh_t *ckh, const void *searchkey, void **key, + void **data) { + size_t cell; + + assert(ckh != NULL); + + cell = ckh_isearch(ckh, searchkey); + if (cell != SIZE_T_MAX) { + if (key != NULL) { + *key = (void *)ckh->tab[cell].key; + } + if (data != NULL) { + *data = (void *)ckh->tab[cell].data; + } + ckh->tab[cell].key = NULL; + ckh->tab[cell].data = NULL; /* Not necessary. */ + + ckh->count--; + /* Try to halve the table if it is less than 1/4 full. */ + if (ckh->count < (ZU(1) << (ckh->lg_curbuckets + + LG_CKH_BUCKET_CELLS - 2)) && ckh->lg_curbuckets + > ckh->lg_minbuckets) { + /* Ignore error due to OOM. */ + ckh_shrink(tsd, ckh); + } + + return false; + } + + return true; +} + +bool +ckh_search(ckh_t *ckh, const void *searchkey, void **key, void **data) { + size_t cell; + + assert(ckh != NULL); + + cell = ckh_isearch(ckh, searchkey); + if (cell != SIZE_T_MAX) { + if (key != NULL) { + *key = (void *)ckh->tab[cell].key; + } + if (data != NULL) { + *data = (void *)ckh->tab[cell].data; + } + return false; + } + + return true; +} + +void +ckh_string_hash(const void *key, size_t r_hash[2]) { + hash(key, strlen((const char *)key), 0x94122f33U, r_hash); +} + +bool +ckh_string_keycomp(const void *k1, const void *k2) { + assert(k1 != NULL); + assert(k2 != NULL); + + return !strcmp((char *)k1, (char *)k2); +} + +void +ckh_pointer_hash(const void *key, size_t r_hash[2]) { + union { + const void *v; + size_t i; + } u; + + assert(sizeof(u.v) == sizeof(u.i)); + u.v = key; + hash(&u.i, sizeof(u.i), 0xd983396eU, r_hash); +} + +bool +ckh_pointer_keycomp(const void *k1, const void *k2) { + return (k1 == k2); +} diff --git a/BeefRT/JEMalloc/src/counter.c b/BeefRT/JEMalloc/src/counter.c new file mode 100644 index 00000000..8f1ae3af --- /dev/null +++ b/BeefRT/JEMalloc/src/counter.c @@ -0,0 +1,30 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/counter.h" + +bool +counter_accum_init(counter_accum_t *counter, uint64_t interval) { + if (LOCKEDINT_MTX_INIT(counter->mtx, "counter_accum", + WITNESS_RANK_COUNTER_ACCUM, malloc_mutex_rank_exclusive)) { + return true; + } + locked_init_u64_unsynchronized(&counter->accumbytes, 0); + counter->interval = interval; + return false; +} + +void +counter_prefork(tsdn_t *tsdn, counter_accum_t *counter) { + LOCKEDINT_MTX_PREFORK(tsdn, counter->mtx); +} + +void +counter_postfork_parent(tsdn_t *tsdn, counter_accum_t *counter) { + LOCKEDINT_MTX_POSTFORK_PARENT(tsdn, counter->mtx); +} + +void +counter_postfork_child(tsdn_t *tsdn, counter_accum_t *counter) { + LOCKEDINT_MTX_POSTFORK_CHILD(tsdn, counter->mtx); +} diff --git a/BeefRT/JEMalloc/src/ctl.c b/BeefRT/JEMalloc/src/ctl.c new file mode 100644 index 00000000..135271ba --- /dev/null +++ b/BeefRT/JEMalloc/src/ctl.c @@ -0,0 +1,4414 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/ctl.h" +#include "jemalloc/internal/extent_dss.h" +#include "jemalloc/internal/extent_mmap.h" +#include "jemalloc/internal/inspect.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/nstime.h" +#include "jemalloc/internal/peak_event.h" +#include "jemalloc/internal/prof_data.h" +#include "jemalloc/internal/prof_log.h" +#include "jemalloc/internal/prof_recent.h" +#include "jemalloc/internal/prof_stats.h" +#include "jemalloc/internal/prof_sys.h" +#include "jemalloc/internal/safety_check.h" +#include "jemalloc/internal/sc.h" +#include "jemalloc/internal/util.h" + +/******************************************************************************/ +/* Data. */ + +/* + * ctl_mtx protects the following: + * - ctl_stats->* + */ +static malloc_mutex_t ctl_mtx; +static bool ctl_initialized; +static ctl_stats_t *ctl_stats; +static ctl_arenas_t *ctl_arenas; + +/******************************************************************************/ +/* Helpers for named and indexed nodes. */ + +static const ctl_named_node_t * +ctl_named_node(const ctl_node_t *node) { + return ((node->named) ? (const ctl_named_node_t *)node : NULL); +} + +static const ctl_named_node_t * +ctl_named_children(const ctl_named_node_t *node, size_t index) { + const ctl_named_node_t *children = ctl_named_node(node->children); + + return (children ? &children[index] : NULL); +} + +static const ctl_indexed_node_t * +ctl_indexed_node(const ctl_node_t *node) { + return (!node->named ? (const ctl_indexed_node_t *)node : NULL); +} + +/******************************************************************************/ +/* Function prototypes for non-inline static functions. */ + +#define CTL_PROTO(n) \ +static int n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, \ + void *oldp, size_t *oldlenp, void *newp, size_t newlen); + +#define INDEX_PROTO(n) \ +static const ctl_named_node_t *n##_index(tsdn_t *tsdn, \ + const size_t *mib, size_t miblen, size_t i); + +CTL_PROTO(version) +CTL_PROTO(epoch) +CTL_PROTO(background_thread) +CTL_PROTO(max_background_threads) +CTL_PROTO(thread_tcache_enabled) +CTL_PROTO(thread_tcache_flush) +CTL_PROTO(thread_peak_read) +CTL_PROTO(thread_peak_reset) +CTL_PROTO(thread_prof_name) +CTL_PROTO(thread_prof_active) +CTL_PROTO(thread_arena) +CTL_PROTO(thread_allocated) +CTL_PROTO(thread_allocatedp) +CTL_PROTO(thread_deallocated) +CTL_PROTO(thread_deallocatedp) +CTL_PROTO(thread_idle) +CTL_PROTO(config_cache_oblivious) +CTL_PROTO(config_debug) +CTL_PROTO(config_fill) +CTL_PROTO(config_lazy_lock) +CTL_PROTO(config_malloc_conf) +CTL_PROTO(config_opt_safety_checks) +CTL_PROTO(config_prof) +CTL_PROTO(config_prof_libgcc) +CTL_PROTO(config_prof_libunwind) +CTL_PROTO(config_stats) +CTL_PROTO(config_utrace) +CTL_PROTO(config_xmalloc) +CTL_PROTO(opt_abort) +CTL_PROTO(opt_abort_conf) +CTL_PROTO(opt_cache_oblivious) +CTL_PROTO(opt_trust_madvise) +CTL_PROTO(opt_confirm_conf) +CTL_PROTO(opt_hpa) +CTL_PROTO(opt_hpa_slab_max_alloc) +CTL_PROTO(opt_hpa_hugification_threshold) +CTL_PROTO(opt_hpa_hugify_delay_ms) +CTL_PROTO(opt_hpa_min_purge_interval_ms) +CTL_PROTO(opt_hpa_dirty_mult) +CTL_PROTO(opt_hpa_sec_nshards) +CTL_PROTO(opt_hpa_sec_max_alloc) +CTL_PROTO(opt_hpa_sec_max_bytes) +CTL_PROTO(opt_hpa_sec_bytes_after_flush) +CTL_PROTO(opt_hpa_sec_batch_fill_extra) +CTL_PROTO(opt_metadata_thp) +CTL_PROTO(opt_retain) +CTL_PROTO(opt_dss) +CTL_PROTO(opt_narenas) +CTL_PROTO(opt_percpu_arena) +CTL_PROTO(opt_oversize_threshold) +CTL_PROTO(opt_background_thread) +CTL_PROTO(opt_mutex_max_spin) +CTL_PROTO(opt_max_background_threads) +CTL_PROTO(opt_dirty_decay_ms) +CTL_PROTO(opt_muzzy_decay_ms) +CTL_PROTO(opt_stats_print) +CTL_PROTO(opt_stats_print_opts) +CTL_PROTO(opt_stats_interval) +CTL_PROTO(opt_stats_interval_opts) +CTL_PROTO(opt_junk) +CTL_PROTO(opt_zero) +CTL_PROTO(opt_utrace) +CTL_PROTO(opt_xmalloc) +CTL_PROTO(opt_experimental_infallible_new) +CTL_PROTO(opt_tcache) +CTL_PROTO(opt_tcache_max) +CTL_PROTO(opt_tcache_nslots_small_min) +CTL_PROTO(opt_tcache_nslots_small_max) +CTL_PROTO(opt_tcache_nslots_large) +CTL_PROTO(opt_lg_tcache_nslots_mul) +CTL_PROTO(opt_tcache_gc_incr_bytes) +CTL_PROTO(opt_tcache_gc_delay_bytes) +CTL_PROTO(opt_lg_tcache_flush_small_div) +CTL_PROTO(opt_lg_tcache_flush_large_div) +CTL_PROTO(opt_thp) +CTL_PROTO(opt_lg_extent_max_active_fit) +CTL_PROTO(opt_prof) +CTL_PROTO(opt_prof_prefix) +CTL_PROTO(opt_prof_active) +CTL_PROTO(opt_prof_thread_active_init) +CTL_PROTO(opt_lg_prof_sample) +CTL_PROTO(opt_lg_prof_interval) +CTL_PROTO(opt_prof_gdump) +CTL_PROTO(opt_prof_final) +CTL_PROTO(opt_prof_leak) +CTL_PROTO(opt_prof_leak_error) +CTL_PROTO(opt_prof_accum) +CTL_PROTO(opt_prof_recent_alloc_max) +CTL_PROTO(opt_prof_stats) +CTL_PROTO(opt_prof_sys_thread_name) +CTL_PROTO(opt_prof_time_res) +CTL_PROTO(opt_lg_san_uaf_align) +CTL_PROTO(opt_zero_realloc) +CTL_PROTO(tcache_create) +CTL_PROTO(tcache_flush) +CTL_PROTO(tcache_destroy) +CTL_PROTO(arena_i_initialized) +CTL_PROTO(arena_i_decay) +CTL_PROTO(arena_i_purge) +CTL_PROTO(arena_i_reset) +CTL_PROTO(arena_i_destroy) +CTL_PROTO(arena_i_dss) +CTL_PROTO(arena_i_oversize_threshold) +CTL_PROTO(arena_i_dirty_decay_ms) +CTL_PROTO(arena_i_muzzy_decay_ms) +CTL_PROTO(arena_i_extent_hooks) +CTL_PROTO(arena_i_retain_grow_limit) +INDEX_PROTO(arena_i) +CTL_PROTO(arenas_bin_i_size) +CTL_PROTO(arenas_bin_i_nregs) +CTL_PROTO(arenas_bin_i_slab_size) +CTL_PROTO(arenas_bin_i_nshards) +INDEX_PROTO(arenas_bin_i) +CTL_PROTO(arenas_lextent_i_size) +INDEX_PROTO(arenas_lextent_i) +CTL_PROTO(arenas_narenas) +CTL_PROTO(arenas_dirty_decay_ms) +CTL_PROTO(arenas_muzzy_decay_ms) +CTL_PROTO(arenas_quantum) +CTL_PROTO(arenas_page) +CTL_PROTO(arenas_tcache_max) +CTL_PROTO(arenas_nbins) +CTL_PROTO(arenas_nhbins) +CTL_PROTO(arenas_nlextents) +CTL_PROTO(arenas_create) +CTL_PROTO(arenas_lookup) +CTL_PROTO(prof_thread_active_init) +CTL_PROTO(prof_active) +CTL_PROTO(prof_dump) +CTL_PROTO(prof_gdump) +CTL_PROTO(prof_prefix) +CTL_PROTO(prof_reset) +CTL_PROTO(prof_interval) +CTL_PROTO(lg_prof_sample) +CTL_PROTO(prof_log_start) +CTL_PROTO(prof_log_stop) +CTL_PROTO(prof_stats_bins_i_live) +CTL_PROTO(prof_stats_bins_i_accum) +INDEX_PROTO(prof_stats_bins_i) +CTL_PROTO(prof_stats_lextents_i_live) +CTL_PROTO(prof_stats_lextents_i_accum) +INDEX_PROTO(prof_stats_lextents_i) +CTL_PROTO(stats_arenas_i_small_allocated) +CTL_PROTO(stats_arenas_i_small_nmalloc) +CTL_PROTO(stats_arenas_i_small_ndalloc) +CTL_PROTO(stats_arenas_i_small_nrequests) +CTL_PROTO(stats_arenas_i_small_nfills) +CTL_PROTO(stats_arenas_i_small_nflushes) +CTL_PROTO(stats_arenas_i_large_allocated) +CTL_PROTO(stats_arenas_i_large_nmalloc) +CTL_PROTO(stats_arenas_i_large_ndalloc) +CTL_PROTO(stats_arenas_i_large_nrequests) +CTL_PROTO(stats_arenas_i_large_nfills) +CTL_PROTO(stats_arenas_i_large_nflushes) +CTL_PROTO(stats_arenas_i_bins_j_nmalloc) +CTL_PROTO(stats_arenas_i_bins_j_ndalloc) +CTL_PROTO(stats_arenas_i_bins_j_nrequests) +CTL_PROTO(stats_arenas_i_bins_j_curregs) +CTL_PROTO(stats_arenas_i_bins_j_nfills) +CTL_PROTO(stats_arenas_i_bins_j_nflushes) +CTL_PROTO(stats_arenas_i_bins_j_nslabs) +CTL_PROTO(stats_arenas_i_bins_j_nreslabs) +CTL_PROTO(stats_arenas_i_bins_j_curslabs) +CTL_PROTO(stats_arenas_i_bins_j_nonfull_slabs) +INDEX_PROTO(stats_arenas_i_bins_j) +CTL_PROTO(stats_arenas_i_lextents_j_nmalloc) +CTL_PROTO(stats_arenas_i_lextents_j_ndalloc) +CTL_PROTO(stats_arenas_i_lextents_j_nrequests) +CTL_PROTO(stats_arenas_i_lextents_j_curlextents) +INDEX_PROTO(stats_arenas_i_lextents_j) +CTL_PROTO(stats_arenas_i_extents_j_ndirty) +CTL_PROTO(stats_arenas_i_extents_j_nmuzzy) +CTL_PROTO(stats_arenas_i_extents_j_nretained) +CTL_PROTO(stats_arenas_i_extents_j_dirty_bytes) +CTL_PROTO(stats_arenas_i_extents_j_muzzy_bytes) +CTL_PROTO(stats_arenas_i_extents_j_retained_bytes) +INDEX_PROTO(stats_arenas_i_extents_j) +CTL_PROTO(stats_arenas_i_hpa_shard_npurge_passes) +CTL_PROTO(stats_arenas_i_hpa_shard_npurges) +CTL_PROTO(stats_arenas_i_hpa_shard_nhugifies) +CTL_PROTO(stats_arenas_i_hpa_shard_ndehugifies) + +/* We have a set of stats for full slabs. */ +CTL_PROTO(stats_arenas_i_hpa_shard_full_slabs_npageslabs_nonhuge) +CTL_PROTO(stats_arenas_i_hpa_shard_full_slabs_npageslabs_huge) +CTL_PROTO(stats_arenas_i_hpa_shard_full_slabs_nactive_nonhuge) +CTL_PROTO(stats_arenas_i_hpa_shard_full_slabs_nactive_huge) +CTL_PROTO(stats_arenas_i_hpa_shard_full_slabs_ndirty_nonhuge) +CTL_PROTO(stats_arenas_i_hpa_shard_full_slabs_ndirty_huge) + +/* A parallel set for the empty slabs. */ +CTL_PROTO(stats_arenas_i_hpa_shard_empty_slabs_npageslabs_nonhuge) +CTL_PROTO(stats_arenas_i_hpa_shard_empty_slabs_npageslabs_huge) +CTL_PROTO(stats_arenas_i_hpa_shard_empty_slabs_nactive_nonhuge) +CTL_PROTO(stats_arenas_i_hpa_shard_empty_slabs_nactive_huge) +CTL_PROTO(stats_arenas_i_hpa_shard_empty_slabs_ndirty_nonhuge) +CTL_PROTO(stats_arenas_i_hpa_shard_empty_slabs_ndirty_huge) + +/* + * And one for the slabs that are neither empty nor full, but indexed by how + * full they are. + */ +CTL_PROTO(stats_arenas_i_hpa_shard_nonfull_slabs_j_npageslabs_nonhuge) +CTL_PROTO(stats_arenas_i_hpa_shard_nonfull_slabs_j_npageslabs_huge) +CTL_PROTO(stats_arenas_i_hpa_shard_nonfull_slabs_j_nactive_nonhuge) +CTL_PROTO(stats_arenas_i_hpa_shard_nonfull_slabs_j_nactive_huge) +CTL_PROTO(stats_arenas_i_hpa_shard_nonfull_slabs_j_ndirty_nonhuge) +CTL_PROTO(stats_arenas_i_hpa_shard_nonfull_slabs_j_ndirty_huge) + +INDEX_PROTO(stats_arenas_i_hpa_shard_nonfull_slabs_j) +CTL_PROTO(stats_arenas_i_nthreads) +CTL_PROTO(stats_arenas_i_uptime) +CTL_PROTO(stats_arenas_i_dss) +CTL_PROTO(stats_arenas_i_dirty_decay_ms) +CTL_PROTO(stats_arenas_i_muzzy_decay_ms) +CTL_PROTO(stats_arenas_i_pactive) +CTL_PROTO(stats_arenas_i_pdirty) +CTL_PROTO(stats_arenas_i_pmuzzy) +CTL_PROTO(stats_arenas_i_mapped) +CTL_PROTO(stats_arenas_i_retained) +CTL_PROTO(stats_arenas_i_extent_avail) +CTL_PROTO(stats_arenas_i_dirty_npurge) +CTL_PROTO(stats_arenas_i_dirty_nmadvise) +CTL_PROTO(stats_arenas_i_dirty_purged) +CTL_PROTO(stats_arenas_i_muzzy_npurge) +CTL_PROTO(stats_arenas_i_muzzy_nmadvise) +CTL_PROTO(stats_arenas_i_muzzy_purged) +CTL_PROTO(stats_arenas_i_base) +CTL_PROTO(stats_arenas_i_internal) +CTL_PROTO(stats_arenas_i_metadata_thp) +CTL_PROTO(stats_arenas_i_tcache_bytes) +CTL_PROTO(stats_arenas_i_tcache_stashed_bytes) +CTL_PROTO(stats_arenas_i_resident) +CTL_PROTO(stats_arenas_i_abandoned_vm) +CTL_PROTO(stats_arenas_i_hpa_sec_bytes) +INDEX_PROTO(stats_arenas_i) +CTL_PROTO(stats_allocated) +CTL_PROTO(stats_active) +CTL_PROTO(stats_background_thread_num_threads) +CTL_PROTO(stats_background_thread_num_runs) +CTL_PROTO(stats_background_thread_run_interval) +CTL_PROTO(stats_metadata) +CTL_PROTO(stats_metadata_thp) +CTL_PROTO(stats_resident) +CTL_PROTO(stats_mapped) +CTL_PROTO(stats_retained) +CTL_PROTO(stats_zero_reallocs) +CTL_PROTO(experimental_hooks_install) +CTL_PROTO(experimental_hooks_remove) +CTL_PROTO(experimental_hooks_prof_backtrace) +CTL_PROTO(experimental_hooks_prof_dump) +CTL_PROTO(experimental_hooks_safety_check_abort) +CTL_PROTO(experimental_thread_activity_callback) +CTL_PROTO(experimental_utilization_query) +CTL_PROTO(experimental_utilization_batch_query) +CTL_PROTO(experimental_arenas_i_pactivep) +INDEX_PROTO(experimental_arenas_i) +CTL_PROTO(experimental_prof_recent_alloc_max) +CTL_PROTO(experimental_prof_recent_alloc_dump) +CTL_PROTO(experimental_batch_alloc) +CTL_PROTO(experimental_arenas_create_ext) + +#define MUTEX_STATS_CTL_PROTO_GEN(n) \ +CTL_PROTO(stats_##n##_num_ops) \ +CTL_PROTO(stats_##n##_num_wait) \ +CTL_PROTO(stats_##n##_num_spin_acq) \ +CTL_PROTO(stats_##n##_num_owner_switch) \ +CTL_PROTO(stats_##n##_total_wait_time) \ +CTL_PROTO(stats_##n##_max_wait_time) \ +CTL_PROTO(stats_##n##_max_num_thds) + +/* Global mutexes. */ +#define OP(mtx) MUTEX_STATS_CTL_PROTO_GEN(mutexes_##mtx) +MUTEX_PROF_GLOBAL_MUTEXES +#undef OP + +/* Per arena mutexes. */ +#define OP(mtx) MUTEX_STATS_CTL_PROTO_GEN(arenas_i_mutexes_##mtx) +MUTEX_PROF_ARENA_MUTEXES +#undef OP + +/* Arena bin mutexes. */ +MUTEX_STATS_CTL_PROTO_GEN(arenas_i_bins_j_mutex) +#undef MUTEX_STATS_CTL_PROTO_GEN + +CTL_PROTO(stats_mutexes_reset) + +/******************************************************************************/ +/* mallctl tree. */ + +#define NAME(n) {true}, n +#define CHILD(t, c) \ + sizeof(c##_node) / sizeof(ctl_##t##_node_t), \ + (ctl_node_t *)c##_node, \ + NULL +#define CTL(c) 0, NULL, c##_ctl + +/* + * Only handles internal indexed nodes, since there are currently no external + * ones. + */ +#define INDEX(i) {false}, i##_index + +static const ctl_named_node_t thread_tcache_node[] = { + {NAME("enabled"), CTL(thread_tcache_enabled)}, + {NAME("flush"), CTL(thread_tcache_flush)} +}; + +static const ctl_named_node_t thread_peak_node[] = { + {NAME("read"), CTL(thread_peak_read)}, + {NAME("reset"), CTL(thread_peak_reset)}, +}; + +static const ctl_named_node_t thread_prof_node[] = { + {NAME("name"), CTL(thread_prof_name)}, + {NAME("active"), CTL(thread_prof_active)} +}; + +static const ctl_named_node_t thread_node[] = { + {NAME("arena"), CTL(thread_arena)}, + {NAME("allocated"), CTL(thread_allocated)}, + {NAME("allocatedp"), CTL(thread_allocatedp)}, + {NAME("deallocated"), CTL(thread_deallocated)}, + {NAME("deallocatedp"), CTL(thread_deallocatedp)}, + {NAME("tcache"), CHILD(named, thread_tcache)}, + {NAME("peak"), CHILD(named, thread_peak)}, + {NAME("prof"), CHILD(named, thread_prof)}, + {NAME("idle"), CTL(thread_idle)} +}; + +static const ctl_named_node_t config_node[] = { + {NAME("cache_oblivious"), CTL(config_cache_oblivious)}, + {NAME("debug"), CTL(config_debug)}, + {NAME("fill"), CTL(config_fill)}, + {NAME("lazy_lock"), CTL(config_lazy_lock)}, + {NAME("malloc_conf"), CTL(config_malloc_conf)}, + {NAME("opt_safety_checks"), CTL(config_opt_safety_checks)}, + {NAME("prof"), CTL(config_prof)}, + {NAME("prof_libgcc"), CTL(config_prof_libgcc)}, + {NAME("prof_libunwind"), CTL(config_prof_libunwind)}, + {NAME("stats"), CTL(config_stats)}, + {NAME("utrace"), CTL(config_utrace)}, + {NAME("xmalloc"), CTL(config_xmalloc)} +}; + +static const ctl_named_node_t opt_node[] = { + {NAME("abort"), CTL(opt_abort)}, + {NAME("abort_conf"), CTL(opt_abort_conf)}, + {NAME("cache_oblivious"), CTL(opt_cache_oblivious)}, + {NAME("trust_madvise"), CTL(opt_trust_madvise)}, + {NAME("confirm_conf"), CTL(opt_confirm_conf)}, + {NAME("hpa"), CTL(opt_hpa)}, + {NAME("hpa_slab_max_alloc"), CTL(opt_hpa_slab_max_alloc)}, + {NAME("hpa_hugification_threshold"), + CTL(opt_hpa_hugification_threshold)}, + {NAME("hpa_hugify_delay_ms"), CTL(opt_hpa_hugify_delay_ms)}, + {NAME("hpa_min_purge_interval_ms"), CTL(opt_hpa_min_purge_interval_ms)}, + {NAME("hpa_dirty_mult"), CTL(opt_hpa_dirty_mult)}, + {NAME("hpa_sec_nshards"), CTL(opt_hpa_sec_nshards)}, + {NAME("hpa_sec_max_alloc"), CTL(opt_hpa_sec_max_alloc)}, + {NAME("hpa_sec_max_bytes"), CTL(opt_hpa_sec_max_bytes)}, + {NAME("hpa_sec_bytes_after_flush"), + CTL(opt_hpa_sec_bytes_after_flush)}, + {NAME("hpa_sec_batch_fill_extra"), + CTL(opt_hpa_sec_batch_fill_extra)}, + {NAME("metadata_thp"), CTL(opt_metadata_thp)}, + {NAME("retain"), CTL(opt_retain)}, + {NAME("dss"), CTL(opt_dss)}, + {NAME("narenas"), CTL(opt_narenas)}, + {NAME("percpu_arena"), CTL(opt_percpu_arena)}, + {NAME("oversize_threshold"), CTL(opt_oversize_threshold)}, + {NAME("mutex_max_spin"), CTL(opt_mutex_max_spin)}, + {NAME("background_thread"), CTL(opt_background_thread)}, + {NAME("max_background_threads"), CTL(opt_max_background_threads)}, + {NAME("dirty_decay_ms"), CTL(opt_dirty_decay_ms)}, + {NAME("muzzy_decay_ms"), CTL(opt_muzzy_decay_ms)}, + {NAME("stats_print"), CTL(opt_stats_print)}, + {NAME("stats_print_opts"), CTL(opt_stats_print_opts)}, + {NAME("stats_interval"), CTL(opt_stats_interval)}, + {NAME("stats_interval_opts"), CTL(opt_stats_interval_opts)}, + {NAME("junk"), CTL(opt_junk)}, + {NAME("zero"), CTL(opt_zero)}, + {NAME("utrace"), CTL(opt_utrace)}, + {NAME("xmalloc"), CTL(opt_xmalloc)}, + {NAME("experimental_infallible_new"), + CTL(opt_experimental_infallible_new)}, + {NAME("tcache"), CTL(opt_tcache)}, + {NAME("tcache_max"), CTL(opt_tcache_max)}, + {NAME("tcache_nslots_small_min"), + CTL(opt_tcache_nslots_small_min)}, + {NAME("tcache_nslots_small_max"), + CTL(opt_tcache_nslots_small_max)}, + {NAME("tcache_nslots_large"), CTL(opt_tcache_nslots_large)}, + {NAME("lg_tcache_nslots_mul"), CTL(opt_lg_tcache_nslots_mul)}, + {NAME("tcache_gc_incr_bytes"), CTL(opt_tcache_gc_incr_bytes)}, + {NAME("tcache_gc_delay_bytes"), CTL(opt_tcache_gc_delay_bytes)}, + {NAME("lg_tcache_flush_small_div"), + CTL(opt_lg_tcache_flush_small_div)}, + {NAME("lg_tcache_flush_large_div"), + CTL(opt_lg_tcache_flush_large_div)}, + {NAME("thp"), CTL(opt_thp)}, + {NAME("lg_extent_max_active_fit"), CTL(opt_lg_extent_max_active_fit)}, + {NAME("prof"), CTL(opt_prof)}, + {NAME("prof_prefix"), CTL(opt_prof_prefix)}, + {NAME("prof_active"), CTL(opt_prof_active)}, + {NAME("prof_thread_active_init"), CTL(opt_prof_thread_active_init)}, + {NAME("lg_prof_sample"), CTL(opt_lg_prof_sample)}, + {NAME("lg_prof_interval"), CTL(opt_lg_prof_interval)}, + {NAME("prof_gdump"), CTL(opt_prof_gdump)}, + {NAME("prof_final"), CTL(opt_prof_final)}, + {NAME("prof_leak"), CTL(opt_prof_leak)}, + {NAME("prof_leak_error"), CTL(opt_prof_leak_error)}, + {NAME("prof_accum"), CTL(opt_prof_accum)}, + {NAME("prof_recent_alloc_max"), CTL(opt_prof_recent_alloc_max)}, + {NAME("prof_stats"), CTL(opt_prof_stats)}, + {NAME("prof_sys_thread_name"), CTL(opt_prof_sys_thread_name)}, + {NAME("prof_time_resolution"), CTL(opt_prof_time_res)}, + {NAME("lg_san_uaf_align"), CTL(opt_lg_san_uaf_align)}, + {NAME("zero_realloc"), CTL(opt_zero_realloc)} +}; + +static const ctl_named_node_t tcache_node[] = { + {NAME("create"), CTL(tcache_create)}, + {NAME("flush"), CTL(tcache_flush)}, + {NAME("destroy"), CTL(tcache_destroy)} +}; + +static const ctl_named_node_t arena_i_node[] = { + {NAME("initialized"), CTL(arena_i_initialized)}, + {NAME("decay"), CTL(arena_i_decay)}, + {NAME("purge"), CTL(arena_i_purge)}, + {NAME("reset"), CTL(arena_i_reset)}, + {NAME("destroy"), CTL(arena_i_destroy)}, + {NAME("dss"), CTL(arena_i_dss)}, + /* + * Undocumented for now, since we anticipate an arena API in flux after + * we cut the last 5-series release. + */ + {NAME("oversize_threshold"), CTL(arena_i_oversize_threshold)}, + {NAME("dirty_decay_ms"), CTL(arena_i_dirty_decay_ms)}, + {NAME("muzzy_decay_ms"), CTL(arena_i_muzzy_decay_ms)}, + {NAME("extent_hooks"), CTL(arena_i_extent_hooks)}, + {NAME("retain_grow_limit"), CTL(arena_i_retain_grow_limit)} +}; +static const ctl_named_node_t super_arena_i_node[] = { + {NAME(""), CHILD(named, arena_i)} +}; + +static const ctl_indexed_node_t arena_node[] = { + {INDEX(arena_i)} +}; + +static const ctl_named_node_t arenas_bin_i_node[] = { + {NAME("size"), CTL(arenas_bin_i_size)}, + {NAME("nregs"), CTL(arenas_bin_i_nregs)}, + {NAME("slab_size"), CTL(arenas_bin_i_slab_size)}, + {NAME("nshards"), CTL(arenas_bin_i_nshards)} +}; +static const ctl_named_node_t super_arenas_bin_i_node[] = { + {NAME(""), CHILD(named, arenas_bin_i)} +}; + +static const ctl_indexed_node_t arenas_bin_node[] = { + {INDEX(arenas_bin_i)} +}; + +static const ctl_named_node_t arenas_lextent_i_node[] = { + {NAME("size"), CTL(arenas_lextent_i_size)} +}; +static const ctl_named_node_t super_arenas_lextent_i_node[] = { + {NAME(""), CHILD(named, arenas_lextent_i)} +}; + +static const ctl_indexed_node_t arenas_lextent_node[] = { + {INDEX(arenas_lextent_i)} +}; + +static const ctl_named_node_t arenas_node[] = { + {NAME("narenas"), CTL(arenas_narenas)}, + {NAME("dirty_decay_ms"), CTL(arenas_dirty_decay_ms)}, + {NAME("muzzy_decay_ms"), CTL(arenas_muzzy_decay_ms)}, + {NAME("quantum"), CTL(arenas_quantum)}, + {NAME("page"), CTL(arenas_page)}, + {NAME("tcache_max"), CTL(arenas_tcache_max)}, + {NAME("nbins"), CTL(arenas_nbins)}, + {NAME("nhbins"), CTL(arenas_nhbins)}, + {NAME("bin"), CHILD(indexed, arenas_bin)}, + {NAME("nlextents"), CTL(arenas_nlextents)}, + {NAME("lextent"), CHILD(indexed, arenas_lextent)}, + {NAME("create"), CTL(arenas_create)}, + {NAME("lookup"), CTL(arenas_lookup)} +}; + +static const ctl_named_node_t prof_stats_bins_i_node[] = { + {NAME("live"), CTL(prof_stats_bins_i_live)}, + {NAME("accum"), CTL(prof_stats_bins_i_accum)} +}; + +static const ctl_named_node_t super_prof_stats_bins_i_node[] = { + {NAME(""), CHILD(named, prof_stats_bins_i)} +}; + +static const ctl_indexed_node_t prof_stats_bins_node[] = { + {INDEX(prof_stats_bins_i)} +}; + +static const ctl_named_node_t prof_stats_lextents_i_node[] = { + {NAME("live"), CTL(prof_stats_lextents_i_live)}, + {NAME("accum"), CTL(prof_stats_lextents_i_accum)} +}; + +static const ctl_named_node_t super_prof_stats_lextents_i_node[] = { + {NAME(""), CHILD(named, prof_stats_lextents_i)} +}; + +static const ctl_indexed_node_t prof_stats_lextents_node[] = { + {INDEX(prof_stats_lextents_i)} +}; + +static const ctl_named_node_t prof_stats_node[] = { + {NAME("bins"), CHILD(indexed, prof_stats_bins)}, + {NAME("lextents"), CHILD(indexed, prof_stats_lextents)}, +}; + +static const ctl_named_node_t prof_node[] = { + {NAME("thread_active_init"), CTL(prof_thread_active_init)}, + {NAME("active"), CTL(prof_active)}, + {NAME("dump"), CTL(prof_dump)}, + {NAME("gdump"), CTL(prof_gdump)}, + {NAME("prefix"), CTL(prof_prefix)}, + {NAME("reset"), CTL(prof_reset)}, + {NAME("interval"), CTL(prof_interval)}, + {NAME("lg_sample"), CTL(lg_prof_sample)}, + {NAME("log_start"), CTL(prof_log_start)}, + {NAME("log_stop"), CTL(prof_log_stop)}, + {NAME("stats"), CHILD(named, prof_stats)} +}; + +static const ctl_named_node_t stats_arenas_i_small_node[] = { + {NAME("allocated"), CTL(stats_arenas_i_small_allocated)}, + {NAME("nmalloc"), CTL(stats_arenas_i_small_nmalloc)}, + {NAME("ndalloc"), CTL(stats_arenas_i_small_ndalloc)}, + {NAME("nrequests"), CTL(stats_arenas_i_small_nrequests)}, + {NAME("nfills"), CTL(stats_arenas_i_small_nfills)}, + {NAME("nflushes"), CTL(stats_arenas_i_small_nflushes)} +}; + +static const ctl_named_node_t stats_arenas_i_large_node[] = { + {NAME("allocated"), CTL(stats_arenas_i_large_allocated)}, + {NAME("nmalloc"), CTL(stats_arenas_i_large_nmalloc)}, + {NAME("ndalloc"), CTL(stats_arenas_i_large_ndalloc)}, + {NAME("nrequests"), CTL(stats_arenas_i_large_nrequests)}, + {NAME("nfills"), CTL(stats_arenas_i_large_nfills)}, + {NAME("nflushes"), CTL(stats_arenas_i_large_nflushes)} +}; + +#define MUTEX_PROF_DATA_NODE(prefix) \ +static const ctl_named_node_t stats_##prefix##_node[] = { \ + {NAME("num_ops"), \ + CTL(stats_##prefix##_num_ops)}, \ + {NAME("num_wait"), \ + CTL(stats_##prefix##_num_wait)}, \ + {NAME("num_spin_acq"), \ + CTL(stats_##prefix##_num_spin_acq)}, \ + {NAME("num_owner_switch"), \ + CTL(stats_##prefix##_num_owner_switch)}, \ + {NAME("total_wait_time"), \ + CTL(stats_##prefix##_total_wait_time)}, \ + {NAME("max_wait_time"), \ + CTL(stats_##prefix##_max_wait_time)}, \ + {NAME("max_num_thds"), \ + CTL(stats_##prefix##_max_num_thds)} \ + /* Note that # of current waiting thread not provided. */ \ +}; + +MUTEX_PROF_DATA_NODE(arenas_i_bins_j_mutex) + +static const ctl_named_node_t stats_arenas_i_bins_j_node[] = { + {NAME("nmalloc"), CTL(stats_arenas_i_bins_j_nmalloc)}, + {NAME("ndalloc"), CTL(stats_arenas_i_bins_j_ndalloc)}, + {NAME("nrequests"), CTL(stats_arenas_i_bins_j_nrequests)}, + {NAME("curregs"), CTL(stats_arenas_i_bins_j_curregs)}, + {NAME("nfills"), CTL(stats_arenas_i_bins_j_nfills)}, + {NAME("nflushes"), CTL(stats_arenas_i_bins_j_nflushes)}, + {NAME("nslabs"), CTL(stats_arenas_i_bins_j_nslabs)}, + {NAME("nreslabs"), CTL(stats_arenas_i_bins_j_nreslabs)}, + {NAME("curslabs"), CTL(stats_arenas_i_bins_j_curslabs)}, + {NAME("nonfull_slabs"), CTL(stats_arenas_i_bins_j_nonfull_slabs)}, + {NAME("mutex"), CHILD(named, stats_arenas_i_bins_j_mutex)} +}; + +static const ctl_named_node_t super_stats_arenas_i_bins_j_node[] = { + {NAME(""), CHILD(named, stats_arenas_i_bins_j)} +}; + +static const ctl_indexed_node_t stats_arenas_i_bins_node[] = { + {INDEX(stats_arenas_i_bins_j)} +}; + +static const ctl_named_node_t stats_arenas_i_lextents_j_node[] = { + {NAME("nmalloc"), CTL(stats_arenas_i_lextents_j_nmalloc)}, + {NAME("ndalloc"), CTL(stats_arenas_i_lextents_j_ndalloc)}, + {NAME("nrequests"), CTL(stats_arenas_i_lextents_j_nrequests)}, + {NAME("curlextents"), CTL(stats_arenas_i_lextents_j_curlextents)} +}; +static const ctl_named_node_t super_stats_arenas_i_lextents_j_node[] = { + {NAME(""), CHILD(named, stats_arenas_i_lextents_j)} +}; + +static const ctl_indexed_node_t stats_arenas_i_lextents_node[] = { + {INDEX(stats_arenas_i_lextents_j)} +}; + +static const ctl_named_node_t stats_arenas_i_extents_j_node[] = { + {NAME("ndirty"), CTL(stats_arenas_i_extents_j_ndirty)}, + {NAME("nmuzzy"), CTL(stats_arenas_i_extents_j_nmuzzy)}, + {NAME("nretained"), CTL(stats_arenas_i_extents_j_nretained)}, + {NAME("dirty_bytes"), CTL(stats_arenas_i_extents_j_dirty_bytes)}, + {NAME("muzzy_bytes"), CTL(stats_arenas_i_extents_j_muzzy_bytes)}, + {NAME("retained_bytes"), CTL(stats_arenas_i_extents_j_retained_bytes)} +}; + +static const ctl_named_node_t super_stats_arenas_i_extents_j_node[] = { + {NAME(""), CHILD(named, stats_arenas_i_extents_j)} +}; + +static const ctl_indexed_node_t stats_arenas_i_extents_node[] = { + {INDEX(stats_arenas_i_extents_j)} +}; + +#define OP(mtx) MUTEX_PROF_DATA_NODE(arenas_i_mutexes_##mtx) +MUTEX_PROF_ARENA_MUTEXES +#undef OP + +static const ctl_named_node_t stats_arenas_i_mutexes_node[] = { +#define OP(mtx) {NAME(#mtx), CHILD(named, stats_arenas_i_mutexes_##mtx)}, +MUTEX_PROF_ARENA_MUTEXES +#undef OP +}; + +static const ctl_named_node_t stats_arenas_i_hpa_shard_full_slabs_node[] = { + {NAME("npageslabs_nonhuge"), + CTL(stats_arenas_i_hpa_shard_full_slabs_npageslabs_nonhuge)}, + {NAME("npageslabs_huge"), + CTL(stats_arenas_i_hpa_shard_full_slabs_npageslabs_huge)}, + {NAME("nactive_nonhuge"), + CTL(stats_arenas_i_hpa_shard_full_slabs_nactive_nonhuge)}, + {NAME("nactive_huge"), + CTL(stats_arenas_i_hpa_shard_full_slabs_nactive_huge)}, + {NAME("ndirty_nonhuge"), + CTL(stats_arenas_i_hpa_shard_full_slabs_ndirty_nonhuge)}, + {NAME("ndirty_huge"), + CTL(stats_arenas_i_hpa_shard_full_slabs_ndirty_huge)} +}; + +static const ctl_named_node_t stats_arenas_i_hpa_shard_empty_slabs_node[] = { + {NAME("npageslabs_nonhuge"), + CTL(stats_arenas_i_hpa_shard_empty_slabs_npageslabs_nonhuge)}, + {NAME("npageslabs_huge"), + CTL(stats_arenas_i_hpa_shard_empty_slabs_npageslabs_huge)}, + {NAME("nactive_nonhuge"), + CTL(stats_arenas_i_hpa_shard_empty_slabs_nactive_nonhuge)}, + {NAME("nactive_huge"), + CTL(stats_arenas_i_hpa_shard_empty_slabs_nactive_huge)}, + {NAME("ndirty_nonhuge"), + CTL(stats_arenas_i_hpa_shard_empty_slabs_ndirty_nonhuge)}, + {NAME("ndirty_huge"), + CTL(stats_arenas_i_hpa_shard_empty_slabs_ndirty_huge)} +}; + +static const ctl_named_node_t stats_arenas_i_hpa_shard_nonfull_slabs_j_node[] = { + {NAME("npageslabs_nonhuge"), + CTL(stats_arenas_i_hpa_shard_nonfull_slabs_j_npageslabs_nonhuge)}, + {NAME("npageslabs_huge"), + CTL(stats_arenas_i_hpa_shard_nonfull_slabs_j_npageslabs_huge)}, + {NAME("nactive_nonhuge"), + CTL(stats_arenas_i_hpa_shard_nonfull_slabs_j_nactive_nonhuge)}, + {NAME("nactive_huge"), + CTL(stats_arenas_i_hpa_shard_nonfull_slabs_j_nactive_huge)}, + {NAME("ndirty_nonhuge"), + CTL(stats_arenas_i_hpa_shard_nonfull_slabs_j_ndirty_nonhuge)}, + {NAME("ndirty_huge"), + CTL(stats_arenas_i_hpa_shard_nonfull_slabs_j_ndirty_huge)} +}; + +static const ctl_named_node_t super_stats_arenas_i_hpa_shard_nonfull_slabs_j_node[] = { + {NAME(""), + CHILD(named, stats_arenas_i_hpa_shard_nonfull_slabs_j)} +}; + +static const ctl_indexed_node_t stats_arenas_i_hpa_shard_nonfull_slabs_node[] = +{ + {INDEX(stats_arenas_i_hpa_shard_nonfull_slabs_j)} +}; + +static const ctl_named_node_t stats_arenas_i_hpa_shard_node[] = { + {NAME("full_slabs"), CHILD(named, + stats_arenas_i_hpa_shard_full_slabs)}, + {NAME("empty_slabs"), CHILD(named, + stats_arenas_i_hpa_shard_empty_slabs)}, + {NAME("nonfull_slabs"), CHILD(indexed, + stats_arenas_i_hpa_shard_nonfull_slabs)}, + + {NAME("npurge_passes"), CTL(stats_arenas_i_hpa_shard_npurge_passes)}, + {NAME("npurges"), CTL(stats_arenas_i_hpa_shard_npurges)}, + {NAME("nhugifies"), CTL(stats_arenas_i_hpa_shard_nhugifies)}, + {NAME("ndehugifies"), CTL(stats_arenas_i_hpa_shard_ndehugifies)} +}; + +static const ctl_named_node_t stats_arenas_i_node[] = { + {NAME("nthreads"), CTL(stats_arenas_i_nthreads)}, + {NAME("uptime"), CTL(stats_arenas_i_uptime)}, + {NAME("dss"), CTL(stats_arenas_i_dss)}, + {NAME("dirty_decay_ms"), CTL(stats_arenas_i_dirty_decay_ms)}, + {NAME("muzzy_decay_ms"), CTL(stats_arenas_i_muzzy_decay_ms)}, + {NAME("pactive"), CTL(stats_arenas_i_pactive)}, + {NAME("pdirty"), CTL(stats_arenas_i_pdirty)}, + {NAME("pmuzzy"), CTL(stats_arenas_i_pmuzzy)}, + {NAME("mapped"), CTL(stats_arenas_i_mapped)}, + {NAME("retained"), CTL(stats_arenas_i_retained)}, + {NAME("extent_avail"), CTL(stats_arenas_i_extent_avail)}, + {NAME("dirty_npurge"), CTL(stats_arenas_i_dirty_npurge)}, + {NAME("dirty_nmadvise"), CTL(stats_arenas_i_dirty_nmadvise)}, + {NAME("dirty_purged"), CTL(stats_arenas_i_dirty_purged)}, + {NAME("muzzy_npurge"), CTL(stats_arenas_i_muzzy_npurge)}, + {NAME("muzzy_nmadvise"), CTL(stats_arenas_i_muzzy_nmadvise)}, + {NAME("muzzy_purged"), CTL(stats_arenas_i_muzzy_purged)}, + {NAME("base"), CTL(stats_arenas_i_base)}, + {NAME("internal"), CTL(stats_arenas_i_internal)}, + {NAME("metadata_thp"), CTL(stats_arenas_i_metadata_thp)}, + {NAME("tcache_bytes"), CTL(stats_arenas_i_tcache_bytes)}, + {NAME("tcache_stashed_bytes"), + CTL(stats_arenas_i_tcache_stashed_bytes)}, + {NAME("resident"), CTL(stats_arenas_i_resident)}, + {NAME("abandoned_vm"), CTL(stats_arenas_i_abandoned_vm)}, + {NAME("hpa_sec_bytes"), CTL(stats_arenas_i_hpa_sec_bytes)}, + {NAME("small"), CHILD(named, stats_arenas_i_small)}, + {NAME("large"), CHILD(named, stats_arenas_i_large)}, + {NAME("bins"), CHILD(indexed, stats_arenas_i_bins)}, + {NAME("lextents"), CHILD(indexed, stats_arenas_i_lextents)}, + {NAME("extents"), CHILD(indexed, stats_arenas_i_extents)}, + {NAME("mutexes"), CHILD(named, stats_arenas_i_mutexes)}, + {NAME("hpa_shard"), CHILD(named, stats_arenas_i_hpa_shard)} +}; +static const ctl_named_node_t super_stats_arenas_i_node[] = { + {NAME(""), CHILD(named, stats_arenas_i)} +}; + +static const ctl_indexed_node_t stats_arenas_node[] = { + {INDEX(stats_arenas_i)} +}; + +static const ctl_named_node_t stats_background_thread_node[] = { + {NAME("num_threads"), CTL(stats_background_thread_num_threads)}, + {NAME("num_runs"), CTL(stats_background_thread_num_runs)}, + {NAME("run_interval"), CTL(stats_background_thread_run_interval)} +}; + +#define OP(mtx) MUTEX_PROF_DATA_NODE(mutexes_##mtx) +MUTEX_PROF_GLOBAL_MUTEXES +#undef OP + +static const ctl_named_node_t stats_mutexes_node[] = { +#define OP(mtx) {NAME(#mtx), CHILD(named, stats_mutexes_##mtx)}, +MUTEX_PROF_GLOBAL_MUTEXES +#undef OP + {NAME("reset"), CTL(stats_mutexes_reset)} +}; +#undef MUTEX_PROF_DATA_NODE + +static const ctl_named_node_t stats_node[] = { + {NAME("allocated"), CTL(stats_allocated)}, + {NAME("active"), CTL(stats_active)}, + {NAME("metadata"), CTL(stats_metadata)}, + {NAME("metadata_thp"), CTL(stats_metadata_thp)}, + {NAME("resident"), CTL(stats_resident)}, + {NAME("mapped"), CTL(stats_mapped)}, + {NAME("retained"), CTL(stats_retained)}, + {NAME("background_thread"), + CHILD(named, stats_background_thread)}, + {NAME("mutexes"), CHILD(named, stats_mutexes)}, + {NAME("arenas"), CHILD(indexed, stats_arenas)}, + {NAME("zero_reallocs"), CTL(stats_zero_reallocs)}, +}; + +static const ctl_named_node_t experimental_hooks_node[] = { + {NAME("install"), CTL(experimental_hooks_install)}, + {NAME("remove"), CTL(experimental_hooks_remove)}, + {NAME("prof_backtrace"), CTL(experimental_hooks_prof_backtrace)}, + {NAME("prof_dump"), CTL(experimental_hooks_prof_dump)}, + {NAME("safety_check_abort"), CTL(experimental_hooks_safety_check_abort)}, +}; + +static const ctl_named_node_t experimental_thread_node[] = { + {NAME("activity_callback"), + CTL(experimental_thread_activity_callback)} +}; + +static const ctl_named_node_t experimental_utilization_node[] = { + {NAME("query"), CTL(experimental_utilization_query)}, + {NAME("batch_query"), CTL(experimental_utilization_batch_query)} +}; + +static const ctl_named_node_t experimental_arenas_i_node[] = { + {NAME("pactivep"), CTL(experimental_arenas_i_pactivep)} +}; +static const ctl_named_node_t super_experimental_arenas_i_node[] = { + {NAME(""), CHILD(named, experimental_arenas_i)} +}; + +static const ctl_indexed_node_t experimental_arenas_node[] = { + {INDEX(experimental_arenas_i)} +}; + +static const ctl_named_node_t experimental_prof_recent_node[] = { + {NAME("alloc_max"), CTL(experimental_prof_recent_alloc_max)}, + {NAME("alloc_dump"), CTL(experimental_prof_recent_alloc_dump)}, +}; + +static const ctl_named_node_t experimental_node[] = { + {NAME("hooks"), CHILD(named, experimental_hooks)}, + {NAME("utilization"), CHILD(named, experimental_utilization)}, + {NAME("arenas"), CHILD(indexed, experimental_arenas)}, + {NAME("arenas_create_ext"), CTL(experimental_arenas_create_ext)}, + {NAME("prof_recent"), CHILD(named, experimental_prof_recent)}, + {NAME("batch_alloc"), CTL(experimental_batch_alloc)}, + {NAME("thread"), CHILD(named, experimental_thread)} +}; + +static const ctl_named_node_t root_node[] = { + {NAME("version"), CTL(version)}, + {NAME("epoch"), CTL(epoch)}, + {NAME("background_thread"), CTL(background_thread)}, + {NAME("max_background_threads"), CTL(max_background_threads)}, + {NAME("thread"), CHILD(named, thread)}, + {NAME("config"), CHILD(named, config)}, + {NAME("opt"), CHILD(named, opt)}, + {NAME("tcache"), CHILD(named, tcache)}, + {NAME("arena"), CHILD(indexed, arena)}, + {NAME("arenas"), CHILD(named, arenas)}, + {NAME("prof"), CHILD(named, prof)}, + {NAME("stats"), CHILD(named, stats)}, + {NAME("experimental"), CHILD(named, experimental)} +}; +static const ctl_named_node_t super_root_node[] = { + {NAME(""), CHILD(named, root)} +}; + +#undef NAME +#undef CHILD +#undef CTL +#undef INDEX + +/******************************************************************************/ + +/* + * Sets *dst + *src non-atomically. This is safe, since everything is + * synchronized by the ctl mutex. + */ +static void +ctl_accum_locked_u64(locked_u64_t *dst, locked_u64_t *src) { + locked_inc_u64_unsynchronized(dst, + locked_read_u64_unsynchronized(src)); +} + +static void +ctl_accum_atomic_zu(atomic_zu_t *dst, atomic_zu_t *src) { + size_t cur_dst = atomic_load_zu(dst, ATOMIC_RELAXED); + size_t cur_src = atomic_load_zu(src, ATOMIC_RELAXED); + atomic_store_zu(dst, cur_dst + cur_src, ATOMIC_RELAXED); +} + +/******************************************************************************/ + +static unsigned +arenas_i2a_impl(size_t i, bool compat, bool validate) { + unsigned a; + + switch (i) { + case MALLCTL_ARENAS_ALL: + a = 0; + break; + case MALLCTL_ARENAS_DESTROYED: + a = 1; + break; + default: + if (compat && i == ctl_arenas->narenas) { + /* + * Provide deprecated backward compatibility for + * accessing the merged stats at index narenas rather + * than via MALLCTL_ARENAS_ALL. This is scheduled for + * removal in 6.0.0. + */ + a = 0; + } else if (validate && i >= ctl_arenas->narenas) { + a = UINT_MAX; + } else { + /* + * This function should never be called for an index + * more than one past the range of indices that have + * initialized ctl data. + */ + assert(i < ctl_arenas->narenas || (!validate && i == + ctl_arenas->narenas)); + a = (unsigned)i + 2; + } + break; + } + + return a; +} + +static unsigned +arenas_i2a(size_t i) { + return arenas_i2a_impl(i, true, false); +} + +static ctl_arena_t * +arenas_i_impl(tsd_t *tsd, size_t i, bool compat, bool init) { + ctl_arena_t *ret; + + assert(!compat || !init); + + ret = ctl_arenas->arenas[arenas_i2a_impl(i, compat, false)]; + if (init && ret == NULL) { + if (config_stats) { + struct container_s { + ctl_arena_t ctl_arena; + ctl_arena_stats_t astats; + }; + struct container_s *cont = + (struct container_s *)base_alloc(tsd_tsdn(tsd), + b0get(), sizeof(struct container_s), QUANTUM); + if (cont == NULL) { + return NULL; + } + ret = &cont->ctl_arena; + ret->astats = &cont->astats; + } else { + ret = (ctl_arena_t *)base_alloc(tsd_tsdn(tsd), b0get(), + sizeof(ctl_arena_t), QUANTUM); + if (ret == NULL) { + return NULL; + } + } + ret->arena_ind = (unsigned)i; + ctl_arenas->arenas[arenas_i2a_impl(i, compat, false)] = ret; + } + + assert(ret == NULL || arenas_i2a(ret->arena_ind) == arenas_i2a(i)); + return ret; +} + +static ctl_arena_t * +arenas_i(size_t i) { + ctl_arena_t *ret = arenas_i_impl(tsd_fetch(), i, true, false); + assert(ret != NULL); + return ret; +} + +static void +ctl_arena_clear(ctl_arena_t *ctl_arena) { + ctl_arena->nthreads = 0; + ctl_arena->dss = dss_prec_names[dss_prec_limit]; + ctl_arena->dirty_decay_ms = -1; + ctl_arena->muzzy_decay_ms = -1; + ctl_arena->pactive = 0; + ctl_arena->pdirty = 0; + ctl_arena->pmuzzy = 0; + if (config_stats) { + memset(&ctl_arena->astats->astats, 0, sizeof(arena_stats_t)); + ctl_arena->astats->allocated_small = 0; + ctl_arena->astats->nmalloc_small = 0; + ctl_arena->astats->ndalloc_small = 0; + ctl_arena->astats->nrequests_small = 0; + ctl_arena->astats->nfills_small = 0; + ctl_arena->astats->nflushes_small = 0; + memset(ctl_arena->astats->bstats, 0, SC_NBINS * + sizeof(bin_stats_data_t)); + memset(ctl_arena->astats->lstats, 0, (SC_NSIZES - SC_NBINS) * + sizeof(arena_stats_large_t)); + memset(ctl_arena->astats->estats, 0, SC_NPSIZES * + sizeof(pac_estats_t)); + memset(&ctl_arena->astats->hpastats, 0, + sizeof(hpa_shard_stats_t)); + memset(&ctl_arena->astats->secstats, 0, + sizeof(sec_stats_t)); + } +} + +static void +ctl_arena_stats_amerge(tsdn_t *tsdn, ctl_arena_t *ctl_arena, arena_t *arena) { + unsigned i; + + if (config_stats) { + arena_stats_merge(tsdn, arena, &ctl_arena->nthreads, + &ctl_arena->dss, &ctl_arena->dirty_decay_ms, + &ctl_arena->muzzy_decay_ms, &ctl_arena->pactive, + &ctl_arena->pdirty, &ctl_arena->pmuzzy, + &ctl_arena->astats->astats, ctl_arena->astats->bstats, + ctl_arena->astats->lstats, ctl_arena->astats->estats, + &ctl_arena->astats->hpastats, &ctl_arena->astats->secstats); + + for (i = 0; i < SC_NBINS; i++) { + bin_stats_t *bstats = + &ctl_arena->astats->bstats[i].stats_data; + ctl_arena->astats->allocated_small += bstats->curregs * + sz_index2size(i); + ctl_arena->astats->nmalloc_small += bstats->nmalloc; + ctl_arena->astats->ndalloc_small += bstats->ndalloc; + ctl_arena->astats->nrequests_small += bstats->nrequests; + ctl_arena->astats->nfills_small += bstats->nfills; + ctl_arena->astats->nflushes_small += bstats->nflushes; + } + } else { + arena_basic_stats_merge(tsdn, arena, &ctl_arena->nthreads, + &ctl_arena->dss, &ctl_arena->dirty_decay_ms, + &ctl_arena->muzzy_decay_ms, &ctl_arena->pactive, + &ctl_arena->pdirty, &ctl_arena->pmuzzy); + } +} + +static void +ctl_arena_stats_sdmerge(ctl_arena_t *ctl_sdarena, ctl_arena_t *ctl_arena, + bool destroyed) { + unsigned i; + + if (!destroyed) { + ctl_sdarena->nthreads += ctl_arena->nthreads; + ctl_sdarena->pactive += ctl_arena->pactive; + ctl_sdarena->pdirty += ctl_arena->pdirty; + ctl_sdarena->pmuzzy += ctl_arena->pmuzzy; + } else { + assert(ctl_arena->nthreads == 0); + assert(ctl_arena->pactive == 0); + assert(ctl_arena->pdirty == 0); + assert(ctl_arena->pmuzzy == 0); + } + + if (config_stats) { + ctl_arena_stats_t *sdstats = ctl_sdarena->astats; + ctl_arena_stats_t *astats = ctl_arena->astats; + + if (!destroyed) { + sdstats->astats.mapped += astats->astats.mapped; + sdstats->astats.pa_shard_stats.pac_stats.retained + += astats->astats.pa_shard_stats.pac_stats.retained; + sdstats->astats.pa_shard_stats.edata_avail + += astats->astats.pa_shard_stats.edata_avail; + } + + ctl_accum_locked_u64( + &sdstats->astats.pa_shard_stats.pac_stats.decay_dirty.npurge, + &astats->astats.pa_shard_stats.pac_stats.decay_dirty.npurge); + ctl_accum_locked_u64( + &sdstats->astats.pa_shard_stats.pac_stats.decay_dirty.nmadvise, + &astats->astats.pa_shard_stats.pac_stats.decay_dirty.nmadvise); + ctl_accum_locked_u64( + &sdstats->astats.pa_shard_stats.pac_stats.decay_dirty.purged, + &astats->astats.pa_shard_stats.pac_stats.decay_dirty.purged); + + ctl_accum_locked_u64( + &sdstats->astats.pa_shard_stats.pac_stats.decay_muzzy.npurge, + &astats->astats.pa_shard_stats.pac_stats.decay_muzzy.npurge); + ctl_accum_locked_u64( + &sdstats->astats.pa_shard_stats.pac_stats.decay_muzzy.nmadvise, + &astats->astats.pa_shard_stats.pac_stats.decay_muzzy.nmadvise); + ctl_accum_locked_u64( + &sdstats->astats.pa_shard_stats.pac_stats.decay_muzzy.purged, + &astats->astats.pa_shard_stats.pac_stats.decay_muzzy.purged); + +#define OP(mtx) malloc_mutex_prof_merge( \ + &(sdstats->astats.mutex_prof_data[ \ + arena_prof_mutex_##mtx]), \ + &(astats->astats.mutex_prof_data[ \ + arena_prof_mutex_##mtx])); +MUTEX_PROF_ARENA_MUTEXES +#undef OP + if (!destroyed) { + sdstats->astats.base += astats->astats.base; + sdstats->astats.resident += astats->astats.resident; + sdstats->astats.metadata_thp += astats->astats.metadata_thp; + ctl_accum_atomic_zu(&sdstats->astats.internal, + &astats->astats.internal); + } else { + assert(atomic_load_zu( + &astats->astats.internal, ATOMIC_RELAXED) == 0); + } + + if (!destroyed) { + sdstats->allocated_small += astats->allocated_small; + } else { + assert(astats->allocated_small == 0); + } + sdstats->nmalloc_small += astats->nmalloc_small; + sdstats->ndalloc_small += astats->ndalloc_small; + sdstats->nrequests_small += astats->nrequests_small; + sdstats->nfills_small += astats->nfills_small; + sdstats->nflushes_small += astats->nflushes_small; + + if (!destroyed) { + sdstats->astats.allocated_large += + astats->astats.allocated_large; + } else { + assert(astats->astats.allocated_large == 0); + } + sdstats->astats.nmalloc_large += astats->astats.nmalloc_large; + sdstats->astats.ndalloc_large += astats->astats.ndalloc_large; + sdstats->astats.nrequests_large + += astats->astats.nrequests_large; + sdstats->astats.nflushes_large += astats->astats.nflushes_large; + ctl_accum_atomic_zu( + &sdstats->astats.pa_shard_stats.pac_stats.abandoned_vm, + &astats->astats.pa_shard_stats.pac_stats.abandoned_vm); + + sdstats->astats.tcache_bytes += astats->astats.tcache_bytes; + sdstats->astats.tcache_stashed_bytes += + astats->astats.tcache_stashed_bytes; + + if (ctl_arena->arena_ind == 0) { + sdstats->astats.uptime = astats->astats.uptime; + } + + /* Merge bin stats. */ + for (i = 0; i < SC_NBINS; i++) { + bin_stats_t *bstats = &astats->bstats[i].stats_data; + bin_stats_t *merged = &sdstats->bstats[i].stats_data; + merged->nmalloc += bstats->nmalloc; + merged->ndalloc += bstats->ndalloc; + merged->nrequests += bstats->nrequests; + if (!destroyed) { + merged->curregs += bstats->curregs; + } else { + assert(bstats->curregs == 0); + } + merged->nfills += bstats->nfills; + merged->nflushes += bstats->nflushes; + merged->nslabs += bstats->nslabs; + merged->reslabs += bstats->reslabs; + if (!destroyed) { + merged->curslabs += bstats->curslabs; + merged->nonfull_slabs += bstats->nonfull_slabs; + } else { + assert(bstats->curslabs == 0); + assert(bstats->nonfull_slabs == 0); + } + malloc_mutex_prof_merge(&sdstats->bstats[i].mutex_data, + &astats->bstats[i].mutex_data); + } + + /* Merge stats for large allocations. */ + for (i = 0; i < SC_NSIZES - SC_NBINS; i++) { + ctl_accum_locked_u64(&sdstats->lstats[i].nmalloc, + &astats->lstats[i].nmalloc); + ctl_accum_locked_u64(&sdstats->lstats[i].ndalloc, + &astats->lstats[i].ndalloc); + ctl_accum_locked_u64(&sdstats->lstats[i].nrequests, + &astats->lstats[i].nrequests); + if (!destroyed) { + sdstats->lstats[i].curlextents += + astats->lstats[i].curlextents; + } else { + assert(astats->lstats[i].curlextents == 0); + } + } + + /* Merge extents stats. */ + for (i = 0; i < SC_NPSIZES; i++) { + sdstats->estats[i].ndirty += astats->estats[i].ndirty; + sdstats->estats[i].nmuzzy += astats->estats[i].nmuzzy; + sdstats->estats[i].nretained + += astats->estats[i].nretained; + sdstats->estats[i].dirty_bytes + += astats->estats[i].dirty_bytes; + sdstats->estats[i].muzzy_bytes + += astats->estats[i].muzzy_bytes; + sdstats->estats[i].retained_bytes + += astats->estats[i].retained_bytes; + } + + /* Merge HPA stats. */ + hpa_shard_stats_accum(&sdstats->hpastats, &astats->hpastats); + sec_stats_accum(&sdstats->secstats, &astats->secstats); + } +} + +static void +ctl_arena_refresh(tsdn_t *tsdn, arena_t *arena, ctl_arena_t *ctl_sdarena, + unsigned i, bool destroyed) { + ctl_arena_t *ctl_arena = arenas_i(i); + + ctl_arena_clear(ctl_arena); + ctl_arena_stats_amerge(tsdn, ctl_arena, arena); + /* Merge into sum stats as well. */ + ctl_arena_stats_sdmerge(ctl_sdarena, ctl_arena, destroyed); +} + +static unsigned +ctl_arena_init(tsd_t *tsd, const arena_config_t *config) { + unsigned arena_ind; + ctl_arena_t *ctl_arena; + + if ((ctl_arena = ql_last(&ctl_arenas->destroyed, destroyed_link)) != + NULL) { + ql_remove(&ctl_arenas->destroyed, ctl_arena, destroyed_link); + arena_ind = ctl_arena->arena_ind; + } else { + arena_ind = ctl_arenas->narenas; + } + + /* Trigger stats allocation. */ + if (arenas_i_impl(tsd, arena_ind, false, true) == NULL) { + return UINT_MAX; + } + + /* Initialize new arena. */ + if (arena_init(tsd_tsdn(tsd), arena_ind, config) == NULL) { + return UINT_MAX; + } + + if (arena_ind == ctl_arenas->narenas) { + ctl_arenas->narenas++; + } + + return arena_ind; +} + +static void +ctl_background_thread_stats_read(tsdn_t *tsdn) { + background_thread_stats_t *stats = &ctl_stats->background_thread; + if (!have_background_thread || + background_thread_stats_read(tsdn, stats)) { + memset(stats, 0, sizeof(background_thread_stats_t)); + nstime_init_zero(&stats->run_interval); + } + malloc_mutex_prof_copy( + &ctl_stats->mutex_prof_data[global_prof_mutex_max_per_bg_thd], + &stats->max_counter_per_bg_thd); +} + +static void +ctl_refresh(tsdn_t *tsdn) { + unsigned i; + ctl_arena_t *ctl_sarena = arenas_i(MALLCTL_ARENAS_ALL); + VARIABLE_ARRAY(arena_t *, tarenas, ctl_arenas->narenas); + + /* + * Clear sum stats, since they will be merged into by + * ctl_arena_refresh(). + */ + ctl_arena_clear(ctl_sarena); + + for (i = 0; i < ctl_arenas->narenas; i++) { + tarenas[i] = arena_get(tsdn, i, false); + } + + for (i = 0; i < ctl_arenas->narenas; i++) { + ctl_arena_t *ctl_arena = arenas_i(i); + bool initialized = (tarenas[i] != NULL); + + ctl_arena->initialized = initialized; + if (initialized) { + ctl_arena_refresh(tsdn, tarenas[i], ctl_sarena, i, + false); + } + } + + if (config_stats) { + ctl_stats->allocated = ctl_sarena->astats->allocated_small + + ctl_sarena->astats->astats.allocated_large; + ctl_stats->active = (ctl_sarena->pactive << LG_PAGE); + ctl_stats->metadata = ctl_sarena->astats->astats.base + + atomic_load_zu(&ctl_sarena->astats->astats.internal, + ATOMIC_RELAXED); + ctl_stats->resident = ctl_sarena->astats->astats.resident; + ctl_stats->metadata_thp = + ctl_sarena->astats->astats.metadata_thp; + ctl_stats->mapped = ctl_sarena->astats->astats.mapped; + ctl_stats->retained = ctl_sarena->astats->astats + .pa_shard_stats.pac_stats.retained; + + ctl_background_thread_stats_read(tsdn); + +#define READ_GLOBAL_MUTEX_PROF_DATA(i, mtx) \ + malloc_mutex_lock(tsdn, &mtx); \ + malloc_mutex_prof_read(tsdn, &ctl_stats->mutex_prof_data[i], &mtx); \ + malloc_mutex_unlock(tsdn, &mtx); + + if (config_prof && opt_prof) { + READ_GLOBAL_MUTEX_PROF_DATA( + global_prof_mutex_prof, bt2gctx_mtx); + READ_GLOBAL_MUTEX_PROF_DATA( + global_prof_mutex_prof_thds_data, tdatas_mtx); + READ_GLOBAL_MUTEX_PROF_DATA( + global_prof_mutex_prof_dump, prof_dump_mtx); + READ_GLOBAL_MUTEX_PROF_DATA( + global_prof_mutex_prof_recent_alloc, + prof_recent_alloc_mtx); + READ_GLOBAL_MUTEX_PROF_DATA( + global_prof_mutex_prof_recent_dump, + prof_recent_dump_mtx); + READ_GLOBAL_MUTEX_PROF_DATA( + global_prof_mutex_prof_stats, prof_stats_mtx); + } + if (have_background_thread) { + READ_GLOBAL_MUTEX_PROF_DATA( + global_prof_mutex_background_thread, + background_thread_lock); + } else { + memset(&ctl_stats->mutex_prof_data[ + global_prof_mutex_background_thread], 0, + sizeof(mutex_prof_data_t)); + } + /* We own ctl mutex already. */ + malloc_mutex_prof_read(tsdn, + &ctl_stats->mutex_prof_data[global_prof_mutex_ctl], + &ctl_mtx); +#undef READ_GLOBAL_MUTEX_PROF_DATA + } + ctl_arenas->epoch++; +} + +static bool +ctl_init(tsd_t *tsd) { + bool ret; + tsdn_t *tsdn = tsd_tsdn(tsd); + + malloc_mutex_lock(tsdn, &ctl_mtx); + if (!ctl_initialized) { + ctl_arena_t *ctl_sarena, *ctl_darena; + unsigned i; + + /* + * Allocate demand-zeroed space for pointers to the full + * range of supported arena indices. + */ + if (ctl_arenas == NULL) { + ctl_arenas = (ctl_arenas_t *)base_alloc(tsdn, + b0get(), sizeof(ctl_arenas_t), QUANTUM); + if (ctl_arenas == NULL) { + ret = true; + goto label_return; + } + } + + if (config_stats && ctl_stats == NULL) { + ctl_stats = (ctl_stats_t *)base_alloc(tsdn, b0get(), + sizeof(ctl_stats_t), QUANTUM); + if (ctl_stats == NULL) { + ret = true; + goto label_return; + } + } + + /* + * Allocate space for the current full range of arenas + * here rather than doing it lazily elsewhere, in order + * to limit when OOM-caused errors can occur. + */ + if ((ctl_sarena = arenas_i_impl(tsd, MALLCTL_ARENAS_ALL, false, + true)) == NULL) { + ret = true; + goto label_return; + } + ctl_sarena->initialized = true; + + if ((ctl_darena = arenas_i_impl(tsd, MALLCTL_ARENAS_DESTROYED, + false, true)) == NULL) { + ret = true; + goto label_return; + } + ctl_arena_clear(ctl_darena); + /* + * Don't toggle ctl_darena to initialized until an arena is + * actually destroyed, so that arena..initialized can be used + * to query whether the stats are relevant. + */ + + ctl_arenas->narenas = narenas_total_get(); + for (i = 0; i < ctl_arenas->narenas; i++) { + if (arenas_i_impl(tsd, i, false, true) == NULL) { + ret = true; + goto label_return; + } + } + + ql_new(&ctl_arenas->destroyed); + ctl_refresh(tsdn); + + ctl_initialized = true; + } + + ret = false; +label_return: + malloc_mutex_unlock(tsdn, &ctl_mtx); + return ret; +} + +static int +ctl_lookup(tsdn_t *tsdn, const ctl_named_node_t *starting_node, + const char *name, const ctl_named_node_t **ending_nodep, size_t *mibp, + size_t *depthp) { + int ret; + const char *elm, *tdot, *dot; + size_t elen, i, j; + const ctl_named_node_t *node; + + elm = name; + /* Equivalent to strchrnul(). */ + dot = ((tdot = strchr(elm, '.')) != NULL) ? tdot : strchr(elm, '\0'); + elen = (size_t)((uintptr_t)dot - (uintptr_t)elm); + if (elen == 0) { + ret = ENOENT; + goto label_return; + } + node = starting_node; + for (i = 0; i < *depthp; i++) { + assert(node); + assert(node->nchildren > 0); + if (ctl_named_node(node->children) != NULL) { + const ctl_named_node_t *pnode = node; + + /* Children are named. */ + for (j = 0; j < node->nchildren; j++) { + const ctl_named_node_t *child = + ctl_named_children(node, j); + if (strlen(child->name) == elen && + strncmp(elm, child->name, elen) == 0) { + node = child; + mibp[i] = j; + break; + } + } + if (node == pnode) { + ret = ENOENT; + goto label_return; + } + } else { + uintmax_t index; + const ctl_indexed_node_t *inode; + + /* Children are indexed. */ + index = malloc_strtoumax(elm, NULL, 10); + if (index == UINTMAX_MAX || index > SIZE_T_MAX) { + ret = ENOENT; + goto label_return; + } + + inode = ctl_indexed_node(node->children); + node = inode->index(tsdn, mibp, *depthp, (size_t)index); + if (node == NULL) { + ret = ENOENT; + goto label_return; + } + + mibp[i] = (size_t)index; + } + + /* Reached the end? */ + if (node->ctl != NULL || *dot == '\0') { + /* Terminal node. */ + if (*dot != '\0') { + /* + * The name contains more elements than are + * in this path through the tree. + */ + ret = ENOENT; + goto label_return; + } + /* Complete lookup successful. */ + *depthp = i + 1; + break; + } + + /* Update elm. */ + elm = &dot[1]; + dot = ((tdot = strchr(elm, '.')) != NULL) ? tdot : + strchr(elm, '\0'); + elen = (size_t)((uintptr_t)dot - (uintptr_t)elm); + } + if (ending_nodep != NULL) { + *ending_nodep = node; + } + + ret = 0; +label_return: + return ret; +} + +int +ctl_byname(tsd_t *tsd, const char *name, void *oldp, size_t *oldlenp, + void *newp, size_t newlen) { + int ret; + size_t depth; + size_t mib[CTL_MAX_DEPTH]; + const ctl_named_node_t *node; + + if (!ctl_initialized && ctl_init(tsd)) { + ret = EAGAIN; + goto label_return; + } + + depth = CTL_MAX_DEPTH; + ret = ctl_lookup(tsd_tsdn(tsd), super_root_node, name, &node, mib, + &depth); + if (ret != 0) { + goto label_return; + } + + if (node != NULL && node->ctl) { + ret = node->ctl(tsd, mib, depth, oldp, oldlenp, newp, newlen); + } else { + /* The name refers to a partial path through the ctl tree. */ + ret = ENOENT; + } + +label_return: + return(ret); +} + +int +ctl_nametomib(tsd_t *tsd, const char *name, size_t *mibp, size_t *miblenp) { + int ret; + + if (!ctl_initialized && ctl_init(tsd)) { + ret = EAGAIN; + goto label_return; + } + + ret = ctl_lookup(tsd_tsdn(tsd), super_root_node, name, NULL, mibp, + miblenp); +label_return: + return(ret); +} + +static int +ctl_lookupbymib(tsdn_t *tsdn, const ctl_named_node_t **ending_nodep, + const size_t *mib, size_t miblen) { + int ret; + + const ctl_named_node_t *node = super_root_node; + for (size_t i = 0; i < miblen; i++) { + assert(node); + assert(node->nchildren > 0); + if (ctl_named_node(node->children) != NULL) { + /* Children are named. */ + if (node->nchildren <= mib[i]) { + ret = ENOENT; + goto label_return; + } + node = ctl_named_children(node, mib[i]); + } else { + const ctl_indexed_node_t *inode; + + /* Indexed element. */ + inode = ctl_indexed_node(node->children); + node = inode->index(tsdn, mib, miblen, mib[i]); + if (node == NULL) { + ret = ENOENT; + goto label_return; + } + } + } + assert(ending_nodep != NULL); + *ending_nodep = node; + ret = 0; + +label_return: + return(ret); +} + +int +ctl_bymib(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp, + size_t *oldlenp, void *newp, size_t newlen) { + int ret; + const ctl_named_node_t *node; + + if (!ctl_initialized && ctl_init(tsd)) { + ret = EAGAIN; + goto label_return; + } + + ret = ctl_lookupbymib(tsd_tsdn(tsd), &node, mib, miblen); + if (ret != 0) { + goto label_return; + } + + /* Call the ctl function. */ + if (node && node->ctl) { + ret = node->ctl(tsd, mib, miblen, oldp, oldlenp, newp, newlen); + } else { + /* Partial MIB. */ + ret = ENOENT; + } + +label_return: + return(ret); +} + +int +ctl_mibnametomib(tsd_t *tsd, size_t *mib, size_t miblen, const char *name, + size_t *miblenp) { + int ret; + const ctl_named_node_t *node; + + if (!ctl_initialized && ctl_init(tsd)) { + ret = EAGAIN; + goto label_return; + } + + ret = ctl_lookupbymib(tsd_tsdn(tsd), &node, mib, miblen); + if (ret != 0) { + goto label_return; + } + if (node == NULL || node->ctl != NULL) { + ret = ENOENT; + goto label_return; + } + + assert(miblenp != NULL); + assert(*miblenp >= miblen); + *miblenp -= miblen; + ret = ctl_lookup(tsd_tsdn(tsd), node, name, NULL, mib + miblen, + miblenp); + *miblenp += miblen; +label_return: + return(ret); +} + +int +ctl_bymibname(tsd_t *tsd, size_t *mib, size_t miblen, const char *name, + size_t *miblenp, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + const ctl_named_node_t *node; + + if (!ctl_initialized && ctl_init(tsd)) { + ret = EAGAIN; + goto label_return; + } + + ret = ctl_lookupbymib(tsd_tsdn(tsd), &node, mib, miblen); + if (ret != 0) { + goto label_return; + } + if (node == NULL || node->ctl != NULL) { + ret = ENOENT; + goto label_return; + } + + assert(miblenp != NULL); + assert(*miblenp >= miblen); + *miblenp -= miblen; + /* + * The same node supplies the starting node and stores the ending node. + */ + ret = ctl_lookup(tsd_tsdn(tsd), node, name, &node, mib + miblen, + miblenp); + *miblenp += miblen; + if (ret != 0) { + goto label_return; + } + + if (node != NULL && node->ctl) { + ret = node->ctl(tsd, mib, *miblenp, oldp, oldlenp, newp, + newlen); + } else { + /* The name refers to a partial path through the ctl tree. */ + ret = ENOENT; + } + +label_return: + return(ret); +} + +bool +ctl_boot(void) { + if (malloc_mutex_init(&ctl_mtx, "ctl", WITNESS_RANK_CTL, + malloc_mutex_rank_exclusive)) { + return true; + } + + ctl_initialized = false; + + return false; +} + +void +ctl_prefork(tsdn_t *tsdn) { + malloc_mutex_prefork(tsdn, &ctl_mtx); +} + +void +ctl_postfork_parent(tsdn_t *tsdn) { + malloc_mutex_postfork_parent(tsdn, &ctl_mtx); +} + +void +ctl_postfork_child(tsdn_t *tsdn) { + malloc_mutex_postfork_child(tsdn, &ctl_mtx); +} + +void +ctl_mtx_assert_held(tsdn_t *tsdn) { + malloc_mutex_assert_owner(tsdn, &ctl_mtx); +} + +/******************************************************************************/ +/* *_ctl() functions. */ + +#define READONLY() do { \ + if (newp != NULL || newlen != 0) { \ + ret = EPERM; \ + goto label_return; \ + } \ +} while (0) + +#define WRITEONLY() do { \ + if (oldp != NULL || oldlenp != NULL) { \ + ret = EPERM; \ + goto label_return; \ + } \ +} while (0) + +/* Can read or write, but not both. */ +#define READ_XOR_WRITE() do { \ + if ((oldp != NULL && oldlenp != NULL) && (newp != NULL || \ + newlen != 0)) { \ + ret = EPERM; \ + goto label_return; \ + } \ +} while (0) + +/* Can neither read nor write. */ +#define NEITHER_READ_NOR_WRITE() do { \ + if (oldp != NULL || oldlenp != NULL || newp != NULL || \ + newlen != 0) { \ + ret = EPERM; \ + goto label_return; \ + } \ +} while (0) + +/* Verify that the space provided is enough. */ +#define VERIFY_READ(t) do { \ + if (oldp == NULL || oldlenp == NULL || *oldlenp != sizeof(t)) { \ + *oldlenp = 0; \ + ret = EINVAL; \ + goto label_return; \ + } \ +} while (0) + +#define READ(v, t) do { \ + if (oldp != NULL && oldlenp != NULL) { \ + if (*oldlenp != sizeof(t)) { \ + size_t copylen = (sizeof(t) <= *oldlenp) \ + ? sizeof(t) : *oldlenp; \ + memcpy(oldp, (void *)&(v), copylen); \ + *oldlenp = copylen; \ + ret = EINVAL; \ + goto label_return; \ + } \ + *(t *)oldp = (v); \ + } \ +} while (0) + +#define WRITE(v, t) do { \ + if (newp != NULL) { \ + if (newlen != sizeof(t)) { \ + ret = EINVAL; \ + goto label_return; \ + } \ + (v) = *(t *)newp; \ + } \ +} while (0) + +#define ASSURED_WRITE(v, t) do { \ + if (newp == NULL || newlen != sizeof(t)) { \ + ret = EINVAL; \ + goto label_return; \ + } \ + (v) = *(t *)newp; \ +} while (0) + +#define MIB_UNSIGNED(v, i) do { \ + if (mib[i] > UINT_MAX) { \ + ret = EFAULT; \ + goto label_return; \ + } \ + v = (unsigned)mib[i]; \ +} while (0) + +/* + * There's a lot of code duplication in the following macros due to limitations + * in how nested cpp macros are expanded. + */ +#define CTL_RO_CLGEN(c, l, n, v, t) \ +static int \ +n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp, \ + size_t *oldlenp, void *newp, size_t newlen) { \ + int ret; \ + t oldval; \ + \ + if (!(c)) { \ + return ENOENT; \ + } \ + if (l) { \ + malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx); \ + } \ + READONLY(); \ + oldval = (v); \ + READ(oldval, t); \ + \ + ret = 0; \ +label_return: \ + if (l) { \ + malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx); \ + } \ + return ret; \ +} + +#define CTL_RO_CGEN(c, n, v, t) \ +static int \ +n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, \ + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { \ + int ret; \ + t oldval; \ + \ + if (!(c)) { \ + return ENOENT; \ + } \ + malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx); \ + READONLY(); \ + oldval = (v); \ + READ(oldval, t); \ + \ + ret = 0; \ +label_return: \ + malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx); \ + return ret; \ +} + +#define CTL_RO_GEN(n, v, t) \ +static int \ +n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp, \ + size_t *oldlenp, void *newp, size_t newlen) { \ + int ret; \ + t oldval; \ + \ + malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx); \ + READONLY(); \ + oldval = (v); \ + READ(oldval, t); \ + \ + ret = 0; \ +label_return: \ + malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx); \ + return ret; \ +} + +/* + * ctl_mtx is not acquired, under the assumption that no pertinent data will + * mutate during the call. + */ +#define CTL_RO_NL_CGEN(c, n, v, t) \ +static int \ +n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, \ + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { \ + int ret; \ + t oldval; \ + \ + if (!(c)) { \ + return ENOENT; \ + } \ + READONLY(); \ + oldval = (v); \ + READ(oldval, t); \ + \ + ret = 0; \ +label_return: \ + return ret; \ +} + +#define CTL_RO_NL_GEN(n, v, t) \ +static int \ +n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, \ + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { \ + int ret; \ + t oldval; \ + \ + READONLY(); \ + oldval = (v); \ + READ(oldval, t); \ + \ + ret = 0; \ +label_return: \ + return ret; \ +} + +#define CTL_RO_CONFIG_GEN(n, t) \ +static int \ +n##_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, \ + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { \ + int ret; \ + t oldval; \ + \ + READONLY(); \ + oldval = n; \ + READ(oldval, t); \ + \ + ret = 0; \ +label_return: \ + return ret; \ +} + +/******************************************************************************/ + +CTL_RO_NL_GEN(version, JEMALLOC_VERSION, const char *) + +static int +epoch_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + UNUSED uint64_t newval; + + malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx); + WRITE(newval, uint64_t); + if (newp != NULL) { + ctl_refresh(tsd_tsdn(tsd)); + } + READ(ctl_arenas->epoch, uint64_t); + + ret = 0; +label_return: + malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx); + return ret; +} + +static int +background_thread_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, + void *newp, size_t newlen) { + int ret; + bool oldval; + + if (!have_background_thread) { + return ENOENT; + } + background_thread_ctl_init(tsd_tsdn(tsd)); + + malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx); + malloc_mutex_lock(tsd_tsdn(tsd), &background_thread_lock); + if (newp == NULL) { + oldval = background_thread_enabled(); + READ(oldval, bool); + } else { + if (newlen != sizeof(bool)) { + ret = EINVAL; + goto label_return; + } + oldval = background_thread_enabled(); + READ(oldval, bool); + + bool newval = *(bool *)newp; + if (newval == oldval) { + ret = 0; + goto label_return; + } + + background_thread_enabled_set(tsd_tsdn(tsd), newval); + if (newval) { + if (background_threads_enable(tsd)) { + ret = EFAULT; + goto label_return; + } + } else { + if (background_threads_disable(tsd)) { + ret = EFAULT; + goto label_return; + } + } + } + ret = 0; +label_return: + malloc_mutex_unlock(tsd_tsdn(tsd), &background_thread_lock); + malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx); + + return ret; +} + +static int +max_background_threads_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, + size_t newlen) { + int ret; + size_t oldval; + + if (!have_background_thread) { + return ENOENT; + } + background_thread_ctl_init(tsd_tsdn(tsd)); + + malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx); + malloc_mutex_lock(tsd_tsdn(tsd), &background_thread_lock); + if (newp == NULL) { + oldval = max_background_threads; + READ(oldval, size_t); + } else { + if (newlen != sizeof(size_t)) { + ret = EINVAL; + goto label_return; + } + oldval = max_background_threads; + READ(oldval, size_t); + + size_t newval = *(size_t *)newp; + if (newval == oldval) { + ret = 0; + goto label_return; + } + if (newval > opt_max_background_threads) { + ret = EINVAL; + goto label_return; + } + + if (background_thread_enabled()) { + background_thread_enabled_set(tsd_tsdn(tsd), false); + if (background_threads_disable(tsd)) { + ret = EFAULT; + goto label_return; + } + max_background_threads = newval; + background_thread_enabled_set(tsd_tsdn(tsd), true); + if (background_threads_enable(tsd)) { + ret = EFAULT; + goto label_return; + } + } else { + max_background_threads = newval; + } + } + ret = 0; +label_return: + malloc_mutex_unlock(tsd_tsdn(tsd), &background_thread_lock); + malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx); + + return ret; +} + +/******************************************************************************/ + +CTL_RO_CONFIG_GEN(config_cache_oblivious, bool) +CTL_RO_CONFIG_GEN(config_debug, bool) +CTL_RO_CONFIG_GEN(config_fill, bool) +CTL_RO_CONFIG_GEN(config_lazy_lock, bool) +CTL_RO_CONFIG_GEN(config_malloc_conf, const char *) +CTL_RO_CONFIG_GEN(config_opt_safety_checks, bool) +CTL_RO_CONFIG_GEN(config_prof, bool) +CTL_RO_CONFIG_GEN(config_prof_libgcc, bool) +CTL_RO_CONFIG_GEN(config_prof_libunwind, bool) +CTL_RO_CONFIG_GEN(config_stats, bool) +CTL_RO_CONFIG_GEN(config_utrace, bool) +CTL_RO_CONFIG_GEN(config_xmalloc, bool) + +/******************************************************************************/ + +CTL_RO_NL_GEN(opt_abort, opt_abort, bool) +CTL_RO_NL_GEN(opt_abort_conf, opt_abort_conf, bool) +CTL_RO_NL_GEN(opt_cache_oblivious, opt_cache_oblivious, bool) +CTL_RO_NL_GEN(opt_trust_madvise, opt_trust_madvise, bool) +CTL_RO_NL_GEN(opt_confirm_conf, opt_confirm_conf, bool) + +/* HPA options. */ +CTL_RO_NL_GEN(opt_hpa, opt_hpa, bool) +CTL_RO_NL_GEN(opt_hpa_hugification_threshold, + opt_hpa_opts.hugification_threshold, size_t) +CTL_RO_NL_GEN(opt_hpa_hugify_delay_ms, opt_hpa_opts.hugify_delay_ms, uint64_t) +CTL_RO_NL_GEN(opt_hpa_min_purge_interval_ms, opt_hpa_opts.min_purge_interval_ms, + uint64_t) + +/* + * This will have to change before we publicly document this option; fxp_t and + * its representation are internal implementation details. + */ +CTL_RO_NL_GEN(opt_hpa_dirty_mult, opt_hpa_opts.dirty_mult, fxp_t) +CTL_RO_NL_GEN(opt_hpa_slab_max_alloc, opt_hpa_opts.slab_max_alloc, size_t) + +/* HPA SEC options */ +CTL_RO_NL_GEN(opt_hpa_sec_nshards, opt_hpa_sec_opts.nshards, size_t) +CTL_RO_NL_GEN(opt_hpa_sec_max_alloc, opt_hpa_sec_opts.max_alloc, size_t) +CTL_RO_NL_GEN(opt_hpa_sec_max_bytes, opt_hpa_sec_opts.max_bytes, size_t) +CTL_RO_NL_GEN(opt_hpa_sec_bytes_after_flush, opt_hpa_sec_opts.bytes_after_flush, + size_t) +CTL_RO_NL_GEN(opt_hpa_sec_batch_fill_extra, opt_hpa_sec_opts.batch_fill_extra, + size_t) + +CTL_RO_NL_GEN(opt_metadata_thp, metadata_thp_mode_names[opt_metadata_thp], + const char *) +CTL_RO_NL_GEN(opt_retain, opt_retain, bool) +CTL_RO_NL_GEN(opt_dss, opt_dss, const char *) +CTL_RO_NL_GEN(opt_narenas, opt_narenas, unsigned) +CTL_RO_NL_GEN(opt_percpu_arena, percpu_arena_mode_names[opt_percpu_arena], + const char *) +CTL_RO_NL_GEN(opt_mutex_max_spin, opt_mutex_max_spin, int64_t) +CTL_RO_NL_GEN(opt_oversize_threshold, opt_oversize_threshold, size_t) +CTL_RO_NL_GEN(opt_background_thread, opt_background_thread, bool) +CTL_RO_NL_GEN(opt_max_background_threads, opt_max_background_threads, size_t) +CTL_RO_NL_GEN(opt_dirty_decay_ms, opt_dirty_decay_ms, ssize_t) +CTL_RO_NL_GEN(opt_muzzy_decay_ms, opt_muzzy_decay_ms, ssize_t) +CTL_RO_NL_GEN(opt_stats_print, opt_stats_print, bool) +CTL_RO_NL_GEN(opt_stats_print_opts, opt_stats_print_opts, const char *) +CTL_RO_NL_GEN(opt_stats_interval, opt_stats_interval, int64_t) +CTL_RO_NL_GEN(opt_stats_interval_opts, opt_stats_interval_opts, const char *) +CTL_RO_NL_CGEN(config_fill, opt_junk, opt_junk, const char *) +CTL_RO_NL_CGEN(config_fill, opt_zero, opt_zero, bool) +CTL_RO_NL_CGEN(config_utrace, opt_utrace, opt_utrace, bool) +CTL_RO_NL_CGEN(config_xmalloc, opt_xmalloc, opt_xmalloc, bool) +CTL_RO_NL_CGEN(config_enable_cxx, opt_experimental_infallible_new, + opt_experimental_infallible_new, bool) +CTL_RO_NL_GEN(opt_tcache, opt_tcache, bool) +CTL_RO_NL_GEN(opt_tcache_max, opt_tcache_max, size_t) +CTL_RO_NL_GEN(opt_tcache_nslots_small_min, opt_tcache_nslots_small_min, + unsigned) +CTL_RO_NL_GEN(opt_tcache_nslots_small_max, opt_tcache_nslots_small_max, + unsigned) +CTL_RO_NL_GEN(opt_tcache_nslots_large, opt_tcache_nslots_large, unsigned) +CTL_RO_NL_GEN(opt_lg_tcache_nslots_mul, opt_lg_tcache_nslots_mul, ssize_t) +CTL_RO_NL_GEN(opt_tcache_gc_incr_bytes, opt_tcache_gc_incr_bytes, size_t) +CTL_RO_NL_GEN(opt_tcache_gc_delay_bytes, opt_tcache_gc_delay_bytes, size_t) +CTL_RO_NL_GEN(opt_lg_tcache_flush_small_div, opt_lg_tcache_flush_small_div, + unsigned) +CTL_RO_NL_GEN(opt_lg_tcache_flush_large_div, opt_lg_tcache_flush_large_div, + unsigned) +CTL_RO_NL_GEN(opt_thp, thp_mode_names[opt_thp], const char *) +CTL_RO_NL_GEN(opt_lg_extent_max_active_fit, opt_lg_extent_max_active_fit, + size_t) +CTL_RO_NL_CGEN(config_prof, opt_prof, opt_prof, bool) +CTL_RO_NL_CGEN(config_prof, opt_prof_prefix, opt_prof_prefix, const char *) +CTL_RO_NL_CGEN(config_prof, opt_prof_active, opt_prof_active, bool) +CTL_RO_NL_CGEN(config_prof, opt_prof_thread_active_init, + opt_prof_thread_active_init, bool) +CTL_RO_NL_CGEN(config_prof, opt_lg_prof_sample, opt_lg_prof_sample, size_t) +CTL_RO_NL_CGEN(config_prof, opt_prof_accum, opt_prof_accum, bool) +CTL_RO_NL_CGEN(config_prof, opt_lg_prof_interval, opt_lg_prof_interval, ssize_t) +CTL_RO_NL_CGEN(config_prof, opt_prof_gdump, opt_prof_gdump, bool) +CTL_RO_NL_CGEN(config_prof, opt_prof_final, opt_prof_final, bool) +CTL_RO_NL_CGEN(config_prof, opt_prof_leak, opt_prof_leak, bool) +CTL_RO_NL_CGEN(config_prof, opt_prof_leak_error, opt_prof_leak_error, bool) +CTL_RO_NL_CGEN(config_prof, opt_prof_recent_alloc_max, + opt_prof_recent_alloc_max, ssize_t) +CTL_RO_NL_CGEN(config_prof, opt_prof_stats, opt_prof_stats, bool) +CTL_RO_NL_CGEN(config_prof, opt_prof_sys_thread_name, opt_prof_sys_thread_name, + bool) +CTL_RO_NL_CGEN(config_prof, opt_prof_time_res, + prof_time_res_mode_names[opt_prof_time_res], const char *) +CTL_RO_NL_CGEN(config_uaf_detection, opt_lg_san_uaf_align, + opt_lg_san_uaf_align, ssize_t) +CTL_RO_NL_GEN(opt_zero_realloc, + zero_realloc_mode_names[opt_zero_realloc_action], const char *) + +/******************************************************************************/ + +static int +thread_arena_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + arena_t *oldarena; + unsigned newind, oldind; + + oldarena = arena_choose(tsd, NULL); + if (oldarena == NULL) { + return EAGAIN; + } + newind = oldind = arena_ind_get(oldarena); + WRITE(newind, unsigned); + READ(oldind, unsigned); + + if (newind != oldind) { + arena_t *newarena; + + if (newind >= narenas_total_get()) { + /* New arena index is out of range. */ + ret = EFAULT; + goto label_return; + } + + if (have_percpu_arena && + PERCPU_ARENA_ENABLED(opt_percpu_arena)) { + if (newind < percpu_arena_ind_limit(opt_percpu_arena)) { + /* + * If perCPU arena is enabled, thread_arena + * control is not allowed for the auto arena + * range. + */ + ret = EPERM; + goto label_return; + } + } + + /* Initialize arena if necessary. */ + newarena = arena_get(tsd_tsdn(tsd), newind, true); + if (newarena == NULL) { + ret = EAGAIN; + goto label_return; + } + /* Set new arena/tcache associations. */ + arena_migrate(tsd, oldarena, newarena); + if (tcache_available(tsd)) { + tcache_arena_reassociate(tsd_tsdn(tsd), + tsd_tcache_slowp_get(tsd), tsd_tcachep_get(tsd), + newarena); + } + } + + ret = 0; +label_return: + return ret; +} + +CTL_RO_NL_GEN(thread_allocated, tsd_thread_allocated_get(tsd), uint64_t) +CTL_RO_NL_GEN(thread_allocatedp, tsd_thread_allocatedp_get(tsd), uint64_t *) +CTL_RO_NL_GEN(thread_deallocated, tsd_thread_deallocated_get(tsd), uint64_t) +CTL_RO_NL_GEN(thread_deallocatedp, tsd_thread_deallocatedp_get(tsd), uint64_t *) + +static int +thread_tcache_enabled_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, + size_t newlen) { + int ret; + bool oldval; + + oldval = tcache_enabled_get(tsd); + if (newp != NULL) { + if (newlen != sizeof(bool)) { + ret = EINVAL; + goto label_return; + } + tcache_enabled_set(tsd, *(bool *)newp); + } + READ(oldval, bool); + + ret = 0; +label_return: + return ret; +} + +static int +thread_tcache_flush_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, + size_t newlen) { + int ret; + + if (!tcache_available(tsd)) { + ret = EFAULT; + goto label_return; + } + + NEITHER_READ_NOR_WRITE(); + + tcache_flush(tsd); + + ret = 0; +label_return: + return ret; +} + +static int +thread_peak_read_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, + size_t newlen) { + int ret; + if (!config_stats) { + return ENOENT; + } + READONLY(); + peak_event_update(tsd); + uint64_t result = peak_event_max(tsd); + READ(result, uint64_t); + ret = 0; +label_return: + return ret; +} + +static int +thread_peak_reset_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, + size_t newlen) { + int ret; + if (!config_stats) { + return ENOENT; + } + NEITHER_READ_NOR_WRITE(); + peak_event_zero(tsd); + ret = 0; +label_return: + return ret; +} + +static int +thread_prof_name_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, + size_t newlen) { + int ret; + + if (!config_prof || !opt_prof) { + return ENOENT; + } + + READ_XOR_WRITE(); + + if (newp != NULL) { + if (newlen != sizeof(const char *)) { + ret = EINVAL; + goto label_return; + } + + if ((ret = prof_thread_name_set(tsd, *(const char **)newp)) != + 0) { + goto label_return; + } + } else { + const char *oldname = prof_thread_name_get(tsd); + READ(oldname, const char *); + } + + ret = 0; +label_return: + return ret; +} + +static int +thread_prof_active_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, + size_t newlen) { + int ret; + bool oldval; + + if (!config_prof) { + return ENOENT; + } + + oldval = opt_prof ? prof_thread_active_get(tsd) : false; + if (newp != NULL) { + if (!opt_prof) { + ret = ENOENT; + goto label_return; + } + if (newlen != sizeof(bool)) { + ret = EINVAL; + goto label_return; + } + if (prof_thread_active_set(tsd, *(bool *)newp)) { + ret = EAGAIN; + goto label_return; + } + } + READ(oldval, bool); + + ret = 0; +label_return: + return ret; +} + +static int +thread_idle_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, + size_t newlen) { + int ret; + + NEITHER_READ_NOR_WRITE(); + + if (tcache_available(tsd)) { + tcache_flush(tsd); + } + /* + * This heuristic is perhaps not the most well-considered. But it + * matches the only idling policy we have experience with in the status + * quo. Over time we should investigate more principled approaches. + */ + if (opt_narenas > ncpus * 2) { + arena_t *arena = arena_choose(tsd, NULL); + if (arena != NULL) { + arena_decay(tsd_tsdn(tsd), arena, false, true); + } + /* + * The missing arena case is not actually an error; a thread + * might be idle before it associates itself to one. This is + * unusual, but not wrong. + */ + } + + ret = 0; +label_return: + return ret; +} + +/******************************************************************************/ + +static int +tcache_create_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + unsigned tcache_ind; + + READONLY(); + VERIFY_READ(unsigned); + if (tcaches_create(tsd, b0get(), &tcache_ind)) { + ret = EFAULT; + goto label_return; + } + READ(tcache_ind, unsigned); + + ret = 0; +label_return: + return ret; +} + +static int +tcache_flush_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + unsigned tcache_ind; + + WRITEONLY(); + ASSURED_WRITE(tcache_ind, unsigned); + tcaches_flush(tsd, tcache_ind); + + ret = 0; +label_return: + return ret; +} + +static int +tcache_destroy_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + unsigned tcache_ind; + + WRITEONLY(); + ASSURED_WRITE(tcache_ind, unsigned); + tcaches_destroy(tsd, tcache_ind); + + ret = 0; +label_return: + return ret; +} + +/******************************************************************************/ + +static int +arena_i_initialized_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + tsdn_t *tsdn = tsd_tsdn(tsd); + unsigned arena_ind; + bool initialized; + + READONLY(); + MIB_UNSIGNED(arena_ind, 1); + + malloc_mutex_lock(tsdn, &ctl_mtx); + initialized = arenas_i(arena_ind)->initialized; + malloc_mutex_unlock(tsdn, &ctl_mtx); + + READ(initialized, bool); + + ret = 0; +label_return: + return ret; +} + +static void +arena_i_decay(tsdn_t *tsdn, unsigned arena_ind, bool all) { + malloc_mutex_lock(tsdn, &ctl_mtx); + { + unsigned narenas = ctl_arenas->narenas; + + /* + * Access via index narenas is deprecated, and scheduled for + * removal in 6.0.0. + */ + if (arena_ind == MALLCTL_ARENAS_ALL || arena_ind == narenas) { + unsigned i; + VARIABLE_ARRAY(arena_t *, tarenas, narenas); + + for (i = 0; i < narenas; i++) { + tarenas[i] = arena_get(tsdn, i, false); + } + + /* + * No further need to hold ctl_mtx, since narenas and + * tarenas contain everything needed below. + */ + malloc_mutex_unlock(tsdn, &ctl_mtx); + + for (i = 0; i < narenas; i++) { + if (tarenas[i] != NULL) { + arena_decay(tsdn, tarenas[i], false, + all); + } + } + } else { + arena_t *tarena; + + assert(arena_ind < narenas); + + tarena = arena_get(tsdn, arena_ind, false); + + /* No further need to hold ctl_mtx. */ + malloc_mutex_unlock(tsdn, &ctl_mtx); + + if (tarena != NULL) { + arena_decay(tsdn, tarena, false, all); + } + } + } +} + +static int +arena_i_decay_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp, + size_t *oldlenp, void *newp, size_t newlen) { + int ret; + unsigned arena_ind; + + NEITHER_READ_NOR_WRITE(); + MIB_UNSIGNED(arena_ind, 1); + arena_i_decay(tsd_tsdn(tsd), arena_ind, false); + + ret = 0; +label_return: + return ret; +} + +static int +arena_i_purge_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp, + size_t *oldlenp, void *newp, size_t newlen) { + int ret; + unsigned arena_ind; + + NEITHER_READ_NOR_WRITE(); + MIB_UNSIGNED(arena_ind, 1); + arena_i_decay(tsd_tsdn(tsd), arena_ind, true); + + ret = 0; +label_return: + return ret; +} + +static int +arena_i_reset_destroy_helper(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen, unsigned *arena_ind, + arena_t **arena) { + int ret; + + NEITHER_READ_NOR_WRITE(); + MIB_UNSIGNED(*arena_ind, 1); + + *arena = arena_get(tsd_tsdn(tsd), *arena_ind, false); + if (*arena == NULL || arena_is_auto(*arena)) { + ret = EFAULT; + goto label_return; + } + + ret = 0; +label_return: + return ret; +} + +static void +arena_reset_prepare_background_thread(tsd_t *tsd, unsigned arena_ind) { + /* Temporarily disable the background thread during arena reset. */ + if (have_background_thread) { + malloc_mutex_lock(tsd_tsdn(tsd), &background_thread_lock); + if (background_thread_enabled()) { + background_thread_info_t *info = + background_thread_info_get(arena_ind); + assert(info->state == background_thread_started); + malloc_mutex_lock(tsd_tsdn(tsd), &info->mtx); + info->state = background_thread_paused; + malloc_mutex_unlock(tsd_tsdn(tsd), &info->mtx); + } + } +} + +static void +arena_reset_finish_background_thread(tsd_t *tsd, unsigned arena_ind) { + if (have_background_thread) { + if (background_thread_enabled()) { + background_thread_info_t *info = + background_thread_info_get(arena_ind); + assert(info->state == background_thread_paused); + malloc_mutex_lock(tsd_tsdn(tsd), &info->mtx); + info->state = background_thread_started; + malloc_mutex_unlock(tsd_tsdn(tsd), &info->mtx); + } + malloc_mutex_unlock(tsd_tsdn(tsd), &background_thread_lock); + } +} + +static int +arena_i_reset_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp, + size_t *oldlenp, void *newp, size_t newlen) { + int ret; + unsigned arena_ind; + arena_t *arena; + + ret = arena_i_reset_destroy_helper(tsd, mib, miblen, oldp, oldlenp, + newp, newlen, &arena_ind, &arena); + if (ret != 0) { + return ret; + } + + arena_reset_prepare_background_thread(tsd, arena_ind); + arena_reset(tsd, arena); + arena_reset_finish_background_thread(tsd, arena_ind); + + return ret; +} + +static int +arena_i_destroy_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp, + size_t *oldlenp, void *newp, size_t newlen) { + int ret; + unsigned arena_ind; + arena_t *arena; + ctl_arena_t *ctl_darena, *ctl_arena; + + malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx); + + ret = arena_i_reset_destroy_helper(tsd, mib, miblen, oldp, oldlenp, + newp, newlen, &arena_ind, &arena); + if (ret != 0) { + goto label_return; + } + + if (arena_nthreads_get(arena, false) != 0 || arena_nthreads_get(arena, + true) != 0) { + ret = EFAULT; + goto label_return; + } + + arena_reset_prepare_background_thread(tsd, arena_ind); + /* Merge stats after resetting and purging arena. */ + arena_reset(tsd, arena); + arena_decay(tsd_tsdn(tsd), arena, false, true); + ctl_darena = arenas_i(MALLCTL_ARENAS_DESTROYED); + ctl_darena->initialized = true; + ctl_arena_refresh(tsd_tsdn(tsd), arena, ctl_darena, arena_ind, true); + /* Destroy arena. */ + arena_destroy(tsd, arena); + ctl_arena = arenas_i(arena_ind); + ctl_arena->initialized = false; + /* Record arena index for later recycling via arenas.create. */ + ql_elm_new(ctl_arena, destroyed_link); + ql_tail_insert(&ctl_arenas->destroyed, ctl_arena, destroyed_link); + arena_reset_finish_background_thread(tsd, arena_ind); + + assert(ret == 0); +label_return: + malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx); + + return ret; +} + +static int +arena_i_dss_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp, + size_t *oldlenp, void *newp, size_t newlen) { + int ret; + const char *dss = NULL; + unsigned arena_ind; + dss_prec_t dss_prec_old = dss_prec_limit; + dss_prec_t dss_prec = dss_prec_limit; + + malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx); + WRITE(dss, const char *); + MIB_UNSIGNED(arena_ind, 1); + if (dss != NULL) { + int i; + bool match = false; + + for (i = 0; i < dss_prec_limit; i++) { + if (strcmp(dss_prec_names[i], dss) == 0) { + dss_prec = i; + match = true; + break; + } + } + + if (!match) { + ret = EINVAL; + goto label_return; + } + } + + /* + * Access via index narenas is deprecated, and scheduled for removal in + * 6.0.0. + */ + if (arena_ind == MALLCTL_ARENAS_ALL || arena_ind == + ctl_arenas->narenas) { + if (dss_prec != dss_prec_limit && + extent_dss_prec_set(dss_prec)) { + ret = EFAULT; + goto label_return; + } + dss_prec_old = extent_dss_prec_get(); + } else { + arena_t *arena = arena_get(tsd_tsdn(tsd), arena_ind, false); + if (arena == NULL || (dss_prec != dss_prec_limit && + arena_dss_prec_set(arena, dss_prec))) { + ret = EFAULT; + goto label_return; + } + dss_prec_old = arena_dss_prec_get(arena); + } + + dss = dss_prec_names[dss_prec_old]; + READ(dss, const char *); + + ret = 0; +label_return: + malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx); + return ret; +} + +static int +arena_i_oversize_threshold_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + + unsigned arena_ind; + MIB_UNSIGNED(arena_ind, 1); + + arena_t *arena = arena_get(tsd_tsdn(tsd), arena_ind, false); + if (arena == NULL) { + ret = EFAULT; + goto label_return; + } + + if (oldp != NULL && oldlenp != NULL) { + size_t oldval = atomic_load_zu( + &arena->pa_shard.pac.oversize_threshold, ATOMIC_RELAXED); + READ(oldval, size_t); + } + if (newp != NULL) { + if (newlen != sizeof(size_t)) { + ret = EINVAL; + goto label_return; + } + atomic_store_zu(&arena->pa_shard.pac.oversize_threshold, + *(size_t *)newp, ATOMIC_RELAXED); + } + ret = 0; +label_return: + return ret; +} + +static int +arena_i_decay_ms_ctl_impl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen, bool dirty) { + int ret; + unsigned arena_ind; + arena_t *arena; + + MIB_UNSIGNED(arena_ind, 1); + arena = arena_get(tsd_tsdn(tsd), arena_ind, false); + if (arena == NULL) { + ret = EFAULT; + goto label_return; + } + extent_state_t state = dirty ? extent_state_dirty : extent_state_muzzy; + + if (oldp != NULL && oldlenp != NULL) { + size_t oldval = arena_decay_ms_get(arena, state); + READ(oldval, ssize_t); + } + if (newp != NULL) { + if (newlen != sizeof(ssize_t)) { + ret = EINVAL; + goto label_return; + } + if (arena_is_huge(arena_ind) && *(ssize_t *)newp > 0) { + /* + * By default the huge arena purges eagerly. If it is + * set to non-zero decay time afterwards, background + * thread might be needed. + */ + if (background_thread_create(tsd, arena_ind)) { + ret = EFAULT; + goto label_return; + } + } + + if (arena_decay_ms_set(tsd_tsdn(tsd), arena, state, + *(ssize_t *)newp)) { + ret = EFAULT; + goto label_return; + } + } + + ret = 0; +label_return: + return ret; +} + +static int +arena_i_dirty_decay_ms_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + return arena_i_decay_ms_ctl_impl(tsd, mib, miblen, oldp, oldlenp, newp, + newlen, true); +} + +static int +arena_i_muzzy_decay_ms_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + return arena_i_decay_ms_ctl_impl(tsd, mib, miblen, oldp, oldlenp, newp, + newlen, false); +} + +static int +arena_i_extent_hooks_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + unsigned arena_ind; + arena_t *arena; + + malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx); + MIB_UNSIGNED(arena_ind, 1); + if (arena_ind < narenas_total_get()) { + extent_hooks_t *old_extent_hooks; + arena = arena_get(tsd_tsdn(tsd), arena_ind, false); + if (arena == NULL) { + if (arena_ind >= narenas_auto) { + ret = EFAULT; + goto label_return; + } + old_extent_hooks = + (extent_hooks_t *)&ehooks_default_extent_hooks; + READ(old_extent_hooks, extent_hooks_t *); + if (newp != NULL) { + /* Initialize a new arena as a side effect. */ + extent_hooks_t *new_extent_hooks + JEMALLOC_CC_SILENCE_INIT(NULL); + WRITE(new_extent_hooks, extent_hooks_t *); + arena_config_t config = arena_config_default; + config.extent_hooks = new_extent_hooks; + + arena = arena_init(tsd_tsdn(tsd), arena_ind, + &config); + if (arena == NULL) { + ret = EFAULT; + goto label_return; + } + } + } else { + if (newp != NULL) { + extent_hooks_t *new_extent_hooks + JEMALLOC_CC_SILENCE_INIT(NULL); + WRITE(new_extent_hooks, extent_hooks_t *); + old_extent_hooks = arena_set_extent_hooks(tsd, + arena, new_extent_hooks); + READ(old_extent_hooks, extent_hooks_t *); + } else { + old_extent_hooks = + ehooks_get_extent_hooks_ptr( + arena_get_ehooks(arena)); + READ(old_extent_hooks, extent_hooks_t *); + } + } + } else { + ret = EFAULT; + goto label_return; + } + ret = 0; +label_return: + malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx); + return ret; +} + +static int +arena_i_retain_grow_limit_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, + size_t newlen) { + int ret; + unsigned arena_ind; + arena_t *arena; + + if (!opt_retain) { + /* Only relevant when retain is enabled. */ + return ENOENT; + } + + malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx); + MIB_UNSIGNED(arena_ind, 1); + if (arena_ind < narenas_total_get() && (arena = + arena_get(tsd_tsdn(tsd), arena_ind, false)) != NULL) { + size_t old_limit, new_limit; + if (newp != NULL) { + WRITE(new_limit, size_t); + } + bool err = arena_retain_grow_limit_get_set(tsd, arena, + &old_limit, newp != NULL ? &new_limit : NULL); + if (!err) { + READ(old_limit, size_t); + ret = 0; + } else { + ret = EFAULT; + } + } else { + ret = EFAULT; + } +label_return: + malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx); + return ret; +} + +static const ctl_named_node_t * +arena_i_index(tsdn_t *tsdn, const size_t *mib, size_t miblen, + size_t i) { + const ctl_named_node_t *ret; + + malloc_mutex_lock(tsdn, &ctl_mtx); + switch (i) { + case MALLCTL_ARENAS_ALL: + case MALLCTL_ARENAS_DESTROYED: + break; + default: + if (i > ctl_arenas->narenas) { + ret = NULL; + goto label_return; + } + break; + } + + ret = super_arena_i_node; +label_return: + malloc_mutex_unlock(tsdn, &ctl_mtx); + return ret; +} + +/******************************************************************************/ + +static int +arenas_narenas_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + unsigned narenas; + + malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx); + READONLY(); + narenas = ctl_arenas->narenas; + READ(narenas, unsigned); + + ret = 0; +label_return: + malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx); + return ret; +} + +static int +arenas_decay_ms_ctl_impl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, + size_t newlen, bool dirty) { + int ret; + + if (oldp != NULL && oldlenp != NULL) { + size_t oldval = (dirty ? arena_dirty_decay_ms_default_get() : + arena_muzzy_decay_ms_default_get()); + READ(oldval, ssize_t); + } + if (newp != NULL) { + if (newlen != sizeof(ssize_t)) { + ret = EINVAL; + goto label_return; + } + if (dirty ? arena_dirty_decay_ms_default_set(*(ssize_t *)newp) + : arena_muzzy_decay_ms_default_set(*(ssize_t *)newp)) { + ret = EFAULT; + goto label_return; + } + } + + ret = 0; +label_return: + return ret; +} + +static int +arenas_dirty_decay_ms_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + return arenas_decay_ms_ctl_impl(tsd, mib, miblen, oldp, oldlenp, newp, + newlen, true); +} + +static int +arenas_muzzy_decay_ms_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + return arenas_decay_ms_ctl_impl(tsd, mib, miblen, oldp, oldlenp, newp, + newlen, false); +} + +CTL_RO_NL_GEN(arenas_quantum, QUANTUM, size_t) +CTL_RO_NL_GEN(arenas_page, PAGE, size_t) +CTL_RO_NL_GEN(arenas_tcache_max, tcache_maxclass, size_t) +CTL_RO_NL_GEN(arenas_nbins, SC_NBINS, unsigned) +CTL_RO_NL_GEN(arenas_nhbins, nhbins, unsigned) +CTL_RO_NL_GEN(arenas_bin_i_size, bin_infos[mib[2]].reg_size, size_t) +CTL_RO_NL_GEN(arenas_bin_i_nregs, bin_infos[mib[2]].nregs, uint32_t) +CTL_RO_NL_GEN(arenas_bin_i_slab_size, bin_infos[mib[2]].slab_size, size_t) +CTL_RO_NL_GEN(arenas_bin_i_nshards, bin_infos[mib[2]].n_shards, uint32_t) +static const ctl_named_node_t * +arenas_bin_i_index(tsdn_t *tsdn, const size_t *mib, + size_t miblen, size_t i) { + if (i > SC_NBINS) { + return NULL; + } + return super_arenas_bin_i_node; +} + +CTL_RO_NL_GEN(arenas_nlextents, SC_NSIZES - SC_NBINS, unsigned) +CTL_RO_NL_GEN(arenas_lextent_i_size, sz_index2size(SC_NBINS+(szind_t)mib[2]), + size_t) +static const ctl_named_node_t * +arenas_lextent_i_index(tsdn_t *tsdn, const size_t *mib, + size_t miblen, size_t i) { + if (i > SC_NSIZES - SC_NBINS) { + return NULL; + } + return super_arenas_lextent_i_node; +} + +static int +arenas_create_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + unsigned arena_ind; + + malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx); + + VERIFY_READ(unsigned); + arena_config_t config = arena_config_default; + WRITE(config.extent_hooks, extent_hooks_t *); + if ((arena_ind = ctl_arena_init(tsd, &config)) == UINT_MAX) { + ret = EAGAIN; + goto label_return; + } + READ(arena_ind, unsigned); + + ret = 0; +label_return: + malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx); + return ret; +} + +static int +experimental_arenas_create_ext_ctl(tsd_t *tsd, + const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + unsigned arena_ind; + + malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx); + + arena_config_t config = arena_config_default; + VERIFY_READ(unsigned); + WRITE(config, arena_config_t); + + if ((arena_ind = ctl_arena_init(tsd, &config)) == UINT_MAX) { + ret = EAGAIN; + goto label_return; + } + READ(arena_ind, unsigned); + ret = 0; +label_return: + malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx); + return ret; +} + +static int +arenas_lookup_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, + size_t newlen) { + int ret; + unsigned arena_ind; + void *ptr; + edata_t *edata; + arena_t *arena; + + ptr = NULL; + ret = EINVAL; + malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx); + WRITE(ptr, void *); + edata = emap_edata_lookup(tsd_tsdn(tsd), &arena_emap_global, ptr); + if (edata == NULL) { + goto label_return; + } + + arena = arena_get_from_edata(edata); + if (arena == NULL) { + goto label_return; + } + + arena_ind = arena_ind_get(arena); + READ(arena_ind, unsigned); + + ret = 0; +label_return: + malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx); + return ret; +} + +/******************************************************************************/ + +static int +prof_thread_active_init_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, + size_t newlen) { + int ret; + bool oldval; + + if (!config_prof) { + return ENOENT; + } + + if (newp != NULL) { + if (!opt_prof) { + ret = ENOENT; + goto label_return; + } + if (newlen != sizeof(bool)) { + ret = EINVAL; + goto label_return; + } + oldval = prof_thread_active_init_set(tsd_tsdn(tsd), + *(bool *)newp); + } else { + oldval = opt_prof ? prof_thread_active_init_get(tsd_tsdn(tsd)) : + false; + } + READ(oldval, bool); + + ret = 0; +label_return: + return ret; +} + +static int +prof_active_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + bool oldval; + + if (!config_prof) { + ret = ENOENT; + goto label_return; + } + + if (newp != NULL) { + if (newlen != sizeof(bool)) { + ret = EINVAL; + goto label_return; + } + bool val = *(bool *)newp; + if (!opt_prof) { + if (val) { + ret = ENOENT; + goto label_return; + } else { + /* No change needed (already off). */ + oldval = false; + } + } else { + oldval = prof_active_set(tsd_tsdn(tsd), val); + } + } else { + oldval = opt_prof ? prof_active_get(tsd_tsdn(tsd)) : false; + } + READ(oldval, bool); + + ret = 0; +label_return: + return ret; +} + +static int +prof_dump_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + const char *filename = NULL; + + if (!config_prof || !opt_prof) { + return ENOENT; + } + + WRITEONLY(); + WRITE(filename, const char *); + + if (prof_mdump(tsd, filename)) { + ret = EFAULT; + goto label_return; + } + + ret = 0; +label_return: + return ret; +} + +static int +prof_gdump_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + bool oldval; + + if (!config_prof) { + return ENOENT; + } + + if (newp != NULL) { + if (!opt_prof) { + ret = ENOENT; + goto label_return; + } + if (newlen != sizeof(bool)) { + ret = EINVAL; + goto label_return; + } + oldval = prof_gdump_set(tsd_tsdn(tsd), *(bool *)newp); + } else { + oldval = opt_prof ? prof_gdump_get(tsd_tsdn(tsd)) : false; + } + READ(oldval, bool); + + ret = 0; +label_return: + return ret; +} + +static int +prof_prefix_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + const char *prefix = NULL; + + if (!config_prof || !opt_prof) { + return ENOENT; + } + + malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx); + WRITEONLY(); + WRITE(prefix, const char *); + + ret = prof_prefix_set(tsd_tsdn(tsd), prefix) ? EFAULT : 0; +label_return: + malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx); + return ret; +} + +static int +prof_reset_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + size_t lg_sample = lg_prof_sample; + + if (!config_prof || !opt_prof) { + return ENOENT; + } + + WRITEONLY(); + WRITE(lg_sample, size_t); + if (lg_sample >= (sizeof(uint64_t) << 3)) { + lg_sample = (sizeof(uint64_t) << 3) - 1; + } + + prof_reset(tsd, lg_sample); + + ret = 0; +label_return: + return ret; +} + +CTL_RO_NL_CGEN(config_prof, prof_interval, prof_interval, uint64_t) +CTL_RO_NL_CGEN(config_prof, lg_prof_sample, lg_prof_sample, size_t) + +static int +prof_log_start_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp, + size_t *oldlenp, void *newp, size_t newlen) { + int ret; + + const char *filename = NULL; + + if (!config_prof || !opt_prof) { + return ENOENT; + } + + WRITEONLY(); + WRITE(filename, const char *); + + if (prof_log_start(tsd_tsdn(tsd), filename)) { + ret = EFAULT; + goto label_return; + } + + ret = 0; +label_return: + return ret; +} + +static int +prof_log_stop_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp, + size_t *oldlenp, void *newp, size_t newlen) { + if (!config_prof || !opt_prof) { + return ENOENT; + } + + if (prof_log_stop(tsd_tsdn(tsd))) { + return EFAULT; + } + + return 0; +} + +static int +experimental_hooks_prof_backtrace_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + + if (oldp == NULL && newp == NULL) { + ret = EINVAL; + goto label_return; + } + if (oldp != NULL) { + prof_backtrace_hook_t old_hook = + prof_backtrace_hook_get(); + READ(old_hook, prof_backtrace_hook_t); + } + if (newp != NULL) { + if (!opt_prof) { + ret = ENOENT; + goto label_return; + } + prof_backtrace_hook_t new_hook JEMALLOC_CC_SILENCE_INIT(NULL); + WRITE(new_hook, prof_backtrace_hook_t); + if (new_hook == NULL) { + ret = EINVAL; + goto label_return; + } + prof_backtrace_hook_set(new_hook); + } + ret = 0; +label_return: + return ret; +} + +static int +experimental_hooks_prof_dump_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + + if (oldp == NULL && newp == NULL) { + ret = EINVAL; + goto label_return; + } + if (oldp != NULL) { + prof_dump_hook_t old_hook = + prof_dump_hook_get(); + READ(old_hook, prof_dump_hook_t); + } + if (newp != NULL) { + if (!opt_prof) { + ret = ENOENT; + goto label_return; + } + prof_dump_hook_t new_hook JEMALLOC_CC_SILENCE_INIT(NULL); + WRITE(new_hook, prof_dump_hook_t); + prof_dump_hook_set(new_hook); + } + ret = 0; +label_return: + return ret; +} + +/* For integration test purpose only. No plan to move out of experimental. */ +static int +experimental_hooks_safety_check_abort_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + + WRITEONLY(); + if (newp != NULL) { + if (newlen != sizeof(safety_check_abort_hook_t)) { + ret = EINVAL; + goto label_return; + } + safety_check_abort_hook_t hook JEMALLOC_CC_SILENCE_INIT(NULL); + WRITE(hook, safety_check_abort_hook_t); + safety_check_set_abort(hook); + } + ret = 0; +label_return: + return ret; +} + +/******************************************************************************/ + +CTL_RO_CGEN(config_stats, stats_allocated, ctl_stats->allocated, size_t) +CTL_RO_CGEN(config_stats, stats_active, ctl_stats->active, size_t) +CTL_RO_CGEN(config_stats, stats_metadata, ctl_stats->metadata, size_t) +CTL_RO_CGEN(config_stats, stats_metadata_thp, ctl_stats->metadata_thp, size_t) +CTL_RO_CGEN(config_stats, stats_resident, ctl_stats->resident, size_t) +CTL_RO_CGEN(config_stats, stats_mapped, ctl_stats->mapped, size_t) +CTL_RO_CGEN(config_stats, stats_retained, ctl_stats->retained, size_t) + +CTL_RO_CGEN(config_stats, stats_background_thread_num_threads, + ctl_stats->background_thread.num_threads, size_t) +CTL_RO_CGEN(config_stats, stats_background_thread_num_runs, + ctl_stats->background_thread.num_runs, uint64_t) +CTL_RO_CGEN(config_stats, stats_background_thread_run_interval, + nstime_ns(&ctl_stats->background_thread.run_interval), uint64_t) + +CTL_RO_CGEN(config_stats, stats_zero_reallocs, + atomic_load_zu(&zero_realloc_count, ATOMIC_RELAXED), size_t) + +CTL_RO_GEN(stats_arenas_i_dss, arenas_i(mib[2])->dss, const char *) +CTL_RO_GEN(stats_arenas_i_dirty_decay_ms, arenas_i(mib[2])->dirty_decay_ms, + ssize_t) +CTL_RO_GEN(stats_arenas_i_muzzy_decay_ms, arenas_i(mib[2])->muzzy_decay_ms, + ssize_t) +CTL_RO_GEN(stats_arenas_i_nthreads, arenas_i(mib[2])->nthreads, unsigned) +CTL_RO_GEN(stats_arenas_i_uptime, + nstime_ns(&arenas_i(mib[2])->astats->astats.uptime), uint64_t) +CTL_RO_GEN(stats_arenas_i_pactive, arenas_i(mib[2])->pactive, size_t) +CTL_RO_GEN(stats_arenas_i_pdirty, arenas_i(mib[2])->pdirty, size_t) +CTL_RO_GEN(stats_arenas_i_pmuzzy, arenas_i(mib[2])->pmuzzy, size_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_mapped, + arenas_i(mib[2])->astats->astats.mapped, size_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_retained, + arenas_i(mib[2])->astats->astats.pa_shard_stats.pac_stats.retained, size_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_extent_avail, + arenas_i(mib[2])->astats->astats.pa_shard_stats.edata_avail, size_t) + +CTL_RO_CGEN(config_stats, stats_arenas_i_dirty_npurge, + locked_read_u64_unsynchronized( + &arenas_i(mib[2])->astats->astats.pa_shard_stats.pac_stats.decay_dirty.npurge), + uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_dirty_nmadvise, + locked_read_u64_unsynchronized( + &arenas_i(mib[2])->astats->astats.pa_shard_stats.pac_stats.decay_dirty.nmadvise), + uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_dirty_purged, + locked_read_u64_unsynchronized( + &arenas_i(mib[2])->astats->astats.pa_shard_stats.pac_stats.decay_dirty.purged), + uint64_t) + +CTL_RO_CGEN(config_stats, stats_arenas_i_muzzy_npurge, + locked_read_u64_unsynchronized( + &arenas_i(mib[2])->astats->astats.pa_shard_stats.pac_stats.decay_muzzy.npurge), + uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_muzzy_nmadvise, + locked_read_u64_unsynchronized( + &arenas_i(mib[2])->astats->astats.pa_shard_stats.pac_stats.decay_muzzy.nmadvise), + uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_muzzy_purged, + locked_read_u64_unsynchronized( + &arenas_i(mib[2])->astats->astats.pa_shard_stats.pac_stats.decay_muzzy.purged), + uint64_t) + +CTL_RO_CGEN(config_stats, stats_arenas_i_base, + arenas_i(mib[2])->astats->astats.base, + size_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_internal, + atomic_load_zu(&arenas_i(mib[2])->astats->astats.internal, ATOMIC_RELAXED), + size_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_metadata_thp, + arenas_i(mib[2])->astats->astats.metadata_thp, size_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_tcache_bytes, + arenas_i(mib[2])->astats->astats.tcache_bytes, size_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_tcache_stashed_bytes, + arenas_i(mib[2])->astats->astats.tcache_stashed_bytes, size_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_resident, + arenas_i(mib[2])->astats->astats.resident, + size_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_abandoned_vm, + atomic_load_zu( + &arenas_i(mib[2])->astats->astats.pa_shard_stats.pac_stats.abandoned_vm, + ATOMIC_RELAXED), size_t) + +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_sec_bytes, + arenas_i(mib[2])->astats->secstats.bytes, size_t) + +CTL_RO_CGEN(config_stats, stats_arenas_i_small_allocated, + arenas_i(mib[2])->astats->allocated_small, size_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_small_nmalloc, + arenas_i(mib[2])->astats->nmalloc_small, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_small_ndalloc, + arenas_i(mib[2])->astats->ndalloc_small, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_small_nrequests, + arenas_i(mib[2])->astats->nrequests_small, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_small_nfills, + arenas_i(mib[2])->astats->nfills_small, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_small_nflushes, + arenas_i(mib[2])->astats->nflushes_small, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_large_allocated, + arenas_i(mib[2])->astats->astats.allocated_large, size_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_large_nmalloc, + arenas_i(mib[2])->astats->astats.nmalloc_large, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_large_ndalloc, + arenas_i(mib[2])->astats->astats.ndalloc_large, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_large_nrequests, + arenas_i(mib[2])->astats->astats.nrequests_large, uint64_t) +/* + * Note: "nmalloc_large" here instead of "nfills" in the read. This is + * intentional (large has no batch fill). + */ +CTL_RO_CGEN(config_stats, stats_arenas_i_large_nfills, + arenas_i(mib[2])->astats->astats.nmalloc_large, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_large_nflushes, + arenas_i(mib[2])->astats->astats.nflushes_large, uint64_t) + +/* Lock profiling related APIs below. */ +#define RO_MUTEX_CTL_GEN(n, l) \ +CTL_RO_CGEN(config_stats, stats_##n##_num_ops, \ + l.n_lock_ops, uint64_t) \ +CTL_RO_CGEN(config_stats, stats_##n##_num_wait, \ + l.n_wait_times, uint64_t) \ +CTL_RO_CGEN(config_stats, stats_##n##_num_spin_acq, \ + l.n_spin_acquired, uint64_t) \ +CTL_RO_CGEN(config_stats, stats_##n##_num_owner_switch, \ + l.n_owner_switches, uint64_t) \ +CTL_RO_CGEN(config_stats, stats_##n##_total_wait_time, \ + nstime_ns(&l.tot_wait_time), uint64_t) \ +CTL_RO_CGEN(config_stats, stats_##n##_max_wait_time, \ + nstime_ns(&l.max_wait_time), uint64_t) \ +CTL_RO_CGEN(config_stats, stats_##n##_max_num_thds, \ + l.max_n_thds, uint32_t) + +/* Global mutexes. */ +#define OP(mtx) \ + RO_MUTEX_CTL_GEN(mutexes_##mtx, \ + ctl_stats->mutex_prof_data[global_prof_mutex_##mtx]) +MUTEX_PROF_GLOBAL_MUTEXES +#undef OP + +/* Per arena mutexes */ +#define OP(mtx) RO_MUTEX_CTL_GEN(arenas_i_mutexes_##mtx, \ + arenas_i(mib[2])->astats->astats.mutex_prof_data[arena_prof_mutex_##mtx]) +MUTEX_PROF_ARENA_MUTEXES +#undef OP + +/* tcache bin mutex */ +RO_MUTEX_CTL_GEN(arenas_i_bins_j_mutex, + arenas_i(mib[2])->astats->bstats[mib[4]].mutex_data) +#undef RO_MUTEX_CTL_GEN + +/* Resets all mutex stats, including global, arena and bin mutexes. */ +static int +stats_mutexes_reset_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, + void *newp, size_t newlen) { + if (!config_stats) { + return ENOENT; + } + + tsdn_t *tsdn = tsd_tsdn(tsd); + +#define MUTEX_PROF_RESET(mtx) \ + malloc_mutex_lock(tsdn, &mtx); \ + malloc_mutex_prof_data_reset(tsdn, &mtx); \ + malloc_mutex_unlock(tsdn, &mtx); + + /* Global mutexes: ctl and prof. */ + MUTEX_PROF_RESET(ctl_mtx); + if (have_background_thread) { + MUTEX_PROF_RESET(background_thread_lock); + } + if (config_prof && opt_prof) { + MUTEX_PROF_RESET(bt2gctx_mtx); + MUTEX_PROF_RESET(tdatas_mtx); + MUTEX_PROF_RESET(prof_dump_mtx); + MUTEX_PROF_RESET(prof_recent_alloc_mtx); + MUTEX_PROF_RESET(prof_recent_dump_mtx); + MUTEX_PROF_RESET(prof_stats_mtx); + } + + /* Per arena mutexes. */ + unsigned n = narenas_total_get(); + + for (unsigned i = 0; i < n; i++) { + arena_t *arena = arena_get(tsdn, i, false); + if (!arena) { + continue; + } + MUTEX_PROF_RESET(arena->large_mtx); + MUTEX_PROF_RESET(arena->pa_shard.edata_cache.mtx); + MUTEX_PROF_RESET(arena->pa_shard.pac.ecache_dirty.mtx); + MUTEX_PROF_RESET(arena->pa_shard.pac.ecache_muzzy.mtx); + MUTEX_PROF_RESET(arena->pa_shard.pac.ecache_retained.mtx); + MUTEX_PROF_RESET(arena->pa_shard.pac.decay_dirty.mtx); + MUTEX_PROF_RESET(arena->pa_shard.pac.decay_muzzy.mtx); + MUTEX_PROF_RESET(arena->tcache_ql_mtx); + MUTEX_PROF_RESET(arena->base->mtx); + + for (szind_t j = 0; j < SC_NBINS; j++) { + for (unsigned k = 0; k < bin_infos[j].n_shards; k++) { + bin_t *bin = arena_get_bin(arena, j, k); + MUTEX_PROF_RESET(bin->lock); + } + } + } +#undef MUTEX_PROF_RESET + return 0; +} + +CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_nmalloc, + arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.nmalloc, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_ndalloc, + arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.ndalloc, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_nrequests, + arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.nrequests, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_curregs, + arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.curregs, size_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_nfills, + arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.nfills, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_nflushes, + arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.nflushes, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_nslabs, + arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.nslabs, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_nreslabs, + arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.reslabs, uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_curslabs, + arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.curslabs, size_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_bins_j_nonfull_slabs, + arenas_i(mib[2])->astats->bstats[mib[4]].stats_data.nonfull_slabs, size_t) + +static const ctl_named_node_t * +stats_arenas_i_bins_j_index(tsdn_t *tsdn, const size_t *mib, + size_t miblen, size_t j) { + if (j > SC_NBINS) { + return NULL; + } + return super_stats_arenas_i_bins_j_node; +} + +CTL_RO_CGEN(config_stats, stats_arenas_i_lextents_j_nmalloc, + locked_read_u64_unsynchronized( + &arenas_i(mib[2])->astats->lstats[mib[4]].nmalloc), uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_lextents_j_ndalloc, + locked_read_u64_unsynchronized( + &arenas_i(mib[2])->astats->lstats[mib[4]].ndalloc), uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_lextents_j_nrequests, + locked_read_u64_unsynchronized( + &arenas_i(mib[2])->astats->lstats[mib[4]].nrequests), uint64_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_lextents_j_curlextents, + arenas_i(mib[2])->astats->lstats[mib[4]].curlextents, size_t) + +static const ctl_named_node_t * +stats_arenas_i_lextents_j_index(tsdn_t *tsdn, const size_t *mib, + size_t miblen, size_t j) { + if (j > SC_NSIZES - SC_NBINS) { + return NULL; + } + return super_stats_arenas_i_lextents_j_node; +} + +CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_ndirty, + arenas_i(mib[2])->astats->estats[mib[4]].ndirty, size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_nmuzzy, + arenas_i(mib[2])->astats->estats[mib[4]].nmuzzy, size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_nretained, + arenas_i(mib[2])->astats->estats[mib[4]].nretained, size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_dirty_bytes, + arenas_i(mib[2])->astats->estats[mib[4]].dirty_bytes, size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_muzzy_bytes, + arenas_i(mib[2])->astats->estats[mib[4]].muzzy_bytes, size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_retained_bytes, + arenas_i(mib[2])->astats->estats[mib[4]].retained_bytes, size_t); + +static const ctl_named_node_t * +stats_arenas_i_extents_j_index(tsdn_t *tsdn, const size_t *mib, + size_t miblen, size_t j) { + if (j >= SC_NPSIZES) { + return NULL; + } + return super_stats_arenas_i_extents_j_node; +} + +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_npurge_passes, + arenas_i(mib[2])->astats->hpastats.nonderived_stats.npurge_passes, uint64_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_npurges, + arenas_i(mib[2])->astats->hpastats.nonderived_stats.npurges, uint64_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_nhugifies, + arenas_i(mib[2])->astats->hpastats.nonderived_stats.nhugifies, uint64_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_ndehugifies, + arenas_i(mib[2])->astats->hpastats.nonderived_stats.ndehugifies, uint64_t); + +/* Full, nonhuge */ +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_full_slabs_npageslabs_nonhuge, + arenas_i(mib[2])->astats->hpastats.psset_stats.full_slabs[0].npageslabs, + size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_full_slabs_nactive_nonhuge, + arenas_i(mib[2])->astats->hpastats.psset_stats.full_slabs[0].nactive, size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_full_slabs_ndirty_nonhuge, + arenas_i(mib[2])->astats->hpastats.psset_stats.full_slabs[0].ndirty, size_t); + +/* Full, huge */ +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_full_slabs_npageslabs_huge, + arenas_i(mib[2])->astats->hpastats.psset_stats.full_slabs[1].npageslabs, + size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_full_slabs_nactive_huge, + arenas_i(mib[2])->astats->hpastats.psset_stats.full_slabs[1].nactive, size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_full_slabs_ndirty_huge, + arenas_i(mib[2])->astats->hpastats.psset_stats.full_slabs[1].ndirty, size_t); + +/* Empty, nonhuge */ +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_empty_slabs_npageslabs_nonhuge, + arenas_i(mib[2])->astats->hpastats.psset_stats.empty_slabs[0].npageslabs, + size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_empty_slabs_nactive_nonhuge, + arenas_i(mib[2])->astats->hpastats.psset_stats.empty_slabs[0].nactive, size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_empty_slabs_ndirty_nonhuge, + arenas_i(mib[2])->astats->hpastats.psset_stats.empty_slabs[0].ndirty, size_t); + +/* Empty, huge */ +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_empty_slabs_npageslabs_huge, + arenas_i(mib[2])->astats->hpastats.psset_stats.empty_slabs[1].npageslabs, + size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_empty_slabs_nactive_huge, + arenas_i(mib[2])->astats->hpastats.psset_stats.empty_slabs[1].nactive, size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_empty_slabs_ndirty_huge, + arenas_i(mib[2])->astats->hpastats.psset_stats.empty_slabs[1].ndirty, size_t); + +/* Nonfull, nonhuge */ +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_nonfull_slabs_j_npageslabs_nonhuge, + arenas_i(mib[2])->astats->hpastats.psset_stats.nonfull_slabs[mib[5]][0].npageslabs, + size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_nonfull_slabs_j_nactive_nonhuge, + arenas_i(mib[2])->astats->hpastats.psset_stats.nonfull_slabs[mib[5]][0].nactive, + size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_nonfull_slabs_j_ndirty_nonhuge, + arenas_i(mib[2])->astats->hpastats.psset_stats.nonfull_slabs[mib[5]][0].ndirty, + size_t); + +/* Nonfull, huge */ +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_nonfull_slabs_j_npageslabs_huge, + arenas_i(mib[2])->astats->hpastats.psset_stats.nonfull_slabs[mib[5]][1].npageslabs, + size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_nonfull_slabs_j_nactive_huge, + arenas_i(mib[2])->astats->hpastats.psset_stats.nonfull_slabs[mib[5]][1].nactive, + size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_nonfull_slabs_j_ndirty_huge, + arenas_i(mib[2])->astats->hpastats.psset_stats.nonfull_slabs[mib[5]][1].ndirty, + size_t); + +static const ctl_named_node_t * +stats_arenas_i_hpa_shard_nonfull_slabs_j_index(tsdn_t *tsdn, const size_t *mib, + size_t miblen, size_t j) { + if (j >= PSSET_NPSIZES) { + return NULL; + } + return super_stats_arenas_i_hpa_shard_nonfull_slabs_j_node; +} + +static bool +ctl_arenas_i_verify(size_t i) { + size_t a = arenas_i2a_impl(i, true, true); + if (a == UINT_MAX || !ctl_arenas->arenas[a]->initialized) { + return true; + } + + return false; +} + +static const ctl_named_node_t * +stats_arenas_i_index(tsdn_t *tsdn, const size_t *mib, + size_t miblen, size_t i) { + const ctl_named_node_t *ret; + + malloc_mutex_lock(tsdn, &ctl_mtx); + if (ctl_arenas_i_verify(i)) { + ret = NULL; + goto label_return; + } + + ret = super_stats_arenas_i_node; +label_return: + malloc_mutex_unlock(tsdn, &ctl_mtx); + return ret; +} + +static int +experimental_hooks_install_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + if (oldp == NULL || oldlenp == NULL|| newp == NULL) { + ret = EINVAL; + goto label_return; + } + /* + * Note: this is a *private* struct. This is an experimental interface; + * forcing the user to know the jemalloc internals well enough to + * extract the ABI hopefully ensures nobody gets too comfortable with + * this API, which can change at a moment's notice. + */ + hooks_t hooks; + WRITE(hooks, hooks_t); + void *handle = hook_install(tsd_tsdn(tsd), &hooks); + if (handle == NULL) { + ret = EAGAIN; + goto label_return; + } + READ(handle, void *); + + ret = 0; +label_return: + return ret; +} + +static int +experimental_hooks_remove_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + WRITEONLY(); + void *handle = NULL; + WRITE(handle, void *); + if (handle == NULL) { + ret = EINVAL; + goto label_return; + } + hook_remove(tsd_tsdn(tsd), handle); + ret = 0; +label_return: + return ret; +} + +static int +experimental_thread_activity_callback_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + + if (!config_stats) { + return ENOENT; + } + + activity_callback_thunk_t t_old = tsd_activity_callback_thunk_get(tsd); + READ(t_old, activity_callback_thunk_t); + + if (newp != NULL) { + /* + * This initialization is unnecessary. If it's omitted, though, + * clang gets confused and warns on the subsequent use of t_new. + */ + activity_callback_thunk_t t_new = {NULL, NULL}; + WRITE(t_new, activity_callback_thunk_t); + tsd_activity_callback_thunk_set(tsd, t_new); + } + ret = 0; +label_return: + return ret; +} + +/* + * Output six memory utilization entries for an input pointer, the first one of + * type (void *) and the remaining five of type size_t, describing the following + * (in the same order): + * + * (a) memory address of the extent a potential reallocation would go into, + * == the five fields below describe about the extent the pointer resides in == + * (b) number of free regions in the extent, + * (c) number of regions in the extent, + * (d) size of the extent in terms of bytes, + * (e) total number of free regions in the bin the extent belongs to, and + * (f) total number of regions in the bin the extent belongs to. + * + * Note that "(e)" and "(f)" are only available when stats are enabled; + * otherwise their values are undefined. + * + * This API is mainly intended for small class allocations, where extents are + * used as slab. Note that if the bin the extent belongs to is completely + * full, "(a)" will be NULL. + * + * In case of large class allocations, "(a)" will be NULL, and "(e)" and "(f)" + * will be zero (if stats are enabled; otherwise undefined). The other three + * fields will be properly set though the values are trivial: "(b)" will be 0, + * "(c)" will be 1, and "(d)" will be the usable size. + * + * The input pointer and size are respectively passed in by newp and newlen, + * and the output fields and size are respectively oldp and *oldlenp. + * + * It can be beneficial to define the following macros to make it easier to + * access the output: + * + * #define SLABCUR_READ(out) (*(void **)out) + * #define COUNTS(out) ((size_t *)((void **)out + 1)) + * #define NFREE_READ(out) COUNTS(out)[0] + * #define NREGS_READ(out) COUNTS(out)[1] + * #define SIZE_READ(out) COUNTS(out)[2] + * #define BIN_NFREE_READ(out) COUNTS(out)[3] + * #define BIN_NREGS_READ(out) COUNTS(out)[4] + * + * and then write e.g. NFREE_READ(oldp) to fetch the output. See the unit test + * test_query in test/unit/extent_util.c for an example. + * + * For a typical defragmentation workflow making use of this API for + * understanding the fragmentation level, please refer to the comment for + * experimental_utilization_batch_query_ctl. + * + * It's up to the application how to determine the significance of + * fragmentation relying on the outputs returned. Possible choices are: + * + * (a) if extent utilization ratio is below certain threshold, + * (b) if extent memory consumption is above certain threshold, + * (c) if extent utilization ratio is significantly below bin utilization ratio, + * (d) if input pointer deviates a lot from potential reallocation address, or + * (e) some selection/combination of the above. + * + * The caller needs to make sure that the input/output arguments are valid, + * in particular, that the size of the output is correct, i.e.: + * + * *oldlenp = sizeof(void *) + sizeof(size_t) * 5 + * + * Otherwise, the function immediately returns EINVAL without touching anything. + * + * In the rare case where there's no associated extent found for the input + * pointer, the function zeros out all output fields and return. Please refer + * to the comment for experimental_utilization_batch_query_ctl to understand the + * motivation from C++. + */ +static int +experimental_utilization_query_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + + assert(sizeof(inspect_extent_util_stats_verbose_t) + == sizeof(void *) + sizeof(size_t) * 5); + + if (oldp == NULL || oldlenp == NULL + || *oldlenp != sizeof(inspect_extent_util_stats_verbose_t) + || newp == NULL) { + ret = EINVAL; + goto label_return; + } + + void *ptr = NULL; + WRITE(ptr, void *); + inspect_extent_util_stats_verbose_t *util_stats + = (inspect_extent_util_stats_verbose_t *)oldp; + inspect_extent_util_stats_verbose_get(tsd_tsdn(tsd), ptr, + &util_stats->nfree, &util_stats->nregs, &util_stats->size, + &util_stats->bin_nfree, &util_stats->bin_nregs, + &util_stats->slabcur_addr); + ret = 0; + +label_return: + return ret; +} + +/* + * Given an input array of pointers, output three memory utilization entries of + * type size_t for each input pointer about the extent it resides in: + * + * (a) number of free regions in the extent, + * (b) number of regions in the extent, and + * (c) size of the extent in terms of bytes. + * + * This API is mainly intended for small class allocations, where extents are + * used as slab. In case of large class allocations, the outputs are trivial: + * "(a)" will be 0, "(b)" will be 1, and "(c)" will be the usable size. + * + * Note that multiple input pointers may reside on a same extent so the output + * fields may contain duplicates. + * + * The format of the input/output looks like: + * + * input[0]: 1st_pointer_to_query | output[0]: 1st_extent_n_free_regions + * | output[1]: 1st_extent_n_regions + * | output[2]: 1st_extent_size + * input[1]: 2nd_pointer_to_query | output[3]: 2nd_extent_n_free_regions + * | output[4]: 2nd_extent_n_regions + * | output[5]: 2nd_extent_size + * ... | ... + * + * The input array and size are respectively passed in by newp and newlen, and + * the output array and size are respectively oldp and *oldlenp. + * + * It can be beneficial to define the following macros to make it easier to + * access the output: + * + * #define NFREE_READ(out, i) out[(i) * 3] + * #define NREGS_READ(out, i) out[(i) * 3 + 1] + * #define SIZE_READ(out, i) out[(i) * 3 + 2] + * + * and then write e.g. NFREE_READ(oldp, i) to fetch the output. See the unit + * test test_batch in test/unit/extent_util.c for a concrete example. + * + * A typical workflow would be composed of the following steps: + * + * (1) flush tcache: mallctl("thread.tcache.flush", ...) + * (2) initialize input array of pointers to query fragmentation + * (3) allocate output array to hold utilization statistics + * (4) query utilization: mallctl("experimental.utilization.batch_query", ...) + * (5) (optional) decide if it's worthwhile to defragment; otherwise stop here + * (6) disable tcache: mallctl("thread.tcache.enabled", ...) + * (7) defragment allocations with significant fragmentation, e.g.: + * for each allocation { + * if it's fragmented { + * malloc(...); + * memcpy(...); + * free(...); + * } + * } + * (8) enable tcache: mallctl("thread.tcache.enabled", ...) + * + * The application can determine the significance of fragmentation themselves + * relying on the statistics returned, both at the overall level i.e. step "(5)" + * and at individual allocation level i.e. within step "(7)". Possible choices + * are: + * + * (a) whether memory utilization ratio is below certain threshold, + * (b) whether memory consumption is above certain threshold, or + * (c) some combination of the two. + * + * The caller needs to make sure that the input/output arrays are valid and + * their sizes are proper as well as matched, meaning: + * + * (a) newlen = n_pointers * sizeof(const void *) + * (b) *oldlenp = n_pointers * sizeof(size_t) * 3 + * (c) n_pointers > 0 + * + * Otherwise, the function immediately returns EINVAL without touching anything. + * + * In the rare case where there's no associated extent found for some pointers, + * rather than immediately terminating the computation and raising an error, + * the function simply zeros out the corresponding output fields and continues + * the computation until all input pointers are handled. The motivations of + * such a design are as follows: + * + * (a) The function always either processes nothing or processes everything, and + * never leaves the output half touched and half untouched. + * + * (b) It facilitates usage needs especially common in C++. A vast variety of + * C++ objects are instantiated with multiple dynamic memory allocations. For + * example, std::string and std::vector typically use at least two allocations, + * one for the metadata and one for the actual content. Other types may use + * even more allocations. When inquiring about utilization statistics, the + * caller often wants to examine into all such allocations, especially internal + * one(s), rather than just the topmost one. The issue comes when some + * implementations do certain optimizations to reduce/aggregate some internal + * allocations, e.g. putting short strings directly into the metadata, and such + * decisions are not known to the caller. Therefore, we permit pointers to + * memory usages that may not be returned by previous malloc calls, and we + * provide the caller a convenient way to identify such cases. + */ +static int +experimental_utilization_batch_query_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + + assert(sizeof(inspect_extent_util_stats_t) == sizeof(size_t) * 3); + + const size_t len = newlen / sizeof(const void *); + if (oldp == NULL || oldlenp == NULL || newp == NULL || newlen == 0 + || newlen != len * sizeof(const void *) + || *oldlenp != len * sizeof(inspect_extent_util_stats_t)) { + ret = EINVAL; + goto label_return; + } + + void **ptrs = (void **)newp; + inspect_extent_util_stats_t *util_stats = + (inspect_extent_util_stats_t *)oldp; + size_t i; + for (i = 0; i < len; ++i) { + inspect_extent_util_stats_get(tsd_tsdn(tsd), ptrs[i], + &util_stats[i].nfree, &util_stats[i].nregs, + &util_stats[i].size); + } + ret = 0; + +label_return: + return ret; +} + +static const ctl_named_node_t * +experimental_arenas_i_index(tsdn_t *tsdn, const size_t *mib, + size_t miblen, size_t i) { + const ctl_named_node_t *ret; + + malloc_mutex_lock(tsdn, &ctl_mtx); + if (ctl_arenas_i_verify(i)) { + ret = NULL; + goto label_return; + } + ret = super_experimental_arenas_i_node; +label_return: + malloc_mutex_unlock(tsdn, &ctl_mtx); + return ret; +} + +static int +experimental_arenas_i_pactivep_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + if (!config_stats) { + return ENOENT; + } + if (oldp == NULL || oldlenp == NULL || *oldlenp != sizeof(size_t *)) { + return EINVAL; + } + + unsigned arena_ind; + arena_t *arena; + int ret; + size_t *pactivep; + + malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx); + READONLY(); + MIB_UNSIGNED(arena_ind, 2); + if (arena_ind < narenas_total_get() && (arena = + arena_get(tsd_tsdn(tsd), arena_ind, false)) != NULL) { +#if defined(JEMALLOC_GCC_ATOMIC_ATOMICS) || \ + defined(JEMALLOC_GCC_SYNC_ATOMICS) || defined(_MSC_VER) + /* Expose the underlying counter for fast read. */ + pactivep = (size_t *)&(arena->pa_shard.nactive.repr); + READ(pactivep, size_t *); + ret = 0; +#else + ret = EFAULT; +#endif + } else { + ret = EFAULT; + } +label_return: + malloc_mutex_unlock(tsd_tsdn(tsd), &ctl_mtx); + return ret; +} + +static int +experimental_prof_recent_alloc_max_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + + if (!(config_prof && opt_prof)) { + ret = ENOENT; + goto label_return; + } + + ssize_t old_max; + if (newp != NULL) { + if (newlen != sizeof(ssize_t)) { + ret = EINVAL; + goto label_return; + } + ssize_t max = *(ssize_t *)newp; + if (max < -1) { + ret = EINVAL; + goto label_return; + } + old_max = prof_recent_alloc_max_ctl_write(tsd, max); + } else { + old_max = prof_recent_alloc_max_ctl_read(); + } + READ(old_max, ssize_t); + + ret = 0; + +label_return: + return ret; +} + +typedef struct write_cb_packet_s write_cb_packet_t; +struct write_cb_packet_s { + write_cb_t *write_cb; + void *cbopaque; +}; + +static int +experimental_prof_recent_alloc_dump_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + + if (!(config_prof && opt_prof)) { + ret = ENOENT; + goto label_return; + } + + assert(sizeof(write_cb_packet_t) == sizeof(void *) * 2); + + WRITEONLY(); + write_cb_packet_t write_cb_packet; + ASSURED_WRITE(write_cb_packet, write_cb_packet_t); + + prof_recent_alloc_dump(tsd, write_cb_packet.write_cb, + write_cb_packet.cbopaque); + + ret = 0; + +label_return: + return ret; +} + +typedef struct batch_alloc_packet_s batch_alloc_packet_t; +struct batch_alloc_packet_s { + void **ptrs; + size_t num; + size_t size; + int flags; +}; + +static int +experimental_batch_alloc_ctl(tsd_t *tsd, const size_t *mib, + size_t miblen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + + VERIFY_READ(size_t); + + batch_alloc_packet_t batch_alloc_packet; + ASSURED_WRITE(batch_alloc_packet, batch_alloc_packet_t); + size_t filled = batch_alloc(batch_alloc_packet.ptrs, + batch_alloc_packet.num, batch_alloc_packet.size, + batch_alloc_packet.flags); + READ(filled, size_t); + + ret = 0; + +label_return: + return ret; +} + +static int +prof_stats_bins_i_live_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + unsigned binind; + prof_stats_t stats; + + if (!(config_prof && opt_prof && opt_prof_stats)) { + ret = ENOENT; + goto label_return; + } + + READONLY(); + MIB_UNSIGNED(binind, 3); + if (binind >= SC_NBINS) { + ret = EINVAL; + goto label_return; + } + prof_stats_get_live(tsd, (szind_t)binind, &stats); + READ(stats, prof_stats_t); + + ret = 0; +label_return: + return ret; +} + +static int +prof_stats_bins_i_accum_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + unsigned binind; + prof_stats_t stats; + + if (!(config_prof && opt_prof && opt_prof_stats)) { + ret = ENOENT; + goto label_return; + } + + READONLY(); + MIB_UNSIGNED(binind, 3); + if (binind >= SC_NBINS) { + ret = EINVAL; + goto label_return; + } + prof_stats_get_accum(tsd, (szind_t)binind, &stats); + READ(stats, prof_stats_t); + + ret = 0; +label_return: + return ret; +} + +static const ctl_named_node_t * +prof_stats_bins_i_index(tsdn_t *tsdn, const size_t *mib, size_t miblen, + size_t i) { + if (!(config_prof && opt_prof && opt_prof_stats)) { + return NULL; + } + if (i >= SC_NBINS) { + return NULL; + } + return super_prof_stats_bins_i_node; +} + +static int +prof_stats_lextents_i_live_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + unsigned lextent_ind; + prof_stats_t stats; + + if (!(config_prof && opt_prof && opt_prof_stats)) { + ret = ENOENT; + goto label_return; + } + + READONLY(); + MIB_UNSIGNED(lextent_ind, 3); + if (lextent_ind >= SC_NSIZES - SC_NBINS) { + ret = EINVAL; + goto label_return; + } + prof_stats_get_live(tsd, (szind_t)(lextent_ind + SC_NBINS), &stats); + READ(stats, prof_stats_t); + + ret = 0; +label_return: + return ret; +} + +static int +prof_stats_lextents_i_accum_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, + void *oldp, size_t *oldlenp, void *newp, size_t newlen) { + int ret; + unsigned lextent_ind; + prof_stats_t stats; + + if (!(config_prof && opt_prof && opt_prof_stats)) { + ret = ENOENT; + goto label_return; + } + + READONLY(); + MIB_UNSIGNED(lextent_ind, 3); + if (lextent_ind >= SC_NSIZES - SC_NBINS) { + ret = EINVAL; + goto label_return; + } + prof_stats_get_accum(tsd, (szind_t)(lextent_ind + SC_NBINS), &stats); + READ(stats, prof_stats_t); + + ret = 0; +label_return: + return ret; +} + +static const ctl_named_node_t * +prof_stats_lextents_i_index(tsdn_t *tsdn, const size_t *mib, size_t miblen, + size_t i) { + if (!(config_prof && opt_prof && opt_prof_stats)) { + return NULL; + } + if (i >= SC_NSIZES - SC_NBINS) { + return NULL; + } + return super_prof_stats_lextents_i_node; +} diff --git a/BeefRT/JEMalloc/src/decay.c b/BeefRT/JEMalloc/src/decay.c new file mode 100644 index 00000000..d801b2bc --- /dev/null +++ b/BeefRT/JEMalloc/src/decay.c @@ -0,0 +1,295 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/decay.h" + +static const uint64_t h_steps[SMOOTHSTEP_NSTEPS] = { +#define STEP(step, h, x, y) \ + h, + SMOOTHSTEP +#undef STEP +}; + +/* + * Generate a new deadline that is uniformly random within the next epoch after + * the current one. + */ +void +decay_deadline_init(decay_t *decay) { + nstime_copy(&decay->deadline, &decay->epoch); + nstime_add(&decay->deadline, &decay->interval); + if (decay_ms_read(decay) > 0) { + nstime_t jitter; + + nstime_init(&jitter, prng_range_u64(&decay->jitter_state, + nstime_ns(&decay->interval))); + nstime_add(&decay->deadline, &jitter); + } +} + +void +decay_reinit(decay_t *decay, nstime_t *cur_time, ssize_t decay_ms) { + atomic_store_zd(&decay->time_ms, decay_ms, ATOMIC_RELAXED); + if (decay_ms > 0) { + nstime_init(&decay->interval, (uint64_t)decay_ms * + KQU(1000000)); + nstime_idivide(&decay->interval, SMOOTHSTEP_NSTEPS); + } + + nstime_copy(&decay->epoch, cur_time); + decay->jitter_state = (uint64_t)(uintptr_t)decay; + decay_deadline_init(decay); + decay->nunpurged = 0; + memset(decay->backlog, 0, SMOOTHSTEP_NSTEPS * sizeof(size_t)); +} + +bool +decay_init(decay_t *decay, nstime_t *cur_time, ssize_t decay_ms) { + if (config_debug) { + for (size_t i = 0; i < sizeof(decay_t); i++) { + assert(((char *)decay)[i] == 0); + } + decay->ceil_npages = 0; + } + if (malloc_mutex_init(&decay->mtx, "decay", WITNESS_RANK_DECAY, + malloc_mutex_rank_exclusive)) { + return true; + } + decay->purging = false; + decay_reinit(decay, cur_time, decay_ms); + return false; +} + +bool +decay_ms_valid(ssize_t decay_ms) { + if (decay_ms < -1) { + return false; + } + if (decay_ms == -1 || (uint64_t)decay_ms <= NSTIME_SEC_MAX * + KQU(1000)) { + return true; + } + return false; +} + +static void +decay_maybe_update_time(decay_t *decay, nstime_t *new_time) { + if (unlikely(!nstime_monotonic() && nstime_compare(&decay->epoch, + new_time) > 0)) { + /* + * Time went backwards. Move the epoch back in time and + * generate a new deadline, with the expectation that time + * typically flows forward for long enough periods of time that + * epochs complete. Unfortunately, this strategy is susceptible + * to clock jitter triggering premature epoch advances, but + * clock jitter estimation and compensation isn't feasible here + * because calls into this code are event-driven. + */ + nstime_copy(&decay->epoch, new_time); + decay_deadline_init(decay); + } else { + /* Verify that time does not go backwards. */ + assert(nstime_compare(&decay->epoch, new_time) <= 0); + } +} + +static size_t +decay_backlog_npages_limit(const decay_t *decay) { + /* + * For each element of decay_backlog, multiply by the corresponding + * fixed-point smoothstep decay factor. Sum the products, then divide + * to round down to the nearest whole number of pages. + */ + uint64_t sum = 0; + for (unsigned i = 0; i < SMOOTHSTEP_NSTEPS; i++) { + sum += decay->backlog[i] * h_steps[i]; + } + size_t npages_limit_backlog = (size_t)(sum >> SMOOTHSTEP_BFP); + + return npages_limit_backlog; +} + +/* + * Update backlog, assuming that 'nadvance_u64' time intervals have passed. + * Trailing 'nadvance_u64' records should be erased and 'current_npages' is + * placed as the newest record. + */ +static void +decay_backlog_update(decay_t *decay, uint64_t nadvance_u64, + size_t current_npages) { + if (nadvance_u64 >= SMOOTHSTEP_NSTEPS) { + memset(decay->backlog, 0, (SMOOTHSTEP_NSTEPS-1) * + sizeof(size_t)); + } else { + size_t nadvance_z = (size_t)nadvance_u64; + + assert((uint64_t)nadvance_z == nadvance_u64); + + memmove(decay->backlog, &decay->backlog[nadvance_z], + (SMOOTHSTEP_NSTEPS - nadvance_z) * sizeof(size_t)); + if (nadvance_z > 1) { + memset(&decay->backlog[SMOOTHSTEP_NSTEPS - + nadvance_z], 0, (nadvance_z-1) * sizeof(size_t)); + } + } + + size_t npages_delta = (current_npages > decay->nunpurged) ? + current_npages - decay->nunpurged : 0; + decay->backlog[SMOOTHSTEP_NSTEPS-1] = npages_delta; + + if (config_debug) { + if (current_npages > decay->ceil_npages) { + decay->ceil_npages = current_npages; + } + size_t npages_limit = decay_backlog_npages_limit(decay); + assert(decay->ceil_npages >= npages_limit); + if (decay->ceil_npages > npages_limit) { + decay->ceil_npages = npages_limit; + } + } +} + +static inline bool +decay_deadline_reached(const decay_t *decay, const nstime_t *time) { + return (nstime_compare(&decay->deadline, time) <= 0); +} + +uint64_t +decay_npages_purge_in(decay_t *decay, nstime_t *time, size_t npages_new) { + uint64_t decay_interval_ns = decay_epoch_duration_ns(decay); + size_t n_epoch = (size_t)(nstime_ns(time) / decay_interval_ns); + + uint64_t npages_purge; + if (n_epoch >= SMOOTHSTEP_NSTEPS) { + npages_purge = npages_new; + } else { + uint64_t h_steps_max = h_steps[SMOOTHSTEP_NSTEPS - 1]; + assert(h_steps_max >= + h_steps[SMOOTHSTEP_NSTEPS - 1 - n_epoch]); + npages_purge = npages_new * (h_steps_max - + h_steps[SMOOTHSTEP_NSTEPS - 1 - n_epoch]); + npages_purge >>= SMOOTHSTEP_BFP; + } + return npages_purge; +} + +bool +decay_maybe_advance_epoch(decay_t *decay, nstime_t *new_time, + size_t npages_current) { + /* Handle possible non-monotonicity of time. */ + decay_maybe_update_time(decay, new_time); + + if (!decay_deadline_reached(decay, new_time)) { + return false; + } + nstime_t delta; + nstime_copy(&delta, new_time); + nstime_subtract(&delta, &decay->epoch); + + uint64_t nadvance_u64 = nstime_divide(&delta, &decay->interval); + assert(nadvance_u64 > 0); + + /* Add nadvance_u64 decay intervals to epoch. */ + nstime_copy(&delta, &decay->interval); + nstime_imultiply(&delta, nadvance_u64); + nstime_add(&decay->epoch, &delta); + + /* Set a new deadline. */ + decay_deadline_init(decay); + + /* Update the backlog. */ + decay_backlog_update(decay, nadvance_u64, npages_current); + + decay->npages_limit = decay_backlog_npages_limit(decay); + decay->nunpurged = (decay->npages_limit > npages_current) ? + decay->npages_limit : npages_current; + + return true; +} + +/* + * Calculate how many pages should be purged after 'interval'. + * + * First, calculate how many pages should remain at the moment, then subtract + * the number of pages that should remain after 'interval'. The difference is + * how many pages should be purged until then. + * + * The number of pages that should remain at a specific moment is calculated + * like this: pages(now) = sum(backlog[i] * h_steps[i]). After 'interval' + * passes, backlog would shift 'interval' positions to the left and sigmoid + * curve would be applied starting with backlog[interval]. + * + * The implementation doesn't directly map to the description, but it's + * essentially the same calculation, optimized to avoid iterating over + * [interval..SMOOTHSTEP_NSTEPS) twice. + */ +static inline size_t +decay_npurge_after_interval(decay_t *decay, size_t interval) { + size_t i; + uint64_t sum = 0; + for (i = 0; i < interval; i++) { + sum += decay->backlog[i] * h_steps[i]; + } + for (; i < SMOOTHSTEP_NSTEPS; i++) { + sum += decay->backlog[i] * + (h_steps[i] - h_steps[i - interval]); + } + + return (size_t)(sum >> SMOOTHSTEP_BFP); +} + +uint64_t decay_ns_until_purge(decay_t *decay, size_t npages_current, + uint64_t npages_threshold) { + if (!decay_gradually(decay)) { + return DECAY_UNBOUNDED_TIME_TO_PURGE; + } + uint64_t decay_interval_ns = decay_epoch_duration_ns(decay); + assert(decay_interval_ns > 0); + if (npages_current == 0) { + unsigned i; + for (i = 0; i < SMOOTHSTEP_NSTEPS; i++) { + if (decay->backlog[i] > 0) { + break; + } + } + if (i == SMOOTHSTEP_NSTEPS) { + /* No dirty pages recorded. Sleep indefinitely. */ + return DECAY_UNBOUNDED_TIME_TO_PURGE; + } + } + if (npages_current <= npages_threshold) { + /* Use max interval. */ + return decay_interval_ns * SMOOTHSTEP_NSTEPS; + } + + /* Minimal 2 intervals to ensure reaching next epoch deadline. */ + size_t lb = 2; + size_t ub = SMOOTHSTEP_NSTEPS; + + size_t npurge_lb, npurge_ub; + npurge_lb = decay_npurge_after_interval(decay, lb); + if (npurge_lb > npages_threshold) { + return decay_interval_ns * lb; + } + npurge_ub = decay_npurge_after_interval(decay, ub); + if (npurge_ub < npages_threshold) { + return decay_interval_ns * ub; + } + + unsigned n_search = 0; + size_t target, npurge; + while ((npurge_lb + npages_threshold < npurge_ub) && (lb + 2 < ub)) { + target = (lb + ub) / 2; + npurge = decay_npurge_after_interval(decay, target); + if (npurge > npages_threshold) { + ub = target; + npurge_ub = npurge; + } else { + lb = target; + npurge_lb = npurge; + } + assert(n_search < lg_floor(SMOOTHSTEP_NSTEPS) + 1); + ++n_search; + } + return decay_interval_ns * (ub + lb) / 2; +} diff --git a/BeefRT/JEMalloc/src/div.c b/BeefRT/JEMalloc/src/div.c new file mode 100644 index 00000000..808892a1 --- /dev/null +++ b/BeefRT/JEMalloc/src/div.c @@ -0,0 +1,55 @@ +#include "jemalloc/internal/jemalloc_preamble.h" + +#include "jemalloc/internal/div.h" + +#include "jemalloc/internal/assert.h" + +/* + * Suppose we have n = q * d, all integers. We know n and d, and want q = n / d. + * + * For any k, we have (here, all division is exact; not C-style rounding): + * floor(ceil(2^k / d) * n / 2^k) = floor((2^k + r) / d * n / 2^k), where + * r = (-2^k) mod d. + * + * Expanding this out: + * ... = floor(2^k / d * n / 2^k + r / d * n / 2^k) + * = floor(n / d + (r / d) * (n / 2^k)). + * + * The fractional part of n / d is 0 (because of the assumption that d divides n + * exactly), so we have: + * ... = n / d + floor((r / d) * (n / 2^k)) + * + * So that our initial expression is equal to the quantity we seek, so long as + * (r / d) * (n / 2^k) < 1. + * + * r is a remainder mod d, so r < d and r / d < 1 always. We can make + * n / 2 ^ k < 1 by setting k = 32. This gets us a value of magic that works. + */ + +void +div_init(div_info_t *div_info, size_t d) { + /* Nonsensical. */ + assert(d != 0); + /* + * This would make the value of magic too high to fit into a uint32_t + * (we would want magic = 2^32 exactly). This would mess with code gen + * on 32-bit machines. + */ + assert(d != 1); + + uint64_t two_to_k = ((uint64_t)1 << 32); + uint32_t magic = (uint32_t)(two_to_k / d); + + /* + * We want magic = ceil(2^k / d), but C gives us floor. We have to + * increment it unless the result was exact (i.e. unless d is a power of + * two). + */ + if (two_to_k % d != 0) { + magic++; + } + div_info->magic = magic; +#ifdef JEMALLOC_DEBUG + div_info->d = d; +#endif +} diff --git a/BeefRT/JEMalloc/src/ecache.c b/BeefRT/JEMalloc/src/ecache.c new file mode 100644 index 00000000..a242227d --- /dev/null +++ b/BeefRT/JEMalloc/src/ecache.c @@ -0,0 +1,35 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/san.h" + +bool +ecache_init(tsdn_t *tsdn, ecache_t *ecache, extent_state_t state, unsigned ind, + bool delay_coalesce) { + if (malloc_mutex_init(&ecache->mtx, "extents", WITNESS_RANK_EXTENTS, + malloc_mutex_rank_exclusive)) { + return true; + } + ecache->state = state; + ecache->ind = ind; + ecache->delay_coalesce = delay_coalesce; + eset_init(&ecache->eset, state); + eset_init(&ecache->guarded_eset, state); + + return false; +} + +void +ecache_prefork(tsdn_t *tsdn, ecache_t *ecache) { + malloc_mutex_prefork(tsdn, &ecache->mtx); +} + +void +ecache_postfork_parent(tsdn_t *tsdn, ecache_t *ecache) { + malloc_mutex_postfork_parent(tsdn, &ecache->mtx); +} + +void +ecache_postfork_child(tsdn_t *tsdn, ecache_t *ecache) { + malloc_mutex_postfork_child(tsdn, &ecache->mtx); +} diff --git a/BeefRT/JEMalloc/src/edata.c b/BeefRT/JEMalloc/src/edata.c new file mode 100644 index 00000000..82b6f565 --- /dev/null +++ b/BeefRT/JEMalloc/src/edata.c @@ -0,0 +1,6 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +ph_gen(, edata_avail, edata_t, avail_link, + edata_esnead_comp) +ph_gen(, edata_heap, edata_t, heap_link, edata_snad_comp) diff --git a/BeefRT/JEMalloc/src/edata_cache.c b/BeefRT/JEMalloc/src/edata_cache.c new file mode 100644 index 00000000..6bc1848c --- /dev/null +++ b/BeefRT/JEMalloc/src/edata_cache.c @@ -0,0 +1,154 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +bool +edata_cache_init(edata_cache_t *edata_cache, base_t *base) { + edata_avail_new(&edata_cache->avail); + /* + * This is not strictly necessary, since the edata_cache_t is only + * created inside an arena, which is zeroed on creation. But this is + * handy as a safety measure. + */ + atomic_store_zu(&edata_cache->count, 0, ATOMIC_RELAXED); + if (malloc_mutex_init(&edata_cache->mtx, "edata_cache", + WITNESS_RANK_EDATA_CACHE, malloc_mutex_rank_exclusive)) { + return true; + } + edata_cache->base = base; + return false; +} + +edata_t * +edata_cache_get(tsdn_t *tsdn, edata_cache_t *edata_cache) { + malloc_mutex_lock(tsdn, &edata_cache->mtx); + edata_t *edata = edata_avail_first(&edata_cache->avail); + if (edata == NULL) { + malloc_mutex_unlock(tsdn, &edata_cache->mtx); + return base_alloc_edata(tsdn, edata_cache->base); + } + edata_avail_remove(&edata_cache->avail, edata); + atomic_load_sub_store_zu(&edata_cache->count, 1); + malloc_mutex_unlock(tsdn, &edata_cache->mtx); + return edata; +} + +void +edata_cache_put(tsdn_t *tsdn, edata_cache_t *edata_cache, edata_t *edata) { + malloc_mutex_lock(tsdn, &edata_cache->mtx); + edata_avail_insert(&edata_cache->avail, edata); + atomic_load_add_store_zu(&edata_cache->count, 1); + malloc_mutex_unlock(tsdn, &edata_cache->mtx); +} + +void +edata_cache_prefork(tsdn_t *tsdn, edata_cache_t *edata_cache) { + malloc_mutex_prefork(tsdn, &edata_cache->mtx); +} + +void +edata_cache_postfork_parent(tsdn_t *tsdn, edata_cache_t *edata_cache) { + malloc_mutex_postfork_parent(tsdn, &edata_cache->mtx); +} + +void +edata_cache_postfork_child(tsdn_t *tsdn, edata_cache_t *edata_cache) { + malloc_mutex_postfork_child(tsdn, &edata_cache->mtx); +} + +void +edata_cache_fast_init(edata_cache_fast_t *ecs, edata_cache_t *fallback) { + edata_list_inactive_init(&ecs->list); + ecs->fallback = fallback; + ecs->disabled = false; +} + +static void +edata_cache_fast_try_fill_from_fallback(tsdn_t *tsdn, + edata_cache_fast_t *ecs) { + edata_t *edata; + malloc_mutex_lock(tsdn, &ecs->fallback->mtx); + for (int i = 0; i < EDATA_CACHE_FAST_FILL; i++) { + edata = edata_avail_remove_first(&ecs->fallback->avail); + if (edata == NULL) { + break; + } + edata_list_inactive_append(&ecs->list, edata); + atomic_load_sub_store_zu(&ecs->fallback->count, 1); + } + malloc_mutex_unlock(tsdn, &ecs->fallback->mtx); +} + +edata_t * +edata_cache_fast_get(tsdn_t *tsdn, edata_cache_fast_t *ecs) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_EDATA_CACHE, 0); + + if (ecs->disabled) { + assert(edata_list_inactive_first(&ecs->list) == NULL); + return edata_cache_get(tsdn, ecs->fallback); + } + + edata_t *edata = edata_list_inactive_first(&ecs->list); + if (edata != NULL) { + edata_list_inactive_remove(&ecs->list, edata); + return edata; + } + /* Slow path; requires synchronization. */ + edata_cache_fast_try_fill_from_fallback(tsdn, ecs); + edata = edata_list_inactive_first(&ecs->list); + if (edata != NULL) { + edata_list_inactive_remove(&ecs->list, edata); + } else { + /* + * Slowest path (fallback was also empty); allocate something + * new. + */ + edata = base_alloc_edata(tsdn, ecs->fallback->base); + } + return edata; +} + +static void +edata_cache_fast_flush_all(tsdn_t *tsdn, edata_cache_fast_t *ecs) { + /* + * You could imagine smarter cache management policies (like + * only flushing down to some threshold in anticipation of + * future get requests). But just flushing everything provides + * a good opportunity to defrag too, and lets us share code between the + * flush and disable pathways. + */ + edata_t *edata; + size_t nflushed = 0; + malloc_mutex_lock(tsdn, &ecs->fallback->mtx); + while ((edata = edata_list_inactive_first(&ecs->list)) != NULL) { + edata_list_inactive_remove(&ecs->list, edata); + edata_avail_insert(&ecs->fallback->avail, edata); + nflushed++; + } + atomic_load_add_store_zu(&ecs->fallback->count, nflushed); + malloc_mutex_unlock(tsdn, &ecs->fallback->mtx); +} + +void +edata_cache_fast_put(tsdn_t *tsdn, edata_cache_fast_t *ecs, edata_t *edata) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_EDATA_CACHE, 0); + + if (ecs->disabled) { + assert(edata_list_inactive_first(&ecs->list) == NULL); + edata_cache_put(tsdn, ecs->fallback, edata); + return; + } + + /* + * Prepend rather than append, to do LIFO ordering in the hopes of some + * cache locality. + */ + edata_list_inactive_prepend(&ecs->list, edata); +} + +void +edata_cache_fast_disable(tsdn_t *tsdn, edata_cache_fast_t *ecs) { + edata_cache_fast_flush_all(tsdn, ecs); + ecs->disabled = true; +} diff --git a/BeefRT/JEMalloc/src/ehooks.c b/BeefRT/JEMalloc/src/ehooks.c new file mode 100644 index 00000000..383e9de6 --- /dev/null +++ b/BeefRT/JEMalloc/src/ehooks.c @@ -0,0 +1,275 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/ehooks.h" +#include "jemalloc/internal/extent_mmap.h" + +void +ehooks_init(ehooks_t *ehooks, extent_hooks_t *extent_hooks, unsigned ind) { + /* All other hooks are optional; this one is not. */ + assert(extent_hooks->alloc != NULL); + ehooks->ind = ind; + ehooks_set_extent_hooks_ptr(ehooks, extent_hooks); +} + +/* + * If the caller specifies (!*zero), it is still possible to receive zeroed + * memory, in which case *zero is toggled to true. arena_extent_alloc() takes + * advantage of this to avoid demanding zeroed extents, but taking advantage of + * them if they are returned. + */ +static void * +extent_alloc_core(tsdn_t *tsdn, arena_t *arena, void *new_addr, size_t size, + size_t alignment, bool *zero, bool *commit, dss_prec_t dss_prec) { + void *ret; + + assert(size != 0); + assert(alignment != 0); + + /* "primary" dss. */ + if (have_dss && dss_prec == dss_prec_primary && (ret = + extent_alloc_dss(tsdn, arena, new_addr, size, alignment, zero, + commit)) != NULL) { + return ret; + } + /* mmap. */ + if ((ret = extent_alloc_mmap(new_addr, size, alignment, zero, commit)) + != NULL) { + return ret; + } + /* "secondary" dss. */ + if (have_dss && dss_prec == dss_prec_secondary && (ret = + extent_alloc_dss(tsdn, arena, new_addr, size, alignment, zero, + commit)) != NULL) { + return ret; + } + + /* All strategies for allocation failed. */ + return NULL; +} + +void * +ehooks_default_alloc_impl(tsdn_t *tsdn, void *new_addr, size_t size, + size_t alignment, bool *zero, bool *commit, unsigned arena_ind) { + arena_t *arena = arena_get(tsdn, arena_ind, false); + /* NULL arena indicates arena_create. */ + assert(arena != NULL || alignment == HUGEPAGE); + dss_prec_t dss = (arena == NULL) ? dss_prec_disabled : + (dss_prec_t)atomic_load_u(&arena->dss_prec, ATOMIC_RELAXED); + void *ret = extent_alloc_core(tsdn, arena, new_addr, size, alignment, + zero, commit, dss); + if (have_madvise_huge && ret) { + pages_set_thp_state(ret, size); + } + return ret; +} + +static void * +ehooks_default_alloc(extent_hooks_t *extent_hooks, void *new_addr, size_t size, + size_t alignment, bool *zero, bool *commit, unsigned arena_ind) { + return ehooks_default_alloc_impl(tsdn_fetch(), new_addr, size, + ALIGNMENT_CEILING(alignment, PAGE), zero, commit, arena_ind); +} + +bool +ehooks_default_dalloc_impl(void *addr, size_t size) { + if (!have_dss || !extent_in_dss(addr)) { + return extent_dalloc_mmap(addr, size); + } + return true; +} + +static bool +ehooks_default_dalloc(extent_hooks_t *extent_hooks, void *addr, size_t size, + bool committed, unsigned arena_ind) { + return ehooks_default_dalloc_impl(addr, size); +} + +void +ehooks_default_destroy_impl(void *addr, size_t size) { + if (!have_dss || !extent_in_dss(addr)) { + pages_unmap(addr, size); + } +} + +static void +ehooks_default_destroy(extent_hooks_t *extent_hooks, void *addr, size_t size, + bool committed, unsigned arena_ind) { + ehooks_default_destroy_impl(addr, size); +} + +bool +ehooks_default_commit_impl(void *addr, size_t offset, size_t length) { + return pages_commit((void *)((uintptr_t)addr + (uintptr_t)offset), + length); +} + +static bool +ehooks_default_commit(extent_hooks_t *extent_hooks, void *addr, size_t size, + size_t offset, size_t length, unsigned arena_ind) { + return ehooks_default_commit_impl(addr, offset, length); +} + +bool +ehooks_default_decommit_impl(void *addr, size_t offset, size_t length) { + return pages_decommit((void *)((uintptr_t)addr + (uintptr_t)offset), + length); +} + +static bool +ehooks_default_decommit(extent_hooks_t *extent_hooks, void *addr, size_t size, + size_t offset, size_t length, unsigned arena_ind) { + return ehooks_default_decommit_impl(addr, offset, length); +} + +#ifdef PAGES_CAN_PURGE_LAZY +bool +ehooks_default_purge_lazy_impl(void *addr, size_t offset, size_t length) { + return pages_purge_lazy((void *)((uintptr_t)addr + (uintptr_t)offset), + length); +} + +static bool +ehooks_default_purge_lazy(extent_hooks_t *extent_hooks, void *addr, size_t size, + size_t offset, size_t length, unsigned arena_ind) { + assert(addr != NULL); + assert((offset & PAGE_MASK) == 0); + assert(length != 0); + assert((length & PAGE_MASK) == 0); + return ehooks_default_purge_lazy_impl(addr, offset, length); +} +#endif + +#ifdef PAGES_CAN_PURGE_FORCED +bool +ehooks_default_purge_forced_impl(void *addr, size_t offset, size_t length) { + return pages_purge_forced((void *)((uintptr_t)addr + + (uintptr_t)offset), length); +} + +static bool +ehooks_default_purge_forced(extent_hooks_t *extent_hooks, void *addr, + size_t size, size_t offset, size_t length, unsigned arena_ind) { + assert(addr != NULL); + assert((offset & PAGE_MASK) == 0); + assert(length != 0); + assert((length & PAGE_MASK) == 0); + return ehooks_default_purge_forced_impl(addr, offset, length); +} +#endif + +bool +ehooks_default_split_impl() { + if (!maps_coalesce) { + /* + * Without retain, only whole regions can be purged (required by + * MEM_RELEASE on Windows) -- therefore disallow splitting. See + * comments in extent_head_no_merge(). + */ + return !opt_retain; + } + + return false; +} + +static bool +ehooks_default_split(extent_hooks_t *extent_hooks, void *addr, size_t size, + size_t size_a, size_t size_b, bool committed, unsigned arena_ind) { + return ehooks_default_split_impl(); +} + +bool +ehooks_default_merge_impl(tsdn_t *tsdn, void *addr_a, void *addr_b) { + assert(addr_a < addr_b); + /* + * For non-DSS cases -- + * a) W/o maps_coalesce, merge is not always allowed (Windows): + * 1) w/o retain, never merge (first branch below). + * 2) with retain, only merge extents from the same VirtualAlloc + * region (in which case MEM_DECOMMIT is utilized for purging). + * + * b) With maps_coalesce, it's always possible to merge. + * 1) w/o retain, always allow merge (only about dirty / muzzy). + * 2) with retain, to preserve the SN / first-fit, merge is still + * disallowed if b is a head extent, i.e. no merging across + * different mmap regions. + * + * a2) and b2) are implemented in emap_try_acquire_edata_neighbor, and + * sanity checked in the second branch below. + */ + if (!maps_coalesce && !opt_retain) { + return true; + } + if (config_debug) { + edata_t *a = emap_edata_lookup(tsdn, &arena_emap_global, + addr_a); + bool head_a = edata_is_head_get(a); + edata_t *b = emap_edata_lookup(tsdn, &arena_emap_global, + addr_b); + bool head_b = edata_is_head_get(b); + emap_assert_mapped(tsdn, &arena_emap_global, a); + emap_assert_mapped(tsdn, &arena_emap_global, b); + assert(extent_neighbor_head_state_mergeable(head_a, head_b, + /* forward */ true)); + } + if (have_dss && !extent_dss_mergeable(addr_a, addr_b)) { + return true; + } + + return false; +} + +bool +ehooks_default_merge(extent_hooks_t *extent_hooks, void *addr_a, size_t size_a, + void *addr_b, size_t size_b, bool committed, unsigned arena_ind) { + tsdn_t *tsdn = tsdn_fetch(); + + return ehooks_default_merge_impl(tsdn, addr_a, addr_b); +} + +void +ehooks_default_zero_impl(void *addr, size_t size) { + /* + * By default, we try to zero out memory using OS-provided demand-zeroed + * pages. If the user has specifically requested hugepages, though, we + * don't want to purge in the middle of a hugepage (which would break it + * up), so we act conservatively and use memset. + */ + bool needs_memset = true; + if (opt_thp != thp_mode_always) { + needs_memset = pages_purge_forced(addr, size); + } + if (needs_memset) { + memset(addr, 0, size); + } +} + +void +ehooks_default_guard_impl(void *guard1, void *guard2) { + pages_mark_guards(guard1, guard2); +} + +void +ehooks_default_unguard_impl(void *guard1, void *guard2) { + pages_unmark_guards(guard1, guard2); +} + +const extent_hooks_t ehooks_default_extent_hooks = { + ehooks_default_alloc, + ehooks_default_dalloc, + ehooks_default_destroy, + ehooks_default_commit, + ehooks_default_decommit, +#ifdef PAGES_CAN_PURGE_LAZY + ehooks_default_purge_lazy, +#else + NULL, +#endif +#ifdef PAGES_CAN_PURGE_FORCED + ehooks_default_purge_forced, +#else + NULL, +#endif + ehooks_default_split, + ehooks_default_merge +}; diff --git a/BeefRT/JEMalloc/src/emap.c b/BeefRT/JEMalloc/src/emap.c new file mode 100644 index 00000000..9cc95a72 --- /dev/null +++ b/BeefRT/JEMalloc/src/emap.c @@ -0,0 +1,386 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/emap.h" + +enum emap_lock_result_e { + emap_lock_result_success, + emap_lock_result_failure, + emap_lock_result_no_extent +}; +typedef enum emap_lock_result_e emap_lock_result_t; + +bool +emap_init(emap_t *emap, base_t *base, bool zeroed) { + return rtree_new(&emap->rtree, base, zeroed); +} + +void +emap_update_edata_state(tsdn_t *tsdn, emap_t *emap, edata_t *edata, + extent_state_t state) { + witness_assert_positive_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE); + + edata_state_set(edata, state); + + EMAP_DECLARE_RTREE_CTX; + rtree_leaf_elm_t *elm1 = rtree_leaf_elm_lookup(tsdn, &emap->rtree, + rtree_ctx, (uintptr_t)edata_base_get(edata), /* dependent */ true, + /* init_missing */ false); + assert(elm1 != NULL); + rtree_leaf_elm_t *elm2 = edata_size_get(edata) == PAGE ? NULL : + rtree_leaf_elm_lookup(tsdn, &emap->rtree, rtree_ctx, + (uintptr_t)edata_last_get(edata), /* dependent */ true, + /* init_missing */ false); + + rtree_leaf_elm_state_update(tsdn, &emap->rtree, elm1, elm2, state); + + emap_assert_mapped(tsdn, emap, edata); +} + +static inline edata_t * +emap_try_acquire_edata_neighbor_impl(tsdn_t *tsdn, emap_t *emap, edata_t *edata, + extent_pai_t pai, extent_state_t expected_state, bool forward, + bool expanding) { + witness_assert_positive_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE); + assert(!edata_guarded_get(edata)); + assert(!expanding || forward); + assert(!edata_state_in_transition(expected_state)); + assert(expected_state == extent_state_dirty || + expected_state == extent_state_muzzy || + expected_state == extent_state_retained); + + void *neighbor_addr = forward ? edata_past_get(edata) : + edata_before_get(edata); + /* + * This is subtle; the rtree code asserts that its input pointer is + * non-NULL, and this is a useful thing to check. But it's possible + * that edata corresponds to an address of (void *)PAGE (in practice, + * this has only been observed on FreeBSD when address-space + * randomization is on, but it could in principle happen anywhere). In + * this case, edata_before_get(edata) is NULL, triggering the assert. + */ + if (neighbor_addr == NULL) { + return NULL; + } + + EMAP_DECLARE_RTREE_CTX; + rtree_leaf_elm_t *elm = rtree_leaf_elm_lookup(tsdn, &emap->rtree, + rtree_ctx, (uintptr_t)neighbor_addr, /* dependent*/ false, + /* init_missing */ false); + if (elm == NULL) { + return NULL; + } + + rtree_contents_t neighbor_contents = rtree_leaf_elm_read(tsdn, + &emap->rtree, elm, /* dependent */ true); + if (!extent_can_acquire_neighbor(edata, neighbor_contents, pai, + expected_state, forward, expanding)) { + return NULL; + } + + /* From this point, the neighbor edata can be safely acquired. */ + edata_t *neighbor = neighbor_contents.edata; + assert(edata_state_get(neighbor) == expected_state); + emap_update_edata_state(tsdn, emap, neighbor, extent_state_merging); + if (expanding) { + extent_assert_can_expand(edata, neighbor); + } else { + extent_assert_can_coalesce(edata, neighbor); + } + + return neighbor; +} + +edata_t * +emap_try_acquire_edata_neighbor(tsdn_t *tsdn, emap_t *emap, edata_t *edata, + extent_pai_t pai, extent_state_t expected_state, bool forward) { + return emap_try_acquire_edata_neighbor_impl(tsdn, emap, edata, pai, + expected_state, forward, /* expand */ false); +} + +edata_t * +emap_try_acquire_edata_neighbor_expand(tsdn_t *tsdn, emap_t *emap, + edata_t *edata, extent_pai_t pai, extent_state_t expected_state) { + /* Try expanding forward. */ + return emap_try_acquire_edata_neighbor_impl(tsdn, emap, edata, pai, + expected_state, /* forward */ true, /* expand */ true); +} + +void +emap_release_edata(tsdn_t *tsdn, emap_t *emap, edata_t *edata, + extent_state_t new_state) { + assert(emap_edata_in_transition(tsdn, emap, edata)); + assert(emap_edata_is_acquired(tsdn, emap, edata)); + + emap_update_edata_state(tsdn, emap, edata, new_state); +} + +static bool +emap_rtree_leaf_elms_lookup(tsdn_t *tsdn, emap_t *emap, rtree_ctx_t *rtree_ctx, + const edata_t *edata, bool dependent, bool init_missing, + rtree_leaf_elm_t **r_elm_a, rtree_leaf_elm_t **r_elm_b) { + *r_elm_a = rtree_leaf_elm_lookup(tsdn, &emap->rtree, rtree_ctx, + (uintptr_t)edata_base_get(edata), dependent, init_missing); + if (!dependent && *r_elm_a == NULL) { + return true; + } + assert(*r_elm_a != NULL); + + *r_elm_b = rtree_leaf_elm_lookup(tsdn, &emap->rtree, rtree_ctx, + (uintptr_t)edata_last_get(edata), dependent, init_missing); + if (!dependent && *r_elm_b == NULL) { + return true; + } + assert(*r_elm_b != NULL); + + return false; +} + +static void +emap_rtree_write_acquired(tsdn_t *tsdn, emap_t *emap, rtree_leaf_elm_t *elm_a, + rtree_leaf_elm_t *elm_b, edata_t *edata, szind_t szind, bool slab) { + rtree_contents_t contents; + contents.edata = edata; + contents.metadata.szind = szind; + contents.metadata.slab = slab; + contents.metadata.is_head = (edata == NULL) ? false : + edata_is_head_get(edata); + contents.metadata.state = (edata == NULL) ? 0 : edata_state_get(edata); + rtree_leaf_elm_write(tsdn, &emap->rtree, elm_a, contents); + if (elm_b != NULL) { + rtree_leaf_elm_write(tsdn, &emap->rtree, elm_b, contents); + } +} + +bool +emap_register_boundary(tsdn_t *tsdn, emap_t *emap, edata_t *edata, + szind_t szind, bool slab) { + assert(edata_state_get(edata) == extent_state_active); + EMAP_DECLARE_RTREE_CTX; + + rtree_leaf_elm_t *elm_a, *elm_b; + bool err = emap_rtree_leaf_elms_lookup(tsdn, emap, rtree_ctx, edata, + false, true, &elm_a, &elm_b); + if (err) { + return true; + } + assert(rtree_leaf_elm_read(tsdn, &emap->rtree, elm_a, + /* dependent */ false).edata == NULL); + assert(rtree_leaf_elm_read(tsdn, &emap->rtree, elm_b, + /* dependent */ false).edata == NULL); + emap_rtree_write_acquired(tsdn, emap, elm_a, elm_b, edata, szind, slab); + return false; +} + +/* Invoked *after* emap_register_boundary. */ +void +emap_register_interior(tsdn_t *tsdn, emap_t *emap, edata_t *edata, + szind_t szind) { + EMAP_DECLARE_RTREE_CTX; + + assert(edata_slab_get(edata)); + assert(edata_state_get(edata) == extent_state_active); + + if (config_debug) { + /* Making sure the boundary is registered already. */ + rtree_leaf_elm_t *elm_a, *elm_b; + bool err = emap_rtree_leaf_elms_lookup(tsdn, emap, rtree_ctx, + edata, /* dependent */ true, /* init_missing */ false, + &elm_a, &elm_b); + assert(!err); + rtree_contents_t contents_a, contents_b; + contents_a = rtree_leaf_elm_read(tsdn, &emap->rtree, elm_a, + /* dependent */ true); + contents_b = rtree_leaf_elm_read(tsdn, &emap->rtree, elm_b, + /* dependent */ true); + assert(contents_a.edata == edata && contents_b.edata == edata); + assert(contents_a.metadata.slab && contents_b.metadata.slab); + } + + rtree_contents_t contents; + contents.edata = edata; + contents.metadata.szind = szind; + contents.metadata.slab = true; + contents.metadata.state = extent_state_active; + contents.metadata.is_head = false; /* Not allowed to access. */ + + assert(edata_size_get(edata) > (2 << LG_PAGE)); + rtree_write_range(tsdn, &emap->rtree, rtree_ctx, + (uintptr_t)edata_base_get(edata) + PAGE, + (uintptr_t)edata_last_get(edata) - PAGE, contents); +} + +void +emap_deregister_boundary(tsdn_t *tsdn, emap_t *emap, edata_t *edata) { + /* + * The edata must be either in an acquired state, or protected by state + * based locks. + */ + if (!emap_edata_is_acquired(tsdn, emap, edata)) { + witness_assert_positive_depth_to_rank( + tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_CORE); + } + + EMAP_DECLARE_RTREE_CTX; + rtree_leaf_elm_t *elm_a, *elm_b; + + emap_rtree_leaf_elms_lookup(tsdn, emap, rtree_ctx, edata, + true, false, &elm_a, &elm_b); + emap_rtree_write_acquired(tsdn, emap, elm_a, elm_b, NULL, SC_NSIZES, + false); +} + +void +emap_deregister_interior(tsdn_t *tsdn, emap_t *emap, edata_t *edata) { + EMAP_DECLARE_RTREE_CTX; + + assert(edata_slab_get(edata)); + if (edata_size_get(edata) > (2 << LG_PAGE)) { + rtree_clear_range(tsdn, &emap->rtree, rtree_ctx, + (uintptr_t)edata_base_get(edata) + PAGE, + (uintptr_t)edata_last_get(edata) - PAGE); + } +} + +void +emap_remap(tsdn_t *tsdn, emap_t *emap, edata_t *edata, szind_t szind, + bool slab) { + EMAP_DECLARE_RTREE_CTX; + + if (szind != SC_NSIZES) { + rtree_contents_t contents; + contents.edata = edata; + contents.metadata.szind = szind; + contents.metadata.slab = slab; + contents.metadata.is_head = edata_is_head_get(edata); + contents.metadata.state = edata_state_get(edata); + + rtree_write(tsdn, &emap->rtree, rtree_ctx, + (uintptr_t)edata_addr_get(edata), contents); + /* + * Recall that this is called only for active->inactive and + * inactive->active transitions (since only active extents have + * meaningful values for szind and slab). Active, non-slab + * extents only need to handle lookups at their head (on + * deallocation), so we don't bother filling in the end + * boundary. + * + * For slab extents, we do the end-mapping change. This still + * leaves the interior unmodified; an emap_register_interior + * call is coming in those cases, though. + */ + if (slab && edata_size_get(edata) > PAGE) { + uintptr_t key = (uintptr_t)edata_past_get(edata) + - (uintptr_t)PAGE; + rtree_write(tsdn, &emap->rtree, rtree_ctx, key, + contents); + } + } +} + +bool +emap_split_prepare(tsdn_t *tsdn, emap_t *emap, emap_prepare_t *prepare, + edata_t *edata, size_t size_a, edata_t *trail, size_t size_b) { + EMAP_DECLARE_RTREE_CTX; + + /* + * We use incorrect constants for things like arena ind, zero, ranged, + * and commit state, and head status. This is a fake edata_t, used to + * facilitate a lookup. + */ + edata_t lead = {0}; + edata_init(&lead, 0U, edata_addr_get(edata), size_a, false, 0, 0, + extent_state_active, false, false, EXTENT_PAI_PAC, EXTENT_NOT_HEAD); + + emap_rtree_leaf_elms_lookup(tsdn, emap, rtree_ctx, &lead, false, true, + &prepare->lead_elm_a, &prepare->lead_elm_b); + emap_rtree_leaf_elms_lookup(tsdn, emap, rtree_ctx, trail, false, true, + &prepare->trail_elm_a, &prepare->trail_elm_b); + + if (prepare->lead_elm_a == NULL || prepare->lead_elm_b == NULL + || prepare->trail_elm_a == NULL || prepare->trail_elm_b == NULL) { + return true; + } + return false; +} + +void +emap_split_commit(tsdn_t *tsdn, emap_t *emap, emap_prepare_t *prepare, + edata_t *lead, size_t size_a, edata_t *trail, size_t size_b) { + /* + * We should think about not writing to the lead leaf element. We can + * get into situations where a racing realloc-like call can disagree + * with a size lookup request. I think it's fine to declare that these + * situations are race bugs, but there's an argument to be made that for + * things like xallocx, a size lookup call should return either the old + * size or the new size, but not anything else. + */ + emap_rtree_write_acquired(tsdn, emap, prepare->lead_elm_a, + prepare->lead_elm_b, lead, SC_NSIZES, /* slab */ false); + emap_rtree_write_acquired(tsdn, emap, prepare->trail_elm_a, + prepare->trail_elm_b, trail, SC_NSIZES, /* slab */ false); +} + +void +emap_merge_prepare(tsdn_t *tsdn, emap_t *emap, emap_prepare_t *prepare, + edata_t *lead, edata_t *trail) { + EMAP_DECLARE_RTREE_CTX; + emap_rtree_leaf_elms_lookup(tsdn, emap, rtree_ctx, lead, true, false, + &prepare->lead_elm_a, &prepare->lead_elm_b); + emap_rtree_leaf_elms_lookup(tsdn, emap, rtree_ctx, trail, true, false, + &prepare->trail_elm_a, &prepare->trail_elm_b); +} + +void +emap_merge_commit(tsdn_t *tsdn, emap_t *emap, emap_prepare_t *prepare, + edata_t *lead, edata_t *trail) { + rtree_contents_t clear_contents; + clear_contents.edata = NULL; + clear_contents.metadata.szind = SC_NSIZES; + clear_contents.metadata.slab = false; + clear_contents.metadata.is_head = false; + clear_contents.metadata.state = (extent_state_t)0; + + if (prepare->lead_elm_b != NULL) { + rtree_leaf_elm_write(tsdn, &emap->rtree, + prepare->lead_elm_b, clear_contents); + } + + rtree_leaf_elm_t *merged_b; + if (prepare->trail_elm_b != NULL) { + rtree_leaf_elm_write(tsdn, &emap->rtree, + prepare->trail_elm_a, clear_contents); + merged_b = prepare->trail_elm_b; + } else { + merged_b = prepare->trail_elm_a; + } + + emap_rtree_write_acquired(tsdn, emap, prepare->lead_elm_a, merged_b, + lead, SC_NSIZES, false); +} + +void +emap_do_assert_mapped(tsdn_t *tsdn, emap_t *emap, edata_t *edata) { + EMAP_DECLARE_RTREE_CTX; + + rtree_contents_t contents = rtree_read(tsdn, &emap->rtree, rtree_ctx, + (uintptr_t)edata_base_get(edata)); + assert(contents.edata == edata); + assert(contents.metadata.is_head == edata_is_head_get(edata)); + assert(contents.metadata.state == edata_state_get(edata)); +} + +void +emap_do_assert_not_mapped(tsdn_t *tsdn, emap_t *emap, edata_t *edata) { + emap_full_alloc_ctx_t context1 = {0}; + emap_full_alloc_ctx_try_lookup(tsdn, emap, edata_base_get(edata), + &context1); + assert(context1.edata == NULL); + + emap_full_alloc_ctx_t context2 = {0}; + emap_full_alloc_ctx_try_lookup(tsdn, emap, edata_last_get(edata), + &context2); + assert(context2.edata == NULL); +} diff --git a/BeefRT/JEMalloc/src/eset.c b/BeefRT/JEMalloc/src/eset.c new file mode 100644 index 00000000..6f8f335e --- /dev/null +++ b/BeefRT/JEMalloc/src/eset.c @@ -0,0 +1,282 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/eset.h" + +#define ESET_NPSIZES (SC_NPSIZES + 1) + +static void +eset_bin_init(eset_bin_t *bin) { + edata_heap_new(&bin->heap); + /* + * heap_min doesn't need initialization; it gets filled in when the bin + * goes from non-empty to empty. + */ +} + +static void +eset_bin_stats_init(eset_bin_stats_t *bin_stats) { + atomic_store_zu(&bin_stats->nextents, 0, ATOMIC_RELAXED); + atomic_store_zu(&bin_stats->nbytes, 0, ATOMIC_RELAXED); +} + +void +eset_init(eset_t *eset, extent_state_t state) { + for (unsigned i = 0; i < ESET_NPSIZES; i++) { + eset_bin_init(&eset->bins[i]); + eset_bin_stats_init(&eset->bin_stats[i]); + } + fb_init(eset->bitmap, ESET_NPSIZES); + edata_list_inactive_init(&eset->lru); + eset->state = state; +} + +size_t +eset_npages_get(eset_t *eset) { + return atomic_load_zu(&eset->npages, ATOMIC_RELAXED); +} + +size_t +eset_nextents_get(eset_t *eset, pszind_t pind) { + return atomic_load_zu(&eset->bin_stats[pind].nextents, ATOMIC_RELAXED); +} + +size_t +eset_nbytes_get(eset_t *eset, pszind_t pind) { + return atomic_load_zu(&eset->bin_stats[pind].nbytes, ATOMIC_RELAXED); +} + +static void +eset_stats_add(eset_t *eset, pszind_t pind, size_t sz) { + size_t cur = atomic_load_zu(&eset->bin_stats[pind].nextents, + ATOMIC_RELAXED); + atomic_store_zu(&eset->bin_stats[pind].nextents, cur + 1, + ATOMIC_RELAXED); + cur = atomic_load_zu(&eset->bin_stats[pind].nbytes, ATOMIC_RELAXED); + atomic_store_zu(&eset->bin_stats[pind].nbytes, cur + sz, + ATOMIC_RELAXED); +} + +static void +eset_stats_sub(eset_t *eset, pszind_t pind, size_t sz) { + size_t cur = atomic_load_zu(&eset->bin_stats[pind].nextents, + ATOMIC_RELAXED); + atomic_store_zu(&eset->bin_stats[pind].nextents, cur - 1, + ATOMIC_RELAXED); + cur = atomic_load_zu(&eset->bin_stats[pind].nbytes, ATOMIC_RELAXED); + atomic_store_zu(&eset->bin_stats[pind].nbytes, cur - sz, + ATOMIC_RELAXED); +} + +void +eset_insert(eset_t *eset, edata_t *edata) { + assert(edata_state_get(edata) == eset->state); + + size_t size = edata_size_get(edata); + size_t psz = sz_psz_quantize_floor(size); + pszind_t pind = sz_psz2ind(psz); + + edata_cmp_summary_t edata_cmp_summary = edata_cmp_summary_get(edata); + if (edata_heap_empty(&eset->bins[pind].heap)) { + fb_set(eset->bitmap, ESET_NPSIZES, (size_t)pind); + /* Only element is automatically the min element. */ + eset->bins[pind].heap_min = edata_cmp_summary; + } else { + /* + * There's already a min element; update the summary if we're + * about to insert a lower one. + */ + if (edata_cmp_summary_comp(edata_cmp_summary, + eset->bins[pind].heap_min) < 0) { + eset->bins[pind].heap_min = edata_cmp_summary; + } + } + edata_heap_insert(&eset->bins[pind].heap, edata); + + if (config_stats) { + eset_stats_add(eset, pind, size); + } + + edata_list_inactive_append(&eset->lru, edata); + size_t npages = size >> LG_PAGE; + /* + * All modifications to npages hold the mutex (as asserted above), so we + * don't need an atomic fetch-add; we can get by with a load followed by + * a store. + */ + size_t cur_eset_npages = + atomic_load_zu(&eset->npages, ATOMIC_RELAXED); + atomic_store_zu(&eset->npages, cur_eset_npages + npages, + ATOMIC_RELAXED); +} + +void +eset_remove(eset_t *eset, edata_t *edata) { + assert(edata_state_get(edata) == eset->state || + edata_state_in_transition(edata_state_get(edata))); + + size_t size = edata_size_get(edata); + size_t psz = sz_psz_quantize_floor(size); + pszind_t pind = sz_psz2ind(psz); + if (config_stats) { + eset_stats_sub(eset, pind, size); + } + + edata_cmp_summary_t edata_cmp_summary = edata_cmp_summary_get(edata); + edata_heap_remove(&eset->bins[pind].heap, edata); + if (edata_heap_empty(&eset->bins[pind].heap)) { + fb_unset(eset->bitmap, ESET_NPSIZES, (size_t)pind); + } else { + /* + * This is a little weird; we compare if the summaries are + * equal, rather than if the edata we removed was the heap + * minimum. The reason why is that getting the heap minimum + * can cause a pairing heap merge operation. We can avoid this + * if we only update the min if it's changed, in which case the + * summaries of the removed element and the min element should + * compare equal. + */ + if (edata_cmp_summary_comp(edata_cmp_summary, + eset->bins[pind].heap_min) == 0) { + eset->bins[pind].heap_min = edata_cmp_summary_get( + edata_heap_first(&eset->bins[pind].heap)); + } + } + edata_list_inactive_remove(&eset->lru, edata); + size_t npages = size >> LG_PAGE; + /* + * As in eset_insert, we hold eset->mtx and so don't need atomic + * operations for updating eset->npages. + */ + size_t cur_extents_npages = + atomic_load_zu(&eset->npages, ATOMIC_RELAXED); + assert(cur_extents_npages >= npages); + atomic_store_zu(&eset->npages, + cur_extents_npages - (size >> LG_PAGE), ATOMIC_RELAXED); +} + +/* + * Find an extent with size [min_size, max_size) to satisfy the alignment + * requirement. For each size, try only the first extent in the heap. + */ +static edata_t * +eset_fit_alignment(eset_t *eset, size_t min_size, size_t max_size, + size_t alignment) { + pszind_t pind = sz_psz2ind(sz_psz_quantize_ceil(min_size)); + pszind_t pind_max = sz_psz2ind(sz_psz_quantize_ceil(max_size)); + + for (pszind_t i = + (pszind_t)fb_ffs(eset->bitmap, ESET_NPSIZES, (size_t)pind); + i < pind_max; + i = (pszind_t)fb_ffs(eset->bitmap, ESET_NPSIZES, (size_t)i + 1)) { + assert(i < SC_NPSIZES); + assert(!edata_heap_empty(&eset->bins[i].heap)); + edata_t *edata = edata_heap_first(&eset->bins[i].heap); + uintptr_t base = (uintptr_t)edata_base_get(edata); + size_t candidate_size = edata_size_get(edata); + assert(candidate_size >= min_size); + + uintptr_t next_align = ALIGNMENT_CEILING((uintptr_t)base, + PAGE_CEILING(alignment)); + if (base > next_align || base + candidate_size <= next_align) { + /* Overflow or not crossing the next alignment. */ + continue; + } + + size_t leadsize = next_align - base; + if (candidate_size - leadsize >= min_size) { + return edata; + } + } + + return NULL; +} + +/* + * Do first-fit extent selection, i.e. select the oldest/lowest extent that is + * large enough. + * + * lg_max_fit is the (log of the) maximum ratio between the requested size and + * the returned size that we'll allow. This can reduce fragmentation by + * avoiding reusing and splitting large extents for smaller sizes. In practice, + * it's set to opt_lg_extent_max_active_fit for the dirty eset and SC_PTR_BITS + * for others. + */ +static edata_t * +eset_first_fit(eset_t *eset, size_t size, bool exact_only, + unsigned lg_max_fit) { + edata_t *ret = NULL; + edata_cmp_summary_t ret_summ JEMALLOC_CC_SILENCE_INIT({0}); + + pszind_t pind = sz_psz2ind(sz_psz_quantize_ceil(size)); + + if (exact_only) { + return edata_heap_empty(&eset->bins[pind].heap) ? NULL : + edata_heap_first(&eset->bins[pind].heap); + } + + for (pszind_t i = + (pszind_t)fb_ffs(eset->bitmap, ESET_NPSIZES, (size_t)pind); + i < ESET_NPSIZES; + i = (pszind_t)fb_ffs(eset->bitmap, ESET_NPSIZES, (size_t)i + 1)) { + assert(!edata_heap_empty(&eset->bins[i].heap)); + if (lg_max_fit == SC_PTR_BITS) { + /* + * We'll shift by this below, and shifting out all the + * bits is undefined. Decreasing is safe, since the + * page size is larger than 1 byte. + */ + lg_max_fit = SC_PTR_BITS - 1; + } + if ((sz_pind2sz(i) >> lg_max_fit) > size) { + break; + } + if (ret == NULL || edata_cmp_summary_comp( + eset->bins[i].heap_min, ret_summ) < 0) { + /* + * We grab the edata as early as possible, even though + * we might change it later. Practically, a large + * portion of eset_fit calls succeed at the first valid + * index, so this doesn't cost much, and we get the + * effect of prefetching the edata as early as possible. + */ + edata_t *edata = edata_heap_first(&eset->bins[i].heap); + assert(edata_size_get(edata) >= size); + assert(ret == NULL || edata_snad_comp(edata, ret) < 0); + assert(ret == NULL || edata_cmp_summary_comp( + eset->bins[i].heap_min, + edata_cmp_summary_get(edata)) == 0); + ret = edata; + ret_summ = eset->bins[i].heap_min; + } + if (i == SC_NPSIZES) { + break; + } + assert(i < SC_NPSIZES); + } + + return ret; +} + +edata_t * +eset_fit(eset_t *eset, size_t esize, size_t alignment, bool exact_only, + unsigned lg_max_fit) { + size_t max_size = esize + PAGE_CEILING(alignment) - PAGE; + /* Beware size_t wrap-around. */ + if (max_size < esize) { + return NULL; + } + + edata_t *edata = eset_first_fit(eset, max_size, exact_only, lg_max_fit); + + if (alignment > PAGE && edata == NULL) { + /* + * max_size guarantees the alignment requirement but is rather + * pessimistic. Next we try to satisfy the aligned allocation + * with sizes in [esize, max_size). + */ + edata = eset_fit_alignment(eset, esize, max_size, alignment); + } + + return edata; +} diff --git a/BeefRT/JEMalloc/src/exp_grow.c b/BeefRT/JEMalloc/src/exp_grow.c new file mode 100644 index 00000000..386471f4 --- /dev/null +++ b/BeefRT/JEMalloc/src/exp_grow.c @@ -0,0 +1,8 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +void +exp_grow_init(exp_grow_t *exp_grow) { + exp_grow->next = sz_psz2ind(HUGEPAGE); + exp_grow->limit = sz_psz2ind(SC_LARGE_MAXCLASS); +} diff --git a/BeefRT/JEMalloc/src/extent.c b/BeefRT/JEMalloc/src/extent.c new file mode 100644 index 00000000..cf3d1f31 --- /dev/null +++ b/BeefRT/JEMalloc/src/extent.c @@ -0,0 +1,1326 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/emap.h" +#include "jemalloc/internal/extent_dss.h" +#include "jemalloc/internal/extent_mmap.h" +#include "jemalloc/internal/ph.h" +#include "jemalloc/internal/mutex.h" + +/******************************************************************************/ +/* Data. */ + +size_t opt_lg_extent_max_active_fit = LG_EXTENT_MAX_ACTIVE_FIT_DEFAULT; + +static bool extent_commit_impl(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + size_t offset, size_t length, bool growing_retained); +static bool extent_purge_lazy_impl(tsdn_t *tsdn, ehooks_t *ehooks, + edata_t *edata, size_t offset, size_t length, bool growing_retained); +static bool extent_purge_forced_impl(tsdn_t *tsdn, ehooks_t *ehooks, + edata_t *edata, size_t offset, size_t length, bool growing_retained); +static edata_t *extent_split_impl(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *edata, size_t size_a, size_t size_b, bool holding_core_locks); +static bool extent_merge_impl(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *a, edata_t *b, bool holding_core_locks); + +/* Used exclusively for gdump triggering. */ +static atomic_zu_t curpages; +static atomic_zu_t highpages; + +/******************************************************************************/ +/* + * Function prototypes for static functions that are referenced prior to + * definition. + */ + +static void extent_deregister(tsdn_t *tsdn, pac_t *pac, edata_t *edata); +static edata_t *extent_recycle(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + ecache_t *ecache, edata_t *expand_edata, size_t usize, size_t alignment, + bool zero, bool *commit, bool growing_retained, bool guarded); +static edata_t *extent_try_coalesce(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + ecache_t *ecache, edata_t *edata, bool *coalesced); +static edata_t *extent_alloc_retained(tsdn_t *tsdn, pac_t *pac, + ehooks_t *ehooks, edata_t *expand_edata, size_t size, size_t alignment, + bool zero, bool *commit, bool guarded); + +/******************************************************************************/ + +size_t +extent_sn_next(pac_t *pac) { + return atomic_fetch_add_zu(&pac->extent_sn_next, 1, ATOMIC_RELAXED); +} + +static inline bool +extent_may_force_decay(pac_t *pac) { + return !(pac_decay_ms_get(pac, extent_state_dirty) == -1 + || pac_decay_ms_get(pac, extent_state_muzzy) == -1); +} + +static bool +extent_try_delayed_coalesce(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + ecache_t *ecache, edata_t *edata) { + emap_update_edata_state(tsdn, pac->emap, edata, extent_state_active); + + bool coalesced; + edata = extent_try_coalesce(tsdn, pac, ehooks, ecache, + edata, &coalesced); + emap_update_edata_state(tsdn, pac->emap, edata, ecache->state); + + if (!coalesced) { + return true; + } + eset_insert(&ecache->eset, edata); + return false; +} + +edata_t * +ecache_alloc(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache, + edata_t *expand_edata, size_t size, size_t alignment, bool zero, + bool guarded) { + assert(size != 0); + assert(alignment != 0); + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + + bool commit = true; + edata_t *edata = extent_recycle(tsdn, pac, ehooks, ecache, expand_edata, + size, alignment, zero, &commit, false, guarded); + assert(edata == NULL || edata_pai_get(edata) == EXTENT_PAI_PAC); + assert(edata == NULL || edata_guarded_get(edata) == guarded); + return edata; +} + +edata_t * +ecache_alloc_grow(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache, + edata_t *expand_edata, size_t size, size_t alignment, bool zero, + bool guarded) { + assert(size != 0); + assert(alignment != 0); + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + + bool commit = true; + edata_t *edata = extent_alloc_retained(tsdn, pac, ehooks, expand_edata, + size, alignment, zero, &commit, guarded); + if (edata == NULL) { + if (opt_retain && expand_edata != NULL) { + /* + * When retain is enabled and trying to expand, we do + * not attempt extent_alloc_wrapper which does mmap that + * is very unlikely to succeed (unless it happens to be + * at the end). + */ + return NULL; + } + if (guarded) { + /* + * Means no cached guarded extents available (and no + * grow_retained was attempted). The pac_alloc flow + * will alloc regular extents to make new guarded ones. + */ + return NULL; + } + void *new_addr = (expand_edata == NULL) ? NULL : + edata_past_get(expand_edata); + edata = extent_alloc_wrapper(tsdn, pac, ehooks, new_addr, + size, alignment, zero, &commit, + /* growing_retained */ false); + } + + assert(edata == NULL || edata_pai_get(edata) == EXTENT_PAI_PAC); + return edata; +} + +void +ecache_dalloc(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache, + edata_t *edata) { + assert(edata_base_get(edata) != NULL); + assert(edata_size_get(edata) != 0); + assert(edata_pai_get(edata) == EXTENT_PAI_PAC); + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + + edata_addr_set(edata, edata_base_get(edata)); + edata_zeroed_set(edata, false); + + extent_record(tsdn, pac, ehooks, ecache, edata); +} + +edata_t * +ecache_evict(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + ecache_t *ecache, size_t npages_min) { + malloc_mutex_lock(tsdn, &ecache->mtx); + + /* + * Get the LRU coalesced extent, if any. If coalescing was delayed, + * the loop will iterate until the LRU extent is fully coalesced. + */ + edata_t *edata; + while (true) { + /* Get the LRU extent, if any. */ + eset_t *eset = &ecache->eset; + edata = edata_list_inactive_first(&eset->lru); + if (edata == NULL) { + /* + * Next check if there are guarded extents. They are + * more expensive to purge (since they are not + * mergeable), thus in favor of caching them longer. + */ + eset = &ecache->guarded_eset; + edata = edata_list_inactive_first(&eset->lru); + if (edata == NULL) { + goto label_return; + } + } + /* Check the eviction limit. */ + size_t extents_npages = ecache_npages_get(ecache); + if (extents_npages <= npages_min) { + edata = NULL; + goto label_return; + } + eset_remove(eset, edata); + if (!ecache->delay_coalesce || edata_guarded_get(edata)) { + break; + } + /* Try to coalesce. */ + if (extent_try_delayed_coalesce(tsdn, pac, ehooks, ecache, + edata)) { + break; + } + /* + * The LRU extent was just coalesced and the result placed in + * the LRU at its neighbor's position. Start over. + */ + } + + /* + * Either mark the extent active or deregister it to protect against + * concurrent operations. + */ + switch (ecache->state) { + case extent_state_active: + not_reached(); + case extent_state_dirty: + case extent_state_muzzy: + emap_update_edata_state(tsdn, pac->emap, edata, + extent_state_active); + break; + case extent_state_retained: + extent_deregister(tsdn, pac, edata); + break; + default: + not_reached(); + } + +label_return: + malloc_mutex_unlock(tsdn, &ecache->mtx); + return edata; +} + +/* + * This can only happen when we fail to allocate a new extent struct (which + * indicates OOM), e.g. when trying to split an existing extent. + */ +static void +extents_abandon_vm(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache, + edata_t *edata, bool growing_retained) { + size_t sz = edata_size_get(edata); + if (config_stats) { + atomic_fetch_add_zu(&pac->stats->abandoned_vm, sz, + ATOMIC_RELAXED); + } + /* + * Leak extent after making sure its pages have already been purged, so + * that this is only a virtual memory leak. + */ + if (ecache->state == extent_state_dirty) { + if (extent_purge_lazy_impl(tsdn, ehooks, edata, 0, sz, + growing_retained)) { + extent_purge_forced_impl(tsdn, ehooks, edata, 0, + edata_size_get(edata), growing_retained); + } + } + edata_cache_put(tsdn, pac->edata_cache, edata); +} + +static void +extent_deactivate_locked_impl(tsdn_t *tsdn, pac_t *pac, ecache_t *ecache, + edata_t *edata) { + malloc_mutex_assert_owner(tsdn, &ecache->mtx); + assert(edata_arena_ind_get(edata) == ecache_ind_get(ecache)); + + emap_update_edata_state(tsdn, pac->emap, edata, ecache->state); + eset_t *eset = edata_guarded_get(edata) ? &ecache->guarded_eset : + &ecache->eset; + eset_insert(eset, edata); +} + +static void +extent_deactivate_locked(tsdn_t *tsdn, pac_t *pac, ecache_t *ecache, + edata_t *edata) { + assert(edata_state_get(edata) == extent_state_active); + extent_deactivate_locked_impl(tsdn, pac, ecache, edata); +} + +static void +extent_deactivate_check_state_locked(tsdn_t *tsdn, pac_t *pac, ecache_t *ecache, + edata_t *edata, extent_state_t expected_state) { + assert(edata_state_get(edata) == expected_state); + extent_deactivate_locked_impl(tsdn, pac, ecache, edata); +} + +static void +extent_activate_locked(tsdn_t *tsdn, pac_t *pac, ecache_t *ecache, eset_t *eset, + edata_t *edata) { + assert(edata_arena_ind_get(edata) == ecache_ind_get(ecache)); + assert(edata_state_get(edata) == ecache->state || + edata_state_get(edata) == extent_state_merging); + + eset_remove(eset, edata); + emap_update_edata_state(tsdn, pac->emap, edata, extent_state_active); +} + +void +extent_gdump_add(tsdn_t *tsdn, const edata_t *edata) { + cassert(config_prof); + /* prof_gdump() requirement. */ + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + + if (opt_prof && edata_state_get(edata) == extent_state_active) { + size_t nadd = edata_size_get(edata) >> LG_PAGE; + size_t cur = atomic_fetch_add_zu(&curpages, nadd, + ATOMIC_RELAXED) + nadd; + size_t high = atomic_load_zu(&highpages, ATOMIC_RELAXED); + while (cur > high && !atomic_compare_exchange_weak_zu( + &highpages, &high, cur, ATOMIC_RELAXED, ATOMIC_RELAXED)) { + /* + * Don't refresh cur, because it may have decreased + * since this thread lost the highpages update race. + * Note that high is updated in case of CAS failure. + */ + } + if (cur > high && prof_gdump_get_unlocked()) { + prof_gdump(tsdn); + } + } +} + +static void +extent_gdump_sub(tsdn_t *tsdn, const edata_t *edata) { + cassert(config_prof); + + if (opt_prof && edata_state_get(edata) == extent_state_active) { + size_t nsub = edata_size_get(edata) >> LG_PAGE; + assert(atomic_load_zu(&curpages, ATOMIC_RELAXED) >= nsub); + atomic_fetch_sub_zu(&curpages, nsub, ATOMIC_RELAXED); + } +} + +static bool +extent_register_impl(tsdn_t *tsdn, pac_t *pac, edata_t *edata, bool gdump_add) { + assert(edata_state_get(edata) == extent_state_active); + /* + * No locking needed, as the edata must be in active state, which + * prevents other threads from accessing the edata. + */ + if (emap_register_boundary(tsdn, pac->emap, edata, SC_NSIZES, + /* slab */ false)) { + return true; + } + + if (config_prof && gdump_add) { + extent_gdump_add(tsdn, edata); + } + + return false; +} + +static bool +extent_register(tsdn_t *tsdn, pac_t *pac, edata_t *edata) { + return extent_register_impl(tsdn, pac, edata, true); +} + +static bool +extent_register_no_gdump_add(tsdn_t *tsdn, pac_t *pac, edata_t *edata) { + return extent_register_impl(tsdn, pac, edata, false); +} + +static void +extent_reregister(tsdn_t *tsdn, pac_t *pac, edata_t *edata) { + bool err = extent_register(tsdn, pac, edata); + assert(!err); +} + +/* + * Removes all pointers to the given extent from the global rtree. + */ +static void +extent_deregister_impl(tsdn_t *tsdn, pac_t *pac, edata_t *edata, + bool gdump) { + emap_deregister_boundary(tsdn, pac->emap, edata); + + if (config_prof && gdump) { + extent_gdump_sub(tsdn, edata); + } +} + +static void +extent_deregister(tsdn_t *tsdn, pac_t *pac, edata_t *edata) { + extent_deregister_impl(tsdn, pac, edata, true); +} + +static void +extent_deregister_no_gdump_sub(tsdn_t *tsdn, pac_t *pac, + edata_t *edata) { + extent_deregister_impl(tsdn, pac, edata, false); +} + +/* + * Tries to find and remove an extent from ecache that can be used for the + * given allocation request. + */ +static edata_t * +extent_recycle_extract(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + ecache_t *ecache, edata_t *expand_edata, size_t size, size_t alignment, + bool guarded) { + malloc_mutex_assert_owner(tsdn, &ecache->mtx); + assert(alignment > 0); + if (config_debug && expand_edata != NULL) { + /* + * Non-NULL expand_edata indicates in-place expanding realloc. + * new_addr must either refer to a non-existing extent, or to + * the base of an extant extent, since only active slabs support + * interior lookups (which of course cannot be recycled). + */ + void *new_addr = edata_past_get(expand_edata); + assert(PAGE_ADDR2BASE(new_addr) == new_addr); + assert(alignment <= PAGE); + } + + edata_t *edata; + eset_t *eset = guarded ? &ecache->guarded_eset : &ecache->eset; + if (expand_edata != NULL) { + edata = emap_try_acquire_edata_neighbor_expand(tsdn, pac->emap, + expand_edata, EXTENT_PAI_PAC, ecache->state); + if (edata != NULL) { + extent_assert_can_expand(expand_edata, edata); + if (edata_size_get(edata) < size) { + emap_release_edata(tsdn, pac->emap, edata, + ecache->state); + edata = NULL; + } + } + } else { + /* + * A large extent might be broken up from its original size to + * some small size to satisfy a small request. When that small + * request is freed, though, it won't merge back with the larger + * extent if delayed coalescing is on. The large extent can + * then no longer satify a request for its original size. To + * limit this effect, when delayed coalescing is enabled, we + * put a cap on how big an extent we can split for a request. + */ + unsigned lg_max_fit = ecache->delay_coalesce + ? (unsigned)opt_lg_extent_max_active_fit : SC_PTR_BITS; + + /* + * If split and merge are not allowed (Windows w/o retain), try + * exact fit only. + * + * For simplicity purposes, splitting guarded extents is not + * supported. Hence, we do only exact fit for guarded + * allocations. + */ + bool exact_only = (!maps_coalesce && !opt_retain) || guarded; + edata = eset_fit(eset, size, alignment, exact_only, + lg_max_fit); + } + if (edata == NULL) { + return NULL; + } + assert(!guarded || edata_guarded_get(edata)); + extent_activate_locked(tsdn, pac, ecache, eset, edata); + + return edata; +} + +/* + * Given an allocation request and an extent guaranteed to be able to satisfy + * it, this splits off lead and trail extents, leaving edata pointing to an + * extent satisfying the allocation. + * This function doesn't put lead or trail into any ecache; it's the caller's + * job to ensure that they can be reused. + */ +typedef enum { + /* + * Split successfully. lead, edata, and trail, are modified to extents + * describing the ranges before, in, and after the given allocation. + */ + extent_split_interior_ok, + /* + * The extent can't satisfy the given allocation request. None of the + * input edata_t *s are touched. + */ + extent_split_interior_cant_alloc, + /* + * In a potentially invalid state. Must leak (if *to_leak is non-NULL), + * and salvage what's still salvageable (if *to_salvage is non-NULL). + * None of lead, edata, or trail are valid. + */ + extent_split_interior_error +} extent_split_interior_result_t; + +static extent_split_interior_result_t +extent_split_interior(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + /* The result of splitting, in case of success. */ + edata_t **edata, edata_t **lead, edata_t **trail, + /* The mess to clean up, in case of error. */ + edata_t **to_leak, edata_t **to_salvage, + edata_t *expand_edata, size_t size, size_t alignment) { + size_t leadsize = ALIGNMENT_CEILING((uintptr_t)edata_base_get(*edata), + PAGE_CEILING(alignment)) - (uintptr_t)edata_base_get(*edata); + assert(expand_edata == NULL || leadsize == 0); + if (edata_size_get(*edata) < leadsize + size) { + return extent_split_interior_cant_alloc; + } + size_t trailsize = edata_size_get(*edata) - leadsize - size; + + *lead = NULL; + *trail = NULL; + *to_leak = NULL; + *to_salvage = NULL; + + /* Split the lead. */ + if (leadsize != 0) { + assert(!edata_guarded_get(*edata)); + *lead = *edata; + *edata = extent_split_impl(tsdn, pac, ehooks, *lead, leadsize, + size + trailsize, /* holding_core_locks*/ true); + if (*edata == NULL) { + *to_leak = *lead; + *lead = NULL; + return extent_split_interior_error; + } + } + + /* Split the trail. */ + if (trailsize != 0) { + assert(!edata_guarded_get(*edata)); + *trail = extent_split_impl(tsdn, pac, ehooks, *edata, size, + trailsize, /* holding_core_locks */ true); + if (*trail == NULL) { + *to_leak = *edata; + *to_salvage = *lead; + *lead = NULL; + *edata = NULL; + return extent_split_interior_error; + } + } + + return extent_split_interior_ok; +} + +/* + * This fulfills the indicated allocation request out of the given extent (which + * the caller should have ensured was big enough). If there's any unused space + * before or after the resulting allocation, that space is given its own extent + * and put back into ecache. + */ +static edata_t * +extent_recycle_split(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + ecache_t *ecache, edata_t *expand_edata, size_t size, size_t alignment, + edata_t *edata, bool growing_retained) { + assert(!edata_guarded_get(edata) || size == edata_size_get(edata)); + malloc_mutex_assert_owner(tsdn, &ecache->mtx); + + edata_t *lead; + edata_t *trail; + edata_t *to_leak JEMALLOC_CC_SILENCE_INIT(NULL); + edata_t *to_salvage JEMALLOC_CC_SILENCE_INIT(NULL); + + extent_split_interior_result_t result = extent_split_interior( + tsdn, pac, ehooks, &edata, &lead, &trail, &to_leak, &to_salvage, + expand_edata, size, alignment); + + if (!maps_coalesce && result != extent_split_interior_ok + && !opt_retain) { + /* + * Split isn't supported (implies Windows w/o retain). Avoid + * leaking the extent. + */ + assert(to_leak != NULL && lead == NULL && trail == NULL); + extent_deactivate_locked(tsdn, pac, ecache, to_leak); + return NULL; + } + + if (result == extent_split_interior_ok) { + if (lead != NULL) { + extent_deactivate_locked(tsdn, pac, ecache, lead); + } + if (trail != NULL) { + extent_deactivate_locked(tsdn, pac, ecache, trail); + } + return edata; + } else { + /* + * We should have picked an extent that was large enough to + * fulfill our allocation request. + */ + assert(result == extent_split_interior_error); + if (to_salvage != NULL) { + extent_deregister(tsdn, pac, to_salvage); + } + if (to_leak != NULL) { + extent_deregister_no_gdump_sub(tsdn, pac, to_leak); + /* + * May go down the purge path (which assume no ecache + * locks). Only happens with OOM caused split failures. + */ + malloc_mutex_unlock(tsdn, &ecache->mtx); + extents_abandon_vm(tsdn, pac, ehooks, ecache, to_leak, + growing_retained); + malloc_mutex_lock(tsdn, &ecache->mtx); + } + return NULL; + } + unreachable(); +} + +/* + * Tries to satisfy the given allocation request by reusing one of the extents + * in the given ecache_t. + */ +static edata_t * +extent_recycle(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache, + edata_t *expand_edata, size_t size, size_t alignment, bool zero, + bool *commit, bool growing_retained, bool guarded) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, growing_retained ? 1 : 0); + assert(!guarded || expand_edata == NULL); + assert(!guarded || alignment <= PAGE); + + malloc_mutex_lock(tsdn, &ecache->mtx); + + edata_t *edata = extent_recycle_extract(tsdn, pac, ehooks, ecache, + expand_edata, size, alignment, guarded); + if (edata == NULL) { + malloc_mutex_unlock(tsdn, &ecache->mtx); + return NULL; + } + + edata = extent_recycle_split(tsdn, pac, ehooks, ecache, expand_edata, + size, alignment, edata, growing_retained); + malloc_mutex_unlock(tsdn, &ecache->mtx); + if (edata == NULL) { + return NULL; + } + + assert(edata_state_get(edata) == extent_state_active); + if (extent_commit_zero(tsdn, ehooks, edata, *commit, zero, + growing_retained)) { + extent_record(tsdn, pac, ehooks, ecache, edata); + return NULL; + } + if (edata_committed_get(edata)) { + /* + * This reverses the purpose of this variable - previously it + * was treated as an input parameter, now it turns into an + * output parameter, reporting if the edata has actually been + * committed. + */ + *commit = true; + } + return edata; +} + +/* + * If virtual memory is retained, create increasingly larger extents from which + * to split requested extents in order to limit the total number of disjoint + * virtual memory ranges retained by each shard. + */ +static edata_t * +extent_grow_retained(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + size_t size, size_t alignment, bool zero, bool *commit) { + malloc_mutex_assert_owner(tsdn, &pac->grow_mtx); + + size_t alloc_size_min = size + PAGE_CEILING(alignment) - PAGE; + /* Beware size_t wrap-around. */ + if (alloc_size_min < size) { + goto label_err; + } + /* + * Find the next extent size in the series that would be large enough to + * satisfy this request. + */ + size_t alloc_size; + pszind_t exp_grow_skip; + bool err = exp_grow_size_prepare(&pac->exp_grow, alloc_size_min, + &alloc_size, &exp_grow_skip); + if (err) { + goto label_err; + } + + edata_t *edata = edata_cache_get(tsdn, pac->edata_cache); + if (edata == NULL) { + goto label_err; + } + bool zeroed = false; + bool committed = false; + + void *ptr = ehooks_alloc(tsdn, ehooks, NULL, alloc_size, PAGE, &zeroed, + &committed); + + if (ptr == NULL) { + edata_cache_put(tsdn, pac->edata_cache, edata); + goto label_err; + } + + edata_init(edata, ecache_ind_get(&pac->ecache_retained), ptr, + alloc_size, false, SC_NSIZES, extent_sn_next(pac), + extent_state_active, zeroed, committed, EXTENT_PAI_PAC, + EXTENT_IS_HEAD); + + if (extent_register_no_gdump_add(tsdn, pac, edata)) { + edata_cache_put(tsdn, pac->edata_cache, edata); + goto label_err; + } + + if (edata_committed_get(edata)) { + *commit = true; + } + + edata_t *lead; + edata_t *trail; + edata_t *to_leak JEMALLOC_CC_SILENCE_INIT(NULL); + edata_t *to_salvage JEMALLOC_CC_SILENCE_INIT(NULL); + + extent_split_interior_result_t result = extent_split_interior(tsdn, + pac, ehooks, &edata, &lead, &trail, &to_leak, &to_salvage, NULL, + size, alignment); + + if (result == extent_split_interior_ok) { + if (lead != NULL) { + extent_record(tsdn, pac, ehooks, &pac->ecache_retained, + lead); + } + if (trail != NULL) { + extent_record(tsdn, pac, ehooks, &pac->ecache_retained, + trail); + } + } else { + /* + * We should have allocated a sufficiently large extent; the + * cant_alloc case should not occur. + */ + assert(result == extent_split_interior_error); + if (to_salvage != NULL) { + if (config_prof) { + extent_gdump_add(tsdn, to_salvage); + } + extent_record(tsdn, pac, ehooks, &pac->ecache_retained, + to_salvage); + } + if (to_leak != NULL) { + extent_deregister_no_gdump_sub(tsdn, pac, to_leak); + extents_abandon_vm(tsdn, pac, ehooks, + &pac->ecache_retained, to_leak, true); + } + goto label_err; + } + + if (*commit && !edata_committed_get(edata)) { + if (extent_commit_impl(tsdn, ehooks, edata, 0, + edata_size_get(edata), true)) { + extent_record(tsdn, pac, ehooks, + &pac->ecache_retained, edata); + goto label_err; + } + /* A successful commit should return zeroed memory. */ + if (config_debug) { + void *addr = edata_addr_get(edata); + size_t *p = (size_t *)(uintptr_t)addr; + /* Check the first page only. */ + for (size_t i = 0; i < PAGE / sizeof(size_t); i++) { + assert(p[i] == 0); + } + } + } + + /* + * Increment extent_grow_next if doing so wouldn't exceed the allowed + * range. + */ + /* All opportunities for failure are past. */ + exp_grow_size_commit(&pac->exp_grow, exp_grow_skip); + malloc_mutex_unlock(tsdn, &pac->grow_mtx); + + if (config_prof) { + /* Adjust gdump stats now that extent is final size. */ + extent_gdump_add(tsdn, edata); + } + if (zero && !edata_zeroed_get(edata)) { + ehooks_zero(tsdn, ehooks, edata_base_get(edata), + edata_size_get(edata)); + } + return edata; +label_err: + malloc_mutex_unlock(tsdn, &pac->grow_mtx); + return NULL; +} + +static edata_t * +extent_alloc_retained(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *expand_edata, size_t size, size_t alignment, bool zero, + bool *commit, bool guarded) { + assert(size != 0); + assert(alignment != 0); + + malloc_mutex_lock(tsdn, &pac->grow_mtx); + + edata_t *edata = extent_recycle(tsdn, pac, ehooks, + &pac->ecache_retained, expand_edata, size, alignment, zero, commit, + /* growing_retained */ true, guarded); + if (edata != NULL) { + malloc_mutex_unlock(tsdn, &pac->grow_mtx); + if (config_prof) { + extent_gdump_add(tsdn, edata); + } + } else if (opt_retain && expand_edata == NULL && !guarded) { + edata = extent_grow_retained(tsdn, pac, ehooks, size, + alignment, zero, commit); + /* extent_grow_retained() always releases pac->grow_mtx. */ + } else { + malloc_mutex_unlock(tsdn, &pac->grow_mtx); + } + malloc_mutex_assert_not_owner(tsdn, &pac->grow_mtx); + + return edata; +} + +static bool +extent_coalesce(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache, + edata_t *inner, edata_t *outer, bool forward) { + extent_assert_can_coalesce(inner, outer); + eset_remove(&ecache->eset, outer); + + bool err = extent_merge_impl(tsdn, pac, ehooks, + forward ? inner : outer, forward ? outer : inner, + /* holding_core_locks */ true); + if (err) { + extent_deactivate_check_state_locked(tsdn, pac, ecache, outer, + extent_state_merging); + } + + return err; +} + +static edata_t * +extent_try_coalesce_impl(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + ecache_t *ecache, edata_t *edata, bool *coalesced) { + assert(!edata_guarded_get(edata)); + /* + * We avoid checking / locking inactive neighbors for large size + * classes, since they are eagerly coalesced on deallocation which can + * cause lock contention. + */ + /* + * Continue attempting to coalesce until failure, to protect against + * races with other threads that are thwarted by this one. + */ + bool again; + do { + again = false; + + /* Try to coalesce forward. */ + edata_t *next = emap_try_acquire_edata_neighbor(tsdn, pac->emap, + edata, EXTENT_PAI_PAC, ecache->state, /* forward */ true); + if (next != NULL) { + if (!extent_coalesce(tsdn, pac, ehooks, ecache, edata, + next, true)) { + if (ecache->delay_coalesce) { + /* Do minimal coalescing. */ + *coalesced = true; + return edata; + } + again = true; + } + } + + /* Try to coalesce backward. */ + edata_t *prev = emap_try_acquire_edata_neighbor(tsdn, pac->emap, + edata, EXTENT_PAI_PAC, ecache->state, /* forward */ false); + if (prev != NULL) { + if (!extent_coalesce(tsdn, pac, ehooks, ecache, edata, + prev, false)) { + edata = prev; + if (ecache->delay_coalesce) { + /* Do minimal coalescing. */ + *coalesced = true; + return edata; + } + again = true; + } + } + } while (again); + + if (ecache->delay_coalesce) { + *coalesced = false; + } + return edata; +} + +static edata_t * +extent_try_coalesce(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + ecache_t *ecache, edata_t *edata, bool *coalesced) { + return extent_try_coalesce_impl(tsdn, pac, ehooks, ecache, edata, + coalesced); +} + +static edata_t * +extent_try_coalesce_large(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + ecache_t *ecache, edata_t *edata, bool *coalesced) { + return extent_try_coalesce_impl(tsdn, pac, ehooks, ecache, edata, + coalesced); +} + +/* Purge a single extent to retained / unmapped directly. */ +static void +extent_maximally_purge(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *edata) { + size_t extent_size = edata_size_get(edata); + extent_dalloc_wrapper(tsdn, pac, ehooks, edata); + if (config_stats) { + /* Update stats accordingly. */ + LOCKEDINT_MTX_LOCK(tsdn, *pac->stats_mtx); + locked_inc_u64(tsdn, + LOCKEDINT_MTX(*pac->stats_mtx), + &pac->stats->decay_dirty.nmadvise, 1); + locked_inc_u64(tsdn, + LOCKEDINT_MTX(*pac->stats_mtx), + &pac->stats->decay_dirty.purged, + extent_size >> LG_PAGE); + LOCKEDINT_MTX_UNLOCK(tsdn, *pac->stats_mtx); + atomic_fetch_sub_zu(&pac->stats->pac_mapped, extent_size, + ATOMIC_RELAXED); + } +} + +/* + * Does the metadata management portions of putting an unused extent into the + * given ecache_t (coalesces and inserts into the eset). + */ +void +extent_record(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache, + edata_t *edata) { + assert((ecache->state != extent_state_dirty && + ecache->state != extent_state_muzzy) || + !edata_zeroed_get(edata)); + + malloc_mutex_lock(tsdn, &ecache->mtx); + + emap_assert_mapped(tsdn, pac->emap, edata); + + if (edata_guarded_get(edata)) { + goto label_skip_coalesce; + } + if (!ecache->delay_coalesce) { + edata = extent_try_coalesce(tsdn, pac, ehooks, ecache, edata, + NULL); + } else if (edata_size_get(edata) >= SC_LARGE_MINCLASS) { + assert(ecache == &pac->ecache_dirty); + /* Always coalesce large extents eagerly. */ + bool coalesced; + do { + assert(edata_state_get(edata) == extent_state_active); + edata = extent_try_coalesce_large(tsdn, pac, ehooks, + ecache, edata, &coalesced); + } while (coalesced); + if (edata_size_get(edata) >= + atomic_load_zu(&pac->oversize_threshold, ATOMIC_RELAXED) + && extent_may_force_decay(pac)) { + /* Shortcut to purge the oversize extent eagerly. */ + malloc_mutex_unlock(tsdn, &ecache->mtx); + extent_maximally_purge(tsdn, pac, ehooks, edata); + return; + } + } +label_skip_coalesce: + extent_deactivate_locked(tsdn, pac, ecache, edata); + + malloc_mutex_unlock(tsdn, &ecache->mtx); +} + +void +extent_dalloc_gap(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *edata) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + + if (extent_register(tsdn, pac, edata)) { + edata_cache_put(tsdn, pac->edata_cache, edata); + return; + } + extent_dalloc_wrapper(tsdn, pac, ehooks, edata); +} + +static bool +extent_dalloc_wrapper_try(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *edata) { + bool err; + + assert(edata_base_get(edata) != NULL); + assert(edata_size_get(edata) != 0); + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + + edata_addr_set(edata, edata_base_get(edata)); + + /* Try to deallocate. */ + err = ehooks_dalloc(tsdn, ehooks, edata_base_get(edata), + edata_size_get(edata), edata_committed_get(edata)); + + if (!err) { + edata_cache_put(tsdn, pac->edata_cache, edata); + } + + return err; +} + +edata_t * +extent_alloc_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + void *new_addr, size_t size, size_t alignment, bool zero, bool *commit, + bool growing_retained) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, growing_retained ? 1 : 0); + + edata_t *edata = edata_cache_get(tsdn, pac->edata_cache); + if (edata == NULL) { + return NULL; + } + size_t palignment = ALIGNMENT_CEILING(alignment, PAGE); + void *addr = ehooks_alloc(tsdn, ehooks, new_addr, size, palignment, + &zero, commit); + if (addr == NULL) { + edata_cache_put(tsdn, pac->edata_cache, edata); + return NULL; + } + edata_init(edata, ecache_ind_get(&pac->ecache_dirty), addr, + size, /* slab */ false, SC_NSIZES, extent_sn_next(pac), + extent_state_active, zero, *commit, EXTENT_PAI_PAC, + opt_retain ? EXTENT_IS_HEAD : EXTENT_NOT_HEAD); + /* + * Retained memory is not counted towards gdump. Only if an extent is + * allocated as a separate mapping, i.e. growing_retained is false, then + * gdump should be updated. + */ + bool gdump_add = !growing_retained; + if (extent_register_impl(tsdn, pac, edata, gdump_add)) { + edata_cache_put(tsdn, pac->edata_cache, edata); + return NULL; + } + + return edata; +} + +void +extent_dalloc_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *edata) { + assert(edata_pai_get(edata) == EXTENT_PAI_PAC); + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + + /* Avoid calling the default extent_dalloc unless have to. */ + if (!ehooks_dalloc_will_fail(ehooks)) { + /* Remove guard pages for dalloc / unmap. */ + if (edata_guarded_get(edata)) { + assert(ehooks_are_default(ehooks)); + san_unguard_pages_two_sided(tsdn, ehooks, edata, + pac->emap); + } + /* + * Deregister first to avoid a race with other allocating + * threads, and reregister if deallocation fails. + */ + extent_deregister(tsdn, pac, edata); + if (!extent_dalloc_wrapper_try(tsdn, pac, ehooks, edata)) { + return; + } + extent_reregister(tsdn, pac, edata); + } + + /* Try to decommit; purge if that fails. */ + bool zeroed; + if (!edata_committed_get(edata)) { + zeroed = true; + } else if (!extent_decommit_wrapper(tsdn, ehooks, edata, 0, + edata_size_get(edata))) { + zeroed = true; + } else if (!ehooks_purge_forced(tsdn, ehooks, edata_base_get(edata), + edata_size_get(edata), 0, edata_size_get(edata))) { + zeroed = true; + } else if (edata_state_get(edata) == extent_state_muzzy || + !ehooks_purge_lazy(tsdn, ehooks, edata_base_get(edata), + edata_size_get(edata), 0, edata_size_get(edata))) { + zeroed = false; + } else { + zeroed = false; + } + edata_zeroed_set(edata, zeroed); + + if (config_prof) { + extent_gdump_sub(tsdn, edata); + } + + extent_record(tsdn, pac, ehooks, &pac->ecache_retained, edata); +} + +void +extent_destroy_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *edata) { + assert(edata_base_get(edata) != NULL); + assert(edata_size_get(edata) != 0); + extent_state_t state = edata_state_get(edata); + assert(state == extent_state_retained || state == extent_state_active); + assert(emap_edata_is_acquired(tsdn, pac->emap, edata)); + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + + if (edata_guarded_get(edata)) { + assert(opt_retain); + san_unguard_pages_pre_destroy(tsdn, ehooks, edata, pac->emap); + } + edata_addr_set(edata, edata_base_get(edata)); + + /* Try to destroy; silently fail otherwise. */ + ehooks_destroy(tsdn, ehooks, edata_base_get(edata), + edata_size_get(edata), edata_committed_get(edata)); + + edata_cache_put(tsdn, pac->edata_cache, edata); +} + +static bool +extent_commit_impl(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + size_t offset, size_t length, bool growing_retained) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, growing_retained ? 1 : 0); + bool err = ehooks_commit(tsdn, ehooks, edata_base_get(edata), + edata_size_get(edata), offset, length); + edata_committed_set(edata, edata_committed_get(edata) || !err); + return err; +} + +bool +extent_commit_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + size_t offset, size_t length) { + return extent_commit_impl(tsdn, ehooks, edata, offset, length, + /* growing_retained */ false); +} + +bool +extent_decommit_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + size_t offset, size_t length) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + bool err = ehooks_decommit(tsdn, ehooks, edata_base_get(edata), + edata_size_get(edata), offset, length); + edata_committed_set(edata, edata_committed_get(edata) && err); + return err; +} + +static bool +extent_purge_lazy_impl(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + size_t offset, size_t length, bool growing_retained) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, growing_retained ? 1 : 0); + bool err = ehooks_purge_lazy(tsdn, ehooks, edata_base_get(edata), + edata_size_get(edata), offset, length); + return err; +} + +bool +extent_purge_lazy_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + size_t offset, size_t length) { + return extent_purge_lazy_impl(tsdn, ehooks, edata, offset, + length, false); +} + +static bool +extent_purge_forced_impl(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + size_t offset, size_t length, bool growing_retained) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, growing_retained ? 1 : 0); + bool err = ehooks_purge_forced(tsdn, ehooks, edata_base_get(edata), + edata_size_get(edata), offset, length); + return err; +} + +bool +extent_purge_forced_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + size_t offset, size_t length) { + return extent_purge_forced_impl(tsdn, ehooks, edata, offset, length, + false); +} + +/* + * Accepts the extent to split, and the characteristics of each side of the + * split. The 'a' parameters go with the 'lead' of the resulting pair of + * extents (the lower addressed portion of the split), and the 'b' parameters go + * with the trail (the higher addressed portion). This makes 'extent' the lead, + * and returns the trail (except in case of error). + */ +static edata_t * +extent_split_impl(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *edata, size_t size_a, size_t size_b, bool holding_core_locks) { + assert(edata_size_get(edata) == size_a + size_b); + /* Only the shrink path may split w/o holding core locks. */ + if (holding_core_locks) { + witness_assert_positive_depth_to_rank( + tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_CORE); + } else { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + } + + if (ehooks_split_will_fail(ehooks)) { + return NULL; + } + + edata_t *trail = edata_cache_get(tsdn, pac->edata_cache); + if (trail == NULL) { + goto label_error_a; + } + + edata_init(trail, edata_arena_ind_get(edata), + (void *)((uintptr_t)edata_base_get(edata) + size_a), size_b, + /* slab */ false, SC_NSIZES, edata_sn_get(edata), + edata_state_get(edata), edata_zeroed_get(edata), + edata_committed_get(edata), EXTENT_PAI_PAC, EXTENT_NOT_HEAD); + emap_prepare_t prepare; + bool err = emap_split_prepare(tsdn, pac->emap, &prepare, edata, + size_a, trail, size_b); + if (err) { + goto label_error_b; + } + + /* + * No need to acquire trail or edata, because: 1) trail was new (just + * allocated); and 2) edata is either an active allocation (the shrink + * path), or in an acquired state (extracted from the ecache on the + * extent_recycle_split path). + */ + assert(emap_edata_is_acquired(tsdn, pac->emap, edata)); + assert(emap_edata_is_acquired(tsdn, pac->emap, trail)); + + err = ehooks_split(tsdn, ehooks, edata_base_get(edata), size_a + size_b, + size_a, size_b, edata_committed_get(edata)); + + if (err) { + goto label_error_b; + } + + edata_size_set(edata, size_a); + emap_split_commit(tsdn, pac->emap, &prepare, edata, size_a, trail, + size_b); + + return trail; +label_error_b: + edata_cache_put(tsdn, pac->edata_cache, trail); +label_error_a: + return NULL; +} + +edata_t * +extent_split_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, edata_t *edata, + size_t size_a, size_t size_b, bool holding_core_locks) { + return extent_split_impl(tsdn, pac, ehooks, edata, size_a, size_b, + holding_core_locks); +} + +static bool +extent_merge_impl(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, edata_t *a, + edata_t *b, bool holding_core_locks) { + /* Only the expanding path may merge w/o holding ecache locks. */ + if (holding_core_locks) { + witness_assert_positive_depth_to_rank( + tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_CORE); + } else { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + } + + assert(edata_base_get(a) < edata_base_get(b)); + assert(edata_arena_ind_get(a) == edata_arena_ind_get(b)); + assert(edata_arena_ind_get(a) == ehooks_ind_get(ehooks)); + emap_assert_mapped(tsdn, pac->emap, a); + emap_assert_mapped(tsdn, pac->emap, b); + + bool err = ehooks_merge(tsdn, ehooks, edata_base_get(a), + edata_size_get(a), edata_base_get(b), edata_size_get(b), + edata_committed_get(a)); + + if (err) { + return true; + } + + /* + * The rtree writes must happen while all the relevant elements are + * owned, so the following code uses decomposed helper functions rather + * than extent_{,de}register() to do things in the right order. + */ + emap_prepare_t prepare; + emap_merge_prepare(tsdn, pac->emap, &prepare, a, b); + + assert(edata_state_get(a) == extent_state_active || + edata_state_get(a) == extent_state_merging); + edata_state_set(a, extent_state_active); + edata_size_set(a, edata_size_get(a) + edata_size_get(b)); + edata_sn_set(a, (edata_sn_get(a) < edata_sn_get(b)) ? + edata_sn_get(a) : edata_sn_get(b)); + edata_zeroed_set(a, edata_zeroed_get(a) && edata_zeroed_get(b)); + + emap_merge_commit(tsdn, pac->emap, &prepare, a, b); + + edata_cache_put(tsdn, pac->edata_cache, b); + + return false; +} + +bool +extent_merge_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *a, edata_t *b) { + return extent_merge_impl(tsdn, pac, ehooks, a, b, + /* holding_core_locks */ false); +} + +bool +extent_commit_zero(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + bool commit, bool zero, bool growing_retained) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, growing_retained ? 1 : 0); + + if (commit && !edata_committed_get(edata)) { + if (extent_commit_impl(tsdn, ehooks, edata, 0, + edata_size_get(edata), growing_retained)) { + return true; + } + } + if (zero && !edata_zeroed_get(edata)) { + void *addr = edata_base_get(edata); + size_t size = edata_size_get(edata); + ehooks_zero(tsdn, ehooks, addr, size); + } + return false; +} + +bool +extent_boot(void) { + assert(sizeof(slab_data_t) >= sizeof(e_prof_info_t)); + + if (have_dss) { + extent_dss_boot(); + } + + return false; +} diff --git a/BeefRT/JEMalloc/src/extent_dss.c b/BeefRT/JEMalloc/src/extent_dss.c new file mode 100644 index 00000000..9a35bacf --- /dev/null +++ b/BeefRT/JEMalloc/src/extent_dss.c @@ -0,0 +1,277 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/extent_dss.h" +#include "jemalloc/internal/spin.h" + +/******************************************************************************/ +/* Data. */ + +const char *opt_dss = DSS_DEFAULT; + +const char *dss_prec_names[] = { + "disabled", + "primary", + "secondary", + "N/A" +}; + +/* + * Current dss precedence default, used when creating new arenas. NB: This is + * stored as unsigned rather than dss_prec_t because in principle there's no + * guarantee that sizeof(dss_prec_t) is the same as sizeof(unsigned), and we use + * atomic operations to synchronize the setting. + */ +static atomic_u_t dss_prec_default = ATOMIC_INIT( + (unsigned)DSS_PREC_DEFAULT); + +/* Base address of the DSS. */ +static void *dss_base; +/* Atomic boolean indicating whether a thread is currently extending DSS. */ +static atomic_b_t dss_extending; +/* Atomic boolean indicating whether the DSS is exhausted. */ +static atomic_b_t dss_exhausted; +/* Atomic current upper limit on DSS addresses. */ +static atomic_p_t dss_max; + +/******************************************************************************/ + +static void * +extent_dss_sbrk(intptr_t increment) { +#ifdef JEMALLOC_DSS + return sbrk(increment); +#else + not_implemented(); + return NULL; +#endif +} + +dss_prec_t +extent_dss_prec_get(void) { + dss_prec_t ret; + + if (!have_dss) { + return dss_prec_disabled; + } + ret = (dss_prec_t)atomic_load_u(&dss_prec_default, ATOMIC_ACQUIRE); + return ret; +} + +bool +extent_dss_prec_set(dss_prec_t dss_prec) { + if (!have_dss) { + return (dss_prec != dss_prec_disabled); + } + atomic_store_u(&dss_prec_default, (unsigned)dss_prec, ATOMIC_RELEASE); + return false; +} + +static void +extent_dss_extending_start(void) { + spin_t spinner = SPIN_INITIALIZER; + while (true) { + bool expected = false; + if (atomic_compare_exchange_weak_b(&dss_extending, &expected, + true, ATOMIC_ACQ_REL, ATOMIC_RELAXED)) { + break; + } + spin_adaptive(&spinner); + } +} + +static void +extent_dss_extending_finish(void) { + assert(atomic_load_b(&dss_extending, ATOMIC_RELAXED)); + + atomic_store_b(&dss_extending, false, ATOMIC_RELEASE); +} + +static void * +extent_dss_max_update(void *new_addr) { + /* + * Get the current end of the DSS as max_cur and assure that dss_max is + * up to date. + */ + void *max_cur = extent_dss_sbrk(0); + if (max_cur == (void *)-1) { + return NULL; + } + atomic_store_p(&dss_max, max_cur, ATOMIC_RELEASE); + /* Fixed new_addr can only be supported if it is at the edge of DSS. */ + if (new_addr != NULL && max_cur != new_addr) { + return NULL; + } + return max_cur; +} + +void * +extent_alloc_dss(tsdn_t *tsdn, arena_t *arena, void *new_addr, size_t size, + size_t alignment, bool *zero, bool *commit) { + edata_t *gap; + + cassert(have_dss); + assert(size > 0); + assert(alignment == ALIGNMENT_CEILING(alignment, PAGE)); + + /* + * sbrk() uses a signed increment argument, so take care not to + * interpret a large allocation request as a negative increment. + */ + if ((intptr_t)size < 0) { + return NULL; + } + + gap = edata_cache_get(tsdn, &arena->pa_shard.edata_cache); + if (gap == NULL) { + return NULL; + } + + extent_dss_extending_start(); + if (!atomic_load_b(&dss_exhausted, ATOMIC_ACQUIRE)) { + /* + * The loop is necessary to recover from races with other + * threads that are using the DSS for something other than + * malloc. + */ + while (true) { + void *max_cur = extent_dss_max_update(new_addr); + if (max_cur == NULL) { + goto label_oom; + } + + bool head_state = opt_retain ? EXTENT_IS_HEAD : + EXTENT_NOT_HEAD; + /* + * Compute how much page-aligned gap space (if any) is + * necessary to satisfy alignment. This space can be + * recycled for later use. + */ + void *gap_addr_page = (void *)(PAGE_CEILING( + (uintptr_t)max_cur)); + void *ret = (void *)ALIGNMENT_CEILING( + (uintptr_t)gap_addr_page, alignment); + size_t gap_size_page = (uintptr_t)ret - + (uintptr_t)gap_addr_page; + if (gap_size_page != 0) { + edata_init(gap, arena_ind_get(arena), + gap_addr_page, gap_size_page, false, + SC_NSIZES, extent_sn_next( + &arena->pa_shard.pac), + extent_state_active, false, true, + EXTENT_PAI_PAC, head_state); + } + /* + * Compute the address just past the end of the desired + * allocation space. + */ + void *dss_next = (void *)((uintptr_t)ret + size); + if ((uintptr_t)ret < (uintptr_t)max_cur || + (uintptr_t)dss_next < (uintptr_t)max_cur) { + goto label_oom; /* Wrap-around. */ + } + /* Compute the increment, including subpage bytes. */ + void *gap_addr_subpage = max_cur; + size_t gap_size_subpage = (uintptr_t)ret - + (uintptr_t)gap_addr_subpage; + intptr_t incr = gap_size_subpage + size; + + assert((uintptr_t)max_cur + incr == (uintptr_t)ret + + size); + + /* Try to allocate. */ + void *dss_prev = extent_dss_sbrk(incr); + if (dss_prev == max_cur) { + /* Success. */ + atomic_store_p(&dss_max, dss_next, + ATOMIC_RELEASE); + extent_dss_extending_finish(); + + if (gap_size_page != 0) { + ehooks_t *ehooks = arena_get_ehooks( + arena); + extent_dalloc_gap(tsdn, + &arena->pa_shard.pac, ehooks, gap); + } else { + edata_cache_put(tsdn, + &arena->pa_shard.edata_cache, gap); + } + if (!*commit) { + *commit = pages_decommit(ret, size); + } + if (*zero && *commit) { + edata_t edata = {0}; + ehooks_t *ehooks = arena_get_ehooks( + arena); + + edata_init(&edata, + arena_ind_get(arena), ret, size, + size, false, SC_NSIZES, + extent_state_active, false, true, + EXTENT_PAI_PAC, head_state); + if (extent_purge_forced_wrapper(tsdn, + ehooks, &edata, 0, size)) { + memset(ret, 0, size); + } + } + return ret; + } + /* + * Failure, whether due to OOM or a race with a raw + * sbrk() call from outside the allocator. + */ + if (dss_prev == (void *)-1) { + /* OOM. */ + atomic_store_b(&dss_exhausted, true, + ATOMIC_RELEASE); + goto label_oom; + } + } + } +label_oom: + extent_dss_extending_finish(); + edata_cache_put(tsdn, &arena->pa_shard.edata_cache, gap); + return NULL; +} + +static bool +extent_in_dss_helper(void *addr, void *max) { + return ((uintptr_t)addr >= (uintptr_t)dss_base && (uintptr_t)addr < + (uintptr_t)max); +} + +bool +extent_in_dss(void *addr) { + cassert(have_dss); + + return extent_in_dss_helper(addr, atomic_load_p(&dss_max, + ATOMIC_ACQUIRE)); +} + +bool +extent_dss_mergeable(void *addr_a, void *addr_b) { + void *max; + + cassert(have_dss); + + if ((uintptr_t)addr_a < (uintptr_t)dss_base && (uintptr_t)addr_b < + (uintptr_t)dss_base) { + return true; + } + + max = atomic_load_p(&dss_max, ATOMIC_ACQUIRE); + return (extent_in_dss_helper(addr_a, max) == + extent_in_dss_helper(addr_b, max)); +} + +void +extent_dss_boot(void) { + cassert(have_dss); + + dss_base = extent_dss_sbrk(0); + atomic_store_b(&dss_extending, false, ATOMIC_RELAXED); + atomic_store_b(&dss_exhausted, dss_base == (void *)-1, ATOMIC_RELAXED); + atomic_store_p(&dss_max, dss_base, ATOMIC_RELAXED); +} + +/******************************************************************************/ diff --git a/BeefRT/JEMalloc/src/extent_mmap.c b/BeefRT/JEMalloc/src/extent_mmap.c new file mode 100644 index 00000000..5f0ee2d2 --- /dev/null +++ b/BeefRT/JEMalloc/src/extent_mmap.c @@ -0,0 +1,41 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/extent_mmap.h" + +/******************************************************************************/ +/* Data. */ + +bool opt_retain = +#ifdef JEMALLOC_RETAIN + true +#else + false +#endif + ; + +/******************************************************************************/ + +void * +extent_alloc_mmap(void *new_addr, size_t size, size_t alignment, bool *zero, + bool *commit) { + assert(alignment == ALIGNMENT_CEILING(alignment, PAGE)); + void *ret = pages_map(new_addr, size, alignment, commit); + if (ret == NULL) { + return NULL; + } + assert(ret != NULL); + if (*commit) { + *zero = true; + } + return ret; +} + +bool +extent_dalloc_mmap(void *addr, size_t size) { + if (!opt_retain) { + pages_unmap(addr, size); + } + return opt_retain; +} diff --git a/BeefRT/JEMalloc/src/fxp.c b/BeefRT/JEMalloc/src/fxp.c new file mode 100644 index 00000000..96585f0a --- /dev/null +++ b/BeefRT/JEMalloc/src/fxp.c @@ -0,0 +1,124 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/fxp.h" + +static bool +fxp_isdigit(char c) { + return '0' <= c && c <= '9'; +} + +bool +fxp_parse(fxp_t *result, const char *str, char **end) { + /* + * Using malloc_strtoumax in this method isn't as handy as you might + * expect (I tried). In the fractional part, significant leading zeros + * mean that you still need to do your own parsing, now with trickier + * math. In the integer part, the casting (uintmax_t to uint32_t) + * forces more reasoning about bounds than just checking for overflow as + * we parse. + */ + uint32_t integer_part = 0; + + const char *cur = str; + + /* The string must start with a digit or a decimal point. */ + if (*cur != '.' && !fxp_isdigit(*cur)) { + return true; + } + + while ('0' <= *cur && *cur <= '9') { + integer_part *= 10; + integer_part += *cur - '0'; + if (integer_part >= (1U << 16)) { + return true; + } + cur++; + } + + /* + * We've parsed all digits at the beginning of the string, without + * overflow. Either we're done, or there's a fractional part. + */ + if (*cur != '.') { + *result = (integer_part << 16); + if (end != NULL) { + *end = (char *)cur; + } + return false; + } + + /* There's a fractional part. */ + cur++; + if (!fxp_isdigit(*cur)) { + /* Shouldn't end on the decimal point. */ + return true; + } + + /* + * We use a lot of precision for the fractional part, even though we'll + * discard most of it; this lets us get exact values for the important + * special case where the denominator is a small power of 2 (for + * instance, 1/512 == 0.001953125 is exactly representable even with + * only 16 bits of fractional precision). We need to left-shift by 16 + * before dividing so we pick the number of digits to be + * floor(log(2**48)) = 14. + */ + uint64_t fractional_part = 0; + uint64_t frac_div = 1; + for (int i = 0; i < FXP_FRACTIONAL_PART_DIGITS; i++) { + fractional_part *= 10; + frac_div *= 10; + if (fxp_isdigit(*cur)) { + fractional_part += *cur - '0'; + cur++; + } + } + /* + * We only parse the first maxdigits characters, but we can still ignore + * any digits after that. + */ + while (fxp_isdigit(*cur)) { + cur++; + } + + assert(fractional_part < frac_div); + uint32_t fractional_repr = (uint32_t)( + (fractional_part << 16) / frac_div); + + /* Success! */ + *result = (integer_part << 16) + fractional_repr; + if (end != NULL) { + *end = (char *)cur; + } + return false; +} + +void +fxp_print(fxp_t a, char buf[FXP_BUF_SIZE]) { + uint32_t integer_part = fxp_round_down(a); + uint32_t fractional_part = (a & ((1U << 16) - 1)); + + int leading_fraction_zeros = 0; + uint64_t fraction_digits = fractional_part; + for (int i = 0; i < FXP_FRACTIONAL_PART_DIGITS; i++) { + if (fraction_digits < (1U << 16) + && fraction_digits * 10 >= (1U << 16)) { + leading_fraction_zeros = i; + } + fraction_digits *= 10; + } + fraction_digits >>= 16; + while (fraction_digits > 0 && fraction_digits % 10 == 0) { + fraction_digits /= 10; + } + + size_t printed = malloc_snprintf(buf, FXP_BUF_SIZE, "%"FMTu32".", + integer_part); + for (int i = 0; i < leading_fraction_zeros; i++) { + buf[printed] = '0'; + printed++; + } + malloc_snprintf(&buf[printed], FXP_BUF_SIZE - printed, "%"FMTu64, + fraction_digits); +} diff --git a/BeefRT/JEMalloc/src/hook.c b/BeefRT/JEMalloc/src/hook.c new file mode 100644 index 00000000..493edbbe --- /dev/null +++ b/BeefRT/JEMalloc/src/hook.c @@ -0,0 +1,195 @@ +#include "jemalloc/internal/jemalloc_preamble.h" + +#include "jemalloc/internal/hook.h" + +#include "jemalloc/internal/atomic.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/seq.h" + +typedef struct hooks_internal_s hooks_internal_t; +struct hooks_internal_s { + hooks_t hooks; + bool in_use; +}; + +seq_define(hooks_internal_t, hooks) + +static atomic_u_t nhooks = ATOMIC_INIT(0); +static seq_hooks_t hooks[HOOK_MAX]; +static malloc_mutex_t hooks_mu; + +bool +hook_boot() { + return malloc_mutex_init(&hooks_mu, "hooks", WITNESS_RANK_HOOK, + malloc_mutex_rank_exclusive); +} + +static void * +hook_install_locked(hooks_t *to_install) { + hooks_internal_t hooks_internal; + for (int i = 0; i < HOOK_MAX; i++) { + bool success = seq_try_load_hooks(&hooks_internal, &hooks[i]); + /* We hold mu; no concurrent access. */ + assert(success); + if (!hooks_internal.in_use) { + hooks_internal.hooks = *to_install; + hooks_internal.in_use = true; + seq_store_hooks(&hooks[i], &hooks_internal); + atomic_store_u(&nhooks, + atomic_load_u(&nhooks, ATOMIC_RELAXED) + 1, + ATOMIC_RELAXED); + return &hooks[i]; + } + } + return NULL; +} + +void * +hook_install(tsdn_t *tsdn, hooks_t *to_install) { + malloc_mutex_lock(tsdn, &hooks_mu); + void *ret = hook_install_locked(to_install); + if (ret != NULL) { + tsd_global_slow_inc(tsdn); + } + malloc_mutex_unlock(tsdn, &hooks_mu); + return ret; +} + +static void +hook_remove_locked(seq_hooks_t *to_remove) { + hooks_internal_t hooks_internal; + bool success = seq_try_load_hooks(&hooks_internal, to_remove); + /* We hold mu; no concurrent access. */ + assert(success); + /* Should only remove hooks that were added. */ + assert(hooks_internal.in_use); + hooks_internal.in_use = false; + seq_store_hooks(to_remove, &hooks_internal); + atomic_store_u(&nhooks, atomic_load_u(&nhooks, ATOMIC_RELAXED) - 1, + ATOMIC_RELAXED); +} + +void +hook_remove(tsdn_t *tsdn, void *opaque) { + if (config_debug) { + char *hooks_begin = (char *)&hooks[0]; + char *hooks_end = (char *)&hooks[HOOK_MAX]; + char *hook = (char *)opaque; + assert(hooks_begin <= hook && hook < hooks_end + && (hook - hooks_begin) % sizeof(seq_hooks_t) == 0); + } + malloc_mutex_lock(tsdn, &hooks_mu); + hook_remove_locked((seq_hooks_t *)opaque); + tsd_global_slow_dec(tsdn); + malloc_mutex_unlock(tsdn, &hooks_mu); +} + +#define FOR_EACH_HOOK_BEGIN(hooks_internal_ptr) \ +for (int for_each_hook_counter = 0; \ + for_each_hook_counter < HOOK_MAX; \ + for_each_hook_counter++) { \ + bool for_each_hook_success = seq_try_load_hooks( \ + (hooks_internal_ptr), &hooks[for_each_hook_counter]); \ + if (!for_each_hook_success) { \ + continue; \ + } \ + if (!(hooks_internal_ptr)->in_use) { \ + continue; \ + } +#define FOR_EACH_HOOK_END \ +} + +static bool * +hook_reentrantp() { + /* + * We prevent user reentrancy within hooks. This is basically just a + * thread-local bool that triggers an early-exit. + * + * We don't fold in_hook into reentrancy. There are two reasons for + * this: + * - Right now, we turn on reentrancy during things like extent hook + * execution. Allocating during extent hooks is not officially + * supported, but we don't want to break it for the time being. These + * sorts of allocations should probably still be hooked, though. + * - If a hook allocates, we may want it to be relatively fast (after + * all, it executes on every allocator operation). Turning on + * reentrancy is a fairly heavyweight mode (disabling tcache, + * redirecting to arena 0, etc.). It's possible we may one day want + * to turn on reentrant mode here, if it proves too difficult to keep + * this working. But that's fairly easy for us to see; OTOH, people + * not using hooks because they're too slow is easy for us to miss. + * + * The tricky part is + * that this code might get invoked even if we don't have access to tsd. + * This function mimics getting a pointer to thread-local data, except + * that it might secretly return a pointer to some global data if we + * know that the caller will take the early-exit path. + * If we return a bool that indicates that we are reentrant, then the + * caller will go down the early exit path, leaving the global + * untouched. + */ + static bool in_hook_global = true; + tsdn_t *tsdn = tsdn_fetch(); + bool *in_hook = tsdn_in_hookp_get(tsdn); + if (in_hook!= NULL) { + return in_hook; + } + return &in_hook_global; +} + +#define HOOK_PROLOGUE \ + if (likely(atomic_load_u(&nhooks, ATOMIC_RELAXED) == 0)) { \ + return; \ + } \ + bool *in_hook = hook_reentrantp(); \ + if (*in_hook) { \ + return; \ + } \ + *in_hook = true; + +#define HOOK_EPILOGUE \ + *in_hook = false; + +void +hook_invoke_alloc(hook_alloc_t type, void *result, uintptr_t result_raw, + uintptr_t args_raw[3]) { + HOOK_PROLOGUE + + hooks_internal_t hook; + FOR_EACH_HOOK_BEGIN(&hook) + hook_alloc h = hook.hooks.alloc_hook; + if (h != NULL) { + h(hook.hooks.extra, type, result, result_raw, args_raw); + } + FOR_EACH_HOOK_END + + HOOK_EPILOGUE +} + +void +hook_invoke_dalloc(hook_dalloc_t type, void *address, uintptr_t args_raw[3]) { + HOOK_PROLOGUE + hooks_internal_t hook; + FOR_EACH_HOOK_BEGIN(&hook) + hook_dalloc h = hook.hooks.dalloc_hook; + if (h != NULL) { + h(hook.hooks.extra, type, address, args_raw); + } + FOR_EACH_HOOK_END + HOOK_EPILOGUE +} + +void +hook_invoke_expand(hook_expand_t type, void *address, size_t old_usize, + size_t new_usize, uintptr_t result_raw, uintptr_t args_raw[4]) { + HOOK_PROLOGUE + hooks_internal_t hook; + FOR_EACH_HOOK_BEGIN(&hook) + hook_expand h = hook.hooks.expand_hook; + if (h != NULL) { + h(hook.hooks.extra, type, address, old_usize, new_usize, + result_raw, args_raw); + } + FOR_EACH_HOOK_END + HOOK_EPILOGUE +} diff --git a/BeefRT/JEMalloc/src/hpa.c b/BeefRT/JEMalloc/src/hpa.c new file mode 100644 index 00000000..7e2aeba0 --- /dev/null +++ b/BeefRT/JEMalloc/src/hpa.c @@ -0,0 +1,1044 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/hpa.h" + +#include "jemalloc/internal/fb.h" +#include "jemalloc/internal/witness.h" + +#define HPA_EDEN_SIZE (128 * HUGEPAGE) + +static edata_t *hpa_alloc(tsdn_t *tsdn, pai_t *self, size_t size, + size_t alignment, bool zero, bool guarded, bool frequent_reuse, + bool *deferred_work_generated); +static size_t hpa_alloc_batch(tsdn_t *tsdn, pai_t *self, size_t size, + size_t nallocs, edata_list_active_t *results, bool *deferred_work_generated); +static bool hpa_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, + size_t old_size, size_t new_size, bool zero, bool *deferred_work_generated); +static bool hpa_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata, + size_t old_size, size_t new_size, bool *deferred_work_generated); +static void hpa_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata, + bool *deferred_work_generated); +static void hpa_dalloc_batch(tsdn_t *tsdn, pai_t *self, + edata_list_active_t *list, bool *deferred_work_generated); +static uint64_t hpa_time_until_deferred_work(tsdn_t *tsdn, pai_t *self); + +bool +hpa_supported() { +#ifdef _WIN32 + /* + * At least until the API and implementation is somewhat settled, we + * don't want to try to debug the VM subsystem on the hardest-to-test + * platform. + */ + return false; +#endif + if (!pages_can_hugify) { + return false; + } + /* + * We fundamentally rely on a address-space-hungry growth strategy for + * hugepages. + */ + if (LG_SIZEOF_PTR != 3) { + return false; + } + /* + * If we couldn't detect the value of HUGEPAGE, HUGEPAGE_PAGES becomes + * this sentinel value -- see the comment in pages.h. + */ + if (HUGEPAGE_PAGES == 1) { + return false; + } + return true; +} + +static void +hpa_do_consistency_checks(hpa_shard_t *shard) { + assert(shard->base != NULL); +} + +bool +hpa_central_init(hpa_central_t *central, base_t *base, const hpa_hooks_t *hooks) { + /* malloc_conf processing should have filtered out these cases. */ + assert(hpa_supported()); + bool err; + err = malloc_mutex_init(¢ral->grow_mtx, "hpa_central_grow", + WITNESS_RANK_HPA_CENTRAL_GROW, malloc_mutex_rank_exclusive); + if (err) { + return true; + } + err = malloc_mutex_init(¢ral->mtx, "hpa_central", + WITNESS_RANK_HPA_CENTRAL, malloc_mutex_rank_exclusive); + if (err) { + return true; + } + central->base = base; + central->eden = NULL; + central->eden_len = 0; + central->age_counter = 0; + central->hooks = *hooks; + return false; +} + +static hpdata_t * +hpa_alloc_ps(tsdn_t *tsdn, hpa_central_t *central) { + return (hpdata_t *)base_alloc(tsdn, central->base, sizeof(hpdata_t), + CACHELINE); +} + +hpdata_t * +hpa_central_extract(tsdn_t *tsdn, hpa_central_t *central, size_t size, + bool *oom) { + /* Don't yet support big allocations; these should get filtered out. */ + assert(size <= HUGEPAGE); + /* + * Should only try to extract from the central allocator if the local + * shard is exhausted. We should hold the grow_mtx on that shard. + */ + witness_assert_positive_depth_to_rank( + tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_HPA_SHARD_GROW); + + malloc_mutex_lock(tsdn, ¢ral->grow_mtx); + *oom = false; + + hpdata_t *ps = NULL; + + /* Is eden a perfect fit? */ + if (central->eden != NULL && central->eden_len == HUGEPAGE) { + ps = hpa_alloc_ps(tsdn, central); + if (ps == NULL) { + *oom = true; + malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); + return NULL; + } + hpdata_init(ps, central->eden, central->age_counter++); + central->eden = NULL; + central->eden_len = 0; + malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); + return ps; + } + + /* + * We're about to try to allocate from eden by splitting. If eden is + * NULL, we have to allocate it too. Otherwise, we just have to + * allocate an edata_t for the new psset. + */ + if (central->eden == NULL) { + /* + * During development, we're primarily concerned with systems + * with overcommit. Eventually, we should be more careful here. + */ + bool commit = true; + /* Allocate address space, bailing if we fail. */ + void *new_eden = pages_map(NULL, HPA_EDEN_SIZE, HUGEPAGE, + &commit); + if (new_eden == NULL) { + *oom = true; + malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); + return NULL; + } + ps = hpa_alloc_ps(tsdn, central); + if (ps == NULL) { + pages_unmap(new_eden, HPA_EDEN_SIZE); + *oom = true; + malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); + return NULL; + } + central->eden = new_eden; + central->eden_len = HPA_EDEN_SIZE; + } else { + /* Eden is already nonempty; only need an edata for ps. */ + ps = hpa_alloc_ps(tsdn, central); + if (ps == NULL) { + *oom = true; + malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); + return NULL; + } + } + assert(ps != NULL); + assert(central->eden != NULL); + assert(central->eden_len > HUGEPAGE); + assert(central->eden_len % HUGEPAGE == 0); + assert(HUGEPAGE_ADDR2BASE(central->eden) == central->eden); + + hpdata_init(ps, central->eden, central->age_counter++); + + char *eden_char = (char *)central->eden; + eden_char += HUGEPAGE; + central->eden = (void *)eden_char; + central->eden_len -= HUGEPAGE; + + malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); + + return ps; +} + +bool +hpa_shard_init(hpa_shard_t *shard, hpa_central_t *central, emap_t *emap, + base_t *base, edata_cache_t *edata_cache, unsigned ind, + const hpa_shard_opts_t *opts) { + /* malloc_conf processing should have filtered out these cases. */ + assert(hpa_supported()); + bool err; + err = malloc_mutex_init(&shard->grow_mtx, "hpa_shard_grow", + WITNESS_RANK_HPA_SHARD_GROW, malloc_mutex_rank_exclusive); + if (err) { + return true; + } + err = malloc_mutex_init(&shard->mtx, "hpa_shard", + WITNESS_RANK_HPA_SHARD, malloc_mutex_rank_exclusive); + if (err) { + return true; + } + + assert(edata_cache != NULL); + shard->central = central; + shard->base = base; + edata_cache_fast_init(&shard->ecf, edata_cache); + psset_init(&shard->psset); + shard->age_counter = 0; + shard->ind = ind; + shard->emap = emap; + + shard->opts = *opts; + + shard->npending_purge = 0; + nstime_init_zero(&shard->last_purge); + + shard->stats.npurge_passes = 0; + shard->stats.npurges = 0; + shard->stats.nhugifies = 0; + shard->stats.ndehugifies = 0; + + /* + * Fill these in last, so that if an hpa_shard gets used despite + * initialization failing, we'll at least crash instead of just + * operating on corrupted data. + */ + shard->pai.alloc = &hpa_alloc; + shard->pai.alloc_batch = &hpa_alloc_batch; + shard->pai.expand = &hpa_expand; + shard->pai.shrink = &hpa_shrink; + shard->pai.dalloc = &hpa_dalloc; + shard->pai.dalloc_batch = &hpa_dalloc_batch; + shard->pai.time_until_deferred_work = &hpa_time_until_deferred_work; + + hpa_do_consistency_checks(shard); + + return false; +} + +/* + * Note that the stats functions here follow the usual stats naming conventions; + * "merge" obtains the stats from some live object of instance, while "accum" + * only combines the stats from one stats objet to another. Hence the lack of + * locking here. + */ +static void +hpa_shard_nonderived_stats_accum(hpa_shard_nonderived_stats_t *dst, + hpa_shard_nonderived_stats_t *src) { + dst->npurge_passes += src->npurge_passes; + dst->npurges += src->npurges; + dst->nhugifies += src->nhugifies; + dst->ndehugifies += src->ndehugifies; +} + +void +hpa_shard_stats_accum(hpa_shard_stats_t *dst, hpa_shard_stats_t *src) { + psset_stats_accum(&dst->psset_stats, &src->psset_stats); + hpa_shard_nonderived_stats_accum(&dst->nonderived_stats, + &src->nonderived_stats); +} + +void +hpa_shard_stats_merge(tsdn_t *tsdn, hpa_shard_t *shard, + hpa_shard_stats_t *dst) { + hpa_do_consistency_checks(shard); + + malloc_mutex_lock(tsdn, &shard->grow_mtx); + malloc_mutex_lock(tsdn, &shard->mtx); + psset_stats_accum(&dst->psset_stats, &shard->psset.stats); + hpa_shard_nonderived_stats_accum(&dst->nonderived_stats, &shard->stats); + malloc_mutex_unlock(tsdn, &shard->mtx); + malloc_mutex_unlock(tsdn, &shard->grow_mtx); +} + +static bool +hpa_good_hugification_candidate(hpa_shard_t *shard, hpdata_t *ps) { + /* + * Note that this needs to be >= rather than just >, because of the + * important special case in which the hugification threshold is exactly + * HUGEPAGE. + */ + return hpdata_nactive_get(ps) * PAGE + >= shard->opts.hugification_threshold; +} + +static size_t +hpa_adjusted_ndirty(tsdn_t *tsdn, hpa_shard_t *shard) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + return psset_ndirty(&shard->psset) - shard->npending_purge; +} + +static size_t +hpa_ndirty_max(tsdn_t *tsdn, hpa_shard_t *shard) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + if (shard->opts.dirty_mult == (fxp_t)-1) { + return (size_t)-1; + } + return fxp_mul_frac(psset_nactive(&shard->psset), + shard->opts.dirty_mult); +} + +static bool +hpa_hugify_blocked_by_ndirty(tsdn_t *tsdn, hpa_shard_t *shard) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + hpdata_t *to_hugify = psset_pick_hugify(&shard->psset); + if (to_hugify == NULL) { + return false; + } + return hpa_adjusted_ndirty(tsdn, shard) + + hpdata_nretained_get(to_hugify) > hpa_ndirty_max(tsdn, shard); +} + +static bool +hpa_should_purge(tsdn_t *tsdn, hpa_shard_t *shard) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + if (hpa_adjusted_ndirty(tsdn, shard) > hpa_ndirty_max(tsdn, shard)) { + return true; + } + if (hpa_hugify_blocked_by_ndirty(tsdn, shard)) { + return true; + } + return false; +} + +static void +hpa_update_purge_hugify_eligibility(tsdn_t *tsdn, hpa_shard_t *shard, + hpdata_t *ps) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + if (hpdata_changing_state_get(ps)) { + hpdata_purge_allowed_set(ps, false); + hpdata_disallow_hugify(ps); + return; + } + /* + * Hugepages are distinctly costly to purge, so try to avoid it unless + * they're *particularly* full of dirty pages. Eventually, we should + * use a smarter / more dynamic heuristic for situations where we have + * to manually hugify. + * + * In situations where we don't manually hugify, this problem is + * reduced. The "bad" situation we're trying to avoid is one's that's + * common in some Linux configurations (where both enabled and defrag + * are set to madvise) that can lead to long latency spikes on the first + * access after a hugification. The ideal policy in such configurations + * is probably time-based for both purging and hugifying; only hugify a + * hugepage if it's met the criteria for some extended period of time, + * and only dehugify it if it's failed to meet the criteria for an + * extended period of time. When background threads are on, we should + * try to take this hit on one of them, as well. + * + * I think the ideal setting is THP always enabled, and defrag set to + * deferred; in that case we don't need any explicit calls on the + * allocator's end at all; we just try to pack allocations in a + * hugepage-friendly manner and let the OS hugify in the background. + */ + hpdata_purge_allowed_set(ps, hpdata_ndirty_get(ps) > 0); + if (hpa_good_hugification_candidate(shard, ps) + && !hpdata_huge_get(ps)) { + nstime_t now; + shard->central->hooks.curtime(&now, /* first_reading */ true); + hpdata_allow_hugify(ps, now); + } + /* + * Once a hugepage has become eligible for hugification, we don't mark + * it as ineligible just because it stops meeting the criteria (this + * could lead to situations where a hugepage that spends most of its + * time meeting the criteria never quite getting hugified if there are + * intervening deallocations). The idea is that the hugification delay + * will allow them to get purged, reseting their "hugify-allowed" bit. + * If they don't get purged, then the hugification isn't hurting and + * might help. As an exception, we don't hugify hugepages that are now + * empty; it definitely doesn't help there until the hugepage gets + * reused, which is likely not for a while. + */ + if (hpdata_nactive_get(ps) == 0) { + hpdata_disallow_hugify(ps); + } +} + +static bool +hpa_shard_has_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + hpdata_t *to_hugify = psset_pick_hugify(&shard->psset); + return to_hugify != NULL || hpa_should_purge(tsdn, shard); +} + +/* Returns whether or not we purged anything. */ +static bool +hpa_try_purge(tsdn_t *tsdn, hpa_shard_t *shard) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + + hpdata_t *to_purge = psset_pick_purge(&shard->psset); + if (to_purge == NULL) { + return false; + } + assert(hpdata_purge_allowed_get(to_purge)); + assert(!hpdata_changing_state_get(to_purge)); + + /* + * Don't let anyone else purge or hugify this page while + * we're purging it (allocations and deallocations are + * OK). + */ + psset_update_begin(&shard->psset, to_purge); + assert(hpdata_alloc_allowed_get(to_purge)); + hpdata_mid_purge_set(to_purge, true); + hpdata_purge_allowed_set(to_purge, false); + hpdata_disallow_hugify(to_purge); + /* + * Unlike with hugification (where concurrent + * allocations are allowed), concurrent allocation out + * of a hugepage being purged is unsafe; we might hand + * out an extent for an allocation and then purge it + * (clearing out user data). + */ + hpdata_alloc_allowed_set(to_purge, false); + psset_update_end(&shard->psset, to_purge); + + /* Gather all the metadata we'll need during the purge. */ + bool dehugify = hpdata_huge_get(to_purge); + hpdata_purge_state_t purge_state; + size_t num_to_purge = hpdata_purge_begin(to_purge, &purge_state); + + shard->npending_purge += num_to_purge; + + malloc_mutex_unlock(tsdn, &shard->mtx); + + /* Actually do the purging, now that the lock is dropped. */ + if (dehugify) { + shard->central->hooks.dehugify(hpdata_addr_get(to_purge), + HUGEPAGE); + } + size_t total_purged = 0; + uint64_t purges_this_pass = 0; + void *purge_addr; + size_t purge_size; + while (hpdata_purge_next(to_purge, &purge_state, &purge_addr, + &purge_size)) { + total_purged += purge_size; + assert(total_purged <= HUGEPAGE); + purges_this_pass++; + shard->central->hooks.purge(purge_addr, purge_size); + } + + malloc_mutex_lock(tsdn, &shard->mtx); + /* The shard updates */ + shard->npending_purge -= num_to_purge; + shard->stats.npurge_passes++; + shard->stats.npurges += purges_this_pass; + shard->central->hooks.curtime(&shard->last_purge, + /* first_reading */ false); + if (dehugify) { + shard->stats.ndehugifies++; + } + + /* The hpdata updates. */ + psset_update_begin(&shard->psset, to_purge); + if (dehugify) { + hpdata_dehugify(to_purge); + } + hpdata_purge_end(to_purge, &purge_state); + hpdata_mid_purge_set(to_purge, false); + + hpdata_alloc_allowed_set(to_purge, true); + hpa_update_purge_hugify_eligibility(tsdn, shard, to_purge); + + psset_update_end(&shard->psset, to_purge); + + return true; +} + +/* Returns whether or not we hugified anything. */ +static bool +hpa_try_hugify(tsdn_t *tsdn, hpa_shard_t *shard) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + + if (hpa_hugify_blocked_by_ndirty(tsdn, shard)) { + return false; + } + + hpdata_t *to_hugify = psset_pick_hugify(&shard->psset); + if (to_hugify == NULL) { + return false; + } + assert(hpdata_hugify_allowed_get(to_hugify)); + assert(!hpdata_changing_state_get(to_hugify)); + + /* Make sure that it's been hugifiable for long enough. */ + nstime_t time_hugify_allowed = hpdata_time_hugify_allowed(to_hugify); + uint64_t millis = shard->central->hooks.ms_since(&time_hugify_allowed); + if (millis < shard->opts.hugify_delay_ms) { + return false; + } + + /* + * Don't let anyone else purge or hugify this page while + * we're hugifying it (allocations and deallocations are + * OK). + */ + psset_update_begin(&shard->psset, to_hugify); + hpdata_mid_hugify_set(to_hugify, true); + hpdata_purge_allowed_set(to_hugify, false); + hpdata_disallow_hugify(to_hugify); + assert(hpdata_alloc_allowed_get(to_hugify)); + psset_update_end(&shard->psset, to_hugify); + + malloc_mutex_unlock(tsdn, &shard->mtx); + + shard->central->hooks.hugify(hpdata_addr_get(to_hugify), HUGEPAGE); + + malloc_mutex_lock(tsdn, &shard->mtx); + shard->stats.nhugifies++; + + psset_update_begin(&shard->psset, to_hugify); + hpdata_hugify(to_hugify); + hpdata_mid_hugify_set(to_hugify, false); + hpa_update_purge_hugify_eligibility(tsdn, shard, to_hugify); + psset_update_end(&shard->psset, to_hugify); + + return true; +} + +/* + * Execution of deferred work is forced if it's triggered by an explicit + * hpa_shard_do_deferred_work() call. + */ +static void +hpa_shard_maybe_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard, + bool forced) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + if (!forced && shard->opts.deferral_allowed) { + return; + } + /* + * If we're on a background thread, do work so long as there's work to + * be done. Otherwise, bound latency to not be *too* bad by doing at + * most a small fixed number of operations. + */ + bool hugified = false; + bool purged = false; + size_t max_ops = (forced ? (size_t)-1 : 16); + size_t nops = 0; + do { + /* + * Always purge before hugifying, to make sure we get some + * ability to hit our quiescence targets. + */ + purged = false; + while (hpa_should_purge(tsdn, shard) && nops < max_ops) { + purged = hpa_try_purge(tsdn, shard); + if (purged) { + nops++; + } + } + hugified = hpa_try_hugify(tsdn, shard); + if (hugified) { + nops++; + } + malloc_mutex_assert_owner(tsdn, &shard->mtx); + malloc_mutex_assert_owner(tsdn, &shard->mtx); + } while ((hugified || purged) && nops < max_ops); +} + +static edata_t * +hpa_try_alloc_one_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, + bool *oom) { + bool err; + edata_t *edata = edata_cache_fast_get(tsdn, &shard->ecf); + if (edata == NULL) { + *oom = true; + return NULL; + } + + hpdata_t *ps = psset_pick_alloc(&shard->psset, size); + if (ps == NULL) { + edata_cache_fast_put(tsdn, &shard->ecf, edata); + return NULL; + } + + psset_update_begin(&shard->psset, ps); + + if (hpdata_empty(ps)) { + /* + * If the pageslab used to be empty, treat it as though it's + * brand new for fragmentation-avoidance purposes; what we're + * trying to approximate is the age of the allocations *in* that + * pageslab, and the allocations in the new pageslab are + * definitionally the youngest in this hpa shard. + */ + hpdata_age_set(ps, shard->age_counter++); + } + + void *addr = hpdata_reserve_alloc(ps, size); + edata_init(edata, shard->ind, addr, size, /* slab */ false, + SC_NSIZES, /* sn */ hpdata_age_get(ps), extent_state_active, + /* zeroed */ false, /* committed */ true, EXTENT_PAI_HPA, + EXTENT_NOT_HEAD); + edata_ps_set(edata, ps); + + /* + * This could theoretically be moved outside of the critical section, + * but that introduces the potential for a race. Without the lock, the + * (initially nonempty, since this is the reuse pathway) pageslab we + * allocated out of could become otherwise empty while the lock is + * dropped. This would force us to deal with a pageslab eviction down + * the error pathway, which is a pain. + */ + err = emap_register_boundary(tsdn, shard->emap, edata, + SC_NSIZES, /* slab */ false); + if (err) { + hpdata_unreserve(ps, edata_addr_get(edata), + edata_size_get(edata)); + /* + * We should arguably reset dirty state here, but this would + * require some sort of prepare + commit functionality that's a + * little much to deal with for now. + * + * We don't have a do_deferred_work down this pathway, on the + * principle that we didn't *really* affect shard state (we + * tweaked the stats, but our tweaks weren't really accurate). + */ + psset_update_end(&shard->psset, ps); + edata_cache_fast_put(tsdn, &shard->ecf, edata); + *oom = true; + return NULL; + } + + hpa_update_purge_hugify_eligibility(tsdn, shard, ps); + psset_update_end(&shard->psset, ps); + return edata; +} + +static size_t +hpa_try_alloc_batch_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, + bool *oom, size_t nallocs, edata_list_active_t *results, + bool *deferred_work_generated) { + malloc_mutex_lock(tsdn, &shard->mtx); + size_t nsuccess = 0; + for (; nsuccess < nallocs; nsuccess++) { + edata_t *edata = hpa_try_alloc_one_no_grow(tsdn, shard, size, + oom); + if (edata == NULL) { + break; + } + edata_list_active_append(results, edata); + } + + hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ false); + *deferred_work_generated = hpa_shard_has_deferred_work(tsdn, shard); + malloc_mutex_unlock(tsdn, &shard->mtx); + return nsuccess; +} + +static size_t +hpa_alloc_batch_psset(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, + size_t nallocs, edata_list_active_t *results, + bool *deferred_work_generated) { + assert(size <= shard->opts.slab_max_alloc); + bool oom = false; + + size_t nsuccess = hpa_try_alloc_batch_no_grow(tsdn, shard, size, &oom, + nallocs, results, deferred_work_generated); + + if (nsuccess == nallocs || oom) { + return nsuccess; + } + + /* + * We didn't OOM, but weren't able to fill everything requested of us; + * try to grow. + */ + malloc_mutex_lock(tsdn, &shard->grow_mtx); + /* + * Check for grow races; maybe some earlier thread expanded the psset + * in between when we dropped the main mutex and grabbed the grow mutex. + */ + nsuccess += hpa_try_alloc_batch_no_grow(tsdn, shard, size, &oom, + nallocs - nsuccess, results, deferred_work_generated); + if (nsuccess == nallocs || oom) { + malloc_mutex_unlock(tsdn, &shard->grow_mtx); + return nsuccess; + } + + /* + * Note that we don't hold shard->mtx here (while growing); + * deallocations (and allocations of smaller sizes) may still succeed + * while we're doing this potentially expensive system call. + */ + hpdata_t *ps = hpa_central_extract(tsdn, shard->central, size, &oom); + if (ps == NULL) { + malloc_mutex_unlock(tsdn, &shard->grow_mtx); + return nsuccess; + } + + /* + * We got the pageslab; allocate from it. This does an unlock followed + * by a lock on the same mutex, and holds the grow mutex while doing + * deferred work, but this is an uncommon path; the simplicity is worth + * it. + */ + malloc_mutex_lock(tsdn, &shard->mtx); + psset_insert(&shard->psset, ps); + malloc_mutex_unlock(tsdn, &shard->mtx); + + nsuccess += hpa_try_alloc_batch_no_grow(tsdn, shard, size, &oom, + nallocs - nsuccess, results, deferred_work_generated); + /* + * Drop grow_mtx before doing deferred work; other threads blocked on it + * should be allowed to proceed while we're working. + */ + malloc_mutex_unlock(tsdn, &shard->grow_mtx); + + return nsuccess; +} + +static hpa_shard_t * +hpa_from_pai(pai_t *self) { + assert(self->alloc = &hpa_alloc); + assert(self->expand = &hpa_expand); + assert(self->shrink = &hpa_shrink); + assert(self->dalloc = &hpa_dalloc); + return (hpa_shard_t *)self; +} + +static size_t +hpa_alloc_batch(tsdn_t *tsdn, pai_t *self, size_t size, size_t nallocs, + edata_list_active_t *results, bool *deferred_work_generated) { + assert(nallocs > 0); + assert((size & PAGE_MASK) == 0); + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + hpa_shard_t *shard = hpa_from_pai(self); + + if (size > shard->opts.slab_max_alloc) { + return 0; + } + + size_t nsuccess = hpa_alloc_batch_psset(tsdn, shard, size, nallocs, + results, deferred_work_generated); + + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + + /* + * Guard the sanity checks with config_debug because the loop cannot be + * proven non-circular by the compiler, even if everything within the + * loop is optimized away. + */ + if (config_debug) { + edata_t *edata; + ql_foreach(edata, &results->head, ql_link_active) { + emap_assert_mapped(tsdn, shard->emap, edata); + assert(edata_pai_get(edata) == EXTENT_PAI_HPA); + assert(edata_state_get(edata) == extent_state_active); + assert(edata_arena_ind_get(edata) == shard->ind); + assert(edata_szind_get_maybe_invalid(edata) == + SC_NSIZES); + assert(!edata_slab_get(edata)); + assert(edata_committed_get(edata)); + assert(edata_base_get(edata) == edata_addr_get(edata)); + assert(edata_base_get(edata) != NULL); + } + } + return nsuccess; +} + +static edata_t * +hpa_alloc(tsdn_t *tsdn, pai_t *self, size_t size, size_t alignment, bool zero, + bool guarded, bool frequent_reuse, bool *deferred_work_generated) { + assert((size & PAGE_MASK) == 0); + assert(!guarded); + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + + /* We don't handle alignment or zeroing for now. */ + if (alignment > PAGE || zero) { + return NULL; + } + /* + * An alloc with alignment == PAGE and zero == false is equivalent to a + * batch alloc of 1. Just do that, so we can share code. + */ + edata_list_active_t results; + edata_list_active_init(&results); + size_t nallocs = hpa_alloc_batch(tsdn, self, size, /* nallocs */ 1, + &results, deferred_work_generated); + assert(nallocs == 0 || nallocs == 1); + edata_t *edata = edata_list_active_first(&results); + return edata; +} + +static bool +hpa_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size, + size_t new_size, bool zero, bool *deferred_work_generated) { + /* Expand not yet supported. */ + return true; +} + +static bool +hpa_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata, + size_t old_size, size_t new_size, bool *deferred_work_generated) { + /* Shrink not yet supported. */ + return true; +} + +static void +hpa_dalloc_prepare_unlocked(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata) { + malloc_mutex_assert_not_owner(tsdn, &shard->mtx); + + assert(edata_pai_get(edata) == EXTENT_PAI_HPA); + assert(edata_state_get(edata) == extent_state_active); + assert(edata_arena_ind_get(edata) == shard->ind); + assert(edata_szind_get_maybe_invalid(edata) == SC_NSIZES); + assert(edata_committed_get(edata)); + assert(edata_base_get(edata) != NULL); + + /* + * Another thread shouldn't be trying to touch the metadata of an + * allocation being freed. The one exception is a merge attempt from a + * lower-addressed PAC extent; in this case we have a nominal race on + * the edata metadata bits, but in practice the fact that the PAI bits + * are different will prevent any further access. The race is bad, but + * benign in practice, and the long term plan is to track enough state + * in the rtree to prevent these merge attempts in the first place. + */ + edata_addr_set(edata, edata_base_get(edata)); + edata_zeroed_set(edata, false); + emap_deregister_boundary(tsdn, shard->emap, edata); +} + +static void +hpa_dalloc_locked(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + + /* + * Release the metadata early, to avoid having to remember to do it + * while we're also doing tricky purging logic. First, we need to grab + * a few bits of metadata from it. + * + * Note that the shard mutex protects ps's metadata too; it wouldn't be + * correct to try to read most information out of it without the lock. + */ + hpdata_t *ps = edata_ps_get(edata); + /* Currently, all edatas come from pageslabs. */ + assert(ps != NULL); + void *unreserve_addr = edata_addr_get(edata); + size_t unreserve_size = edata_size_get(edata); + edata_cache_fast_put(tsdn, &shard->ecf, edata); + + psset_update_begin(&shard->psset, ps); + hpdata_unreserve(ps, unreserve_addr, unreserve_size); + hpa_update_purge_hugify_eligibility(tsdn, shard, ps); + psset_update_end(&shard->psset, ps); +} + +static void +hpa_dalloc_batch(tsdn_t *tsdn, pai_t *self, edata_list_active_t *list, + bool *deferred_work_generated) { + hpa_shard_t *shard = hpa_from_pai(self); + + edata_t *edata; + ql_foreach(edata, &list->head, ql_link_active) { + hpa_dalloc_prepare_unlocked(tsdn, shard, edata); + } + + malloc_mutex_lock(tsdn, &shard->mtx); + /* Now, remove from the list. */ + while ((edata = edata_list_active_first(list)) != NULL) { + edata_list_active_remove(list, edata); + hpa_dalloc_locked(tsdn, shard, edata); + } + hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ false); + *deferred_work_generated = + hpa_shard_has_deferred_work(tsdn, shard); + + malloc_mutex_unlock(tsdn, &shard->mtx); +} + +static void +hpa_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata, + bool *deferred_work_generated) { + assert(!edata_guarded_get(edata)); + /* Just a dalloc_batch of size 1; this lets us share logic. */ + edata_list_active_t dalloc_list; + edata_list_active_init(&dalloc_list); + edata_list_active_append(&dalloc_list, edata); + hpa_dalloc_batch(tsdn, self, &dalloc_list, deferred_work_generated); +} + +/* + * Calculate time until either purging or hugification ought to happen. + * Called by background threads. + */ +static uint64_t +hpa_time_until_deferred_work(tsdn_t *tsdn, pai_t *self) { + hpa_shard_t *shard = hpa_from_pai(self); + uint64_t time_ns = BACKGROUND_THREAD_DEFERRED_MAX; + + malloc_mutex_lock(tsdn, &shard->mtx); + + hpdata_t *to_hugify = psset_pick_hugify(&shard->psset); + if (to_hugify != NULL) { + nstime_t time_hugify_allowed = + hpdata_time_hugify_allowed(to_hugify); + uint64_t since_hugify_allowed_ms = + shard->central->hooks.ms_since(&time_hugify_allowed); + /* + * If not enough time has passed since hugification was allowed, + * sleep for the rest. + */ + if (since_hugify_allowed_ms < shard->opts.hugify_delay_ms) { + time_ns = shard->opts.hugify_delay_ms - + since_hugify_allowed_ms; + time_ns *= 1000 * 1000; + } else { + malloc_mutex_unlock(tsdn, &shard->mtx); + return BACKGROUND_THREAD_DEFERRED_MIN; + } + } + + if (hpa_should_purge(tsdn, shard)) { + /* + * If we haven't purged before, no need to check interval + * between purges. Simply purge as soon as possible. + */ + if (shard->stats.npurge_passes == 0) { + malloc_mutex_unlock(tsdn, &shard->mtx); + return BACKGROUND_THREAD_DEFERRED_MIN; + } + uint64_t since_last_purge_ms = shard->central->hooks.ms_since( + &shard->last_purge); + + if (since_last_purge_ms < shard->opts.min_purge_interval_ms) { + uint64_t until_purge_ns; + until_purge_ns = shard->opts.min_purge_interval_ms - + since_last_purge_ms; + until_purge_ns *= 1000 * 1000; + + if (until_purge_ns < time_ns) { + time_ns = until_purge_ns; + } + } else { + time_ns = BACKGROUND_THREAD_DEFERRED_MIN; + } + } + malloc_mutex_unlock(tsdn, &shard->mtx); + return time_ns; +} + +void +hpa_shard_disable(tsdn_t *tsdn, hpa_shard_t *shard) { + hpa_do_consistency_checks(shard); + + malloc_mutex_lock(tsdn, &shard->mtx); + edata_cache_fast_disable(tsdn, &shard->ecf); + malloc_mutex_unlock(tsdn, &shard->mtx); +} + +static void +hpa_shard_assert_stats_empty(psset_bin_stats_t *bin_stats) { + assert(bin_stats->npageslabs == 0); + assert(bin_stats->nactive == 0); +} + +static void +hpa_assert_empty(tsdn_t *tsdn, hpa_shard_t *shard, psset_t *psset) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + for (int huge = 0; huge <= 1; huge++) { + hpa_shard_assert_stats_empty(&psset->stats.full_slabs[huge]); + for (pszind_t i = 0; i < PSSET_NPSIZES; i++) { + hpa_shard_assert_stats_empty( + &psset->stats.nonfull_slabs[i][huge]); + } + } +} + +void +hpa_shard_destroy(tsdn_t *tsdn, hpa_shard_t *shard) { + hpa_do_consistency_checks(shard); + /* + * By the time we're here, the arena code should have dalloc'd all the + * active extents, which means we should have eventually evicted + * everything from the psset, so it shouldn't be able to serve even a + * 1-page allocation. + */ + if (config_debug) { + malloc_mutex_lock(tsdn, &shard->mtx); + hpa_assert_empty(tsdn, shard, &shard->psset); + malloc_mutex_unlock(tsdn, &shard->mtx); + } + hpdata_t *ps; + while ((ps = psset_pick_alloc(&shard->psset, PAGE)) != NULL) { + /* There should be no allocations anywhere. */ + assert(hpdata_empty(ps)); + psset_remove(&shard->psset, ps); + shard->central->hooks.unmap(hpdata_addr_get(ps), HUGEPAGE); + } +} + +void +hpa_shard_set_deferral_allowed(tsdn_t *tsdn, hpa_shard_t *shard, + bool deferral_allowed) { + hpa_do_consistency_checks(shard); + + malloc_mutex_lock(tsdn, &shard->mtx); + bool deferral_previously_allowed = shard->opts.deferral_allowed; + shard->opts.deferral_allowed = deferral_allowed; + if (deferral_previously_allowed && !deferral_allowed) { + hpa_shard_maybe_do_deferred_work(tsdn, shard, + /* forced */ true); + } + malloc_mutex_unlock(tsdn, &shard->mtx); +} + +void +hpa_shard_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) { + hpa_do_consistency_checks(shard); + + malloc_mutex_lock(tsdn, &shard->mtx); + hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ true); + malloc_mutex_unlock(tsdn, &shard->mtx); +} + +void +hpa_shard_prefork3(tsdn_t *tsdn, hpa_shard_t *shard) { + hpa_do_consistency_checks(shard); + + malloc_mutex_prefork(tsdn, &shard->grow_mtx); +} + +void +hpa_shard_prefork4(tsdn_t *tsdn, hpa_shard_t *shard) { + hpa_do_consistency_checks(shard); + + malloc_mutex_prefork(tsdn, &shard->mtx); +} + +void +hpa_shard_postfork_parent(tsdn_t *tsdn, hpa_shard_t *shard) { + hpa_do_consistency_checks(shard); + + malloc_mutex_postfork_parent(tsdn, &shard->grow_mtx); + malloc_mutex_postfork_parent(tsdn, &shard->mtx); +} + +void +hpa_shard_postfork_child(tsdn_t *tsdn, hpa_shard_t *shard) { + hpa_do_consistency_checks(shard); + + malloc_mutex_postfork_child(tsdn, &shard->grow_mtx); + malloc_mutex_postfork_child(tsdn, &shard->mtx); +} diff --git a/BeefRT/JEMalloc/src/hpa_hooks.c b/BeefRT/JEMalloc/src/hpa_hooks.c new file mode 100644 index 00000000..ade581e8 --- /dev/null +++ b/BeefRT/JEMalloc/src/hpa_hooks.c @@ -0,0 +1,63 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/hpa_hooks.h" + +static void *hpa_hooks_map(size_t size); +static void hpa_hooks_unmap(void *ptr, size_t size); +static void hpa_hooks_purge(void *ptr, size_t size); +static void hpa_hooks_hugify(void *ptr, size_t size); +static void hpa_hooks_dehugify(void *ptr, size_t size); +static void hpa_hooks_curtime(nstime_t *r_nstime, bool first_reading); +static uint64_t hpa_hooks_ms_since(nstime_t *past_nstime); + +hpa_hooks_t hpa_hooks_default = { + &hpa_hooks_map, + &hpa_hooks_unmap, + &hpa_hooks_purge, + &hpa_hooks_hugify, + &hpa_hooks_dehugify, + &hpa_hooks_curtime, + &hpa_hooks_ms_since +}; + +static void * +hpa_hooks_map(size_t size) { + bool commit = true; + return pages_map(NULL, size, HUGEPAGE, &commit); +} + +static void +hpa_hooks_unmap(void *ptr, size_t size) { + pages_unmap(ptr, size); +} + +static void +hpa_hooks_purge(void *ptr, size_t size) { + pages_purge_forced(ptr, size); +} + +static void +hpa_hooks_hugify(void *ptr, size_t size) { + bool err = pages_huge(ptr, size); + (void)err; +} + +static void +hpa_hooks_dehugify(void *ptr, size_t size) { + bool err = pages_nohuge(ptr, size); + (void)err; +} + +static void +hpa_hooks_curtime(nstime_t *r_nstime, bool first_reading) { + if (first_reading) { + nstime_init_zero(r_nstime); + } + nstime_update(r_nstime); +} + +static uint64_t +hpa_hooks_ms_since(nstime_t *past_nstime) { + return nstime_ns_since(past_nstime) / 1000 / 1000; +} diff --git a/BeefRT/JEMalloc/src/hpdata.c b/BeefRT/JEMalloc/src/hpdata.c new file mode 100644 index 00000000..e7d7294c --- /dev/null +++ b/BeefRT/JEMalloc/src/hpdata.c @@ -0,0 +1,325 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/hpdata.h" + +static int +hpdata_age_comp(const hpdata_t *a, const hpdata_t *b) { + uint64_t a_age = hpdata_age_get(a); + uint64_t b_age = hpdata_age_get(b); + /* + * hpdata ages are operation counts in the psset; no two should be the + * same. + */ + assert(a_age != b_age); + return (a_age > b_age) - (a_age < b_age); +} + +ph_gen(, hpdata_age_heap, hpdata_t, age_link, hpdata_age_comp) + +void +hpdata_init(hpdata_t *hpdata, void *addr, uint64_t age) { + hpdata_addr_set(hpdata, addr); + hpdata_age_set(hpdata, age); + hpdata->h_huge = false; + hpdata->h_alloc_allowed = true; + hpdata->h_in_psset_alloc_container = false; + hpdata->h_purge_allowed = false; + hpdata->h_hugify_allowed = false; + hpdata->h_in_psset_hugify_container = false; + hpdata->h_mid_purge = false; + hpdata->h_mid_hugify = false; + hpdata->h_updating = false; + hpdata->h_in_psset = false; + hpdata_longest_free_range_set(hpdata, HUGEPAGE_PAGES); + hpdata->h_nactive = 0; + fb_init(hpdata->active_pages, HUGEPAGE_PAGES); + hpdata->h_ntouched = 0; + fb_init(hpdata->touched_pages, HUGEPAGE_PAGES); + + hpdata_assert_consistent(hpdata); +} + +void * +hpdata_reserve_alloc(hpdata_t *hpdata, size_t sz) { + hpdata_assert_consistent(hpdata); + /* + * This is a metadata change; the hpdata should therefore either not be + * in the psset, or should have explicitly marked itself as being + * mid-update. + */ + assert(!hpdata->h_in_psset || hpdata->h_updating); + assert(hpdata->h_alloc_allowed); + assert((sz & PAGE_MASK) == 0); + size_t npages = sz >> LG_PAGE; + assert(npages <= hpdata_longest_free_range_get(hpdata)); + + size_t result; + + size_t start = 0; + /* + * These are dead stores, but the compiler will issue warnings on them + * since it can't tell statically that found is always true below. + */ + size_t begin = 0; + size_t len = 0; + + size_t largest_unchosen_range = 0; + while (true) { + bool found = fb_urange_iter(hpdata->active_pages, + HUGEPAGE_PAGES, start, &begin, &len); + /* + * A precondition to this function is that hpdata must be able + * to serve the allocation. + */ + assert(found); + assert(len <= hpdata_longest_free_range_get(hpdata)); + if (len >= npages) { + /* + * We use first-fit within the page slabs; this gives + * bounded worst-case fragmentation within a slab. It's + * not necessarily right; we could experiment with + * various other options. + */ + break; + } + if (len > largest_unchosen_range) { + largest_unchosen_range = len; + } + start = begin + len; + } + /* We found a range; remember it. */ + result = begin; + fb_set_range(hpdata->active_pages, HUGEPAGE_PAGES, begin, npages); + hpdata->h_nactive += npages; + + /* + * We might be about to dirty some memory for the first time; update our + * count if so. + */ + size_t new_dirty = fb_ucount(hpdata->touched_pages, HUGEPAGE_PAGES, + result, npages); + fb_set_range(hpdata->touched_pages, HUGEPAGE_PAGES, result, npages); + hpdata->h_ntouched += new_dirty; + + /* + * If we allocated out of a range that was the longest in the hpdata, it + * might be the only one of that size and we'll have to adjust the + * metadata. + */ + if (len == hpdata_longest_free_range_get(hpdata)) { + start = begin + npages; + while (start < HUGEPAGE_PAGES) { + bool found = fb_urange_iter(hpdata->active_pages, + HUGEPAGE_PAGES, start, &begin, &len); + if (!found) { + break; + } + assert(len <= hpdata_longest_free_range_get(hpdata)); + if (len == hpdata_longest_free_range_get(hpdata)) { + largest_unchosen_range = len; + break; + } + if (len > largest_unchosen_range) { + largest_unchosen_range = len; + } + start = begin + len; + } + hpdata_longest_free_range_set(hpdata, largest_unchosen_range); + } + + hpdata_assert_consistent(hpdata); + return (void *)( + (uintptr_t)hpdata_addr_get(hpdata) + (result << LG_PAGE)); +} + +void +hpdata_unreserve(hpdata_t *hpdata, void *addr, size_t sz) { + hpdata_assert_consistent(hpdata); + /* See the comment in reserve. */ + assert(!hpdata->h_in_psset || hpdata->h_updating); + assert(((uintptr_t)addr & PAGE_MASK) == 0); + assert((sz & PAGE_MASK) == 0); + size_t begin = ((uintptr_t)addr - (uintptr_t)hpdata_addr_get(hpdata)) + >> LG_PAGE; + assert(begin < HUGEPAGE_PAGES); + size_t npages = sz >> LG_PAGE; + size_t old_longest_range = hpdata_longest_free_range_get(hpdata); + + fb_unset_range(hpdata->active_pages, HUGEPAGE_PAGES, begin, npages); + /* We might have just created a new, larger range. */ + size_t new_begin = (fb_fls(hpdata->active_pages, HUGEPAGE_PAGES, + begin) + 1); + size_t new_end = fb_ffs(hpdata->active_pages, HUGEPAGE_PAGES, + begin + npages - 1); + size_t new_range_len = new_end - new_begin; + + if (new_range_len > old_longest_range) { + hpdata_longest_free_range_set(hpdata, new_range_len); + } + + hpdata->h_nactive -= npages; + + hpdata_assert_consistent(hpdata); +} + +size_t +hpdata_purge_begin(hpdata_t *hpdata, hpdata_purge_state_t *purge_state) { + hpdata_assert_consistent(hpdata); + /* + * See the comment below; we might purge any inactive extent, so it's + * unsafe for any other thread to turn any inactive extent active while + * we're operating on it. + */ + assert(!hpdata_alloc_allowed_get(hpdata)); + + purge_state->npurged = 0; + purge_state->next_purge_search_begin = 0; + + /* + * Initialize to_purge. + * + * It's possible to end up in situations where two dirty extents are + * separated by a retained extent: + * - 1 page allocated. + * - 1 page allocated. + * - 1 pages allocated. + * + * If the middle page is freed and purged, and then the first and third + * pages are freed, and then another purge pass happens, the hpdata + * looks like this: + * - 1 page dirty. + * - 1 page retained. + * - 1 page dirty. + * + * But it's safe to do a single 3-page purge. + * + * We do this by first computing the dirty pages, and then filling in + * any gaps by extending each range in the dirty bitmap to extend until + * the next active page. This purges more pages, but the expensive part + * of purging is the TLB shootdowns, rather than the kernel state + * tracking; doing a little bit more of the latter is fine if it saves + * us from doing some of the former. + */ + + /* + * The dirty pages are those that are touched but not active. Note that + * in a normal-ish case, HUGEPAGE_PAGES is something like 512 and the + * fb_group_t is 64 bits, so this is 64 bytes, spread across 8 + * fb_group_ts. + */ + fb_group_t dirty_pages[FB_NGROUPS(HUGEPAGE_PAGES)]; + fb_init(dirty_pages, HUGEPAGE_PAGES); + fb_bit_not(dirty_pages, hpdata->active_pages, HUGEPAGE_PAGES); + fb_bit_and(dirty_pages, dirty_pages, hpdata->touched_pages, + HUGEPAGE_PAGES); + + fb_init(purge_state->to_purge, HUGEPAGE_PAGES); + size_t next_bit = 0; + while (next_bit < HUGEPAGE_PAGES) { + size_t next_dirty = fb_ffs(dirty_pages, HUGEPAGE_PAGES, + next_bit); + /* Recall that fb_ffs returns nbits if no set bit is found. */ + if (next_dirty == HUGEPAGE_PAGES) { + break; + } + size_t next_active = fb_ffs(hpdata->active_pages, + HUGEPAGE_PAGES, next_dirty); + /* + * Don't purge past the end of the dirty extent, into retained + * pages. This helps the kernel a tiny bit, but honestly it's + * mostly helpful for testing (where we tend to write test cases + * that think in terms of the dirty ranges). + */ + ssize_t last_dirty = fb_fls(dirty_pages, HUGEPAGE_PAGES, + next_active - 1); + assert(last_dirty >= 0); + assert((size_t)last_dirty >= next_dirty); + assert((size_t)last_dirty - next_dirty + 1 <= HUGEPAGE_PAGES); + + fb_set_range(purge_state->to_purge, HUGEPAGE_PAGES, next_dirty, + last_dirty - next_dirty + 1); + next_bit = next_active + 1; + } + + /* We should purge, at least, everything dirty. */ + size_t ndirty = hpdata->h_ntouched - hpdata->h_nactive; + purge_state->ndirty_to_purge = ndirty; + assert(ndirty <= fb_scount( + purge_state->to_purge, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES)); + assert(ndirty == fb_scount(dirty_pages, HUGEPAGE_PAGES, 0, + HUGEPAGE_PAGES)); + + hpdata_assert_consistent(hpdata); + + return ndirty; +} + +bool +hpdata_purge_next(hpdata_t *hpdata, hpdata_purge_state_t *purge_state, + void **r_purge_addr, size_t *r_purge_size) { + /* + * Note that we don't have a consistency check here; we're accessing + * hpdata without synchronization, and therefore have no right to expect + * a consistent state. + */ + assert(!hpdata_alloc_allowed_get(hpdata)); + + if (purge_state->next_purge_search_begin == HUGEPAGE_PAGES) { + return false; + } + size_t purge_begin; + size_t purge_len; + bool found_range = fb_srange_iter(purge_state->to_purge, HUGEPAGE_PAGES, + purge_state->next_purge_search_begin, &purge_begin, &purge_len); + if (!found_range) { + return false; + } + + *r_purge_addr = (void *)( + (uintptr_t)hpdata_addr_get(hpdata) + purge_begin * PAGE); + *r_purge_size = purge_len * PAGE; + + purge_state->next_purge_search_begin = purge_begin + purge_len; + purge_state->npurged += purge_len; + assert(purge_state->npurged <= HUGEPAGE_PAGES); + + return true; +} + +void +hpdata_purge_end(hpdata_t *hpdata, hpdata_purge_state_t *purge_state) { + assert(!hpdata_alloc_allowed_get(hpdata)); + hpdata_assert_consistent(hpdata); + /* See the comment in reserve. */ + assert(!hpdata->h_in_psset || hpdata->h_updating); + + assert(purge_state->npurged == fb_scount(purge_state->to_purge, + HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES)); + assert(purge_state->npurged >= purge_state->ndirty_to_purge); + + fb_bit_not(purge_state->to_purge, purge_state->to_purge, + HUGEPAGE_PAGES); + fb_bit_and(hpdata->touched_pages, hpdata->touched_pages, + purge_state->to_purge, HUGEPAGE_PAGES); + assert(hpdata->h_ntouched >= purge_state->ndirty_to_purge); + hpdata->h_ntouched -= purge_state->ndirty_to_purge; + + hpdata_assert_consistent(hpdata); +} + +void +hpdata_hugify(hpdata_t *hpdata) { + hpdata_assert_consistent(hpdata); + hpdata->h_huge = true; + fb_set_range(hpdata->touched_pages, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES); + hpdata->h_ntouched = HUGEPAGE_PAGES; + hpdata_assert_consistent(hpdata); +} + +void +hpdata_dehugify(hpdata_t *hpdata) { + hpdata_assert_consistent(hpdata); + hpdata->h_huge = false; + hpdata_assert_consistent(hpdata); +} diff --git a/BeefRT/JEMalloc/src/inspect.c b/BeefRT/JEMalloc/src/inspect.c new file mode 100644 index 00000000..911b5d52 --- /dev/null +++ b/BeefRT/JEMalloc/src/inspect.c @@ -0,0 +1,77 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +void +inspect_extent_util_stats_get(tsdn_t *tsdn, const void *ptr, size_t *nfree, + size_t *nregs, size_t *size) { + assert(ptr != NULL && nfree != NULL && nregs != NULL && size != NULL); + + const edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global, ptr); + if (unlikely(edata == NULL)) { + *nfree = *nregs = *size = 0; + return; + } + + *size = edata_size_get(edata); + if (!edata_slab_get(edata)) { + *nfree = 0; + *nregs = 1; + } else { + *nfree = edata_nfree_get(edata); + *nregs = bin_infos[edata_szind_get(edata)].nregs; + assert(*nfree <= *nregs); + assert(*nfree * edata_usize_get(edata) <= *size); + } +} + +void +inspect_extent_util_stats_verbose_get(tsdn_t *tsdn, const void *ptr, + size_t *nfree, size_t *nregs, size_t *size, size_t *bin_nfree, + size_t *bin_nregs, void **slabcur_addr) { + assert(ptr != NULL && nfree != NULL && nregs != NULL && size != NULL + && bin_nfree != NULL && bin_nregs != NULL && slabcur_addr != NULL); + + const edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global, ptr); + if (unlikely(edata == NULL)) { + *nfree = *nregs = *size = *bin_nfree = *bin_nregs = 0; + *slabcur_addr = NULL; + return; + } + + *size = edata_size_get(edata); + if (!edata_slab_get(edata)) { + *nfree = *bin_nfree = *bin_nregs = 0; + *nregs = 1; + *slabcur_addr = NULL; + return; + } + + *nfree = edata_nfree_get(edata); + const szind_t szind = edata_szind_get(edata); + *nregs = bin_infos[szind].nregs; + assert(*nfree <= *nregs); + assert(*nfree * edata_usize_get(edata) <= *size); + + arena_t *arena = (arena_t *)atomic_load_p( + &arenas[edata_arena_ind_get(edata)], ATOMIC_RELAXED); + assert(arena != NULL); + const unsigned binshard = edata_binshard_get(edata); + bin_t *bin = arena_get_bin(arena, szind, binshard); + + malloc_mutex_lock(tsdn, &bin->lock); + if (config_stats) { + *bin_nregs = *nregs * bin->stats.curslabs; + assert(*bin_nregs >= bin->stats.curregs); + *bin_nfree = *bin_nregs - bin->stats.curregs; + } else { + *bin_nfree = *bin_nregs = 0; + } + edata_t *slab; + if (bin->slabcur != NULL) { + slab = bin->slabcur; + } else { + slab = edata_heap_first(&bin->slabs_nonfull); + } + *slabcur_addr = slab != NULL ? edata_addr_get(slab) : NULL; + malloc_mutex_unlock(tsdn, &bin->lock); +} diff --git a/BeefRT/JEMalloc/src/jemalloc.c b/BeefRT/JEMalloc/src/jemalloc.c new file mode 100644 index 00000000..7655de4e --- /dev/null +++ b/BeefRT/JEMalloc/src/jemalloc.c @@ -0,0 +1,4476 @@ +#define JEMALLOC_C_ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/atomic.h" +#include "jemalloc/internal/buf_writer.h" +#include "jemalloc/internal/ctl.h" +#include "jemalloc/internal/emap.h" +#include "jemalloc/internal/extent_dss.h" +#include "jemalloc/internal/extent_mmap.h" +#include "jemalloc/internal/fxp.h" +#include "jemalloc/internal/san.h" +#include "jemalloc/internal/hook.h" +#include "jemalloc/internal/jemalloc_internal_types.h" +#include "jemalloc/internal/log.h" +#include "jemalloc/internal/malloc_io.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/nstime.h" +#include "jemalloc/internal/rtree.h" +#include "jemalloc/internal/safety_check.h" +#include "jemalloc/internal/sc.h" +#include "jemalloc/internal/spin.h" +#include "jemalloc/internal/sz.h" +#include "jemalloc/internal/ticker.h" +#include "jemalloc/internal/thread_event.h" +#include "jemalloc/internal/util.h" + +/******************************************************************************/ +/* Data. */ + +/* Runtime configuration options. */ +const char *je_malloc_conf +#ifndef _WIN32 + JEMALLOC_ATTR(weak) +#endif + ; +/* + * The usual rule is that the closer to runtime you are, the higher priority + * your configuration settings are (so the jemalloc config options get lower + * priority than the per-binary setting, which gets lower priority than the /etc + * setting, which gets lower priority than the environment settings). + * + * But it's a fairly common use case in some testing environments for a user to + * be able to control the binary, but nothing else (e.g. a performancy canary + * uses the production OS and environment variables, but can run any binary in + * those circumstances). For these use cases, it's handy to have an in-binary + * mechanism for overriding environment variable settings, with the idea that if + * the results are positive they get promoted to the official settings, and + * moved from the binary to the environment variable. + * + * We don't actually want this to be widespread, so we'll give it a silly name + * and not mention it in headers or documentation. + */ +const char *je_malloc_conf_2_conf_harder +#ifndef _WIN32 + JEMALLOC_ATTR(weak) +#endif + ; + +bool opt_abort = +#ifdef JEMALLOC_DEBUG + true +#else + false +#endif + ; +bool opt_abort_conf = +#ifdef JEMALLOC_DEBUG + true +#else + false +#endif + ; +/* Intentionally default off, even with debug builds. */ +bool opt_confirm_conf = false; +const char *opt_junk = +#if (defined(JEMALLOC_DEBUG) && defined(JEMALLOC_FILL)) + "true" +#else + "false" +#endif + ; +bool opt_junk_alloc = +#if (defined(JEMALLOC_DEBUG) && defined(JEMALLOC_FILL)) + true +#else + false +#endif + ; +bool opt_junk_free = +#if (defined(JEMALLOC_DEBUG) && defined(JEMALLOC_FILL)) + true +#else + false +#endif + ; +bool opt_trust_madvise = +#ifdef JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS + false +#else + true +#endif + ; + +bool opt_cache_oblivious = +#ifdef JEMALLOC_CACHE_OBLIVIOUS + true +#else + false +#endif + ; + +zero_realloc_action_t opt_zero_realloc_action = +#ifdef JEMALLOC_ZERO_REALLOC_DEFAULT_FREE + zero_realloc_action_free +#else + zero_realloc_action_alloc +#endif + ; + +atomic_zu_t zero_realloc_count = ATOMIC_INIT(0); + +const char *zero_realloc_mode_names[] = { + "alloc", + "free", + "abort", +}; + +/* + * These are the documented values for junk fill debugging facilities -- see the + * man page. + */ +static const uint8_t junk_alloc_byte = 0xa5; +static const uint8_t junk_free_byte = 0x5a; + +static void default_junk_alloc(void *ptr, size_t usize) { + memset(ptr, junk_alloc_byte, usize); +} + +static void default_junk_free(void *ptr, size_t usize) { + memset(ptr, junk_free_byte, usize); +} + +void (*junk_alloc_callback)(void *ptr, size_t size) = &default_junk_alloc; +void (*junk_free_callback)(void *ptr, size_t size) = &default_junk_free; + +bool opt_utrace = false; +bool opt_xmalloc = false; +bool opt_experimental_infallible_new = false; +bool opt_zero = false; +unsigned opt_narenas = 0; +fxp_t opt_narenas_ratio = FXP_INIT_INT(4); + +unsigned ncpus; + +/* Protects arenas initialization. */ +malloc_mutex_t arenas_lock; + +/* The global hpa, and whether it's on. */ +bool opt_hpa = false; +hpa_shard_opts_t opt_hpa_opts = HPA_SHARD_OPTS_DEFAULT; +sec_opts_t opt_hpa_sec_opts = SEC_OPTS_DEFAULT; + +/* + * Arenas that are used to service external requests. Not all elements of the + * arenas array are necessarily used; arenas are created lazily as needed. + * + * arenas[0..narenas_auto) are used for automatic multiplexing of threads and + * arenas. arenas[narenas_auto..narenas_total) are only used if the application + * takes some action to create them and allocate from them. + * + * Points to an arena_t. + */ +JEMALLOC_ALIGNED(CACHELINE) +atomic_p_t arenas[MALLOCX_ARENA_LIMIT]; +static atomic_u_t narenas_total; /* Use narenas_total_*(). */ +/* Below three are read-only after initialization. */ +static arena_t *a0; /* arenas[0]. */ +unsigned narenas_auto; +unsigned manual_arena_base; + +malloc_init_t malloc_init_state = malloc_init_uninitialized; + +/* False should be the common case. Set to true to trigger initialization. */ +bool malloc_slow = true; + +/* When malloc_slow is true, set the corresponding bits for sanity check. */ +enum { + flag_opt_junk_alloc = (1U), + flag_opt_junk_free = (1U << 1), + flag_opt_zero = (1U << 2), + flag_opt_utrace = (1U << 3), + flag_opt_xmalloc = (1U << 4) +}; +static uint8_t malloc_slow_flags; + +#ifdef JEMALLOC_THREADED_INIT +/* Used to let the initializing thread recursively allocate. */ +# define NO_INITIALIZER ((unsigned long)0) +# define INITIALIZER pthread_self() +# define IS_INITIALIZER (malloc_initializer == pthread_self()) +static pthread_t malloc_initializer = NO_INITIALIZER; +#else +# define NO_INITIALIZER false +# define INITIALIZER true +# define IS_INITIALIZER malloc_initializer +static bool malloc_initializer = NO_INITIALIZER; +#endif + +/* Used to avoid initialization races. */ +#ifdef _WIN32 +#if _WIN32_WINNT >= 0x0600 +static malloc_mutex_t init_lock = SRWLOCK_INIT; +#else +static malloc_mutex_t init_lock; +static bool init_lock_initialized = false; + +JEMALLOC_ATTR(constructor) +static void WINAPI +_init_init_lock(void) { + /* + * If another constructor in the same binary is using mallctl to e.g. + * set up extent hooks, it may end up running before this one, and + * malloc_init_hard will crash trying to lock the uninitialized lock. So + * we force an initialization of the lock in malloc_init_hard as well. + * We don't try to care about atomicity of the accessed to the + * init_lock_initialized boolean, since it really only matters early in + * the process creation, before any separate thread normally starts + * doing anything. + */ + if (!init_lock_initialized) { + malloc_mutex_init(&init_lock, "init", WITNESS_RANK_INIT, + malloc_mutex_rank_exclusive); + } + init_lock_initialized = true; +} + +#ifdef _MSC_VER +# pragma section(".CRT$XCU", read) +JEMALLOC_SECTION(".CRT$XCU") JEMALLOC_ATTR(used) +static const void (WINAPI *init_init_lock)(void) = _init_init_lock; +#endif +#endif +#else +static malloc_mutex_t init_lock = MALLOC_MUTEX_INITIALIZER; +#endif + +typedef struct { + void *p; /* Input pointer (as in realloc(p, s)). */ + size_t s; /* Request size. */ + void *r; /* Result pointer. */ +} malloc_utrace_t; + +#ifdef JEMALLOC_UTRACE +# define UTRACE(a, b, c) do { \ + if (unlikely(opt_utrace)) { \ + int utrace_serrno = errno; \ + malloc_utrace_t ut; \ + ut.p = (a); \ + ut.s = (b); \ + ut.r = (c); \ + UTRACE_CALL(&ut, sizeof(ut)); \ + errno = utrace_serrno; \ + } \ +} while (0) +#else +# define UTRACE(a, b, c) +#endif + +/* Whether encountered any invalid config options. */ +static bool had_conf_error = false; + +/******************************************************************************/ +/* + * Function prototypes for static functions that are referenced prior to + * definition. + */ + +static bool malloc_init_hard_a0(void); +static bool malloc_init_hard(void); + +/******************************************************************************/ +/* + * Begin miscellaneous support functions. + */ + +JEMALLOC_ALWAYS_INLINE bool +malloc_init_a0(void) { + if (unlikely(malloc_init_state == malloc_init_uninitialized)) { + return malloc_init_hard_a0(); + } + return false; +} + +JEMALLOC_ALWAYS_INLINE bool +malloc_init(void) { + if (unlikely(!malloc_initialized()) && malloc_init_hard()) { + return true; + } + return false; +} + +/* + * The a0*() functions are used instead of i{d,}alloc() in situations that + * cannot tolerate TLS variable access. + */ + +static void * +a0ialloc(size_t size, bool zero, bool is_internal) { + if (unlikely(malloc_init_a0())) { + return NULL; + } + + return iallocztm(TSDN_NULL, size, sz_size2index(size), zero, NULL, + is_internal, arena_get(TSDN_NULL, 0, true), true); +} + +static void +a0idalloc(void *ptr, bool is_internal) { + idalloctm(TSDN_NULL, ptr, NULL, NULL, is_internal, true); +} + +void * +a0malloc(size_t size) { + return a0ialloc(size, false, true); +} + +void +a0dalloc(void *ptr) { + a0idalloc(ptr, true); +} + +/* + * FreeBSD's libc uses the bootstrap_*() functions in bootstrap-sensitive + * situations that cannot tolerate TLS variable access (TLS allocation and very + * early internal data structure initialization). + */ + +void * +bootstrap_malloc(size_t size) { + if (unlikely(size == 0)) { + size = 1; + } + + return a0ialloc(size, false, false); +} + +void * +bootstrap_calloc(size_t num, size_t size) { + size_t num_size; + + num_size = num * size; + if (unlikely(num_size == 0)) { + assert(num == 0 || size == 0); + num_size = 1; + } + + return a0ialloc(num_size, true, false); +} + +void +bootstrap_free(void *ptr) { + if (unlikely(ptr == NULL)) { + return; + } + + a0idalloc(ptr, false); +} + +void +arena_set(unsigned ind, arena_t *arena) { + atomic_store_p(&arenas[ind], arena, ATOMIC_RELEASE); +} + +static void +narenas_total_set(unsigned narenas) { + atomic_store_u(&narenas_total, narenas, ATOMIC_RELEASE); +} + +static void +narenas_total_inc(void) { + atomic_fetch_add_u(&narenas_total, 1, ATOMIC_RELEASE); +} + +unsigned +narenas_total_get(void) { + return atomic_load_u(&narenas_total, ATOMIC_ACQUIRE); +} + +/* Create a new arena and insert it into the arenas array at index ind. */ +static arena_t * +arena_init_locked(tsdn_t *tsdn, unsigned ind, const arena_config_t *config) { + arena_t *arena; + + assert(ind <= narenas_total_get()); + if (ind >= MALLOCX_ARENA_LIMIT) { + return NULL; + } + if (ind == narenas_total_get()) { + narenas_total_inc(); + } + + /* + * Another thread may have already initialized arenas[ind] if it's an + * auto arena. + */ + arena = arena_get(tsdn, ind, false); + if (arena != NULL) { + assert(arena_is_auto(arena)); + return arena; + } + + /* Actually initialize the arena. */ + arena = arena_new(tsdn, ind, config); + + return arena; +} + +static void +arena_new_create_background_thread(tsdn_t *tsdn, unsigned ind) { + if (ind == 0) { + return; + } + /* + * Avoid creating a new background thread just for the huge arena, which + * purges eagerly by default. + */ + if (have_background_thread && !arena_is_huge(ind)) { + if (background_thread_create(tsdn_tsd(tsdn), ind)) { + malloc_printf(": error in background thread " + "creation for arena %u. Abort.\n", ind); + abort(); + } + } +} + +arena_t * +arena_init(tsdn_t *tsdn, unsigned ind, const arena_config_t *config) { + arena_t *arena; + + malloc_mutex_lock(tsdn, &arenas_lock); + arena = arena_init_locked(tsdn, ind, config); + malloc_mutex_unlock(tsdn, &arenas_lock); + + arena_new_create_background_thread(tsdn, ind); + + return arena; +} + +static void +arena_bind(tsd_t *tsd, unsigned ind, bool internal) { + arena_t *arena = arena_get(tsd_tsdn(tsd), ind, false); + arena_nthreads_inc(arena, internal); + + if (internal) { + tsd_iarena_set(tsd, arena); + } else { + tsd_arena_set(tsd, arena); + unsigned shard = atomic_fetch_add_u(&arena->binshard_next, 1, + ATOMIC_RELAXED); + tsd_binshards_t *bins = tsd_binshardsp_get(tsd); + for (unsigned i = 0; i < SC_NBINS; i++) { + assert(bin_infos[i].n_shards > 0 && + bin_infos[i].n_shards <= BIN_SHARDS_MAX); + bins->binshard[i] = shard % bin_infos[i].n_shards; + } + } +} + +void +arena_migrate(tsd_t *tsd, arena_t *oldarena, arena_t *newarena) { + assert(oldarena != NULL); + assert(newarena != NULL); + + arena_nthreads_dec(oldarena, false); + arena_nthreads_inc(newarena, false); + tsd_arena_set(tsd, newarena); + + if (arena_nthreads_get(oldarena, false) == 0) { + /* Purge if the old arena has no associated threads anymore. */ + arena_decay(tsd_tsdn(tsd), oldarena, + /* is_background_thread */ false, /* all */ true); + } +} + +static void +arena_unbind(tsd_t *tsd, unsigned ind, bool internal) { + arena_t *arena; + + arena = arena_get(tsd_tsdn(tsd), ind, false); + arena_nthreads_dec(arena, internal); + + if (internal) { + tsd_iarena_set(tsd, NULL); + } else { + tsd_arena_set(tsd, NULL); + } +} + +/* Slow path, called only by arena_choose(). */ +arena_t * +arena_choose_hard(tsd_t *tsd, bool internal) { + arena_t *ret JEMALLOC_CC_SILENCE_INIT(NULL); + + if (have_percpu_arena && PERCPU_ARENA_ENABLED(opt_percpu_arena)) { + unsigned choose = percpu_arena_choose(); + ret = arena_get(tsd_tsdn(tsd), choose, true); + assert(ret != NULL); + arena_bind(tsd, arena_ind_get(ret), false); + arena_bind(tsd, arena_ind_get(ret), true); + + return ret; + } + + if (narenas_auto > 1) { + unsigned i, j, choose[2], first_null; + bool is_new_arena[2]; + + /* + * Determine binding for both non-internal and internal + * allocation. + * + * choose[0]: For application allocation. + * choose[1]: For internal metadata allocation. + */ + + for (j = 0; j < 2; j++) { + choose[j] = 0; + is_new_arena[j] = false; + } + + first_null = narenas_auto; + malloc_mutex_lock(tsd_tsdn(tsd), &arenas_lock); + assert(arena_get(tsd_tsdn(tsd), 0, false) != NULL); + for (i = 1; i < narenas_auto; i++) { + if (arena_get(tsd_tsdn(tsd), i, false) != NULL) { + /* + * Choose the first arena that has the lowest + * number of threads assigned to it. + */ + for (j = 0; j < 2; j++) { + if (arena_nthreads_get(arena_get( + tsd_tsdn(tsd), i, false), !!j) < + arena_nthreads_get(arena_get( + tsd_tsdn(tsd), choose[j], false), + !!j)) { + choose[j] = i; + } + } + } else if (first_null == narenas_auto) { + /* + * Record the index of the first uninitialized + * arena, in case all extant arenas are in use. + * + * NB: It is possible for there to be + * discontinuities in terms of initialized + * versus uninitialized arenas, due to the + * "thread.arena" mallctl. + */ + first_null = i; + } + } + + for (j = 0; j < 2; j++) { + if (arena_nthreads_get(arena_get(tsd_tsdn(tsd), + choose[j], false), !!j) == 0 || first_null == + narenas_auto) { + /* + * Use an unloaded arena, or the least loaded + * arena if all arenas are already initialized. + */ + if (!!j == internal) { + ret = arena_get(tsd_tsdn(tsd), + choose[j], false); + } + } else { + arena_t *arena; + + /* Initialize a new arena. */ + choose[j] = first_null; + arena = arena_init_locked(tsd_tsdn(tsd), + choose[j], &arena_config_default); + if (arena == NULL) { + malloc_mutex_unlock(tsd_tsdn(tsd), + &arenas_lock); + return NULL; + } + is_new_arena[j] = true; + if (!!j == internal) { + ret = arena; + } + } + arena_bind(tsd, choose[j], !!j); + } + malloc_mutex_unlock(tsd_tsdn(tsd), &arenas_lock); + + for (j = 0; j < 2; j++) { + if (is_new_arena[j]) { + assert(choose[j] > 0); + arena_new_create_background_thread( + tsd_tsdn(tsd), choose[j]); + } + } + + } else { + ret = arena_get(tsd_tsdn(tsd), 0, false); + arena_bind(tsd, 0, false); + arena_bind(tsd, 0, true); + } + + return ret; +} + +void +iarena_cleanup(tsd_t *tsd) { + arena_t *iarena; + + iarena = tsd_iarena_get(tsd); + if (iarena != NULL) { + arena_unbind(tsd, arena_ind_get(iarena), true); + } +} + +void +arena_cleanup(tsd_t *tsd) { + arena_t *arena; + + arena = tsd_arena_get(tsd); + if (arena != NULL) { + arena_unbind(tsd, arena_ind_get(arena), false); + } +} + +static void +stats_print_atexit(void) { + if (config_stats) { + tsdn_t *tsdn; + unsigned narenas, i; + + tsdn = tsdn_fetch(); + + /* + * Merge stats from extant threads. This is racy, since + * individual threads do not lock when recording tcache stats + * events. As a consequence, the final stats may be slightly + * out of date by the time they are reported, if other threads + * continue to allocate. + */ + for (i = 0, narenas = narenas_total_get(); i < narenas; i++) { + arena_t *arena = arena_get(tsdn, i, false); + if (arena != NULL) { + tcache_slow_t *tcache_slow; + + malloc_mutex_lock(tsdn, &arena->tcache_ql_mtx); + ql_foreach(tcache_slow, &arena->tcache_ql, + link) { + tcache_stats_merge(tsdn, + tcache_slow->tcache, arena); + } + malloc_mutex_unlock(tsdn, + &arena->tcache_ql_mtx); + } + } + } + je_malloc_stats_print(NULL, NULL, opt_stats_print_opts); +} + +/* + * Ensure that we don't hold any locks upon entry to or exit from allocator + * code (in a "broad" sense that doesn't count a reentrant allocation as an + * entrance or exit). + */ +JEMALLOC_ALWAYS_INLINE void +check_entry_exit_locking(tsdn_t *tsdn) { + if (!config_debug) { + return; + } + if (tsdn_null(tsdn)) { + return; + } + tsd_t *tsd = tsdn_tsd(tsdn); + /* + * It's possible we hold locks at entry/exit if we're in a nested + * allocation. + */ + int8_t reentrancy_level = tsd_reentrancy_level_get(tsd); + if (reentrancy_level != 0) { + return; + } + witness_assert_lockless(tsdn_witness_tsdp_get(tsdn)); +} + +/* + * End miscellaneous support functions. + */ +/******************************************************************************/ +/* + * Begin initialization functions. + */ + +static char * +jemalloc_secure_getenv(const char *name) { +#ifdef JEMALLOC_HAVE_SECURE_GETENV + return secure_getenv(name); +#else +# ifdef JEMALLOC_HAVE_ISSETUGID + if (issetugid() != 0) { + return NULL; + } +# endif + return getenv(name); +#endif +} + +static unsigned +malloc_ncpus(void) { + long result; + +#ifdef _WIN32 + SYSTEM_INFO si; + GetSystemInfo(&si); + result = si.dwNumberOfProcessors; +#elif defined(CPU_COUNT) + /* + * glibc >= 2.6 has the CPU_COUNT macro. + * + * glibc's sysconf() uses isspace(). glibc allocates for the first time + * *before* setting up the isspace tables. Therefore we need a + * different method to get the number of CPUs. + * + * The getaffinity approach is also preferred when only a subset of CPUs + * is available, to avoid using more arenas than necessary. + */ + { +# if defined(__FreeBSD__) || defined(__DragonFly__) + cpuset_t set; +# else + cpu_set_t set; +# endif +# if defined(JEMALLOC_HAVE_SCHED_SETAFFINITY) + sched_getaffinity(0, sizeof(set), &set); +# else + pthread_getaffinity_np(pthread_self(), sizeof(set), &set); +# endif + result = CPU_COUNT(&set); + } +#else + result = sysconf(_SC_NPROCESSORS_ONLN); +#endif + return ((result == -1) ? 1 : (unsigned)result); +} + +/* + * Ensure that number of CPUs is determistinc, i.e. it is the same based on: + * - sched_getaffinity() + * - _SC_NPROCESSORS_ONLN + * - _SC_NPROCESSORS_CONF + * Since otherwise tricky things is possible with percpu arenas in use. + */ +static bool +malloc_cpu_count_is_deterministic() +{ +#ifdef _WIN32 + return true; +#else + long cpu_onln = sysconf(_SC_NPROCESSORS_ONLN); + long cpu_conf = sysconf(_SC_NPROCESSORS_CONF); + if (cpu_onln != cpu_conf) { + return false; + } +# if defined(CPU_COUNT) +# if defined(__FreeBSD__) || defined(__DragonFly__) + cpuset_t set; +# else + cpu_set_t set; +# endif /* __FreeBSD__ */ +# if defined(JEMALLOC_HAVE_SCHED_SETAFFINITY) + sched_getaffinity(0, sizeof(set), &set); +# else /* !JEMALLOC_HAVE_SCHED_SETAFFINITY */ + pthread_getaffinity_np(pthread_self(), sizeof(set), &set); +# endif /* JEMALLOC_HAVE_SCHED_SETAFFINITY */ + long cpu_affinity = CPU_COUNT(&set); + if (cpu_affinity != cpu_conf) { + return false; + } +# endif /* CPU_COUNT */ + return true; +#endif +} + +static void +init_opt_stats_opts(const char *v, size_t vlen, char *dest) { + size_t opts_len = strlen(dest); + assert(opts_len <= stats_print_tot_num_options); + + for (size_t i = 0; i < vlen; i++) { + switch (v[i]) { +#define OPTION(o, v, d, s) case o: break; + STATS_PRINT_OPTIONS +#undef OPTION + default: continue; + } + + if (strchr(dest, v[i]) != NULL) { + /* Ignore repeated. */ + continue; + } + + dest[opts_len++] = v[i]; + dest[opts_len] = '\0'; + assert(opts_len <= stats_print_tot_num_options); + } + assert(opts_len == strlen(dest)); +} + +/* Reads the next size pair in a multi-sized option. */ +static bool +malloc_conf_multi_sizes_next(const char **slab_size_segment_cur, + size_t *vlen_left, size_t *slab_start, size_t *slab_end, size_t *new_size) { + const char *cur = *slab_size_segment_cur; + char *end; + uintmax_t um; + + set_errno(0); + + /* First number, then '-' */ + um = malloc_strtoumax(cur, &end, 0); + if (get_errno() != 0 || *end != '-') { + return true; + } + *slab_start = (size_t)um; + cur = end + 1; + + /* Second number, then ':' */ + um = malloc_strtoumax(cur, &end, 0); + if (get_errno() != 0 || *end != ':') { + return true; + } + *slab_end = (size_t)um; + cur = end + 1; + + /* Last number */ + um = malloc_strtoumax(cur, &end, 0); + if (get_errno() != 0) { + return true; + } + *new_size = (size_t)um; + + /* Consume the separator if there is one. */ + if (*end == '|') { + end++; + } + + *vlen_left -= end - *slab_size_segment_cur; + *slab_size_segment_cur = end; + + return false; +} + +static bool +malloc_conf_next(char const **opts_p, char const **k_p, size_t *klen_p, + char const **v_p, size_t *vlen_p) { + bool accept; + const char *opts = *opts_p; + + *k_p = opts; + + for (accept = false; !accept;) { + switch (*opts) { + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': + case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': + case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': + case 'Y': case 'Z': + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': + case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': + case 's': case 't': case 'u': case 'v': case 'w': case 'x': + case 'y': case 'z': + case '0': case '1': case '2': case '3': case '4': case '5': + case '6': case '7': case '8': case '9': + case '_': + opts++; + break; + case ':': + opts++; + *klen_p = (uintptr_t)opts - 1 - (uintptr_t)*k_p; + *v_p = opts; + accept = true; + break; + case '\0': + if (opts != *opts_p) { + malloc_write(": Conf string ends " + "with key\n"); + had_conf_error = true; + } + return true; + default: + malloc_write(": Malformed conf string\n"); + had_conf_error = true; + return true; + } + } + + for (accept = false; !accept;) { + switch (*opts) { + case ',': + opts++; + /* + * Look ahead one character here, because the next time + * this function is called, it will assume that end of + * input has been cleanly reached if no input remains, + * but we have optimistically already consumed the + * comma if one exists. + */ + if (*opts == '\0') { + malloc_write(": Conf string ends " + "with comma\n"); + had_conf_error = true; + } + *vlen_p = (uintptr_t)opts - 1 - (uintptr_t)*v_p; + accept = true; + break; + case '\0': + *vlen_p = (uintptr_t)opts - (uintptr_t)*v_p; + accept = true; + break; + default: + opts++; + break; + } + } + + *opts_p = opts; + return false; +} + +static void +malloc_abort_invalid_conf(void) { + assert(opt_abort_conf); + malloc_printf(": Abort (abort_conf:true) on invalid conf " + "value (see above).\n"); + abort(); +} + +static void +malloc_conf_error(const char *msg, const char *k, size_t klen, const char *v, + size_t vlen) { + malloc_printf(": %s: %.*s:%.*s\n", msg, (int)klen, k, + (int)vlen, v); + /* If abort_conf is set, error out after processing all options. */ + const char *experimental = "experimental_"; + if (strncmp(k, experimental, strlen(experimental)) == 0) { + /* However, tolerate experimental features. */ + return; + } + had_conf_error = true; +} + +static void +malloc_slow_flag_init(void) { + /* + * Combine the runtime options into malloc_slow for fast path. Called + * after processing all the options. + */ + malloc_slow_flags |= (opt_junk_alloc ? flag_opt_junk_alloc : 0) + | (opt_junk_free ? flag_opt_junk_free : 0) + | (opt_zero ? flag_opt_zero : 0) + | (opt_utrace ? flag_opt_utrace : 0) + | (opt_xmalloc ? flag_opt_xmalloc : 0); + + malloc_slow = (malloc_slow_flags != 0); +} + +/* Number of sources for initializing malloc_conf */ +#define MALLOC_CONF_NSOURCES 5 + +static const char * +obtain_malloc_conf(unsigned which_source, char buf[PATH_MAX + 1]) { + if (config_debug) { + static unsigned read_source = 0; + /* + * Each source should only be read once, to minimize # of + * syscalls on init. + */ + assert(read_source++ == which_source); + } + assert(which_source < MALLOC_CONF_NSOURCES); + + const char *ret; + switch (which_source) { + case 0: + ret = config_malloc_conf; + break; + case 1: + if (je_malloc_conf != NULL) { + /* Use options that were compiled into the program. */ + ret = je_malloc_conf; + } else { + /* No configuration specified. */ + ret = NULL; + } + break; + case 2: { + ssize_t linklen = 0; +#ifndef _WIN32 + int saved_errno = errno; + const char *linkname = +# ifdef JEMALLOC_PREFIX + "/etc/"JEMALLOC_PREFIX"malloc.conf" +# else + "/etc/malloc.conf" +# endif + ; + + /* + * Try to use the contents of the "/etc/malloc.conf" symbolic + * link's name. + */ +#ifndef JEMALLOC_READLINKAT + linklen = readlink(linkname, buf, PATH_MAX); +#else + linklen = readlinkat(AT_FDCWD, linkname, buf, PATH_MAX); +#endif + if (linklen == -1) { + /* No configuration specified. */ + linklen = 0; + /* Restore errno. */ + set_errno(saved_errno); + } +#endif + buf[linklen] = '\0'; + ret = buf; + break; + } case 3: { + const char *envname = +#ifdef JEMALLOC_PREFIX + JEMALLOC_CPREFIX"MALLOC_CONF" +#else + "MALLOC_CONF" +#endif + ; + + if ((ret = jemalloc_secure_getenv(envname)) != NULL) { + /* + * Do nothing; opts is already initialized to the value + * of the MALLOC_CONF environment variable. + */ + } else { + /* No configuration specified. */ + ret = NULL; + } + break; + } case 4: { + ret = je_malloc_conf_2_conf_harder; + break; + } default: + not_reached(); + ret = NULL; + } + return ret; +} + +static void +malloc_conf_init_helper(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS], + bool initial_call, const char *opts_cache[MALLOC_CONF_NSOURCES], + char buf[PATH_MAX + 1]) { + static const char *opts_explain[MALLOC_CONF_NSOURCES] = { + "string specified via --with-malloc-conf", + "string pointed to by the global variable malloc_conf", + "\"name\" of the file referenced by the symbolic link named " + "/etc/malloc.conf", + "value of the environment variable MALLOC_CONF", + "string pointed to by the global variable " + "malloc_conf_2_conf_harder", + }; + unsigned i; + const char *opts, *k, *v; + size_t klen, vlen; + + for (i = 0; i < MALLOC_CONF_NSOURCES; i++) { + /* Get runtime configuration. */ + if (initial_call) { + opts_cache[i] = obtain_malloc_conf(i, buf); + } + opts = opts_cache[i]; + if (!initial_call && opt_confirm_conf) { + malloc_printf( + ": malloc_conf #%u (%s): \"%s\"\n", + i + 1, opts_explain[i], opts != NULL ? opts : ""); + } + if (opts == NULL) { + continue; + } + + while (*opts != '\0' && !malloc_conf_next(&opts, &k, &klen, &v, + &vlen)) { + +#define CONF_ERROR(msg, k, klen, v, vlen) \ + if (!initial_call) { \ + malloc_conf_error( \ + msg, k, klen, v, vlen); \ + cur_opt_valid = false; \ + } +#define CONF_CONTINUE { \ + if (!initial_call && opt_confirm_conf \ + && cur_opt_valid) { \ + malloc_printf(": -- " \ + "Set conf value: %.*s:%.*s" \ + "\n", (int)klen, k, \ + (int)vlen, v); \ + } \ + continue; \ + } +#define CONF_MATCH(n) \ + (sizeof(n)-1 == klen && strncmp(n, k, klen) == 0) +#define CONF_MATCH_VALUE(n) \ + (sizeof(n)-1 == vlen && strncmp(n, v, vlen) == 0) +#define CONF_HANDLE_BOOL(o, n) \ + if (CONF_MATCH(n)) { \ + if (CONF_MATCH_VALUE("true")) { \ + o = true; \ + } else if (CONF_MATCH_VALUE("false")) { \ + o = false; \ + } else { \ + CONF_ERROR("Invalid conf value",\ + k, klen, v, vlen); \ + } \ + CONF_CONTINUE; \ + } + /* + * One of the CONF_MIN macros below expands, in one of the use points, + * to "unsigned integer < 0", which is always false, triggering the + * GCC -Wtype-limits warning, which we disable here and re-enable below. + */ + JEMALLOC_DIAGNOSTIC_PUSH + JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS + +#define CONF_DONT_CHECK_MIN(um, min) false +#define CONF_CHECK_MIN(um, min) ((um) < (min)) +#define CONF_DONT_CHECK_MAX(um, max) false +#define CONF_CHECK_MAX(um, max) ((um) > (max)) + +#define CONF_VALUE_READ(max_t, result) \ + char *end; \ + set_errno(0); \ + result = (max_t)malloc_strtoumax(v, &end, 0); +#define CONF_VALUE_READ_FAIL() \ + (get_errno() != 0 || (uintptr_t)end - (uintptr_t)v != vlen) + +#define CONF_HANDLE_T(t, max_t, o, n, min, max, check_min, check_max, clip) \ + if (CONF_MATCH(n)) { \ + max_t mv; \ + CONF_VALUE_READ(max_t, mv) \ + if (CONF_VALUE_READ_FAIL()) { \ + CONF_ERROR("Invalid conf value",\ + k, klen, v, vlen); \ + } else if (clip) { \ + if (check_min(mv, (t)(min))) { \ + o = (t)(min); \ + } else if ( \ + check_max(mv, (t)(max))) { \ + o = (t)(max); \ + } else { \ + o = (t)mv; \ + } \ + } else { \ + if (check_min(mv, (t)(min)) || \ + check_max(mv, (t)(max))) { \ + CONF_ERROR( \ + "Out-of-range " \ + "conf value", \ + k, klen, v, vlen); \ + } else { \ + o = (t)mv; \ + } \ + } \ + CONF_CONTINUE; \ + } +#define CONF_HANDLE_T_U(t, o, n, min, max, check_min, check_max, clip) \ + CONF_HANDLE_T(t, uintmax_t, o, n, min, max, check_min, \ + check_max, clip) +#define CONF_HANDLE_T_SIGNED(t, o, n, min, max, check_min, check_max, clip)\ + CONF_HANDLE_T(t, intmax_t, o, n, min, max, check_min, \ + check_max, clip) + +#define CONF_HANDLE_UNSIGNED(o, n, min, max, check_min, check_max, \ + clip) \ + CONF_HANDLE_T_U(unsigned, o, n, min, max, \ + check_min, check_max, clip) +#define CONF_HANDLE_SIZE_T(o, n, min, max, check_min, check_max, clip) \ + CONF_HANDLE_T_U(size_t, o, n, min, max, \ + check_min, check_max, clip) +#define CONF_HANDLE_INT64_T(o, n, min, max, check_min, check_max, clip) \ + CONF_HANDLE_T_SIGNED(int64_t, o, n, min, max, \ + check_min, check_max, clip) +#define CONF_HANDLE_UINT64_T(o, n, min, max, check_min, check_max, clip)\ + CONF_HANDLE_T_U(uint64_t, o, n, min, max, \ + check_min, check_max, clip) +#define CONF_HANDLE_SSIZE_T(o, n, min, max) \ + CONF_HANDLE_T_SIGNED(ssize_t, o, n, min, max, \ + CONF_CHECK_MIN, CONF_CHECK_MAX, false) +#define CONF_HANDLE_CHAR_P(o, n, d) \ + if (CONF_MATCH(n)) { \ + size_t cpylen = (vlen <= \ + sizeof(o)-1) ? vlen : \ + sizeof(o)-1; \ + strncpy(o, v, cpylen); \ + o[cpylen] = '\0'; \ + CONF_CONTINUE; \ + } + + bool cur_opt_valid = true; + + CONF_HANDLE_BOOL(opt_confirm_conf, "confirm_conf") + if (initial_call) { + continue; + } + + CONF_HANDLE_BOOL(opt_abort, "abort") + CONF_HANDLE_BOOL(opt_abort_conf, "abort_conf") + CONF_HANDLE_BOOL(opt_trust_madvise, "trust_madvise") + if (strncmp("metadata_thp", k, klen) == 0) { + int m; + bool match = false; + for (m = 0; m < metadata_thp_mode_limit; m++) { + if (strncmp(metadata_thp_mode_names[m], + v, vlen) == 0) { + opt_metadata_thp = m; + match = true; + break; + } + } + if (!match) { + CONF_ERROR("Invalid conf value", + k, klen, v, vlen); + } + CONF_CONTINUE; + } + CONF_HANDLE_BOOL(opt_retain, "retain") + if (strncmp("dss", k, klen) == 0) { + int m; + bool match = false; + for (m = 0; m < dss_prec_limit; m++) { + if (strncmp(dss_prec_names[m], v, vlen) + == 0) { + if (extent_dss_prec_set(m)) { + CONF_ERROR( + "Error setting dss", + k, klen, v, vlen); + } else { + opt_dss = + dss_prec_names[m]; + match = true; + break; + } + } + } + if (!match) { + CONF_ERROR("Invalid conf value", + k, klen, v, vlen); + } + CONF_CONTINUE; + } + if (CONF_MATCH("narenas")) { + if (CONF_MATCH_VALUE("default")) { + opt_narenas = 0; + CONF_CONTINUE; + } else { + CONF_HANDLE_UNSIGNED(opt_narenas, + "narenas", 1, UINT_MAX, + CONF_CHECK_MIN, CONF_DONT_CHECK_MAX, + /* clip */ false) + } + } + if (CONF_MATCH("narenas_ratio")) { + char *end; + bool err = fxp_parse(&opt_narenas_ratio, v, + &end); + if (err || (size_t)(end - v) != vlen) { + CONF_ERROR("Invalid conf value", + k, klen, v, vlen); + } + CONF_CONTINUE; + } + if (CONF_MATCH("bin_shards")) { + const char *bin_shards_segment_cur = v; + size_t vlen_left = vlen; + do { + size_t size_start; + size_t size_end; + size_t nshards; + bool err = malloc_conf_multi_sizes_next( + &bin_shards_segment_cur, &vlen_left, + &size_start, &size_end, &nshards); + if (err || bin_update_shard_size( + bin_shard_sizes, size_start, + size_end, nshards)) { + CONF_ERROR( + "Invalid settings for " + "bin_shards", k, klen, v, + vlen); + break; + } + } while (vlen_left > 0); + CONF_CONTINUE; + } + CONF_HANDLE_INT64_T(opt_mutex_max_spin, + "mutex_max_spin", -1, INT64_MAX, CONF_CHECK_MIN, + CONF_DONT_CHECK_MAX, false); + CONF_HANDLE_SSIZE_T(opt_dirty_decay_ms, + "dirty_decay_ms", -1, NSTIME_SEC_MAX * KQU(1000) < + QU(SSIZE_MAX) ? NSTIME_SEC_MAX * KQU(1000) : + SSIZE_MAX); + CONF_HANDLE_SSIZE_T(opt_muzzy_decay_ms, + "muzzy_decay_ms", -1, NSTIME_SEC_MAX * KQU(1000) < + QU(SSIZE_MAX) ? NSTIME_SEC_MAX * KQU(1000) : + SSIZE_MAX); + CONF_HANDLE_BOOL(opt_stats_print, "stats_print") + if (CONF_MATCH("stats_print_opts")) { + init_opt_stats_opts(v, vlen, + opt_stats_print_opts); + CONF_CONTINUE; + } + CONF_HANDLE_INT64_T(opt_stats_interval, + "stats_interval", -1, INT64_MAX, + CONF_CHECK_MIN, CONF_DONT_CHECK_MAX, false) + if (CONF_MATCH("stats_interval_opts")) { + init_opt_stats_opts(v, vlen, + opt_stats_interval_opts); + CONF_CONTINUE; + } + if (config_fill) { + if (CONF_MATCH("junk")) { + if (CONF_MATCH_VALUE("true")) { + opt_junk = "true"; + opt_junk_alloc = opt_junk_free = + true; + } else if (CONF_MATCH_VALUE("false")) { + opt_junk = "false"; + opt_junk_alloc = opt_junk_free = + false; + } else if (CONF_MATCH_VALUE("alloc")) { + opt_junk = "alloc"; + opt_junk_alloc = true; + opt_junk_free = false; + } else if (CONF_MATCH_VALUE("free")) { + opt_junk = "free"; + opt_junk_alloc = false; + opt_junk_free = true; + } else { + CONF_ERROR( + "Invalid conf value", + k, klen, v, vlen); + } + CONF_CONTINUE; + } + CONF_HANDLE_BOOL(opt_zero, "zero") + } + if (config_utrace) { + CONF_HANDLE_BOOL(opt_utrace, "utrace") + } + if (config_xmalloc) { + CONF_HANDLE_BOOL(opt_xmalloc, "xmalloc") + } + if (config_enable_cxx) { + CONF_HANDLE_BOOL( + opt_experimental_infallible_new, + "experimental_infallible_new") + } + + CONF_HANDLE_BOOL(opt_tcache, "tcache") + CONF_HANDLE_SIZE_T(opt_tcache_max, "tcache_max", + 0, TCACHE_MAXCLASS_LIMIT, CONF_DONT_CHECK_MIN, + CONF_CHECK_MAX, /* clip */ true) + if (CONF_MATCH("lg_tcache_max")) { + size_t m; + CONF_VALUE_READ(size_t, m) + if (CONF_VALUE_READ_FAIL()) { + CONF_ERROR("Invalid conf value", + k, klen, v, vlen); + } else { + /* clip if necessary */ + if (m > TCACHE_LG_MAXCLASS_LIMIT) { + m = TCACHE_LG_MAXCLASS_LIMIT; + } + opt_tcache_max = (size_t)1 << m; + } + CONF_CONTINUE; + } + /* + * Anyone trying to set a value outside -16 to 16 is + * deeply confused. + */ + CONF_HANDLE_SSIZE_T(opt_lg_tcache_nslots_mul, + "lg_tcache_nslots_mul", -16, 16) + /* Ditto with values past 2048. */ + CONF_HANDLE_UNSIGNED(opt_tcache_nslots_small_min, + "tcache_nslots_small_min", 1, 2048, + CONF_CHECK_MIN, CONF_CHECK_MAX, /* clip */ true) + CONF_HANDLE_UNSIGNED(opt_tcache_nslots_small_max, + "tcache_nslots_small_max", 1, 2048, + CONF_CHECK_MIN, CONF_CHECK_MAX, /* clip */ true) + CONF_HANDLE_UNSIGNED(opt_tcache_nslots_large, + "tcache_nslots_large", 1, 2048, + CONF_CHECK_MIN, CONF_CHECK_MAX, /* clip */ true) + CONF_HANDLE_SIZE_T(opt_tcache_gc_incr_bytes, + "tcache_gc_incr_bytes", 1024, SIZE_T_MAX, + CONF_CHECK_MIN, CONF_DONT_CHECK_MAX, + /* clip */ true) + CONF_HANDLE_SIZE_T(opt_tcache_gc_delay_bytes, + "tcache_gc_delay_bytes", 0, SIZE_T_MAX, + CONF_DONT_CHECK_MIN, CONF_DONT_CHECK_MAX, + /* clip */ false) + CONF_HANDLE_UNSIGNED(opt_lg_tcache_flush_small_div, + "lg_tcache_flush_small_div", 1, 16, + CONF_CHECK_MIN, CONF_CHECK_MAX, /* clip */ true) + CONF_HANDLE_UNSIGNED(opt_lg_tcache_flush_large_div, + "lg_tcache_flush_large_div", 1, 16, + CONF_CHECK_MIN, CONF_CHECK_MAX, /* clip */ true) + + /* + * The runtime option of oversize_threshold remains + * undocumented. It may be tweaked in the next major + * release (6.0). The default value 8M is rather + * conservative / safe. Tuning it further down may + * improve fragmentation a bit more, but may also cause + * contention on the huge arena. + */ + CONF_HANDLE_SIZE_T(opt_oversize_threshold, + "oversize_threshold", 0, SC_LARGE_MAXCLASS, + CONF_DONT_CHECK_MIN, CONF_CHECK_MAX, false) + CONF_HANDLE_SIZE_T(opt_lg_extent_max_active_fit, + "lg_extent_max_active_fit", 0, + (sizeof(size_t) << 3), CONF_DONT_CHECK_MIN, + CONF_CHECK_MAX, false) + + if (strncmp("percpu_arena", k, klen) == 0) { + bool match = false; + for (int m = percpu_arena_mode_names_base; m < + percpu_arena_mode_names_limit; m++) { + if (strncmp(percpu_arena_mode_names[m], + v, vlen) == 0) { + if (!have_percpu_arena) { + CONF_ERROR( + "No getcpu support", + k, klen, v, vlen); + } + opt_percpu_arena = m; + match = true; + break; + } + } + if (!match) { + CONF_ERROR("Invalid conf value", + k, klen, v, vlen); + } + CONF_CONTINUE; + } + CONF_HANDLE_BOOL(opt_background_thread, + "background_thread"); + CONF_HANDLE_SIZE_T(opt_max_background_threads, + "max_background_threads", 1, + opt_max_background_threads, + CONF_CHECK_MIN, CONF_CHECK_MAX, + true); + CONF_HANDLE_BOOL(opt_hpa, "hpa") + CONF_HANDLE_SIZE_T(opt_hpa_opts.slab_max_alloc, + "hpa_slab_max_alloc", PAGE, HUGEPAGE, + CONF_CHECK_MIN, CONF_CHECK_MAX, true); + + /* + * Accept either a ratio-based or an exact hugification + * threshold. + */ + CONF_HANDLE_SIZE_T(opt_hpa_opts.hugification_threshold, + "hpa_hugification_threshold", PAGE, HUGEPAGE, + CONF_CHECK_MIN, CONF_CHECK_MAX, true); + if (CONF_MATCH("hpa_hugification_threshold_ratio")) { + fxp_t ratio; + char *end; + bool err = fxp_parse(&ratio, v, + &end); + if (err || (size_t)(end - v) != vlen + || ratio > FXP_INIT_INT(1)) { + CONF_ERROR("Invalid conf value", + k, klen, v, vlen); + } else { + opt_hpa_opts.hugification_threshold = + fxp_mul_frac(HUGEPAGE, ratio); + } + CONF_CONTINUE; + } + + CONF_HANDLE_UINT64_T( + opt_hpa_opts.hugify_delay_ms, "hpa_hugify_delay_ms", + 0, 0, CONF_DONT_CHECK_MIN, CONF_DONT_CHECK_MAX, + false); + + CONF_HANDLE_UINT64_T( + opt_hpa_opts.min_purge_interval_ms, + "hpa_min_purge_interval_ms", 0, 0, + CONF_DONT_CHECK_MIN, CONF_DONT_CHECK_MAX, false); + + if (CONF_MATCH("hpa_dirty_mult")) { + if (CONF_MATCH_VALUE("-1")) { + opt_hpa_opts.dirty_mult = (fxp_t)-1; + CONF_CONTINUE; + } + fxp_t ratio; + char *end; + bool err = fxp_parse(&ratio, v, + &end); + if (err || (size_t)(end - v) != vlen) { + CONF_ERROR("Invalid conf value", + k, klen, v, vlen); + } else { + opt_hpa_opts.dirty_mult = ratio; + } + CONF_CONTINUE; + } + + CONF_HANDLE_SIZE_T(opt_hpa_sec_opts.nshards, + "hpa_sec_nshards", 0, 0, CONF_CHECK_MIN, + CONF_DONT_CHECK_MAX, true); + CONF_HANDLE_SIZE_T(opt_hpa_sec_opts.max_alloc, + "hpa_sec_max_alloc", PAGE, 0, CONF_CHECK_MIN, + CONF_DONT_CHECK_MAX, true); + CONF_HANDLE_SIZE_T(opt_hpa_sec_opts.max_bytes, + "hpa_sec_max_bytes", PAGE, 0, CONF_CHECK_MIN, + CONF_DONT_CHECK_MAX, true); + CONF_HANDLE_SIZE_T(opt_hpa_sec_opts.bytes_after_flush, + "hpa_sec_bytes_after_flush", PAGE, 0, + CONF_CHECK_MIN, CONF_DONT_CHECK_MAX, true); + CONF_HANDLE_SIZE_T(opt_hpa_sec_opts.batch_fill_extra, + "hpa_sec_batch_fill_extra", 0, HUGEPAGE_PAGES, + CONF_CHECK_MIN, CONF_CHECK_MAX, true); + + if (CONF_MATCH("slab_sizes")) { + if (CONF_MATCH_VALUE("default")) { + sc_data_init(sc_data); + CONF_CONTINUE; + } + bool err; + const char *slab_size_segment_cur = v; + size_t vlen_left = vlen; + do { + size_t slab_start; + size_t slab_end; + size_t pgs; + err = malloc_conf_multi_sizes_next( + &slab_size_segment_cur, + &vlen_left, &slab_start, &slab_end, + &pgs); + if (!err) { + sc_data_update_slab_size( + sc_data, slab_start, + slab_end, (int)pgs); + } else { + CONF_ERROR("Invalid settings " + "for slab_sizes", + k, klen, v, vlen); + } + } while (!err && vlen_left > 0); + CONF_CONTINUE; + } + if (config_prof) { + CONF_HANDLE_BOOL(opt_prof, "prof") + CONF_HANDLE_CHAR_P(opt_prof_prefix, + "prof_prefix", "jeprof") + CONF_HANDLE_BOOL(opt_prof_active, "prof_active") + CONF_HANDLE_BOOL(opt_prof_thread_active_init, + "prof_thread_active_init") + CONF_HANDLE_SIZE_T(opt_lg_prof_sample, + "lg_prof_sample", 0, (sizeof(uint64_t) << 3) + - 1, CONF_DONT_CHECK_MIN, CONF_CHECK_MAX, + true) + CONF_HANDLE_BOOL(opt_prof_accum, "prof_accum") + CONF_HANDLE_SSIZE_T(opt_lg_prof_interval, + "lg_prof_interval", -1, + (sizeof(uint64_t) << 3) - 1) + CONF_HANDLE_BOOL(opt_prof_gdump, "prof_gdump") + CONF_HANDLE_BOOL(opt_prof_final, "prof_final") + CONF_HANDLE_BOOL(opt_prof_leak, "prof_leak") + CONF_HANDLE_BOOL(opt_prof_leak_error, + "prof_leak_error") + CONF_HANDLE_BOOL(opt_prof_log, "prof_log") + CONF_HANDLE_SSIZE_T(opt_prof_recent_alloc_max, + "prof_recent_alloc_max", -1, SSIZE_MAX) + CONF_HANDLE_BOOL(opt_prof_stats, "prof_stats") + CONF_HANDLE_BOOL(opt_prof_sys_thread_name, + "prof_sys_thread_name") + if (CONF_MATCH("prof_time_resolution")) { + if (CONF_MATCH_VALUE("default")) { + opt_prof_time_res = + prof_time_res_default; + } else if (CONF_MATCH_VALUE("high")) { + if (!config_high_res_timer) { + CONF_ERROR( + "No high resolution" + " timer support", + k, klen, v, vlen); + } else { + opt_prof_time_res = + prof_time_res_high; + } + } else { + CONF_ERROR("Invalid conf value", + k, klen, v, vlen); + } + CONF_CONTINUE; + } + /* + * Undocumented. When set to false, don't + * correct for an unbiasing bug in jeprof + * attribution. This can be handy if you want + * to get consistent numbers from your binary + * across different jemalloc versions, even if + * those numbers are incorrect. The default is + * true. + */ + CONF_HANDLE_BOOL(opt_prof_unbias, "prof_unbias") + } + if (config_log) { + if (CONF_MATCH("log")) { + size_t cpylen = ( + vlen <= sizeof(log_var_names) ? + vlen : sizeof(log_var_names) - 1); + strncpy(log_var_names, v, cpylen); + log_var_names[cpylen] = '\0'; + CONF_CONTINUE; + } + } + if (CONF_MATCH("thp")) { + bool match = false; + for (int m = 0; m < thp_mode_names_limit; m++) { + if (strncmp(thp_mode_names[m],v, vlen) + == 0) { + if (!have_madvise_huge && !have_memcntl) { + CONF_ERROR( + "No THP support", + k, klen, v, vlen); + } + opt_thp = m; + match = true; + break; + } + } + if (!match) { + CONF_ERROR("Invalid conf value", + k, klen, v, vlen); + } + CONF_CONTINUE; + } + if (CONF_MATCH("zero_realloc")) { + if (CONF_MATCH_VALUE("alloc")) { + opt_zero_realloc_action + = zero_realloc_action_alloc; + } else if (CONF_MATCH_VALUE("free")) { + opt_zero_realloc_action + = zero_realloc_action_free; + } else if (CONF_MATCH_VALUE("abort")) { + opt_zero_realloc_action + = zero_realloc_action_abort; + } else { + CONF_ERROR("Invalid conf value", + k, klen, v, vlen); + } + CONF_CONTINUE; + } + if (config_uaf_detection && + CONF_MATCH("lg_san_uaf_align")) { + ssize_t a; + CONF_VALUE_READ(ssize_t, a) + if (CONF_VALUE_READ_FAIL() || a < -1) { + CONF_ERROR("Invalid conf value", + k, klen, v, vlen); + } + if (a == -1) { + opt_lg_san_uaf_align = -1; + CONF_CONTINUE; + } + + /* clip if necessary */ + ssize_t max_allowed = (sizeof(size_t) << 3) - 1; + ssize_t min_allowed = LG_PAGE; + if (a > max_allowed) { + a = max_allowed; + } else if (a < min_allowed) { + a = min_allowed; + } + + opt_lg_san_uaf_align = a; + CONF_CONTINUE; + } + + CONF_HANDLE_SIZE_T(opt_san_guard_small, + "san_guard_small", 0, SIZE_T_MAX, + CONF_DONT_CHECK_MIN, CONF_DONT_CHECK_MAX, false) + CONF_HANDLE_SIZE_T(opt_san_guard_large, + "san_guard_large", 0, SIZE_T_MAX, + CONF_DONT_CHECK_MIN, CONF_DONT_CHECK_MAX, false) + + CONF_ERROR("Invalid conf pair", k, klen, v, vlen); +#undef CONF_ERROR +#undef CONF_CONTINUE +#undef CONF_MATCH +#undef CONF_MATCH_VALUE +#undef CONF_HANDLE_BOOL +#undef CONF_DONT_CHECK_MIN +#undef CONF_CHECK_MIN +#undef CONF_DONT_CHECK_MAX +#undef CONF_CHECK_MAX +#undef CONF_HANDLE_T +#undef CONF_HANDLE_T_U +#undef CONF_HANDLE_T_SIGNED +#undef CONF_HANDLE_UNSIGNED +#undef CONF_HANDLE_SIZE_T +#undef CONF_HANDLE_SSIZE_T +#undef CONF_HANDLE_CHAR_P + /* Re-enable diagnostic "-Wtype-limits" */ + JEMALLOC_DIAGNOSTIC_POP + } + if (opt_abort_conf && had_conf_error) { + malloc_abort_invalid_conf(); + } + } + atomic_store_b(&log_init_done, true, ATOMIC_RELEASE); +} + +static bool +malloc_conf_init_check_deps(void) { + if (opt_prof_leak_error && !opt_prof_final) { + malloc_printf(": prof_leak_error is set w/o " + "prof_final.\n"); + return true; + } + + return false; +} + +static void +malloc_conf_init(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS]) { + const char *opts_cache[MALLOC_CONF_NSOURCES] = {NULL, NULL, NULL, NULL, + NULL}; + char buf[PATH_MAX + 1]; + + /* The first call only set the confirm_conf option and opts_cache */ + malloc_conf_init_helper(NULL, NULL, true, opts_cache, buf); + malloc_conf_init_helper(sc_data, bin_shard_sizes, false, opts_cache, + NULL); + if (malloc_conf_init_check_deps()) { + /* check_deps does warning msg only; abort below if needed. */ + if (opt_abort_conf) { + malloc_abort_invalid_conf(); + } + } +} + +#undef MALLOC_CONF_NSOURCES + +static bool +malloc_init_hard_needed(void) { + if (malloc_initialized() || (IS_INITIALIZER && malloc_init_state == + malloc_init_recursible)) { + /* + * Another thread initialized the allocator before this one + * acquired init_lock, or this thread is the initializing + * thread, and it is recursively allocating. + */ + return false; + } +#ifdef JEMALLOC_THREADED_INIT + if (malloc_initializer != NO_INITIALIZER && !IS_INITIALIZER) { + /* Busy-wait until the initializing thread completes. */ + spin_t spinner = SPIN_INITIALIZER; + do { + malloc_mutex_unlock(TSDN_NULL, &init_lock); + spin_adaptive(&spinner); + malloc_mutex_lock(TSDN_NULL, &init_lock); + } while (!malloc_initialized()); + return false; + } +#endif + return true; +} + +static bool +malloc_init_hard_a0_locked() { + malloc_initializer = INITIALIZER; + + JEMALLOC_DIAGNOSTIC_PUSH + JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS + sc_data_t sc_data = {0}; + JEMALLOC_DIAGNOSTIC_POP + + /* + * Ordering here is somewhat tricky; we need sc_boot() first, since that + * determines what the size classes will be, and then + * malloc_conf_init(), since any slab size tweaking will need to be done + * before sz_boot and bin_info_boot, which assume that the values they + * read out of sc_data_global are final. + */ + sc_boot(&sc_data); + unsigned bin_shard_sizes[SC_NBINS]; + bin_shard_sizes_boot(bin_shard_sizes); + /* + * prof_boot0 only initializes opt_prof_prefix. We need to do it before + * we parse malloc_conf options, in case malloc_conf parsing overwrites + * it. + */ + if (config_prof) { + prof_boot0(); + } + malloc_conf_init(&sc_data, bin_shard_sizes); + san_init(opt_lg_san_uaf_align); + sz_boot(&sc_data, opt_cache_oblivious); + bin_info_boot(&sc_data, bin_shard_sizes); + + if (opt_stats_print) { + /* Print statistics at exit. */ + if (atexit(stats_print_atexit) != 0) { + malloc_write(": Error in atexit()\n"); + if (opt_abort) { + abort(); + } + } + } + + if (stats_boot()) { + return true; + } + if (pages_boot()) { + return true; + } + if (base_boot(TSDN_NULL)) { + return true; + } + /* emap_global is static, hence zeroed. */ + if (emap_init(&arena_emap_global, b0get(), /* zeroed */ true)) { + return true; + } + if (extent_boot()) { + return true; + } + if (ctl_boot()) { + return true; + } + if (config_prof) { + prof_boot1(); + } + if (opt_hpa && !hpa_supported()) { + malloc_printf(": HPA not supported in the current " + "configuration; %s.", + opt_abort_conf ? "aborting" : "disabling"); + if (opt_abort_conf) { + malloc_abort_invalid_conf(); + } else { + opt_hpa = false; + } + } + if (arena_boot(&sc_data, b0get(), opt_hpa)) { + return true; + } + if (tcache_boot(TSDN_NULL, b0get())) { + return true; + } + if (malloc_mutex_init(&arenas_lock, "arenas", WITNESS_RANK_ARENAS, + malloc_mutex_rank_exclusive)) { + return true; + } + hook_boot(); + /* + * Create enough scaffolding to allow recursive allocation in + * malloc_ncpus(). + */ + narenas_auto = 1; + manual_arena_base = narenas_auto + 1; + memset(arenas, 0, sizeof(arena_t *) * narenas_auto); + /* + * Initialize one arena here. The rest are lazily created in + * arena_choose_hard(). + */ + if (arena_init(TSDN_NULL, 0, &arena_config_default) == NULL) { + return true; + } + a0 = arena_get(TSDN_NULL, 0, false); + + if (opt_hpa && !hpa_supported()) { + malloc_printf(": HPA not supported in the current " + "configuration; %s.", + opt_abort_conf ? "aborting" : "disabling"); + if (opt_abort_conf) { + malloc_abort_invalid_conf(); + } else { + opt_hpa = false; + } + } else if (opt_hpa) { + hpa_shard_opts_t hpa_shard_opts = opt_hpa_opts; + hpa_shard_opts.deferral_allowed = background_thread_enabled(); + if (pa_shard_enable_hpa(TSDN_NULL, &a0->pa_shard, + &hpa_shard_opts, &opt_hpa_sec_opts)) { + return true; + } + } + + malloc_init_state = malloc_init_a0_initialized; + + return false; +} + +static bool +malloc_init_hard_a0(void) { + bool ret; + + malloc_mutex_lock(TSDN_NULL, &init_lock); + ret = malloc_init_hard_a0_locked(); + malloc_mutex_unlock(TSDN_NULL, &init_lock); + return ret; +} + +/* Initialize data structures which may trigger recursive allocation. */ +static bool +malloc_init_hard_recursible(void) { + malloc_init_state = malloc_init_recursible; + + ncpus = malloc_ncpus(); + if (opt_percpu_arena != percpu_arena_disabled) { + bool cpu_count_is_deterministic = + malloc_cpu_count_is_deterministic(); + if (!cpu_count_is_deterministic) { + /* + * If # of CPU is not deterministic, and narenas not + * specified, disables per cpu arena since it may not + * detect CPU IDs properly. + */ + if (opt_narenas == 0) { + opt_percpu_arena = percpu_arena_disabled; + malloc_write(": Number of CPUs " + "detected is not deterministic. Per-CPU " + "arena disabled.\n"); + if (opt_abort_conf) { + malloc_abort_invalid_conf(); + } + if (opt_abort) { + abort(); + } + } + } + } + +#if (defined(JEMALLOC_HAVE_PTHREAD_ATFORK) && !defined(JEMALLOC_MUTEX_INIT_CB) \ + && !defined(JEMALLOC_ZONE) && !defined(_WIN32) && \ + !defined(__native_client__)) + /* LinuxThreads' pthread_atfork() allocates. */ + if (pthread_atfork(jemalloc_prefork, jemalloc_postfork_parent, + jemalloc_postfork_child) != 0) { + malloc_write(": Error in pthread_atfork()\n"); + if (opt_abort) { + abort(); + } + return true; + } +#endif + + if (background_thread_boot0()) { + return true; + } + + return false; +} + +static unsigned +malloc_narenas_default(void) { + assert(ncpus > 0); + /* + * For SMP systems, create more than one arena per CPU by + * default. + */ + if (ncpus > 1) { + fxp_t fxp_ncpus = FXP_INIT_INT(ncpus); + fxp_t goal = fxp_mul(fxp_ncpus, opt_narenas_ratio); + uint32_t int_goal = fxp_round_nearest(goal); + if (int_goal == 0) { + return 1; + } + return int_goal; + } else { + return 1; + } +} + +static percpu_arena_mode_t +percpu_arena_as_initialized(percpu_arena_mode_t mode) { + assert(!malloc_initialized()); + assert(mode <= percpu_arena_disabled); + + if (mode != percpu_arena_disabled) { + mode += percpu_arena_mode_enabled_base; + } + + return mode; +} + +static bool +malloc_init_narenas(void) { + assert(ncpus > 0); + + if (opt_percpu_arena != percpu_arena_disabled) { + if (!have_percpu_arena || malloc_getcpu() < 0) { + opt_percpu_arena = percpu_arena_disabled; + malloc_printf(": perCPU arena getcpu() not " + "available. Setting narenas to %u.\n", opt_narenas ? + opt_narenas : malloc_narenas_default()); + if (opt_abort) { + abort(); + } + } else { + if (ncpus >= MALLOCX_ARENA_LIMIT) { + malloc_printf(": narenas w/ percpu" + "arena beyond limit (%d)\n", ncpus); + if (opt_abort) { + abort(); + } + return true; + } + /* NB: opt_percpu_arena isn't fully initialized yet. */ + if (percpu_arena_as_initialized(opt_percpu_arena) == + per_phycpu_arena && ncpus % 2 != 0) { + malloc_printf(": invalid " + "configuration -- per physical CPU arena " + "with odd number (%u) of CPUs (no hyper " + "threading?).\n", ncpus); + if (opt_abort) + abort(); + } + unsigned n = percpu_arena_ind_limit( + percpu_arena_as_initialized(opt_percpu_arena)); + if (opt_narenas < n) { + /* + * If narenas is specified with percpu_arena + * enabled, actual narenas is set as the greater + * of the two. percpu_arena_choose will be free + * to use any of the arenas based on CPU + * id. This is conservative (at a small cost) + * but ensures correctness. + * + * If for some reason the ncpus determined at + * boot is not the actual number (e.g. because + * of affinity setting from numactl), reserving + * narenas this way provides a workaround for + * percpu_arena. + */ + opt_narenas = n; + } + } + } + if (opt_narenas == 0) { + opt_narenas = malloc_narenas_default(); + } + assert(opt_narenas > 0); + + narenas_auto = opt_narenas; + /* + * Limit the number of arenas to the indexing range of MALLOCX_ARENA(). + */ + if (narenas_auto >= MALLOCX_ARENA_LIMIT) { + narenas_auto = MALLOCX_ARENA_LIMIT - 1; + malloc_printf(": Reducing narenas to limit (%d)\n", + narenas_auto); + } + narenas_total_set(narenas_auto); + if (arena_init_huge()) { + narenas_total_inc(); + } + manual_arena_base = narenas_total_get(); + + return false; +} + +static void +malloc_init_percpu(void) { + opt_percpu_arena = percpu_arena_as_initialized(opt_percpu_arena); +} + +static bool +malloc_init_hard_finish(void) { + if (malloc_mutex_boot()) { + return true; + } + + malloc_init_state = malloc_init_initialized; + malloc_slow_flag_init(); + + return false; +} + +static void +malloc_init_hard_cleanup(tsdn_t *tsdn, bool reentrancy_set) { + malloc_mutex_assert_owner(tsdn, &init_lock); + malloc_mutex_unlock(tsdn, &init_lock); + if (reentrancy_set) { + assert(!tsdn_null(tsdn)); + tsd_t *tsd = tsdn_tsd(tsdn); + assert(tsd_reentrancy_level_get(tsd) > 0); + post_reentrancy(tsd); + } +} + +static bool +malloc_init_hard(void) { + tsd_t *tsd; + +#if defined(_WIN32) && _WIN32_WINNT < 0x0600 + _init_init_lock(); +#endif + malloc_mutex_lock(TSDN_NULL, &init_lock); + +#define UNLOCK_RETURN(tsdn, ret, reentrancy) \ + malloc_init_hard_cleanup(tsdn, reentrancy); \ + return ret; + + if (!malloc_init_hard_needed()) { + UNLOCK_RETURN(TSDN_NULL, false, false) + } + + if (malloc_init_state != malloc_init_a0_initialized && + malloc_init_hard_a0_locked()) { + UNLOCK_RETURN(TSDN_NULL, true, false) + } + + malloc_mutex_unlock(TSDN_NULL, &init_lock); + /* Recursive allocation relies on functional tsd. */ + tsd = malloc_tsd_boot0(); + if (tsd == NULL) { + return true; + } + if (malloc_init_hard_recursible()) { + return true; + } + + malloc_mutex_lock(tsd_tsdn(tsd), &init_lock); + /* Set reentrancy level to 1 during init. */ + pre_reentrancy(tsd, NULL); + /* Initialize narenas before prof_boot2 (for allocation). */ + if (malloc_init_narenas() + || background_thread_boot1(tsd_tsdn(tsd), b0get())) { + UNLOCK_RETURN(tsd_tsdn(tsd), true, true) + } + if (config_prof && prof_boot2(tsd, b0get())) { + UNLOCK_RETURN(tsd_tsdn(tsd), true, true) + } + + malloc_init_percpu(); + + if (malloc_init_hard_finish()) { + UNLOCK_RETURN(tsd_tsdn(tsd), true, true) + } + post_reentrancy(tsd); + malloc_mutex_unlock(tsd_tsdn(tsd), &init_lock); + + witness_assert_lockless(witness_tsd_tsdn( + tsd_witness_tsdp_get_unsafe(tsd))); + malloc_tsd_boot1(); + /* Update TSD after tsd_boot1. */ + tsd = tsd_fetch(); + if (opt_background_thread) { + assert(have_background_thread); + /* + * Need to finish init & unlock first before creating background + * threads (pthread_create depends on malloc). ctl_init (which + * sets isthreaded) needs to be called without holding any lock. + */ + background_thread_ctl_init(tsd_tsdn(tsd)); + if (background_thread_create(tsd, 0)) { + return true; + } + } +#undef UNLOCK_RETURN + return false; +} + +/* + * End initialization functions. + */ +/******************************************************************************/ +/* + * Begin allocation-path internal functions and data structures. + */ + +/* + * Settings determined by the documented behavior of the allocation functions. + */ +typedef struct static_opts_s static_opts_t; +struct static_opts_s { + /* Whether or not allocation size may overflow. */ + bool may_overflow; + + /* + * Whether or not allocations (with alignment) of size 0 should be + * treated as size 1. + */ + bool bump_empty_aligned_alloc; + /* + * Whether to assert that allocations are not of size 0 (after any + * bumping). + */ + bool assert_nonempty_alloc; + + /* + * Whether or not to modify the 'result' argument to malloc in case of + * error. + */ + bool null_out_result_on_error; + /* Whether to set errno when we encounter an error condition. */ + bool set_errno_on_error; + + /* + * The minimum valid alignment for functions requesting aligned storage. + */ + size_t min_alignment; + + /* The error string to use if we oom. */ + const char *oom_string; + /* The error string to use if the passed-in alignment is invalid. */ + const char *invalid_alignment_string; + + /* + * False if we're configured to skip some time-consuming operations. + * + * This isn't really a malloc "behavior", but it acts as a useful + * summary of several other static (or at least, static after program + * initialization) options. + */ + bool slow; + /* + * Return size. + */ + bool usize; +}; + +JEMALLOC_ALWAYS_INLINE void +static_opts_init(static_opts_t *static_opts) { + static_opts->may_overflow = false; + static_opts->bump_empty_aligned_alloc = false; + static_opts->assert_nonempty_alloc = false; + static_opts->null_out_result_on_error = false; + static_opts->set_errno_on_error = false; + static_opts->min_alignment = 0; + static_opts->oom_string = ""; + static_opts->invalid_alignment_string = ""; + static_opts->slow = false; + static_opts->usize = false; +} + +/* + * These correspond to the macros in jemalloc/jemalloc_macros.h. Broadly, we + * should have one constant here per magic value there. Note however that the + * representations need not be related. + */ +#define TCACHE_IND_NONE ((unsigned)-1) +#define TCACHE_IND_AUTOMATIC ((unsigned)-2) +#define ARENA_IND_AUTOMATIC ((unsigned)-1) + +typedef struct dynamic_opts_s dynamic_opts_t; +struct dynamic_opts_s { + void **result; + size_t usize; + size_t num_items; + size_t item_size; + size_t alignment; + bool zero; + unsigned tcache_ind; + unsigned arena_ind; +}; + +JEMALLOC_ALWAYS_INLINE void +dynamic_opts_init(dynamic_opts_t *dynamic_opts) { + dynamic_opts->result = NULL; + dynamic_opts->usize = 0; + dynamic_opts->num_items = 0; + dynamic_opts->item_size = 0; + dynamic_opts->alignment = 0; + dynamic_opts->zero = false; + dynamic_opts->tcache_ind = TCACHE_IND_AUTOMATIC; + dynamic_opts->arena_ind = ARENA_IND_AUTOMATIC; +} + +/* + * ind parameter is optional and is only checked and filled if alignment == 0; + * return true if result is out of range. + */ +JEMALLOC_ALWAYS_INLINE bool +aligned_usize_get(size_t size, size_t alignment, size_t *usize, szind_t *ind, + bool bump_empty_aligned_alloc) { + assert(usize != NULL); + if (alignment == 0) { + if (ind != NULL) { + *ind = sz_size2index(size); + if (unlikely(*ind >= SC_NSIZES)) { + return true; + } + *usize = sz_index2size(*ind); + assert(*usize > 0 && *usize <= SC_LARGE_MAXCLASS); + return false; + } + *usize = sz_s2u(size); + } else { + if (bump_empty_aligned_alloc && unlikely(size == 0)) { + size = 1; + } + *usize = sz_sa2u(size, alignment); + } + if (unlikely(*usize == 0 || *usize > SC_LARGE_MAXCLASS)) { + return true; + } + return false; +} + +JEMALLOC_ALWAYS_INLINE bool +zero_get(bool guarantee, bool slow) { + if (config_fill && slow && unlikely(opt_zero)) { + return true; + } else { + return guarantee; + } +} + +JEMALLOC_ALWAYS_INLINE tcache_t * +tcache_get_from_ind(tsd_t *tsd, unsigned tcache_ind, bool slow, bool is_alloc) { + tcache_t *tcache; + if (tcache_ind == TCACHE_IND_AUTOMATIC) { + if (likely(!slow)) { + /* Getting tcache ptr unconditionally. */ + tcache = tsd_tcachep_get(tsd); + assert(tcache == tcache_get(tsd)); + } else if (is_alloc || + likely(tsd_reentrancy_level_get(tsd) == 0)) { + tcache = tcache_get(tsd); + } else { + tcache = NULL; + } + } else { + /* + * Should not specify tcache on deallocation path when being + * reentrant. + */ + assert(is_alloc || tsd_reentrancy_level_get(tsd) == 0 || + tsd_state_nocleanup(tsd)); + if (tcache_ind == TCACHE_IND_NONE) { + tcache = NULL; + } else { + tcache = tcaches_get(tsd, tcache_ind); + } + } + return tcache; +} + +/* Return true if a manual arena is specified and arena_get() OOMs. */ +JEMALLOC_ALWAYS_INLINE bool +arena_get_from_ind(tsd_t *tsd, unsigned arena_ind, arena_t **arena_p) { + if (arena_ind == ARENA_IND_AUTOMATIC) { + /* + * In case of automatic arena management, we defer arena + * computation until as late as we can, hoping to fill the + * allocation out of the tcache. + */ + *arena_p = NULL; + } else { + *arena_p = arena_get(tsd_tsdn(tsd), arena_ind, true); + if (unlikely(*arena_p == NULL) && arena_ind >= narenas_auto) { + return true; + } + } + return false; +} + +/* ind is ignored if dopts->alignment > 0. */ +JEMALLOC_ALWAYS_INLINE void * +imalloc_no_sample(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd, + size_t size, size_t usize, szind_t ind) { + /* Fill in the tcache. */ + tcache_t *tcache = tcache_get_from_ind(tsd, dopts->tcache_ind, + sopts->slow, /* is_alloc */ true); + + /* Fill in the arena. */ + arena_t *arena; + if (arena_get_from_ind(tsd, dopts->arena_ind, &arena)) { + return NULL; + } + + if (unlikely(dopts->alignment != 0)) { + return ipalloct(tsd_tsdn(tsd), usize, dopts->alignment, + dopts->zero, tcache, arena); + } + + return iallocztm(tsd_tsdn(tsd), size, ind, dopts->zero, tcache, false, + arena, sopts->slow); +} + +JEMALLOC_ALWAYS_INLINE void * +imalloc_sample(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd, + size_t usize, szind_t ind) { + void *ret; + + /* + * For small allocations, sampling bumps the usize. If so, we allocate + * from the ind_large bucket. + */ + szind_t ind_large; + size_t bumped_usize = usize; + + dopts->alignment = prof_sample_align(dopts->alignment); + if (usize <= SC_SMALL_MAXCLASS) { + assert(((dopts->alignment == 0) ? + sz_s2u(SC_LARGE_MINCLASS) : + sz_sa2u(SC_LARGE_MINCLASS, dopts->alignment)) + == SC_LARGE_MINCLASS); + ind_large = sz_size2index(SC_LARGE_MINCLASS); + bumped_usize = sz_s2u(SC_LARGE_MINCLASS); + ret = imalloc_no_sample(sopts, dopts, tsd, bumped_usize, + bumped_usize, ind_large); + if (unlikely(ret == NULL)) { + return NULL; + } + arena_prof_promote(tsd_tsdn(tsd), ret, usize); + } else { + ret = imalloc_no_sample(sopts, dopts, tsd, usize, usize, ind); + } + assert(prof_sample_aligned(ret)); + + return ret; +} + +/* + * Returns true if the allocation will overflow, and false otherwise. Sets + * *size to the product either way. + */ +JEMALLOC_ALWAYS_INLINE bool +compute_size_with_overflow(bool may_overflow, dynamic_opts_t *dopts, + size_t *size) { + /* + * This function is just num_items * item_size, except that we may have + * to check for overflow. + */ + + if (!may_overflow) { + assert(dopts->num_items == 1); + *size = dopts->item_size; + return false; + } + + /* A size_t with its high-half bits all set to 1. */ + static const size_t high_bits = SIZE_T_MAX << (sizeof(size_t) * 8 / 2); + + *size = dopts->item_size * dopts->num_items; + + if (unlikely(*size == 0)) { + return (dopts->num_items != 0 && dopts->item_size != 0); + } + + /* + * We got a non-zero size, but we don't know if we overflowed to get + * there. To avoid having to do a divide, we'll be clever and note that + * if both A and B can be represented in N/2 bits, then their product + * can be represented in N bits (without the possibility of overflow). + */ + if (likely((high_bits & (dopts->num_items | dopts->item_size)) == 0)) { + return false; + } + if (likely(*size / dopts->item_size == dopts->num_items)) { + return false; + } + return true; +} + +JEMALLOC_ALWAYS_INLINE int +imalloc_body(static_opts_t *sopts, dynamic_opts_t *dopts, tsd_t *tsd) { + /* Where the actual allocated memory will live. */ + void *allocation = NULL; + /* Filled in by compute_size_with_overflow below. */ + size_t size = 0; + /* + * The zero initialization for ind is actually dead store, in that its + * value is reset before any branch on its value is taken. Sometimes + * though, it's convenient to pass it as arguments before this point. + * To avoid undefined behavior then, we initialize it with dummy stores. + */ + szind_t ind = 0; + /* usize will always be properly initialized. */ + size_t usize; + + /* Reentrancy is only checked on slow path. */ + int8_t reentrancy_level; + + /* Compute the amount of memory the user wants. */ + if (unlikely(compute_size_with_overflow(sopts->may_overflow, dopts, + &size))) { + goto label_oom; + } + + if (unlikely(dopts->alignment < sopts->min_alignment + || (dopts->alignment & (dopts->alignment - 1)) != 0)) { + goto label_invalid_alignment; + } + + /* This is the beginning of the "core" algorithm. */ + dopts->zero = zero_get(dopts->zero, sopts->slow); + if (aligned_usize_get(size, dopts->alignment, &usize, &ind, + sopts->bump_empty_aligned_alloc)) { + goto label_oom; + } + dopts->usize = usize; + /* Validate the user input. */ + if (sopts->assert_nonempty_alloc) { + assert (size != 0); + } + + check_entry_exit_locking(tsd_tsdn(tsd)); + + /* + * If we need to handle reentrancy, we can do it out of a + * known-initialized arena (i.e. arena 0). + */ + reentrancy_level = tsd_reentrancy_level_get(tsd); + if (sopts->slow && unlikely(reentrancy_level > 0)) { + /* + * We should never specify particular arenas or tcaches from + * within our internal allocations. + */ + assert(dopts->tcache_ind == TCACHE_IND_AUTOMATIC || + dopts->tcache_ind == TCACHE_IND_NONE); + assert(dopts->arena_ind == ARENA_IND_AUTOMATIC); + dopts->tcache_ind = TCACHE_IND_NONE; + /* We know that arena 0 has already been initialized. */ + dopts->arena_ind = 0; + } + + /* + * If dopts->alignment > 0, then ind is still 0, but usize was computed + * in the previous if statement. Down the positive alignment path, + * imalloc_no_sample and imalloc_sample will ignore ind. + */ + + /* If profiling is on, get our profiling context. */ + if (config_prof && opt_prof) { + bool prof_active = prof_active_get_unlocked(); + bool sample_event = te_prof_sample_event_lookahead(tsd, usize); + prof_tctx_t *tctx = prof_alloc_prep(tsd, prof_active, + sample_event); + + emap_alloc_ctx_t alloc_ctx; + if (likely((uintptr_t)tctx == (uintptr_t)1U)) { + alloc_ctx.slab = (usize <= SC_SMALL_MAXCLASS); + allocation = imalloc_no_sample( + sopts, dopts, tsd, usize, usize, ind); + } else if ((uintptr_t)tctx > (uintptr_t)1U) { + allocation = imalloc_sample( + sopts, dopts, tsd, usize, ind); + alloc_ctx.slab = false; + } else { + allocation = NULL; + } + + if (unlikely(allocation == NULL)) { + prof_alloc_rollback(tsd, tctx); + goto label_oom; + } + prof_malloc(tsd, allocation, size, usize, &alloc_ctx, tctx); + } else { + assert(!opt_prof); + allocation = imalloc_no_sample(sopts, dopts, tsd, size, usize, + ind); + if (unlikely(allocation == NULL)) { + goto label_oom; + } + } + + /* + * Allocation has been done at this point. We still have some + * post-allocation work to do though. + */ + + thread_alloc_event(tsd, usize); + + assert(dopts->alignment == 0 + || ((uintptr_t)allocation & (dopts->alignment - 1)) == ZU(0)); + + assert(usize == isalloc(tsd_tsdn(tsd), allocation)); + + if (config_fill && sopts->slow && !dopts->zero + && unlikely(opt_junk_alloc)) { + junk_alloc_callback(allocation, usize); + } + + if (sopts->slow) { + UTRACE(0, size, allocation); + } + + /* Success! */ + check_entry_exit_locking(tsd_tsdn(tsd)); + *dopts->result = allocation; + return 0; + +label_oom: + if (unlikely(sopts->slow) && config_xmalloc && unlikely(opt_xmalloc)) { + malloc_write(sopts->oom_string); + abort(); + } + + if (sopts->slow) { + UTRACE(NULL, size, NULL); + } + + check_entry_exit_locking(tsd_tsdn(tsd)); + + if (sopts->set_errno_on_error) { + set_errno(ENOMEM); + } + + if (sopts->null_out_result_on_error) { + *dopts->result = NULL; + } + + return ENOMEM; + + /* + * This label is only jumped to by one goto; we move it out of line + * anyways to avoid obscuring the non-error paths, and for symmetry with + * the oom case. + */ +label_invalid_alignment: + if (config_xmalloc && unlikely(opt_xmalloc)) { + malloc_write(sopts->invalid_alignment_string); + abort(); + } + + if (sopts->set_errno_on_error) { + set_errno(EINVAL); + } + + if (sopts->slow) { + UTRACE(NULL, size, NULL); + } + + check_entry_exit_locking(tsd_tsdn(tsd)); + + if (sopts->null_out_result_on_error) { + *dopts->result = NULL; + } + + return EINVAL; +} + +JEMALLOC_ALWAYS_INLINE bool +imalloc_init_check(static_opts_t *sopts, dynamic_opts_t *dopts) { + if (unlikely(!malloc_initialized()) && unlikely(malloc_init())) { + if (config_xmalloc && unlikely(opt_xmalloc)) { + malloc_write(sopts->oom_string); + abort(); + } + UTRACE(NULL, dopts->num_items * dopts->item_size, NULL); + set_errno(ENOMEM); + *dopts->result = NULL; + + return false; + } + + return true; +} + +/* Returns the errno-style error code of the allocation. */ +JEMALLOC_ALWAYS_INLINE int +imalloc(static_opts_t *sopts, dynamic_opts_t *dopts) { + if (tsd_get_allocates() && !imalloc_init_check(sopts, dopts)) { + return ENOMEM; + } + + /* We always need the tsd. Let's grab it right away. */ + tsd_t *tsd = tsd_fetch(); + assert(tsd); + if (likely(tsd_fast(tsd))) { + /* Fast and common path. */ + tsd_assert_fast(tsd); + sopts->slow = false; + return imalloc_body(sopts, dopts, tsd); + } else { + if (!tsd_get_allocates() && !imalloc_init_check(sopts, dopts)) { + return ENOMEM; + } + + sopts->slow = true; + return imalloc_body(sopts, dopts, tsd); + } +} + +JEMALLOC_NOINLINE +void * +malloc_default(size_t size) { + void *ret; + static_opts_t sopts; + dynamic_opts_t dopts; + + /* + * This variant has logging hook on exit but not on entry. It's callled + * only by je_malloc, below, which emits the entry one for us (and, if + * it calls us, does so only via tail call). + */ + + static_opts_init(&sopts); + dynamic_opts_init(&dopts); + + sopts.null_out_result_on_error = true; + sopts.set_errno_on_error = true; + sopts.oom_string = ": Error in malloc(): out of memory\n"; + + dopts.result = &ret; + dopts.num_items = 1; + dopts.item_size = size; + + imalloc(&sopts, &dopts); + /* + * Note that this branch gets optimized away -- it immediately follows + * the check on tsd_fast that sets sopts.slow. + */ + if (sopts.slow) { + uintptr_t args[3] = {size}; + hook_invoke_alloc(hook_alloc_malloc, ret, (uintptr_t)ret, args); + } + + LOG("core.malloc.exit", "result: %p", ret); + + return ret; +} + +/******************************************************************************/ +/* + * Begin malloc(3)-compatible functions. + */ + +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN +void JEMALLOC_NOTHROW * +JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1) +je_malloc(size_t size) { + return imalloc_fastpath(size, &malloc_default); +} + +JEMALLOC_EXPORT int JEMALLOC_NOTHROW +JEMALLOC_ATTR(nonnull(1)) +je_posix_memalign(void **memptr, size_t alignment, size_t size) { + int ret; + static_opts_t sopts; + dynamic_opts_t dopts; + + LOG("core.posix_memalign.entry", "mem ptr: %p, alignment: %zu, " + "size: %zu", memptr, alignment, size); + + static_opts_init(&sopts); + dynamic_opts_init(&dopts); + + sopts.bump_empty_aligned_alloc = true; + sopts.min_alignment = sizeof(void *); + sopts.oom_string = + ": Error allocating aligned memory: out of memory\n"; + sopts.invalid_alignment_string = + ": Error allocating aligned memory: invalid alignment\n"; + + dopts.result = memptr; + dopts.num_items = 1; + dopts.item_size = size; + dopts.alignment = alignment; + + ret = imalloc(&sopts, &dopts); + if (sopts.slow) { + uintptr_t args[3] = {(uintptr_t)memptr, (uintptr_t)alignment, + (uintptr_t)size}; + hook_invoke_alloc(hook_alloc_posix_memalign, *memptr, + (uintptr_t)ret, args); + } + + LOG("core.posix_memalign.exit", "result: %d, alloc ptr: %p", ret, + *memptr); + + return ret; +} + +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN +void JEMALLOC_NOTHROW * +JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(2) +je_aligned_alloc(size_t alignment, size_t size) { + void *ret; + + static_opts_t sopts; + dynamic_opts_t dopts; + + LOG("core.aligned_alloc.entry", "alignment: %zu, size: %zu\n", + alignment, size); + + static_opts_init(&sopts); + dynamic_opts_init(&dopts); + + sopts.bump_empty_aligned_alloc = true; + sopts.null_out_result_on_error = true; + sopts.set_errno_on_error = true; + sopts.min_alignment = 1; + sopts.oom_string = + ": Error allocating aligned memory: out of memory\n"; + sopts.invalid_alignment_string = + ": Error allocating aligned memory: invalid alignment\n"; + + dopts.result = &ret; + dopts.num_items = 1; + dopts.item_size = size; + dopts.alignment = alignment; + + imalloc(&sopts, &dopts); + if (sopts.slow) { + uintptr_t args[3] = {(uintptr_t)alignment, (uintptr_t)size}; + hook_invoke_alloc(hook_alloc_aligned_alloc, ret, + (uintptr_t)ret, args); + } + + LOG("core.aligned_alloc.exit", "result: %p", ret); + + return ret; +} + +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN +void JEMALLOC_NOTHROW * +JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE2(1, 2) +je_calloc(size_t num, size_t size) { + void *ret; + static_opts_t sopts; + dynamic_opts_t dopts; + + LOG("core.calloc.entry", "num: %zu, size: %zu\n", num, size); + + static_opts_init(&sopts); + dynamic_opts_init(&dopts); + + sopts.may_overflow = true; + sopts.null_out_result_on_error = true; + sopts.set_errno_on_error = true; + sopts.oom_string = ": Error in calloc(): out of memory\n"; + + dopts.result = &ret; + dopts.num_items = num; + dopts.item_size = size; + dopts.zero = true; + + imalloc(&sopts, &dopts); + if (sopts.slow) { + uintptr_t args[3] = {(uintptr_t)num, (uintptr_t)size}; + hook_invoke_alloc(hook_alloc_calloc, ret, (uintptr_t)ret, args); + } + + LOG("core.calloc.exit", "result: %p", ret); + + return ret; +} + +JEMALLOC_ALWAYS_INLINE void +ifree(tsd_t *tsd, void *ptr, tcache_t *tcache, bool slow_path) { + if (!slow_path) { + tsd_assert_fast(tsd); + } + check_entry_exit_locking(tsd_tsdn(tsd)); + if (tsd_reentrancy_level_get(tsd) != 0) { + assert(slow_path); + } + + assert(ptr != NULL); + assert(malloc_initialized() || IS_INITIALIZER); + + emap_alloc_ctx_t alloc_ctx; + emap_alloc_ctx_lookup(tsd_tsdn(tsd), &arena_emap_global, ptr, + &alloc_ctx); + assert(alloc_ctx.szind != SC_NSIZES); + + size_t usize = sz_index2size(alloc_ctx.szind); + if (config_prof && opt_prof) { + prof_free(tsd, ptr, usize, &alloc_ctx); + } + + if (likely(!slow_path)) { + idalloctm(tsd_tsdn(tsd), ptr, tcache, &alloc_ctx, false, + false); + } else { + if (config_fill && slow_path && opt_junk_free) { + junk_free_callback(ptr, usize); + } + idalloctm(tsd_tsdn(tsd), ptr, tcache, &alloc_ctx, false, + true); + } + thread_dalloc_event(tsd, usize); +} + +JEMALLOC_ALWAYS_INLINE bool +maybe_check_alloc_ctx(tsd_t *tsd, void *ptr, emap_alloc_ctx_t *alloc_ctx) { + if (config_opt_size_checks) { + emap_alloc_ctx_t dbg_ctx; + emap_alloc_ctx_lookup(tsd_tsdn(tsd), &arena_emap_global, ptr, + &dbg_ctx); + if (alloc_ctx->szind != dbg_ctx.szind) { + safety_check_fail_sized_dealloc( + /* current_dealloc */ true, ptr, + /* true_size */ sz_size2index(dbg_ctx.szind), + /* input_size */ sz_size2index(alloc_ctx->szind)); + return true; + } + if (alloc_ctx->slab != dbg_ctx.slab) { + safety_check_fail( + "Internal heap corruption detected: " + "mismatch in slab bit"); + return true; + } + } + return false; +} + +JEMALLOC_ALWAYS_INLINE void +isfree(tsd_t *tsd, void *ptr, size_t usize, tcache_t *tcache, bool slow_path) { + if (!slow_path) { + tsd_assert_fast(tsd); + } + check_entry_exit_locking(tsd_tsdn(tsd)); + if (tsd_reentrancy_level_get(tsd) != 0) { + assert(slow_path); + } + + assert(ptr != NULL); + assert(malloc_initialized() || IS_INITIALIZER); + + emap_alloc_ctx_t alloc_ctx; + if (!config_prof) { + alloc_ctx.szind = sz_size2index(usize); + alloc_ctx.slab = (alloc_ctx.szind < SC_NBINS); + } else { + if (likely(!prof_sample_aligned(ptr))) { + /* + * When the ptr is not page aligned, it was not sampled. + * usize can be trusted to determine szind and slab. + */ + alloc_ctx.szind = sz_size2index(usize); + alloc_ctx.slab = (alloc_ctx.szind < SC_NBINS); + } else if (opt_prof) { + emap_alloc_ctx_lookup(tsd_tsdn(tsd), &arena_emap_global, + ptr, &alloc_ctx); + + if (config_opt_safety_checks) { + /* Small alloc may have !slab (sampled). */ + if (unlikely(alloc_ctx.szind != + sz_size2index(usize))) { + safety_check_fail_sized_dealloc( + /* current_dealloc */ true, ptr, + /* true_size */ sz_index2size( + alloc_ctx.szind), + /* input_size */ usize); + } + } + } else { + alloc_ctx.szind = sz_size2index(usize); + alloc_ctx.slab = (alloc_ctx.szind < SC_NBINS); + } + } + bool fail = maybe_check_alloc_ctx(tsd, ptr, &alloc_ctx); + if (fail) { + /* + * This is a heap corruption bug. In real life we'll crash; for + * the unit test we just want to avoid breaking anything too + * badly to get a test result out. Let's leak instead of trying + * to free. + */ + return; + } + + if (config_prof && opt_prof) { + prof_free(tsd, ptr, usize, &alloc_ctx); + } + if (likely(!slow_path)) { + isdalloct(tsd_tsdn(tsd), ptr, usize, tcache, &alloc_ctx, + false); + } else { + if (config_fill && slow_path && opt_junk_free) { + junk_free_callback(ptr, usize); + } + isdalloct(tsd_tsdn(tsd), ptr, usize, tcache, &alloc_ctx, + true); + } + thread_dalloc_event(tsd, usize); +} + +JEMALLOC_NOINLINE +void +free_default(void *ptr) { + UTRACE(ptr, 0, 0); + if (likely(ptr != NULL)) { + /* + * We avoid setting up tsd fully (e.g. tcache, arena binding) + * based on only free() calls -- other activities trigger the + * minimal to full transition. This is because free() may + * happen during thread shutdown after tls deallocation: if a + * thread never had any malloc activities until then, a + * fully-setup tsd won't be destructed properly. + */ + tsd_t *tsd = tsd_fetch_min(); + check_entry_exit_locking(tsd_tsdn(tsd)); + + if (likely(tsd_fast(tsd))) { + tcache_t *tcache = tcache_get_from_ind(tsd, + TCACHE_IND_AUTOMATIC, /* slow */ false, + /* is_alloc */ false); + ifree(tsd, ptr, tcache, /* slow */ false); + } else { + tcache_t *tcache = tcache_get_from_ind(tsd, + TCACHE_IND_AUTOMATIC, /* slow */ true, + /* is_alloc */ false); + uintptr_t args_raw[3] = {(uintptr_t)ptr}; + hook_invoke_dalloc(hook_dalloc_free, ptr, args_raw); + ifree(tsd, ptr, tcache, /* slow */ true); + } + + check_entry_exit_locking(tsd_tsdn(tsd)); + } +} + +JEMALLOC_ALWAYS_INLINE bool +free_fastpath_nonfast_aligned(void *ptr, bool check_prof) { + /* + * free_fastpath do not handle two uncommon cases: 1) sampled profiled + * objects and 2) sampled junk & stash for use-after-free detection. + * Both have special alignments which are used to escape the fastpath. + * + * prof_sample is page-aligned, which covers the UAF check when both + * are enabled (the assertion below). Avoiding redundant checks since + * this is on the fastpath -- at most one runtime branch from this. + */ + if (config_debug && cache_bin_nonfast_aligned(ptr)) { + assert(prof_sample_aligned(ptr)); + } + + if (config_prof && check_prof) { + /* When prof is enabled, the prof_sample alignment is enough. */ + if (prof_sample_aligned(ptr)) { + return true; + } else { + return false; + } + } + + if (config_uaf_detection) { + if (cache_bin_nonfast_aligned(ptr)) { + return true; + } else { + return false; + } + } + + return false; +} + +/* Returns whether or not the free attempt was successful. */ +JEMALLOC_ALWAYS_INLINE +bool free_fastpath(void *ptr, size_t size, bool size_hint) { + tsd_t *tsd = tsd_get(false); + /* The branch gets optimized away unless tsd_get_allocates(). */ + if (unlikely(tsd == NULL)) { + return false; + } + /* + * The tsd_fast() / initialized checks are folded into the branch + * testing (deallocated_after >= threshold) later in this function. + * The threshold will be set to 0 when !tsd_fast. + */ + assert(tsd_fast(tsd) || + *tsd_thread_deallocated_next_event_fastp_get_unsafe(tsd) == 0); + + emap_alloc_ctx_t alloc_ctx; + if (!size_hint) { + bool err = emap_alloc_ctx_try_lookup_fast(tsd, + &arena_emap_global, ptr, &alloc_ctx); + + /* Note: profiled objects will have alloc_ctx.slab set */ + if (unlikely(err || !alloc_ctx.slab || + free_fastpath_nonfast_aligned(ptr, + /* check_prof */ false))) { + return false; + } + assert(alloc_ctx.szind != SC_NSIZES); + } else { + /* + * Check for both sizes that are too large, and for sampled / + * special aligned objects. The alignment check will also check + * for null ptr. + */ + if (unlikely(size > SC_LOOKUP_MAXCLASS || + free_fastpath_nonfast_aligned(ptr, + /* check_prof */ true))) { + return false; + } + alloc_ctx.szind = sz_size2index_lookup(size); + /* Max lookup class must be small. */ + assert(alloc_ctx.szind < SC_NBINS); + /* This is a dead store, except when opt size checking is on. */ + alloc_ctx.slab = true; + } + /* + * Currently the fastpath only handles small sizes. The branch on + * SC_LOOKUP_MAXCLASS makes sure of it. This lets us avoid checking + * tcache szind upper limit (i.e. tcache_maxclass) as well. + */ + assert(alloc_ctx.slab); + + uint64_t deallocated, threshold; + te_free_fastpath_ctx(tsd, &deallocated, &threshold); + + size_t usize = sz_index2size(alloc_ctx.szind); + uint64_t deallocated_after = deallocated + usize; + /* + * Check for events and tsd non-nominal (fast_threshold will be set to + * 0) in a single branch. Note that this handles the uninitialized case + * as well (TSD init will be triggered on the non-fastpath). Therefore + * anything depends on a functional TSD (e.g. the alloc_ctx sanity check + * below) needs to be after this branch. + */ + if (unlikely(deallocated_after >= threshold)) { + return false; + } + assert(tsd_fast(tsd)); + bool fail = maybe_check_alloc_ctx(tsd, ptr, &alloc_ctx); + if (fail) { + /* See the comment in isfree. */ + return true; + } + + tcache_t *tcache = tcache_get_from_ind(tsd, TCACHE_IND_AUTOMATIC, + /* slow */ false, /* is_alloc */ false); + cache_bin_t *bin = &tcache->bins[alloc_ctx.szind]; + + /* + * If junking were enabled, this is where we would do it. It's not + * though, since we ensured above that we're on the fast path. Assert + * that to double-check. + */ + assert(!opt_junk_free); + + if (!cache_bin_dalloc_easy(bin, ptr)) { + return false; + } + + *tsd_thread_deallocatedp_get(tsd) = deallocated_after; + + return true; +} + +JEMALLOC_EXPORT void JEMALLOC_NOTHROW +je_free(void *ptr) { + LOG("core.free.entry", "ptr: %p", ptr); + + if (!free_fastpath(ptr, 0, false)) { + free_default(ptr); + } + + LOG("core.free.exit", ""); +} + +/* + * End malloc(3)-compatible functions. + */ +/******************************************************************************/ +/* + * Begin non-standard override functions. + */ + +#ifdef JEMALLOC_OVERRIDE_MEMALIGN +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN +void JEMALLOC_NOTHROW * +JEMALLOC_ATTR(malloc) +je_memalign(size_t alignment, size_t size) { + void *ret; + static_opts_t sopts; + dynamic_opts_t dopts; + + LOG("core.memalign.entry", "alignment: %zu, size: %zu\n", alignment, + size); + + static_opts_init(&sopts); + dynamic_opts_init(&dopts); + + sopts.min_alignment = 1; + sopts.oom_string = + ": Error allocating aligned memory: out of memory\n"; + sopts.invalid_alignment_string = + ": Error allocating aligned memory: invalid alignment\n"; + sopts.null_out_result_on_error = true; + + dopts.result = &ret; + dopts.num_items = 1; + dopts.item_size = size; + dopts.alignment = alignment; + + imalloc(&sopts, &dopts); + if (sopts.slow) { + uintptr_t args[3] = {alignment, size}; + hook_invoke_alloc(hook_alloc_memalign, ret, (uintptr_t)ret, + args); + } + + LOG("core.memalign.exit", "result: %p", ret); + return ret; +} +#endif + +#ifdef JEMALLOC_OVERRIDE_VALLOC +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN +void JEMALLOC_NOTHROW * +JEMALLOC_ATTR(malloc) +je_valloc(size_t size) { + void *ret; + + static_opts_t sopts; + dynamic_opts_t dopts; + + LOG("core.valloc.entry", "size: %zu\n", size); + + static_opts_init(&sopts); + dynamic_opts_init(&dopts); + + sopts.null_out_result_on_error = true; + sopts.min_alignment = PAGE; + sopts.oom_string = + ": Error allocating aligned memory: out of memory\n"; + sopts.invalid_alignment_string = + ": Error allocating aligned memory: invalid alignment\n"; + + dopts.result = &ret; + dopts.num_items = 1; + dopts.item_size = size; + dopts.alignment = PAGE; + + imalloc(&sopts, &dopts); + if (sopts.slow) { + uintptr_t args[3] = {size}; + hook_invoke_alloc(hook_alloc_valloc, ret, (uintptr_t)ret, args); + } + + LOG("core.valloc.exit", "result: %p\n", ret); + return ret; +} +#endif + +#if defined(JEMALLOC_IS_MALLOC) && defined(JEMALLOC_GLIBC_MALLOC_HOOK) +/* + * glibc provides the RTLD_DEEPBIND flag for dlopen which can make it possible + * to inconsistently reference libc's malloc(3)-compatible functions + * (https://bugzilla.mozilla.org/show_bug.cgi?id=493541). + * + * These definitions interpose hooks in glibc. The functions are actually + * passed an extra argument for the caller return address, which will be + * ignored. + */ +#include // defines __GLIBC__ if we are compiling against glibc + +JEMALLOC_EXPORT void (*__free_hook)(void *ptr) = je_free; +JEMALLOC_EXPORT void *(*__malloc_hook)(size_t size) = je_malloc; +JEMALLOC_EXPORT void *(*__realloc_hook)(void *ptr, size_t size) = je_realloc; +# ifdef JEMALLOC_GLIBC_MEMALIGN_HOOK +JEMALLOC_EXPORT void *(*__memalign_hook)(size_t alignment, size_t size) = + je_memalign; +# endif + +# ifdef __GLIBC__ +/* + * To enable static linking with glibc, the libc specific malloc interface must + * be implemented also, so none of glibc's malloc.o functions are added to the + * link. + */ +# define ALIAS(je_fn) __attribute__((alias (#je_fn), used)) +/* To force macro expansion of je_ prefix before stringification. */ +# define PREALIAS(je_fn) ALIAS(je_fn) +# ifdef JEMALLOC_OVERRIDE___LIBC_CALLOC +void *__libc_calloc(size_t n, size_t size) PREALIAS(je_calloc); +# endif +# ifdef JEMALLOC_OVERRIDE___LIBC_FREE +void __libc_free(void* ptr) PREALIAS(je_free); +# endif +# ifdef JEMALLOC_OVERRIDE___LIBC_MALLOC +void *__libc_malloc(size_t size) PREALIAS(je_malloc); +# endif +# ifdef JEMALLOC_OVERRIDE___LIBC_MEMALIGN +void *__libc_memalign(size_t align, size_t s) PREALIAS(je_memalign); +# endif +# ifdef JEMALLOC_OVERRIDE___LIBC_REALLOC +void *__libc_realloc(void* ptr, size_t size) PREALIAS(je_realloc); +# endif +# ifdef JEMALLOC_OVERRIDE___LIBC_VALLOC +void *__libc_valloc(size_t size) PREALIAS(je_valloc); +# endif +# ifdef JEMALLOC_OVERRIDE___POSIX_MEMALIGN +int __posix_memalign(void** r, size_t a, size_t s) PREALIAS(je_posix_memalign); +# endif +# undef PREALIAS +# undef ALIAS +# endif +#endif + +/* + * End non-standard override functions. + */ +/******************************************************************************/ +/* + * Begin non-standard functions. + */ + +JEMALLOC_ALWAYS_INLINE unsigned +mallocx_tcache_get(int flags) { + if (likely((flags & MALLOCX_TCACHE_MASK) == 0)) { + return TCACHE_IND_AUTOMATIC; + } else if ((flags & MALLOCX_TCACHE_MASK) == MALLOCX_TCACHE_NONE) { + return TCACHE_IND_NONE; + } else { + return MALLOCX_TCACHE_GET(flags); + } +} + +JEMALLOC_ALWAYS_INLINE unsigned +mallocx_arena_get(int flags) { + if (unlikely((flags & MALLOCX_ARENA_MASK) != 0)) { + return MALLOCX_ARENA_GET(flags); + } else { + return ARENA_IND_AUTOMATIC; + } +} + +#ifdef JEMALLOC_EXPERIMENTAL_SMALLOCX_API + +#define JEMALLOC_SMALLOCX_CONCAT_HELPER(x, y) x ## y +#define JEMALLOC_SMALLOCX_CONCAT_HELPER2(x, y) \ + JEMALLOC_SMALLOCX_CONCAT_HELPER(x, y) + +typedef struct { + void *ptr; + size_t size; +} smallocx_return_t; + +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN +smallocx_return_t JEMALLOC_NOTHROW +/* + * The attribute JEMALLOC_ATTR(malloc) cannot be used due to: + * - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86488 + */ +JEMALLOC_SMALLOCX_CONCAT_HELPER2(je_smallocx_, JEMALLOC_VERSION_GID_IDENT) + (size_t size, int flags) { + /* + * Note: the attribute JEMALLOC_ALLOC_SIZE(1) cannot be + * used here because it makes writing beyond the `size` + * of the `ptr` undefined behavior, but the objective + * of this function is to allow writing beyond `size` + * up to `smallocx_return_t::size`. + */ + smallocx_return_t ret; + static_opts_t sopts; + dynamic_opts_t dopts; + + LOG("core.smallocx.entry", "size: %zu, flags: %d", size, flags); + + static_opts_init(&sopts); + dynamic_opts_init(&dopts); + + sopts.assert_nonempty_alloc = true; + sopts.null_out_result_on_error = true; + sopts.oom_string = ": Error in mallocx(): out of memory\n"; + sopts.usize = true; + + dopts.result = &ret.ptr; + dopts.num_items = 1; + dopts.item_size = size; + if (unlikely(flags != 0)) { + dopts.alignment = MALLOCX_ALIGN_GET(flags); + dopts.zero = MALLOCX_ZERO_GET(flags); + dopts.tcache_ind = mallocx_tcache_get(flags); + dopts.arena_ind = mallocx_arena_get(flags); + } + + imalloc(&sopts, &dopts); + assert(dopts.usize == je_nallocx(size, flags)); + ret.size = dopts.usize; + + LOG("core.smallocx.exit", "result: %p, size: %zu", ret.ptr, ret.size); + return ret; +} +#undef JEMALLOC_SMALLOCX_CONCAT_HELPER +#undef JEMALLOC_SMALLOCX_CONCAT_HELPER2 +#endif + +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN +void JEMALLOC_NOTHROW * +JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1) +je_mallocx(size_t size, int flags) { + void *ret; + static_opts_t sopts; + dynamic_opts_t dopts; + + LOG("core.mallocx.entry", "size: %zu, flags: %d", size, flags); + + static_opts_init(&sopts); + dynamic_opts_init(&dopts); + + sopts.assert_nonempty_alloc = true; + sopts.null_out_result_on_error = true; + sopts.oom_string = ": Error in mallocx(): out of memory\n"; + + dopts.result = &ret; + dopts.num_items = 1; + dopts.item_size = size; + if (unlikely(flags != 0)) { + dopts.alignment = MALLOCX_ALIGN_GET(flags); + dopts.zero = MALLOCX_ZERO_GET(flags); + dopts.tcache_ind = mallocx_tcache_get(flags); + dopts.arena_ind = mallocx_arena_get(flags); + } + + imalloc(&sopts, &dopts); + if (sopts.slow) { + uintptr_t args[3] = {size, flags}; + hook_invoke_alloc(hook_alloc_mallocx, ret, (uintptr_t)ret, + args); + } + + LOG("core.mallocx.exit", "result: %p", ret); + return ret; +} + +static void * +irallocx_prof_sample(tsdn_t *tsdn, void *old_ptr, size_t old_usize, + size_t usize, size_t alignment, bool zero, tcache_t *tcache, arena_t *arena, + prof_tctx_t *tctx, hook_ralloc_args_t *hook_args) { + void *p; + + if (tctx == NULL) { + return NULL; + } + + alignment = prof_sample_align(alignment); + if (usize <= SC_SMALL_MAXCLASS) { + p = iralloct(tsdn, old_ptr, old_usize, + SC_LARGE_MINCLASS, alignment, zero, tcache, + arena, hook_args); + if (p == NULL) { + return NULL; + } + arena_prof_promote(tsdn, p, usize); + } else { + p = iralloct(tsdn, old_ptr, old_usize, usize, alignment, zero, + tcache, arena, hook_args); + } + assert(prof_sample_aligned(p)); + + return p; +} + +JEMALLOC_ALWAYS_INLINE void * +irallocx_prof(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t size, + size_t alignment, size_t usize, bool zero, tcache_t *tcache, + arena_t *arena, emap_alloc_ctx_t *alloc_ctx, + hook_ralloc_args_t *hook_args) { + prof_info_t old_prof_info; + prof_info_get_and_reset_recent(tsd, old_ptr, alloc_ctx, &old_prof_info); + bool prof_active = prof_active_get_unlocked(); + bool sample_event = te_prof_sample_event_lookahead(tsd, usize); + prof_tctx_t *tctx = prof_alloc_prep(tsd, prof_active, sample_event); + void *p; + if (unlikely((uintptr_t)tctx != (uintptr_t)1U)) { + p = irallocx_prof_sample(tsd_tsdn(tsd), old_ptr, old_usize, + usize, alignment, zero, tcache, arena, tctx, hook_args); + } else { + p = iralloct(tsd_tsdn(tsd), old_ptr, old_usize, size, alignment, + zero, tcache, arena, hook_args); + } + if (unlikely(p == NULL)) { + prof_alloc_rollback(tsd, tctx); + return NULL; + } + assert(usize == isalloc(tsd_tsdn(tsd), p)); + prof_realloc(tsd, p, size, usize, tctx, prof_active, old_ptr, + old_usize, &old_prof_info, sample_event); + + return p; +} + +static void * +do_rallocx(void *ptr, size_t size, int flags, bool is_realloc) { + void *p; + tsd_t *tsd; + size_t usize; + size_t old_usize; + size_t alignment = MALLOCX_ALIGN_GET(flags); + arena_t *arena; + + assert(ptr != NULL); + assert(size != 0); + assert(malloc_initialized() || IS_INITIALIZER); + tsd = tsd_fetch(); + check_entry_exit_locking(tsd_tsdn(tsd)); + + bool zero = zero_get(MALLOCX_ZERO_GET(flags), /* slow */ true); + + unsigned arena_ind = mallocx_arena_get(flags); + if (arena_get_from_ind(tsd, arena_ind, &arena)) { + goto label_oom; + } + + unsigned tcache_ind = mallocx_tcache_get(flags); + tcache_t *tcache = tcache_get_from_ind(tsd, tcache_ind, + /* slow */ true, /* is_alloc */ true); + + emap_alloc_ctx_t alloc_ctx; + emap_alloc_ctx_lookup(tsd_tsdn(tsd), &arena_emap_global, ptr, + &alloc_ctx); + assert(alloc_ctx.szind != SC_NSIZES); + old_usize = sz_index2size(alloc_ctx.szind); + assert(old_usize == isalloc(tsd_tsdn(tsd), ptr)); + if (aligned_usize_get(size, alignment, &usize, NULL, false)) { + goto label_oom; + } + + hook_ralloc_args_t hook_args = {is_realloc, {(uintptr_t)ptr, size, + flags, 0}}; + if (config_prof && opt_prof) { + p = irallocx_prof(tsd, ptr, old_usize, size, alignment, usize, + zero, tcache, arena, &alloc_ctx, &hook_args); + if (unlikely(p == NULL)) { + goto label_oom; + } + } else { + p = iralloct(tsd_tsdn(tsd), ptr, old_usize, size, alignment, + zero, tcache, arena, &hook_args); + if (unlikely(p == NULL)) { + goto label_oom; + } + assert(usize == isalloc(tsd_tsdn(tsd), p)); + } + assert(alignment == 0 || ((uintptr_t)p & (alignment - 1)) == ZU(0)); + thread_alloc_event(tsd, usize); + thread_dalloc_event(tsd, old_usize); + + UTRACE(ptr, size, p); + check_entry_exit_locking(tsd_tsdn(tsd)); + + if (config_fill && unlikely(opt_junk_alloc) && usize > old_usize + && !zero) { + size_t excess_len = usize - old_usize; + void *excess_start = (void *)((uintptr_t)p + old_usize); + junk_alloc_callback(excess_start, excess_len); + } + + return p; +label_oom: + if (config_xmalloc && unlikely(opt_xmalloc)) { + malloc_write(": Error in rallocx(): out of memory\n"); + abort(); + } + UTRACE(ptr, size, 0); + check_entry_exit_locking(tsd_tsdn(tsd)); + + return NULL; +} + +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN +void JEMALLOC_NOTHROW * +JEMALLOC_ALLOC_SIZE(2) +je_rallocx(void *ptr, size_t size, int flags) { + LOG("core.rallocx.entry", "ptr: %p, size: %zu, flags: %d", ptr, + size, flags); + void *ret = do_rallocx(ptr, size, flags, false); + LOG("core.rallocx.exit", "result: %p", ret); + return ret; +} + +static void * +do_realloc_nonnull_zero(void *ptr) { + if (config_stats) { + atomic_fetch_add_zu(&zero_realloc_count, 1, ATOMIC_RELAXED); + } + if (opt_zero_realloc_action == zero_realloc_action_alloc) { + /* + * The user might have gotten an alloc setting while expecting a + * free setting. If that's the case, we at least try to + * reduce the harm, and turn off the tcache while allocating, so + * that we'll get a true first fit. + */ + return do_rallocx(ptr, 1, MALLOCX_TCACHE_NONE, true); + } else if (opt_zero_realloc_action == zero_realloc_action_free) { + UTRACE(ptr, 0, 0); + tsd_t *tsd = tsd_fetch(); + check_entry_exit_locking(tsd_tsdn(tsd)); + + tcache_t *tcache = tcache_get_from_ind(tsd, + TCACHE_IND_AUTOMATIC, /* slow */ true, + /* is_alloc */ false); + uintptr_t args[3] = {(uintptr_t)ptr, 0}; + hook_invoke_dalloc(hook_dalloc_realloc, ptr, args); + ifree(tsd, ptr, tcache, true); + + check_entry_exit_locking(tsd_tsdn(tsd)); + return NULL; + } else { + safety_check_fail("Called realloc(non-null-ptr, 0) with " + "zero_realloc:abort set\n"); + /* In real code, this will never run; the safety check failure + * will call abort. In the unit test, we just want to bail out + * without corrupting internal state that the test needs to + * finish. + */ + return NULL; + } +} + +JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN +void JEMALLOC_NOTHROW * +JEMALLOC_ALLOC_SIZE(2) +je_realloc(void *ptr, size_t size) { + LOG("core.realloc.entry", "ptr: %p, size: %zu\n", ptr, size); + + if (likely(ptr != NULL && size != 0)) { + void *ret = do_rallocx(ptr, size, 0, true); + LOG("core.realloc.exit", "result: %p", ret); + return ret; + } else if (ptr != NULL && size == 0) { + void *ret = do_realloc_nonnull_zero(ptr); + LOG("core.realloc.exit", "result: %p", ret); + return ret; + } else { + /* realloc(NULL, size) is equivalent to malloc(size). */ + void *ret; + + static_opts_t sopts; + dynamic_opts_t dopts; + + static_opts_init(&sopts); + dynamic_opts_init(&dopts); + + sopts.null_out_result_on_error = true; + sopts.set_errno_on_error = true; + sopts.oom_string = + ": Error in realloc(): out of memory\n"; + + dopts.result = &ret; + dopts.num_items = 1; + dopts.item_size = size; + + imalloc(&sopts, &dopts); + if (sopts.slow) { + uintptr_t args[3] = {(uintptr_t)ptr, size}; + hook_invoke_alloc(hook_alloc_realloc, ret, + (uintptr_t)ret, args); + } + LOG("core.realloc.exit", "result: %p", ret); + return ret; + } +} + +JEMALLOC_ALWAYS_INLINE size_t +ixallocx_helper(tsdn_t *tsdn, void *ptr, size_t old_usize, size_t size, + size_t extra, size_t alignment, bool zero) { + size_t newsize; + + if (ixalloc(tsdn, ptr, old_usize, size, extra, alignment, zero, + &newsize)) { + return old_usize; + } + + return newsize; +} + +static size_t +ixallocx_prof_sample(tsdn_t *tsdn, void *ptr, size_t old_usize, size_t size, + size_t extra, size_t alignment, bool zero, prof_tctx_t *tctx) { + /* Sampled allocation needs to be page aligned. */ + if (tctx == NULL || !prof_sample_aligned(ptr)) { + return old_usize; + } + + return ixallocx_helper(tsdn, ptr, old_usize, size, extra, alignment, + zero); +} + +JEMALLOC_ALWAYS_INLINE size_t +ixallocx_prof(tsd_t *tsd, void *ptr, size_t old_usize, size_t size, + size_t extra, size_t alignment, bool zero, emap_alloc_ctx_t *alloc_ctx) { + /* + * old_prof_info is only used for asserting that the profiling info + * isn't changed by the ixalloc() call. + */ + prof_info_t old_prof_info; + prof_info_get(tsd, ptr, alloc_ctx, &old_prof_info); + + /* + * usize isn't knowable before ixalloc() returns when extra is non-zero. + * Therefore, compute its maximum possible value and use that in + * prof_alloc_prep() to decide whether to capture a backtrace. + * prof_realloc() will use the actual usize to decide whether to sample. + */ + size_t usize_max; + if (aligned_usize_get(size + extra, alignment, &usize_max, NULL, + false)) { + /* + * usize_max is out of range, and chances are that allocation + * will fail, but use the maximum possible value and carry on + * with prof_alloc_prep(), just in case allocation succeeds. + */ + usize_max = SC_LARGE_MAXCLASS; + } + bool prof_active = prof_active_get_unlocked(); + bool sample_event = te_prof_sample_event_lookahead(tsd, usize_max); + prof_tctx_t *tctx = prof_alloc_prep(tsd, prof_active, sample_event); + + size_t usize; + if (unlikely((uintptr_t)tctx != (uintptr_t)1U)) { + usize = ixallocx_prof_sample(tsd_tsdn(tsd), ptr, old_usize, + size, extra, alignment, zero, tctx); + } else { + usize = ixallocx_helper(tsd_tsdn(tsd), ptr, old_usize, size, + extra, alignment, zero); + } + + /* + * At this point we can still safely get the original profiling + * information associated with the ptr, because (a) the edata_t object + * associated with the ptr still lives and (b) the profiling info + * fields are not touched. "(a)" is asserted in the outer je_xallocx() + * function, and "(b)" is indirectly verified below by checking that + * the alloc_tctx field is unchanged. + */ + prof_info_t prof_info; + if (usize == old_usize) { + prof_info_get(tsd, ptr, alloc_ctx, &prof_info); + prof_alloc_rollback(tsd, tctx); + } else { + prof_info_get_and_reset_recent(tsd, ptr, alloc_ctx, &prof_info); + assert(usize <= usize_max); + sample_event = te_prof_sample_event_lookahead(tsd, usize); + prof_realloc(tsd, ptr, size, usize, tctx, prof_active, ptr, + old_usize, &prof_info, sample_event); + } + + assert(old_prof_info.alloc_tctx == prof_info.alloc_tctx); + return usize; +} + +JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW +je_xallocx(void *ptr, size_t size, size_t extra, int flags) { + tsd_t *tsd; + size_t usize, old_usize; + size_t alignment = MALLOCX_ALIGN_GET(flags); + bool zero = zero_get(MALLOCX_ZERO_GET(flags), /* slow */ true); + + LOG("core.xallocx.entry", "ptr: %p, size: %zu, extra: %zu, " + "flags: %d", ptr, size, extra, flags); + + assert(ptr != NULL); + assert(size != 0); + assert(SIZE_T_MAX - size >= extra); + assert(malloc_initialized() || IS_INITIALIZER); + tsd = tsd_fetch(); + check_entry_exit_locking(tsd_tsdn(tsd)); + + /* + * old_edata is only for verifying that xallocx() keeps the edata_t + * object associated with the ptr (though the content of the edata_t + * object can be changed). + */ + edata_t *old_edata = emap_edata_lookup(tsd_tsdn(tsd), + &arena_emap_global, ptr); + + emap_alloc_ctx_t alloc_ctx; + emap_alloc_ctx_lookup(tsd_tsdn(tsd), &arena_emap_global, ptr, + &alloc_ctx); + assert(alloc_ctx.szind != SC_NSIZES); + old_usize = sz_index2size(alloc_ctx.szind); + assert(old_usize == isalloc(tsd_tsdn(tsd), ptr)); + /* + * The API explicitly absolves itself of protecting against (size + + * extra) numerical overflow, but we may need to clamp extra to avoid + * exceeding SC_LARGE_MAXCLASS. + * + * Ordinarily, size limit checking is handled deeper down, but here we + * have to check as part of (size + extra) clamping, since we need the + * clamped value in the above helper functions. + */ + if (unlikely(size > SC_LARGE_MAXCLASS)) { + usize = old_usize; + goto label_not_resized; + } + if (unlikely(SC_LARGE_MAXCLASS - size < extra)) { + extra = SC_LARGE_MAXCLASS - size; + } + + if (config_prof && opt_prof) { + usize = ixallocx_prof(tsd, ptr, old_usize, size, extra, + alignment, zero, &alloc_ctx); + } else { + usize = ixallocx_helper(tsd_tsdn(tsd), ptr, old_usize, size, + extra, alignment, zero); + } + + /* + * xallocx() should keep using the same edata_t object (though its + * content can be changed). + */ + assert(emap_edata_lookup(tsd_tsdn(tsd), &arena_emap_global, ptr) + == old_edata); + + if (unlikely(usize == old_usize)) { + goto label_not_resized; + } + thread_alloc_event(tsd, usize); + thread_dalloc_event(tsd, old_usize); + + if (config_fill && unlikely(opt_junk_alloc) && usize > old_usize && + !zero) { + size_t excess_len = usize - old_usize; + void *excess_start = (void *)((uintptr_t)ptr + old_usize); + junk_alloc_callback(excess_start, excess_len); + } +label_not_resized: + if (unlikely(!tsd_fast(tsd))) { + uintptr_t args[4] = {(uintptr_t)ptr, size, extra, flags}; + hook_invoke_expand(hook_expand_xallocx, ptr, old_usize, + usize, (uintptr_t)usize, args); + } + + UTRACE(ptr, size, ptr); + check_entry_exit_locking(tsd_tsdn(tsd)); + + LOG("core.xallocx.exit", "result: %zu", usize); + return usize; +} + +JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW +JEMALLOC_ATTR(pure) +je_sallocx(const void *ptr, int flags) { + size_t usize; + tsdn_t *tsdn; + + LOG("core.sallocx.entry", "ptr: %p, flags: %d", ptr, flags); + + assert(malloc_initialized() || IS_INITIALIZER); + assert(ptr != NULL); + + tsdn = tsdn_fetch(); + check_entry_exit_locking(tsdn); + + if (config_debug || force_ivsalloc) { + usize = ivsalloc(tsdn, ptr); + assert(force_ivsalloc || usize != 0); + } else { + usize = isalloc(tsdn, ptr); + } + + check_entry_exit_locking(tsdn); + + LOG("core.sallocx.exit", "result: %zu", usize); + return usize; +} + +JEMALLOC_EXPORT void JEMALLOC_NOTHROW +je_dallocx(void *ptr, int flags) { + LOG("core.dallocx.entry", "ptr: %p, flags: %d", ptr, flags); + + assert(ptr != NULL); + assert(malloc_initialized() || IS_INITIALIZER); + + tsd_t *tsd = tsd_fetch_min(); + bool fast = tsd_fast(tsd); + check_entry_exit_locking(tsd_tsdn(tsd)); + + unsigned tcache_ind = mallocx_tcache_get(flags); + tcache_t *tcache = tcache_get_from_ind(tsd, tcache_ind, !fast, + /* is_alloc */ false); + + UTRACE(ptr, 0, 0); + if (likely(fast)) { + tsd_assert_fast(tsd); + ifree(tsd, ptr, tcache, false); + } else { + uintptr_t args_raw[3] = {(uintptr_t)ptr, flags}; + hook_invoke_dalloc(hook_dalloc_dallocx, ptr, args_raw); + ifree(tsd, ptr, tcache, true); + } + check_entry_exit_locking(tsd_tsdn(tsd)); + + LOG("core.dallocx.exit", ""); +} + +JEMALLOC_ALWAYS_INLINE size_t +inallocx(tsdn_t *tsdn, size_t size, int flags) { + check_entry_exit_locking(tsdn); + size_t usize; + /* In case of out of range, let the user see it rather than fail. */ + aligned_usize_get(size, MALLOCX_ALIGN_GET(flags), &usize, NULL, false); + check_entry_exit_locking(tsdn); + return usize; +} + +JEMALLOC_NOINLINE void +sdallocx_default(void *ptr, size_t size, int flags) { + assert(ptr != NULL); + assert(malloc_initialized() || IS_INITIALIZER); + + tsd_t *tsd = tsd_fetch_min(); + bool fast = tsd_fast(tsd); + size_t usize = inallocx(tsd_tsdn(tsd), size, flags); + check_entry_exit_locking(tsd_tsdn(tsd)); + + unsigned tcache_ind = mallocx_tcache_get(flags); + tcache_t *tcache = tcache_get_from_ind(tsd, tcache_ind, !fast, + /* is_alloc */ false); + + UTRACE(ptr, 0, 0); + if (likely(fast)) { + tsd_assert_fast(tsd); + isfree(tsd, ptr, usize, tcache, false); + } else { + uintptr_t args_raw[3] = {(uintptr_t)ptr, size, flags}; + hook_invoke_dalloc(hook_dalloc_sdallocx, ptr, args_raw); + isfree(tsd, ptr, usize, tcache, true); + } + check_entry_exit_locking(tsd_tsdn(tsd)); +} + +JEMALLOC_EXPORT void JEMALLOC_NOTHROW +je_sdallocx(void *ptr, size_t size, int flags) { + LOG("core.sdallocx.entry", "ptr: %p, size: %zu, flags: %d", ptr, + size, flags); + + if (flags != 0 || !free_fastpath(ptr, size, true)) { + sdallocx_default(ptr, size, flags); + } + + LOG("core.sdallocx.exit", ""); +} + +void JEMALLOC_NOTHROW +je_sdallocx_noflags(void *ptr, size_t size) { + LOG("core.sdallocx.entry", "ptr: %p, size: %zu, flags: 0", ptr, + size); + + if (!free_fastpath(ptr, size, true)) { + sdallocx_default(ptr, size, 0); + } + + LOG("core.sdallocx.exit", ""); +} + +JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW +JEMALLOC_ATTR(pure) +je_nallocx(size_t size, int flags) { + size_t usize; + tsdn_t *tsdn; + + assert(size != 0); + + if (unlikely(malloc_init())) { + LOG("core.nallocx.exit", "result: %zu", ZU(0)); + return 0; + } + + tsdn = tsdn_fetch(); + check_entry_exit_locking(tsdn); + + usize = inallocx(tsdn, size, flags); + if (unlikely(usize > SC_LARGE_MAXCLASS)) { + LOG("core.nallocx.exit", "result: %zu", ZU(0)); + return 0; + } + + check_entry_exit_locking(tsdn); + LOG("core.nallocx.exit", "result: %zu", usize); + return usize; +} + +JEMALLOC_EXPORT int JEMALLOC_NOTHROW +je_mallctl(const char *name, void *oldp, size_t *oldlenp, void *newp, + size_t newlen) { + int ret; + tsd_t *tsd; + + LOG("core.mallctl.entry", "name: %s", name); + + if (unlikely(malloc_init())) { + LOG("core.mallctl.exit", "result: %d", EAGAIN); + return EAGAIN; + } + + tsd = tsd_fetch(); + check_entry_exit_locking(tsd_tsdn(tsd)); + ret = ctl_byname(tsd, name, oldp, oldlenp, newp, newlen); + check_entry_exit_locking(tsd_tsdn(tsd)); + + LOG("core.mallctl.exit", "result: %d", ret); + return ret; +} + +JEMALLOC_EXPORT int JEMALLOC_NOTHROW +je_mallctlnametomib(const char *name, size_t *mibp, size_t *miblenp) { + int ret; + + LOG("core.mallctlnametomib.entry", "name: %s", name); + + if (unlikely(malloc_init())) { + LOG("core.mallctlnametomib.exit", "result: %d", EAGAIN); + return EAGAIN; + } + + tsd_t *tsd = tsd_fetch(); + check_entry_exit_locking(tsd_tsdn(tsd)); + ret = ctl_nametomib(tsd, name, mibp, miblenp); + check_entry_exit_locking(tsd_tsdn(tsd)); + + LOG("core.mallctlnametomib.exit", "result: %d", ret); + return ret; +} + +JEMALLOC_EXPORT int JEMALLOC_NOTHROW +je_mallctlbymib(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp, + void *newp, size_t newlen) { + int ret; + tsd_t *tsd; + + LOG("core.mallctlbymib.entry", ""); + + if (unlikely(malloc_init())) { + LOG("core.mallctlbymib.exit", "result: %d", EAGAIN); + return EAGAIN; + } + + tsd = tsd_fetch(); + check_entry_exit_locking(tsd_tsdn(tsd)); + ret = ctl_bymib(tsd, mib, miblen, oldp, oldlenp, newp, newlen); + check_entry_exit_locking(tsd_tsdn(tsd)); + LOG("core.mallctlbymib.exit", "result: %d", ret); + return ret; +} + +#define STATS_PRINT_BUFSIZE 65536 +JEMALLOC_EXPORT void JEMALLOC_NOTHROW +je_malloc_stats_print(void (*write_cb)(void *, const char *), void *cbopaque, + const char *opts) { + tsdn_t *tsdn; + + LOG("core.malloc_stats_print.entry", ""); + + tsdn = tsdn_fetch(); + check_entry_exit_locking(tsdn); + + if (config_debug) { + stats_print(write_cb, cbopaque, opts); + } else { + buf_writer_t buf_writer; + buf_writer_init(tsdn, &buf_writer, write_cb, cbopaque, NULL, + STATS_PRINT_BUFSIZE); + stats_print(buf_writer_cb, &buf_writer, opts); + buf_writer_terminate(tsdn, &buf_writer); + } + + check_entry_exit_locking(tsdn); + LOG("core.malloc_stats_print.exit", ""); +} +#undef STATS_PRINT_BUFSIZE + +JEMALLOC_ALWAYS_INLINE size_t +je_malloc_usable_size_impl(JEMALLOC_USABLE_SIZE_CONST void *ptr) { + assert(malloc_initialized() || IS_INITIALIZER); + + tsdn_t *tsdn = tsdn_fetch(); + check_entry_exit_locking(tsdn); + + size_t ret; + if (unlikely(ptr == NULL)) { + ret = 0; + } else { + if (config_debug || force_ivsalloc) { + ret = ivsalloc(tsdn, ptr); + assert(force_ivsalloc || ret != 0); + } else { + ret = isalloc(tsdn, ptr); + } + } + check_entry_exit_locking(tsdn); + + return ret; +} + +JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW +je_malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *ptr) { + LOG("core.malloc_usable_size.entry", "ptr: %p", ptr); + + size_t ret = je_malloc_usable_size_impl(ptr); + + LOG("core.malloc_usable_size.exit", "result: %zu", ret); + return ret; +} + +#ifdef JEMALLOC_HAVE_MALLOC_SIZE +JEMALLOC_EXPORT size_t JEMALLOC_NOTHROW +je_malloc_size(const void *ptr) { + LOG("core.malloc_size.entry", "ptr: %p", ptr); + + size_t ret = je_malloc_usable_size_impl(ptr); + + LOG("core.malloc_size.exit", "result: %zu", ret); + return ret; +} +#endif + +static void +batch_alloc_prof_sample_assert(tsd_t *tsd, size_t batch, size_t usize) { + assert(config_prof && opt_prof); + bool prof_sample_event = te_prof_sample_event_lookahead(tsd, + batch * usize); + assert(!prof_sample_event); + size_t surplus; + prof_sample_event = te_prof_sample_event_lookahead_surplus(tsd, + (batch + 1) * usize, &surplus); + assert(prof_sample_event); + assert(surplus < usize); +} + +size_t +batch_alloc(void **ptrs, size_t num, size_t size, int flags) { + LOG("core.batch_alloc.entry", + "ptrs: %p, num: %zu, size: %zu, flags: %d", ptrs, num, size, flags); + + tsd_t *tsd = tsd_fetch(); + check_entry_exit_locking(tsd_tsdn(tsd)); + + size_t filled = 0; + + if (unlikely(tsd == NULL || tsd_reentrancy_level_get(tsd) > 0)) { + goto label_done; + } + + size_t alignment = MALLOCX_ALIGN_GET(flags); + size_t usize; + if (aligned_usize_get(size, alignment, &usize, NULL, false)) { + goto label_done; + } + szind_t ind = sz_size2index(usize); + bool zero = zero_get(MALLOCX_ZERO_GET(flags), /* slow */ true); + + /* + * The cache bin and arena will be lazily initialized; it's hard to + * know in advance whether each of them needs to be initialized. + */ + cache_bin_t *bin = NULL; + arena_t *arena = NULL; + + size_t nregs = 0; + if (likely(ind < SC_NBINS)) { + nregs = bin_infos[ind].nregs; + assert(nregs > 0); + } + + while (filled < num) { + size_t batch = num - filled; + size_t surplus = SIZE_MAX; /* Dead store. */ + bool prof_sample_event = config_prof && opt_prof + && prof_active_get_unlocked() + && te_prof_sample_event_lookahead_surplus(tsd, + batch * usize, &surplus); + + if (prof_sample_event) { + /* + * Adjust so that the batch does not trigger prof + * sampling. + */ + batch -= surplus / usize + 1; + batch_alloc_prof_sample_assert(tsd, batch, usize); + } + + size_t progress = 0; + + if (likely(ind < SC_NBINS) && batch >= nregs) { + if (arena == NULL) { + unsigned arena_ind = mallocx_arena_get(flags); + if (arena_get_from_ind(tsd, arena_ind, + &arena)) { + goto label_done; + } + if (arena == NULL) { + arena = arena_choose(tsd, NULL); + } + if (unlikely(arena == NULL)) { + goto label_done; + } + } + size_t arena_batch = batch - batch % nregs; + size_t n = arena_fill_small_fresh(tsd_tsdn(tsd), arena, + ind, ptrs + filled, arena_batch, zero); + progress += n; + filled += n; + } + + if (likely(ind < nhbins) && progress < batch) { + if (bin == NULL) { + unsigned tcache_ind = mallocx_tcache_get(flags); + tcache_t *tcache = tcache_get_from_ind(tsd, + tcache_ind, /* slow */ true, + /* is_alloc */ true); + if (tcache != NULL) { + bin = &tcache->bins[ind]; + } + } + /* + * If we don't have a tcache bin, we don't want to + * immediately give up, because there's the possibility + * that the user explicitly requested to bypass the + * tcache, or that the user explicitly turned off the + * tcache; in such cases, we go through the slow path, + * i.e. the mallocx() call at the end of the while loop. + */ + if (bin != NULL) { + size_t bin_batch = batch - progress; + /* + * n can be less than bin_batch, meaning that + * the cache bin does not have enough memory. + * In such cases, we rely on the slow path, + * i.e. the mallocx() call at the end of the + * while loop, to fill in the cache, and in the + * next iteration of the while loop, the tcache + * will contain a lot of memory, and we can + * harvest them here. Compared to the + * alternative approach where we directly go to + * the arena bins here, the overhead of our + * current approach should usually be minimal, + * since we never try to fetch more memory than + * what a slab contains via the tcache. An + * additional benefit is that the tcache will + * not be empty for the next allocation request. + */ + size_t n = cache_bin_alloc_batch(bin, bin_batch, + ptrs + filled); + if (config_stats) { + bin->tstats.nrequests += n; + } + if (zero) { + for (size_t i = 0; i < n; ++i) { + memset(ptrs[filled + i], 0, + usize); + } + } + if (config_prof && opt_prof + && unlikely(ind >= SC_NBINS)) { + for (size_t i = 0; i < n; ++i) { + prof_tctx_reset_sampled(tsd, + ptrs[filled + i]); + } + } + progress += n; + filled += n; + } + } + + /* + * For thread events other than prof sampling, trigger them as + * if there's a single allocation of size (n * usize). This is + * fine because: + * (a) these events do not alter the allocation itself, and + * (b) it's possible that some event would have been triggered + * multiple times, instead of only once, if the allocations + * were handled individually, but it would do no harm (or + * even be beneficial) to coalesce the triggerings. + */ + thread_alloc_event(tsd, progress * usize); + + if (progress < batch || prof_sample_event) { + void *p = je_mallocx(size, flags); + if (p == NULL) { /* OOM */ + break; + } + if (progress == batch) { + assert(prof_sampled(tsd, p)); + } + ptrs[filled++] = p; + } + } + +label_done: + check_entry_exit_locking(tsd_tsdn(tsd)); + LOG("core.batch_alloc.exit", "result: %zu", filled); + return filled; +} + +/* + * End non-standard functions. + */ +/******************************************************************************/ +/* + * The following functions are used by threading libraries for protection of + * malloc during fork(). + */ + +/* + * If an application creates a thread before doing any allocation in the main + * thread, then calls fork(2) in the main thread followed by memory allocation + * in the child process, a race can occur that results in deadlock within the + * child: the main thread may have forked while the created thread had + * partially initialized the allocator. Ordinarily jemalloc prevents + * fork/malloc races via the following functions it registers during + * initialization using pthread_atfork(), but of course that does no good if + * the allocator isn't fully initialized at fork time. The following library + * constructor is a partial solution to this problem. It may still be possible + * to trigger the deadlock described above, but doing so would involve forking + * via a library constructor that runs before jemalloc's runs. + */ +#ifndef JEMALLOC_JET +JEMALLOC_ATTR(constructor) +static void +jemalloc_constructor(void) { + malloc_init(); +} +#endif + +#ifndef JEMALLOC_MUTEX_INIT_CB +void +jemalloc_prefork(void) +#else +JEMALLOC_EXPORT void +_malloc_prefork(void) +#endif +{ + tsd_t *tsd; + unsigned i, j, narenas; + arena_t *arena; + +#ifdef JEMALLOC_MUTEX_INIT_CB + if (!malloc_initialized()) { + return; + } +#endif + assert(malloc_initialized()); + + tsd = tsd_fetch(); + + narenas = narenas_total_get(); + + witness_prefork(tsd_witness_tsdp_get(tsd)); + /* Acquire all mutexes in a safe order. */ + ctl_prefork(tsd_tsdn(tsd)); + tcache_prefork(tsd_tsdn(tsd)); + malloc_mutex_prefork(tsd_tsdn(tsd), &arenas_lock); + if (have_background_thread) { + background_thread_prefork0(tsd_tsdn(tsd)); + } + prof_prefork0(tsd_tsdn(tsd)); + if (have_background_thread) { + background_thread_prefork1(tsd_tsdn(tsd)); + } + /* Break arena prefork into stages to preserve lock order. */ + for (i = 0; i < 9; i++) { + for (j = 0; j < narenas; j++) { + if ((arena = arena_get(tsd_tsdn(tsd), j, false)) != + NULL) { + switch (i) { + case 0: + arena_prefork0(tsd_tsdn(tsd), arena); + break; + case 1: + arena_prefork1(tsd_tsdn(tsd), arena); + break; + case 2: + arena_prefork2(tsd_tsdn(tsd), arena); + break; + case 3: + arena_prefork3(tsd_tsdn(tsd), arena); + break; + case 4: + arena_prefork4(tsd_tsdn(tsd), arena); + break; + case 5: + arena_prefork5(tsd_tsdn(tsd), arena); + break; + case 6: + arena_prefork6(tsd_tsdn(tsd), arena); + break; + case 7: + arena_prefork7(tsd_tsdn(tsd), arena); + break; + case 8: + arena_prefork8(tsd_tsdn(tsd), arena); + break; + default: not_reached(); + } + } + } + + } + prof_prefork1(tsd_tsdn(tsd)); + stats_prefork(tsd_tsdn(tsd)); + tsd_prefork(tsd); +} + +#ifndef JEMALLOC_MUTEX_INIT_CB +void +jemalloc_postfork_parent(void) +#else +JEMALLOC_EXPORT void +_malloc_postfork(void) +#endif +{ + tsd_t *tsd; + unsigned i, narenas; + +#ifdef JEMALLOC_MUTEX_INIT_CB + if (!malloc_initialized()) { + return; + } +#endif + assert(malloc_initialized()); + + tsd = tsd_fetch(); + + tsd_postfork_parent(tsd); + + witness_postfork_parent(tsd_witness_tsdp_get(tsd)); + /* Release all mutexes, now that fork() has completed. */ + stats_postfork_parent(tsd_tsdn(tsd)); + for (i = 0, narenas = narenas_total_get(); i < narenas; i++) { + arena_t *arena; + + if ((arena = arena_get(tsd_tsdn(tsd), i, false)) != NULL) { + arena_postfork_parent(tsd_tsdn(tsd), arena); + } + } + prof_postfork_parent(tsd_tsdn(tsd)); + if (have_background_thread) { + background_thread_postfork_parent(tsd_tsdn(tsd)); + } + malloc_mutex_postfork_parent(tsd_tsdn(tsd), &arenas_lock); + tcache_postfork_parent(tsd_tsdn(tsd)); + ctl_postfork_parent(tsd_tsdn(tsd)); +} + +void +jemalloc_postfork_child(void) { + tsd_t *tsd; + unsigned i, narenas; + + assert(malloc_initialized()); + + tsd = tsd_fetch(); + + tsd_postfork_child(tsd); + + witness_postfork_child(tsd_witness_tsdp_get(tsd)); + /* Release all mutexes, now that fork() has completed. */ + stats_postfork_child(tsd_tsdn(tsd)); + for (i = 0, narenas = narenas_total_get(); i < narenas; i++) { + arena_t *arena; + + if ((arena = arena_get(tsd_tsdn(tsd), i, false)) != NULL) { + arena_postfork_child(tsd_tsdn(tsd), arena); + } + } + prof_postfork_child(tsd_tsdn(tsd)); + if (have_background_thread) { + background_thread_postfork_child(tsd_tsdn(tsd)); + } + malloc_mutex_postfork_child(tsd_tsdn(tsd), &arenas_lock); + tcache_postfork_child(tsd_tsdn(tsd)); + ctl_postfork_child(tsd_tsdn(tsd)); +} + +/******************************************************************************/ diff --git a/BeefRT/JEMalloc/src/jemalloc_cpp.cpp b/BeefRT/JEMalloc/src/jemalloc_cpp.cpp new file mode 100644 index 00000000..451655f1 --- /dev/null +++ b/BeefRT/JEMalloc/src/jemalloc_cpp.cpp @@ -0,0 +1,254 @@ +#include +#include + +#define JEMALLOC_CPP_CPP_ +#ifdef __cplusplus +extern "C" { +#endif + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#ifdef __cplusplus +} +#endif + +// All operators in this file are exported. + +// Possibly alias hidden versions of malloc and sdallocx to avoid an extra plt +// thunk? +// +// extern __typeof (sdallocx) sdallocx_int +// __attribute ((alias ("sdallocx"), +// visibility ("hidden"))); +// +// ... but it needs to work with jemalloc namespaces. + +void *operator new(std::size_t size); +void *operator new[](std::size_t size); +void *operator new(std::size_t size, const std::nothrow_t &) noexcept; +void *operator new[](std::size_t size, const std::nothrow_t &) noexcept; +void operator delete(void *ptr) noexcept; +void operator delete[](void *ptr) noexcept; +void operator delete(void *ptr, const std::nothrow_t &) noexcept; +void operator delete[](void *ptr, const std::nothrow_t &) noexcept; + +#if __cpp_sized_deallocation >= 201309 +/* C++14's sized-delete operators. */ +void operator delete(void *ptr, std::size_t size) noexcept; +void operator delete[](void *ptr, std::size_t size) noexcept; +#endif + +#if __cpp_aligned_new >= 201606 +/* C++17's over-aligned operators. */ +void *operator new(std::size_t size, std::align_val_t); +void *operator new(std::size_t size, std::align_val_t, const std::nothrow_t &) noexcept; +void *operator new[](std::size_t size, std::align_val_t); +void *operator new[](std::size_t size, std::align_val_t, const std::nothrow_t &) noexcept; +void operator delete(void* ptr, std::align_val_t) noexcept; +void operator delete(void* ptr, std::align_val_t, const std::nothrow_t &) noexcept; +void operator delete(void* ptr, std::size_t size, std::align_val_t al) noexcept; +void operator delete[](void* ptr, std::align_val_t) noexcept; +void operator delete[](void* ptr, std::align_val_t, const std::nothrow_t &) noexcept; +void operator delete[](void* ptr, std::size_t size, std::align_val_t al) noexcept; +#endif + +JEMALLOC_NOINLINE +static void * +handleOOM(std::size_t size, bool nothrow) { + if (opt_experimental_infallible_new) { + safety_check_fail(": Allocation failed and " + "opt.experimental_infallible_new is true. Aborting.\n"); + return nullptr; + } + + void *ptr = nullptr; + + while (ptr == nullptr) { + std::new_handler handler; + // GCC-4.8 and clang 4.0 do not have std::get_new_handler. + { + static std::mutex mtx; + std::lock_guard lock(mtx); + + handler = std::set_new_handler(nullptr); + std::set_new_handler(handler); + } + if (handler == nullptr) + break; + + try { + handler(); + } catch (const std::bad_alloc &) { + break; + } + + ptr = je_malloc(size); + } + + if (ptr == nullptr && !nothrow) + std::__throw_bad_alloc(); + return ptr; +} + +template +JEMALLOC_NOINLINE +static void * +fallback_impl(std::size_t size) noexcept(IsNoExcept) { + void *ptr = malloc_default(size); + if (likely(ptr != nullptr)) { + return ptr; + } + return handleOOM(size, IsNoExcept); +} + +template +JEMALLOC_ALWAYS_INLINE +void * +newImpl(std::size_t size) noexcept(IsNoExcept) { + return imalloc_fastpath(size, &fallback_impl); +} + +void * +operator new(std::size_t size) { + return newImpl(size); +} + +void * +operator new[](std::size_t size) { + return newImpl(size); +} + +void * +operator new(std::size_t size, const std::nothrow_t &) noexcept { + return newImpl(size); +} + +void * +operator new[](std::size_t size, const std::nothrow_t &) noexcept { + return newImpl(size); +} + +#if __cpp_aligned_new >= 201606 + +template +JEMALLOC_ALWAYS_INLINE +void * +alignedNewImpl(std::size_t size, std::align_val_t alignment) noexcept(IsNoExcept) { + void *ptr = je_aligned_alloc(static_cast(alignment), size); + if (likely(ptr != nullptr)) { + return ptr; + } + + return handleOOM(size, IsNoExcept); +} + +void * +operator new(std::size_t size, std::align_val_t alignment) { + return alignedNewImpl(size, alignment); +} + +void * +operator new[](std::size_t size, std::align_val_t alignment) { + return alignedNewImpl(size, alignment); +} + +void * +operator new(std::size_t size, std::align_val_t alignment, const std::nothrow_t &) noexcept { + return alignedNewImpl(size, alignment); +} + +void * +operator new[](std::size_t size, std::align_val_t alignment, const std::nothrow_t &) noexcept { + return alignedNewImpl(size, alignment); +} + +#endif // __cpp_aligned_new + +void +operator delete(void *ptr) noexcept { + je_free(ptr); +} + +void +operator delete[](void *ptr) noexcept { + je_free(ptr); +} + +void +operator delete(void *ptr, const std::nothrow_t &) noexcept { + je_free(ptr); +} + +void operator delete[](void *ptr, const std::nothrow_t &) noexcept { + je_free(ptr); +} + +#if __cpp_sized_deallocation >= 201309 + +JEMALLOC_ALWAYS_INLINE +void +sizedDeleteImpl(void* ptr, std::size_t size) noexcept { + if (unlikely(ptr == nullptr)) { + return; + } + je_sdallocx_noflags(ptr, size); +} + +void +operator delete(void *ptr, std::size_t size) noexcept { + sizedDeleteImpl(ptr, size); +} + +void +operator delete[](void *ptr, std::size_t size) noexcept { + sizedDeleteImpl(ptr, size); +} + +#endif // __cpp_sized_deallocation + +#if __cpp_aligned_new >= 201606 + +JEMALLOC_ALWAYS_INLINE +void +alignedSizedDeleteImpl(void* ptr, std::size_t size, std::align_val_t alignment) noexcept { + if (config_debug) { + assert(((size_t)alignment & ((size_t)alignment - 1)) == 0); + } + if (unlikely(ptr == nullptr)) { + return; + } + je_sdallocx(ptr, size, MALLOCX_ALIGN(alignment)); +} + +void +operator delete(void* ptr, std::align_val_t) noexcept { + je_free(ptr); +} + +void +operator delete[](void* ptr, std::align_val_t) noexcept { + je_free(ptr); +} + +void +operator delete(void* ptr, std::align_val_t, const std::nothrow_t&) noexcept { + je_free(ptr); +} + +void +operator delete[](void* ptr, std::align_val_t, const std::nothrow_t&) noexcept { + je_free(ptr); +} + +void +operator delete(void* ptr, std::size_t size, std::align_val_t alignment) noexcept { + alignedSizedDeleteImpl(ptr, size, alignment); +} + +void +operator delete[](void* ptr, std::size_t size, std::align_val_t alignment) noexcept { + alignedSizedDeleteImpl(ptr, size, alignment); +} + +#endif // __cpp_aligned_new diff --git a/BeefRT/JEMalloc/src/large.c b/BeefRT/JEMalloc/src/large.c new file mode 100644 index 00000000..5fc4bf58 --- /dev/null +++ b/BeefRT/JEMalloc/src/large.c @@ -0,0 +1,322 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/emap.h" +#include "jemalloc/internal/extent_mmap.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/prof_recent.h" +#include "jemalloc/internal/util.h" + +/******************************************************************************/ + +void * +large_malloc(tsdn_t *tsdn, arena_t *arena, size_t usize, bool zero) { + assert(usize == sz_s2u(usize)); + + return large_palloc(tsdn, arena, usize, CACHELINE, zero); +} + +void * +large_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment, + bool zero) { + size_t ausize; + edata_t *edata; + UNUSED bool idump JEMALLOC_CC_SILENCE_INIT(false); + + assert(!tsdn_null(tsdn) || arena != NULL); + + ausize = sz_sa2u(usize, alignment); + if (unlikely(ausize == 0 || ausize > SC_LARGE_MAXCLASS)) { + return NULL; + } + + if (likely(!tsdn_null(tsdn))) { + arena = arena_choose_maybe_huge(tsdn_tsd(tsdn), arena, usize); + } + if (unlikely(arena == NULL) || (edata = arena_extent_alloc_large(tsdn, + arena, usize, alignment, zero)) == NULL) { + return NULL; + } + + /* See comments in arena_bin_slabs_full_insert(). */ + if (!arena_is_auto(arena)) { + /* Insert edata into large. */ + malloc_mutex_lock(tsdn, &arena->large_mtx); + edata_list_active_append(&arena->large, edata); + malloc_mutex_unlock(tsdn, &arena->large_mtx); + } + + arena_decay_tick(tsdn, arena); + return edata_addr_get(edata); +} + +static bool +large_ralloc_no_move_shrink(tsdn_t *tsdn, edata_t *edata, size_t usize) { + arena_t *arena = arena_get_from_edata(edata); + ehooks_t *ehooks = arena_get_ehooks(arena); + size_t old_size = edata_size_get(edata); + size_t old_usize = edata_usize_get(edata); + + assert(old_usize > usize); + + if (ehooks_split_will_fail(ehooks)) { + return true; + } + + bool deferred_work_generated = false; + bool err = pa_shrink(tsdn, &arena->pa_shard, edata, old_size, + usize + sz_large_pad, sz_size2index(usize), + &deferred_work_generated); + if (err) { + return true; + } + if (deferred_work_generated) { + arena_handle_deferred_work(tsdn, arena); + } + arena_extent_ralloc_large_shrink(tsdn, arena, edata, old_usize); + + return false; +} + +static bool +large_ralloc_no_move_expand(tsdn_t *tsdn, edata_t *edata, size_t usize, + bool zero) { + arena_t *arena = arena_get_from_edata(edata); + + size_t old_size = edata_size_get(edata); + size_t old_usize = edata_usize_get(edata); + size_t new_size = usize + sz_large_pad; + + szind_t szind = sz_size2index(usize); + + bool deferred_work_generated = false; + bool err = pa_expand(tsdn, &arena->pa_shard, edata, old_size, new_size, + szind, zero, &deferred_work_generated); + + if (deferred_work_generated) { + arena_handle_deferred_work(tsdn, arena); + } + + if (err) { + return true; + } + + if (zero) { + if (opt_cache_oblivious) { + assert(sz_large_pad == PAGE); + /* + * Zero the trailing bytes of the original allocation's + * last page, since they are in an indeterminate state. + * There will always be trailing bytes, because ptr's + * offset from the beginning of the extent is a multiple + * of CACHELINE in [0 .. PAGE). + */ + void *zbase = (void *) + ((uintptr_t)edata_addr_get(edata) + old_usize); + void *zpast = PAGE_ADDR2BASE((void *)((uintptr_t)zbase + + PAGE)); + size_t nzero = (uintptr_t)zpast - (uintptr_t)zbase; + assert(nzero > 0); + memset(zbase, 0, nzero); + } + } + arena_extent_ralloc_large_expand(tsdn, arena, edata, old_usize); + + return false; +} + +bool +large_ralloc_no_move(tsdn_t *tsdn, edata_t *edata, size_t usize_min, + size_t usize_max, bool zero) { + size_t oldusize = edata_usize_get(edata); + + /* The following should have been caught by callers. */ + assert(usize_min > 0 && usize_max <= SC_LARGE_MAXCLASS); + /* Both allocation sizes must be large to avoid a move. */ + assert(oldusize >= SC_LARGE_MINCLASS + && usize_max >= SC_LARGE_MINCLASS); + + if (usize_max > oldusize) { + /* Attempt to expand the allocation in-place. */ + if (!large_ralloc_no_move_expand(tsdn, edata, usize_max, + zero)) { + arena_decay_tick(tsdn, arena_get_from_edata(edata)); + return false; + } + /* Try again, this time with usize_min. */ + if (usize_min < usize_max && usize_min > oldusize && + large_ralloc_no_move_expand(tsdn, edata, usize_min, zero)) { + arena_decay_tick(tsdn, arena_get_from_edata(edata)); + return false; + } + } + + /* + * Avoid moving the allocation if the existing extent size accommodates + * the new size. + */ + if (oldusize >= usize_min && oldusize <= usize_max) { + arena_decay_tick(tsdn, arena_get_from_edata(edata)); + return false; + } + + /* Attempt to shrink the allocation in-place. */ + if (oldusize > usize_max) { + if (!large_ralloc_no_move_shrink(tsdn, edata, usize_max)) { + arena_decay_tick(tsdn, arena_get_from_edata(edata)); + return false; + } + } + return true; +} + +static void * +large_ralloc_move_helper(tsdn_t *tsdn, arena_t *arena, size_t usize, + size_t alignment, bool zero) { + if (alignment <= CACHELINE) { + return large_malloc(tsdn, arena, usize, zero); + } + return large_palloc(tsdn, arena, usize, alignment, zero); +} + +void * +large_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t usize, + size_t alignment, bool zero, tcache_t *tcache, + hook_ralloc_args_t *hook_args) { + edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global, ptr); + + size_t oldusize = edata_usize_get(edata); + /* The following should have been caught by callers. */ + assert(usize > 0 && usize <= SC_LARGE_MAXCLASS); + /* Both allocation sizes must be large to avoid a move. */ + assert(oldusize >= SC_LARGE_MINCLASS + && usize >= SC_LARGE_MINCLASS); + + /* Try to avoid moving the allocation. */ + if (!large_ralloc_no_move(tsdn, edata, usize, usize, zero)) { + hook_invoke_expand(hook_args->is_realloc + ? hook_expand_realloc : hook_expand_rallocx, ptr, oldusize, + usize, (uintptr_t)ptr, hook_args->args); + return edata_addr_get(edata); + } + + /* + * usize and old size are different enough that we need to use a + * different size class. In that case, fall back to allocating new + * space and copying. + */ + void *ret = large_ralloc_move_helper(tsdn, arena, usize, alignment, + zero); + if (ret == NULL) { + return NULL; + } + + hook_invoke_alloc(hook_args->is_realloc + ? hook_alloc_realloc : hook_alloc_rallocx, ret, (uintptr_t)ret, + hook_args->args); + hook_invoke_dalloc(hook_args->is_realloc + ? hook_dalloc_realloc : hook_dalloc_rallocx, ptr, hook_args->args); + + size_t copysize = (usize < oldusize) ? usize : oldusize; + memcpy(ret, edata_addr_get(edata), copysize); + isdalloct(tsdn, edata_addr_get(edata), oldusize, tcache, NULL, true); + return ret; +} + +/* + * locked indicates whether the arena's large_mtx is currently held. + */ +static void +large_dalloc_prep_impl(tsdn_t *tsdn, arena_t *arena, edata_t *edata, + bool locked) { + if (!locked) { + /* See comments in arena_bin_slabs_full_insert(). */ + if (!arena_is_auto(arena)) { + malloc_mutex_lock(tsdn, &arena->large_mtx); + edata_list_active_remove(&arena->large, edata); + malloc_mutex_unlock(tsdn, &arena->large_mtx); + } + } else { + /* Only hold the large_mtx if necessary. */ + if (!arena_is_auto(arena)) { + malloc_mutex_assert_owner(tsdn, &arena->large_mtx); + edata_list_active_remove(&arena->large, edata); + } + } + arena_extent_dalloc_large_prep(tsdn, arena, edata); +} + +static void +large_dalloc_finish_impl(tsdn_t *tsdn, arena_t *arena, edata_t *edata) { + bool deferred_work_generated = false; + pa_dalloc(tsdn, &arena->pa_shard, edata, &deferred_work_generated); + if (deferred_work_generated) { + arena_handle_deferred_work(tsdn, arena); + } +} + +void +large_dalloc_prep_locked(tsdn_t *tsdn, edata_t *edata) { + large_dalloc_prep_impl(tsdn, arena_get_from_edata(edata), edata, true); +} + +void +large_dalloc_finish(tsdn_t *tsdn, edata_t *edata) { + large_dalloc_finish_impl(tsdn, arena_get_from_edata(edata), edata); +} + +void +large_dalloc(tsdn_t *tsdn, edata_t *edata) { + arena_t *arena = arena_get_from_edata(edata); + large_dalloc_prep_impl(tsdn, arena, edata, false); + large_dalloc_finish_impl(tsdn, arena, edata); + arena_decay_tick(tsdn, arena); +} + +size_t +large_salloc(tsdn_t *tsdn, const edata_t *edata) { + return edata_usize_get(edata); +} + +void +large_prof_info_get(tsd_t *tsd, edata_t *edata, prof_info_t *prof_info, + bool reset_recent) { + assert(prof_info != NULL); + + prof_tctx_t *alloc_tctx = edata_prof_tctx_get(edata); + prof_info->alloc_tctx = alloc_tctx; + + if ((uintptr_t)alloc_tctx > (uintptr_t)1U) { + nstime_copy(&prof_info->alloc_time, + edata_prof_alloc_time_get(edata)); + prof_info->alloc_size = edata_prof_alloc_size_get(edata); + if (reset_recent) { + /* + * Reset the pointer on the recent allocation record, + * so that this allocation is recorded as released. + */ + prof_recent_alloc_reset(tsd, edata); + } + } +} + +static void +large_prof_tctx_set(edata_t *edata, prof_tctx_t *tctx) { + edata_prof_tctx_set(edata, tctx); +} + +void +large_prof_tctx_reset(edata_t *edata) { + large_prof_tctx_set(edata, (prof_tctx_t *)(uintptr_t)1U); +} + +void +large_prof_info_set(edata_t *edata, prof_tctx_t *tctx, size_t size) { + nstime_t t; + nstime_prof_init_update(&t); + edata_prof_alloc_time_set(edata, &t); + edata_prof_alloc_size_set(edata, size); + edata_prof_recent_alloc_init(edata); + large_prof_tctx_set(edata, tctx); +} diff --git a/BeefRT/JEMalloc/src/log.c b/BeefRT/JEMalloc/src/log.c new file mode 100644 index 00000000..778902fb --- /dev/null +++ b/BeefRT/JEMalloc/src/log.c @@ -0,0 +1,78 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/log.h" + +char log_var_names[JEMALLOC_LOG_VAR_BUFSIZE]; +atomic_b_t log_init_done = ATOMIC_INIT(false); + +/* + * Returns true if we were able to pick out a segment. Fills in r_segment_end + * with a pointer to the first character after the end of the string. + */ +static const char * +log_var_extract_segment(const char* segment_begin) { + const char *end; + for (end = segment_begin; *end != '\0' && *end != '|'; end++) { + } + return end; +} + +static bool +log_var_matches_segment(const char *segment_begin, const char *segment_end, + const char *log_var_begin, const char *log_var_end) { + assert(segment_begin <= segment_end); + assert(log_var_begin < log_var_end); + + ptrdiff_t segment_len = segment_end - segment_begin; + ptrdiff_t log_var_len = log_var_end - log_var_begin; + /* The special '.' segment matches everything. */ + if (segment_len == 1 && *segment_begin == '.') { + return true; + } + if (segment_len == log_var_len) { + return strncmp(segment_begin, log_var_begin, segment_len) == 0; + } else if (segment_len < log_var_len) { + return strncmp(segment_begin, log_var_begin, segment_len) == 0 + && log_var_begin[segment_len] == '.'; + } else { + return false; + } +} + +unsigned +log_var_update_state(log_var_t *log_var) { + const char *log_var_begin = log_var->name; + const char *log_var_end = log_var->name + strlen(log_var->name); + + /* Pointer to one before the beginning of the current segment. */ + const char *segment_begin = log_var_names; + + /* + * If log_init done is false, we haven't parsed the malloc conf yet. To + * avoid log-spew, we default to not displaying anything. + */ + if (!atomic_load_b(&log_init_done, ATOMIC_ACQUIRE)) { + return LOG_INITIALIZED_NOT_ENABLED; + } + + while (true) { + const char *segment_end = log_var_extract_segment( + segment_begin); + assert(segment_end < log_var_names + JEMALLOC_LOG_VAR_BUFSIZE); + if (log_var_matches_segment(segment_begin, segment_end, + log_var_begin, log_var_end)) { + atomic_store_u(&log_var->state, LOG_ENABLED, + ATOMIC_RELAXED); + return LOG_ENABLED; + } + if (*segment_end == '\0') { + /* Hit the end of the segment string with no match. */ + atomic_store_u(&log_var->state, + LOG_INITIALIZED_NOT_ENABLED, ATOMIC_RELAXED); + return LOG_INITIALIZED_NOT_ENABLED; + } + /* Otherwise, skip the delimiter and continue. */ + segment_begin = segment_end + 1; + } +} diff --git a/BeefRT/JEMalloc/src/malloc_io.c b/BeefRT/JEMalloc/src/malloc_io.c new file mode 100644 index 00000000..b76885cb --- /dev/null +++ b/BeefRT/JEMalloc/src/malloc_io.c @@ -0,0 +1,697 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/malloc_io.h" +#include "jemalloc/internal/util.h" + +#ifdef assert +# undef assert +#endif +#ifdef not_reached +# undef not_reached +#endif +#ifdef not_implemented +# undef not_implemented +#endif +#ifdef assert_not_implemented +# undef assert_not_implemented +#endif + +/* + * Define simple versions of assertion macros that won't recurse in case + * of assertion failures in malloc_*printf(). + */ +#define assert(e) do { \ + if (config_debug && !(e)) { \ + malloc_write(": Failed assertion\n"); \ + abort(); \ + } \ +} while (0) + +#define not_reached() do { \ + if (config_debug) { \ + malloc_write(": Unreachable code reached\n"); \ + abort(); \ + } \ + unreachable(); \ +} while (0) + +#define not_implemented() do { \ + if (config_debug) { \ + malloc_write(": Not implemented\n"); \ + abort(); \ + } \ +} while (0) + +#define assert_not_implemented(e) do { \ + if (unlikely(config_debug && !(e))) { \ + not_implemented(); \ + } \ +} while (0) + +/******************************************************************************/ +/* Function prototypes for non-inline static functions. */ + +#define U2S_BUFSIZE ((1U << (LG_SIZEOF_INTMAX_T + 3)) + 1) +static char *u2s(uintmax_t x, unsigned base, bool uppercase, char *s, + size_t *slen_p); +#define D2S_BUFSIZE (1 + U2S_BUFSIZE) +static char *d2s(intmax_t x, char sign, char *s, size_t *slen_p); +#define O2S_BUFSIZE (1 + U2S_BUFSIZE) +static char *o2s(uintmax_t x, bool alt_form, char *s, size_t *slen_p); +#define X2S_BUFSIZE (2 + U2S_BUFSIZE) +static char *x2s(uintmax_t x, bool alt_form, bool uppercase, char *s, + size_t *slen_p); + +/******************************************************************************/ + +/* malloc_message() setup. */ +void +wrtmessage(void *cbopaque, const char *s) { + malloc_write_fd(STDERR_FILENO, s, strlen(s)); +} + +JEMALLOC_EXPORT void (*je_malloc_message)(void *, const char *s); + +/* + * Wrapper around malloc_message() that avoids the need for + * je_malloc_message(...) throughout the code. + */ +void +malloc_write(const char *s) { + if (je_malloc_message != NULL) { + je_malloc_message(NULL, s); + } else { + wrtmessage(NULL, s); + } +} + +/* + * glibc provides a non-standard strerror_r() when _GNU_SOURCE is defined, so + * provide a wrapper. + */ +int +buferror(int err, char *buf, size_t buflen) { +#ifdef _WIN32 + FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM, NULL, err, 0, + (LPSTR)buf, (DWORD)buflen, NULL); + return 0; +#elif defined(JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE) && defined(_GNU_SOURCE) + char *b = strerror_r(err, buf, buflen); + if (b != buf) { + strncpy(buf, b, buflen); + buf[buflen-1] = '\0'; + } + return 0; +#else + return strerror_r(err, buf, buflen); +#endif +} + +uintmax_t +malloc_strtoumax(const char *restrict nptr, char **restrict endptr, int base) { + uintmax_t ret, digit; + unsigned b; + bool neg; + const char *p, *ns; + + p = nptr; + if (base < 0 || base == 1 || base > 36) { + ns = p; + set_errno(EINVAL); + ret = UINTMAX_MAX; + goto label_return; + } + b = base; + + /* Swallow leading whitespace and get sign, if any. */ + neg = false; + while (true) { + switch (*p) { + case '\t': case '\n': case '\v': case '\f': case '\r': case ' ': + p++; + break; + case '-': + neg = true; + JEMALLOC_FALLTHROUGH; + case '+': + p++; + JEMALLOC_FALLTHROUGH; + default: + goto label_prefix; + } + } + + /* Get prefix, if any. */ + label_prefix: + /* + * Note where the first non-whitespace/sign character is so that it is + * possible to tell whether any digits are consumed (e.g., " 0" vs. + * " -x"). + */ + ns = p; + if (*p == '0') { + switch (p[1]) { + case '0': case '1': case '2': case '3': case '4': case '5': + case '6': case '7': + if (b == 0) { + b = 8; + } + if (b == 8) { + p++; + } + break; + case 'X': case 'x': + switch (p[2]) { + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + case 'A': case 'B': case 'C': case 'D': case 'E': + case 'F': + case 'a': case 'b': case 'c': case 'd': case 'e': + case 'f': + if (b == 0) { + b = 16; + } + if (b == 16) { + p += 2; + } + break; + default: + break; + } + break; + default: + p++; + ret = 0; + goto label_return; + } + } + if (b == 0) { + b = 10; + } + + /* Convert. */ + ret = 0; + while ((*p >= '0' && *p <= '9' && (digit = *p - '0') < b) + || (*p >= 'A' && *p <= 'Z' && (digit = 10 + *p - 'A') < b) + || (*p >= 'a' && *p <= 'z' && (digit = 10 + *p - 'a') < b)) { + uintmax_t pret = ret; + ret *= b; + ret += digit; + if (ret < pret) { + /* Overflow. */ + set_errno(ERANGE); + ret = UINTMAX_MAX; + goto label_return; + } + p++; + } + if (neg) { + ret = (uintmax_t)(-((intmax_t)ret)); + } + + if (p == ns) { + /* No conversion performed. */ + set_errno(EINVAL); + ret = UINTMAX_MAX; + goto label_return; + } + +label_return: + if (endptr != NULL) { + if (p == ns) { + /* No characters were converted. */ + *endptr = (char *)nptr; + } else { + *endptr = (char *)p; + } + } + return ret; +} + +static char * +u2s(uintmax_t x, unsigned base, bool uppercase, char *s, size_t *slen_p) { + unsigned i; + + i = U2S_BUFSIZE - 1; + s[i] = '\0'; + switch (base) { + case 10: + do { + i--; + s[i] = "0123456789"[x % (uint64_t)10]; + x /= (uint64_t)10; + } while (x > 0); + break; + case 16: { + const char *digits = (uppercase) + ? "0123456789ABCDEF" + : "0123456789abcdef"; + + do { + i--; + s[i] = digits[x & 0xf]; + x >>= 4; + } while (x > 0); + break; + } default: { + const char *digits = (uppercase) + ? "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" + : "0123456789abcdefghijklmnopqrstuvwxyz"; + + assert(base >= 2 && base <= 36); + do { + i--; + s[i] = digits[x % (uint64_t)base]; + x /= (uint64_t)base; + } while (x > 0); + }} + + *slen_p = U2S_BUFSIZE - 1 - i; + return &s[i]; +} + +static char * +d2s(intmax_t x, char sign, char *s, size_t *slen_p) { + bool neg; + + if ((neg = (x < 0))) { + x = -x; + } + s = u2s(x, 10, false, s, slen_p); + if (neg) { + sign = '-'; + } + switch (sign) { + case '-': + if (!neg) { + break; + } + JEMALLOC_FALLTHROUGH; + case ' ': + case '+': + s--; + (*slen_p)++; + *s = sign; + break; + default: not_reached(); + } + return s; +} + +static char * +o2s(uintmax_t x, bool alt_form, char *s, size_t *slen_p) { + s = u2s(x, 8, false, s, slen_p); + if (alt_form && *s != '0') { + s--; + (*slen_p)++; + *s = '0'; + } + return s; +} + +static char * +x2s(uintmax_t x, bool alt_form, bool uppercase, char *s, size_t *slen_p) { + s = u2s(x, 16, uppercase, s, slen_p); + if (alt_form) { + s -= 2; + (*slen_p) += 2; + memcpy(s, uppercase ? "0X" : "0x", 2); + } + return s; +} + +JEMALLOC_COLD +size_t +malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap) { + size_t i; + const char *f; + +#define APPEND_C(c) do { \ + if (i < size) { \ + str[i] = (c); \ + } \ + i++; \ +} while (0) +#define APPEND_S(s, slen) do { \ + if (i < size) { \ + size_t cpylen = (slen <= size - i) ? slen : size - i; \ + memcpy(&str[i], s, cpylen); \ + } \ + i += slen; \ +} while (0) +#define APPEND_PADDED_S(s, slen, width, left_justify) do { \ + /* Left padding. */ \ + size_t pad_len = (width == -1) ? 0 : ((slen < (size_t)width) ? \ + (size_t)width - slen : 0); \ + if (!left_justify && pad_len != 0) { \ + size_t j; \ + for (j = 0; j < pad_len; j++) { \ + if (pad_zero) { \ + APPEND_C('0'); \ + } else { \ + APPEND_C(' '); \ + } \ + } \ + } \ + /* Value. */ \ + APPEND_S(s, slen); \ + /* Right padding. */ \ + if (left_justify && pad_len != 0) { \ + size_t j; \ + for (j = 0; j < pad_len; j++) { \ + APPEND_C(' '); \ + } \ + } \ +} while (0) +#define GET_ARG_NUMERIC(val, len) do { \ + switch ((unsigned char)len) { \ + case '?': \ + val = va_arg(ap, int); \ + break; \ + case '?' | 0x80: \ + val = va_arg(ap, unsigned int); \ + break; \ + case 'l': \ + val = va_arg(ap, long); \ + break; \ + case 'l' | 0x80: \ + val = va_arg(ap, unsigned long); \ + break; \ + case 'q': \ + val = va_arg(ap, long long); \ + break; \ + case 'q' | 0x80: \ + val = va_arg(ap, unsigned long long); \ + break; \ + case 'j': \ + val = va_arg(ap, intmax_t); \ + break; \ + case 'j' | 0x80: \ + val = va_arg(ap, uintmax_t); \ + break; \ + case 't': \ + val = va_arg(ap, ptrdiff_t); \ + break; \ + case 'z': \ + val = va_arg(ap, ssize_t); \ + break; \ + case 'z' | 0x80: \ + val = va_arg(ap, size_t); \ + break; \ + case 'p': /* Synthetic; used for %p. */ \ + val = va_arg(ap, uintptr_t); \ + break; \ + default: \ + not_reached(); \ + val = 0; \ + } \ +} while (0) + + i = 0; + f = format; + while (true) { + switch (*f) { + case '\0': goto label_out; + case '%': { + bool alt_form = false; + bool left_justify = false; + bool plus_space = false; + bool plus_plus = false; + int prec = -1; + int width = -1; + unsigned char len = '?'; + char *s; + size_t slen; + bool first_width_digit = true; + bool pad_zero = false; + + f++; + /* Flags. */ + while (true) { + switch (*f) { + case '#': + assert(!alt_form); + alt_form = true; + break; + case '-': + assert(!left_justify); + left_justify = true; + break; + case ' ': + assert(!plus_space); + plus_space = true; + break; + case '+': + assert(!plus_plus); + plus_plus = true; + break; + default: goto label_width; + } + f++; + } + /* Width. */ + label_width: + switch (*f) { + case '*': + width = va_arg(ap, int); + f++; + if (width < 0) { + left_justify = true; + width = -width; + } + break; + case '0': + if (first_width_digit) { + pad_zero = true; + } + JEMALLOC_FALLTHROUGH; + case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': { + uintmax_t uwidth; + set_errno(0); + uwidth = malloc_strtoumax(f, (char **)&f, 10); + assert(uwidth != UINTMAX_MAX || get_errno() != + ERANGE); + width = (int)uwidth; + first_width_digit = false; + break; + } default: + break; + } + /* Width/precision separator. */ + if (*f == '.') { + f++; + } else { + goto label_length; + } + /* Precision. */ + switch (*f) { + case '*': + prec = va_arg(ap, int); + f++; + break; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': { + uintmax_t uprec; + set_errno(0); + uprec = malloc_strtoumax(f, (char **)&f, 10); + assert(uprec != UINTMAX_MAX || get_errno() != + ERANGE); + prec = (int)uprec; + break; + } + default: break; + } + /* Length. */ + label_length: + switch (*f) { + case 'l': + f++; + if (*f == 'l') { + len = 'q'; + f++; + } else { + len = 'l'; + } + break; + case 'q': case 'j': case 't': case 'z': + len = *f; + f++; + break; + default: break; + } + /* Conversion specifier. */ + switch (*f) { + case '%': + /* %% */ + APPEND_C(*f); + f++; + break; + case 'd': case 'i': { + intmax_t val JEMALLOC_CC_SILENCE_INIT(0); + char buf[D2S_BUFSIZE]; + + /* + * Outputting negative, zero-padded numbers + * would require a nontrivial rework of the + * interaction between the width and padding + * (since 0 padding goes between the '-' and the + * number, while ' ' padding goes either before + * the - or after the number. Since we + * currently don't ever need 0-padded negative + * numbers, just don't bother supporting it. + */ + assert(!pad_zero); + + GET_ARG_NUMERIC(val, len); + s = d2s(val, (plus_plus ? '+' : (plus_space ? + ' ' : '-')), buf, &slen); + APPEND_PADDED_S(s, slen, width, left_justify); + f++; + break; + } case 'o': { + uintmax_t val JEMALLOC_CC_SILENCE_INIT(0); + char buf[O2S_BUFSIZE]; + + GET_ARG_NUMERIC(val, len | 0x80); + s = o2s(val, alt_form, buf, &slen); + APPEND_PADDED_S(s, slen, width, left_justify); + f++; + break; + } case 'u': { + uintmax_t val JEMALLOC_CC_SILENCE_INIT(0); + char buf[U2S_BUFSIZE]; + + GET_ARG_NUMERIC(val, len | 0x80); + s = u2s(val, 10, false, buf, &slen); + APPEND_PADDED_S(s, slen, width, left_justify); + f++; + break; + } case 'x': case 'X': { + uintmax_t val JEMALLOC_CC_SILENCE_INIT(0); + char buf[X2S_BUFSIZE]; + + GET_ARG_NUMERIC(val, len | 0x80); + s = x2s(val, alt_form, *f == 'X', buf, &slen); + APPEND_PADDED_S(s, slen, width, left_justify); + f++; + break; + } case 'c': { + unsigned char val; + char buf[2]; + + assert(len == '?' || len == 'l'); + assert_not_implemented(len != 'l'); + val = va_arg(ap, int); + buf[0] = val; + buf[1] = '\0'; + APPEND_PADDED_S(buf, 1, width, left_justify); + f++; + break; + } case 's': + assert(len == '?' || len == 'l'); + assert_not_implemented(len != 'l'); + s = va_arg(ap, char *); + slen = (prec < 0) ? strlen(s) : (size_t)prec; + APPEND_PADDED_S(s, slen, width, left_justify); + f++; + break; + case 'p': { + uintmax_t val; + char buf[X2S_BUFSIZE]; + + GET_ARG_NUMERIC(val, 'p'); + s = x2s(val, true, false, buf, &slen); + APPEND_PADDED_S(s, slen, width, left_justify); + f++; + break; + } default: not_reached(); + } + break; + } default: { + APPEND_C(*f); + f++; + break; + }} + } + label_out: + if (i < size) { + str[i] = '\0'; + } else { + str[size - 1] = '\0'; + } + +#undef APPEND_C +#undef APPEND_S +#undef APPEND_PADDED_S +#undef GET_ARG_NUMERIC + return i; +} + +JEMALLOC_FORMAT_PRINTF(3, 4) +size_t +malloc_snprintf(char *str, size_t size, const char *format, ...) { + size_t ret; + va_list ap; + + va_start(ap, format); + ret = malloc_vsnprintf(str, size, format, ap); + va_end(ap); + + return ret; +} + +void +malloc_vcprintf(write_cb_t *write_cb, void *cbopaque, const char *format, + va_list ap) { + char buf[MALLOC_PRINTF_BUFSIZE]; + + if (write_cb == NULL) { + /* + * The caller did not provide an alternate write_cb callback + * function, so use the default one. malloc_write() is an + * inline function, so use malloc_message() directly here. + */ + write_cb = (je_malloc_message != NULL) ? je_malloc_message : + wrtmessage; + } + + malloc_vsnprintf(buf, sizeof(buf), format, ap); + write_cb(cbopaque, buf); +} + +/* + * Print to a callback function in such a way as to (hopefully) avoid memory + * allocation. + */ +JEMALLOC_FORMAT_PRINTF(3, 4) +void +malloc_cprintf(write_cb_t *write_cb, void *cbopaque, const char *format, ...) { + va_list ap; + + va_start(ap, format); + malloc_vcprintf(write_cb, cbopaque, format, ap); + va_end(ap); +} + +/* Print to stderr in such a way as to avoid memory allocation. */ +JEMALLOC_FORMAT_PRINTF(1, 2) +void +malloc_printf(const char *format, ...) { + va_list ap; + + va_start(ap, format); + malloc_vcprintf(NULL, NULL, format, ap); + va_end(ap); +} + +/* + * Restore normal assertion macros, in order to make it possible to compile all + * C files as a single concatenation. + */ +#undef assert +#undef not_reached +#undef not_implemented +#undef assert_not_implemented +#include "jemalloc/internal/assert.h" diff --git a/BeefRT/JEMalloc/src/mutex.c b/BeefRT/JEMalloc/src/mutex.c new file mode 100644 index 00000000..0b3547a8 --- /dev/null +++ b/BeefRT/JEMalloc/src/mutex.c @@ -0,0 +1,228 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/malloc_io.h" +#include "jemalloc/internal/spin.h" + +#ifndef _CRT_SPINCOUNT +#define _CRT_SPINCOUNT 4000 +#endif + +/* + * Based on benchmark results, a fixed spin with this amount of retries works + * well for our critical sections. + */ +int64_t opt_mutex_max_spin = 600; + +/******************************************************************************/ +/* Data. */ + +#ifdef JEMALLOC_LAZY_LOCK +bool isthreaded = false; +#endif +#ifdef JEMALLOC_MUTEX_INIT_CB +static bool postpone_init = true; +static malloc_mutex_t *postponed_mutexes = NULL; +#endif + +/******************************************************************************/ +/* + * We intercept pthread_create() calls in order to toggle isthreaded if the + * process goes multi-threaded. + */ + +#if defined(JEMALLOC_LAZY_LOCK) && !defined(_WIN32) +JEMALLOC_EXPORT int +pthread_create(pthread_t *__restrict thread, + const pthread_attr_t *__restrict attr, void *(*start_routine)(void *), + void *__restrict arg) { + return pthread_create_wrapper(thread, attr, start_routine, arg); +} +#endif + +/******************************************************************************/ + +#ifdef JEMALLOC_MUTEX_INIT_CB +JEMALLOC_EXPORT int _pthread_mutex_init_calloc_cb(pthread_mutex_t *mutex, + void *(calloc_cb)(size_t, size_t)); +#endif + +void +malloc_mutex_lock_slow(malloc_mutex_t *mutex) { + mutex_prof_data_t *data = &mutex->prof_data; + nstime_t before; + + if (ncpus == 1) { + goto label_spin_done; + } + + int cnt = 0; + do { + spin_cpu_spinwait(); + if (!atomic_load_b(&mutex->locked, ATOMIC_RELAXED) + && !malloc_mutex_trylock_final(mutex)) { + data->n_spin_acquired++; + return; + } + } while (cnt++ < opt_mutex_max_spin || opt_mutex_max_spin == -1); + + if (!config_stats) { + /* Only spin is useful when stats is off. */ + malloc_mutex_lock_final(mutex); + return; + } +label_spin_done: + nstime_init_update(&before); + /* Copy before to after to avoid clock skews. */ + nstime_t after; + nstime_copy(&after, &before); + uint32_t n_thds = atomic_fetch_add_u32(&data->n_waiting_thds, 1, + ATOMIC_RELAXED) + 1; + /* One last try as above two calls may take quite some cycles. */ + if (!malloc_mutex_trylock_final(mutex)) { + atomic_fetch_sub_u32(&data->n_waiting_thds, 1, ATOMIC_RELAXED); + data->n_spin_acquired++; + return; + } + + /* True slow path. */ + malloc_mutex_lock_final(mutex); + /* Update more slow-path only counters. */ + atomic_fetch_sub_u32(&data->n_waiting_thds, 1, ATOMIC_RELAXED); + nstime_update(&after); + + nstime_t delta; + nstime_copy(&delta, &after); + nstime_subtract(&delta, &before); + + data->n_wait_times++; + nstime_add(&data->tot_wait_time, &delta); + if (nstime_compare(&data->max_wait_time, &delta) < 0) { + nstime_copy(&data->max_wait_time, &delta); + } + if (n_thds > data->max_n_thds) { + data->max_n_thds = n_thds; + } +} + +static void +mutex_prof_data_init(mutex_prof_data_t *data) { + memset(data, 0, sizeof(mutex_prof_data_t)); + nstime_init_zero(&data->max_wait_time); + nstime_init_zero(&data->tot_wait_time); + data->prev_owner = NULL; +} + +void +malloc_mutex_prof_data_reset(tsdn_t *tsdn, malloc_mutex_t *mutex) { + malloc_mutex_assert_owner(tsdn, mutex); + mutex_prof_data_init(&mutex->prof_data); +} + +static int +mutex_addr_comp(const witness_t *witness1, void *mutex1, + const witness_t *witness2, void *mutex2) { + assert(mutex1 != NULL); + assert(mutex2 != NULL); + uintptr_t mu1int = (uintptr_t)mutex1; + uintptr_t mu2int = (uintptr_t)mutex2; + if (mu1int < mu2int) { + return -1; + } else if (mu1int == mu2int) { + return 0; + } else { + return 1; + } +} + +bool +malloc_mutex_init(malloc_mutex_t *mutex, const char *name, + witness_rank_t rank, malloc_mutex_lock_order_t lock_order) { + mutex_prof_data_init(&mutex->prof_data); +#ifdef _WIN32 +# if _WIN32_WINNT >= 0x0600 + InitializeSRWLock(&mutex->lock); +# else + if (!InitializeCriticalSectionAndSpinCount(&mutex->lock, + _CRT_SPINCOUNT)) { + return true; + } +# endif +#elif (defined(JEMALLOC_OS_UNFAIR_LOCK)) + mutex->lock = OS_UNFAIR_LOCK_INIT; +#elif (defined(JEMALLOC_MUTEX_INIT_CB)) + if (postpone_init) { + mutex->postponed_next = postponed_mutexes; + postponed_mutexes = mutex; + } else { + if (_pthread_mutex_init_calloc_cb(&mutex->lock, + bootstrap_calloc) != 0) { + return true; + } + } +#else + pthread_mutexattr_t attr; + + if (pthread_mutexattr_init(&attr) != 0) { + return true; + } + pthread_mutexattr_settype(&attr, MALLOC_MUTEX_TYPE); + if (pthread_mutex_init(&mutex->lock, &attr) != 0) { + pthread_mutexattr_destroy(&attr); + return true; + } + pthread_mutexattr_destroy(&attr); +#endif + if (config_debug) { + mutex->lock_order = lock_order; + if (lock_order == malloc_mutex_address_ordered) { + witness_init(&mutex->witness, name, rank, + mutex_addr_comp, mutex); + } else { + witness_init(&mutex->witness, name, rank, NULL, NULL); + } + } + return false; +} + +void +malloc_mutex_prefork(tsdn_t *tsdn, malloc_mutex_t *mutex) { + malloc_mutex_lock(tsdn, mutex); +} + +void +malloc_mutex_postfork_parent(tsdn_t *tsdn, malloc_mutex_t *mutex) { + malloc_mutex_unlock(tsdn, mutex); +} + +void +malloc_mutex_postfork_child(tsdn_t *tsdn, malloc_mutex_t *mutex) { +#ifdef JEMALLOC_MUTEX_INIT_CB + malloc_mutex_unlock(tsdn, mutex); +#else + if (malloc_mutex_init(mutex, mutex->witness.name, + mutex->witness.rank, mutex->lock_order)) { + malloc_printf(": Error re-initializing mutex in " + "child\n"); + if (opt_abort) { + abort(); + } + } +#endif +} + +bool +malloc_mutex_boot(void) { +#ifdef JEMALLOC_MUTEX_INIT_CB + postpone_init = false; + while (postponed_mutexes != NULL) { + if (_pthread_mutex_init_calloc_cb(&postponed_mutexes->lock, + bootstrap_calloc) != 0) { + return true; + } + postponed_mutexes = postponed_mutexes->postponed_next; + } +#endif + return false; +} diff --git a/BeefRT/JEMalloc/src/nstime.c b/BeefRT/JEMalloc/src/nstime.c new file mode 100644 index 00000000..a1a53777 --- /dev/null +++ b/BeefRT/JEMalloc/src/nstime.c @@ -0,0 +1,289 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/nstime.h" + +#include "jemalloc/internal/assert.h" + +#define BILLION UINT64_C(1000000000) +#define MILLION UINT64_C(1000000) + +static void +nstime_set_initialized(nstime_t *time) { +#ifdef JEMALLOC_DEBUG + time->magic = NSTIME_MAGIC; +#endif +} + +static void +nstime_assert_initialized(const nstime_t *time) { +#ifdef JEMALLOC_DEBUG + /* + * Some parts (e.g. stats) rely on memset to zero initialize. Treat + * these as valid initialization. + */ + assert(time->magic == NSTIME_MAGIC || + (time->magic == 0 && time->ns == 0)); +#endif +} + +static void +nstime_pair_assert_initialized(const nstime_t *t1, const nstime_t *t2) { + nstime_assert_initialized(t1); + nstime_assert_initialized(t2); +} + +static void +nstime_initialize_operand(nstime_t *time) { + /* + * Operations like nstime_add may have the initial operand being zero + * initialized (covered by the assert below). Full-initialize needed + * before changing it to non-zero. + */ + nstime_assert_initialized(time); + nstime_set_initialized(time); +} + +void +nstime_init(nstime_t *time, uint64_t ns) { + nstime_set_initialized(time); + time->ns = ns; +} + +void +nstime_init2(nstime_t *time, uint64_t sec, uint64_t nsec) { + nstime_set_initialized(time); + time->ns = sec * BILLION + nsec; +} + +uint64_t +nstime_ns(const nstime_t *time) { + nstime_assert_initialized(time); + return time->ns; +} + +uint64_t +nstime_msec(const nstime_t *time) { + nstime_assert_initialized(time); + return time->ns / MILLION; +} + +uint64_t +nstime_sec(const nstime_t *time) { + nstime_assert_initialized(time); + return time->ns / BILLION; +} + +uint64_t +nstime_nsec(const nstime_t *time) { + nstime_assert_initialized(time); + return time->ns % BILLION; +} + +void +nstime_copy(nstime_t *time, const nstime_t *source) { + /* Source is required to be initialized. */ + nstime_assert_initialized(source); + *time = *source; + nstime_assert_initialized(time); +} + +int +nstime_compare(const nstime_t *a, const nstime_t *b) { + nstime_pair_assert_initialized(a, b); + return (a->ns > b->ns) - (a->ns < b->ns); +} + +void +nstime_add(nstime_t *time, const nstime_t *addend) { + nstime_pair_assert_initialized(time, addend); + assert(UINT64_MAX - time->ns >= addend->ns); + + nstime_initialize_operand(time); + time->ns += addend->ns; +} + +void +nstime_iadd(nstime_t *time, uint64_t addend) { + nstime_assert_initialized(time); + assert(UINT64_MAX - time->ns >= addend); + + nstime_initialize_operand(time); + time->ns += addend; +} + +void +nstime_subtract(nstime_t *time, const nstime_t *subtrahend) { + nstime_pair_assert_initialized(time, subtrahend); + assert(nstime_compare(time, subtrahend) >= 0); + + /* No initialize operand -- subtraction must be initialized. */ + time->ns -= subtrahend->ns; +} + +void +nstime_isubtract(nstime_t *time, uint64_t subtrahend) { + nstime_assert_initialized(time); + assert(time->ns >= subtrahend); + + /* No initialize operand -- subtraction must be initialized. */ + time->ns -= subtrahend; +} + +void +nstime_imultiply(nstime_t *time, uint64_t multiplier) { + nstime_assert_initialized(time); + assert((((time->ns | multiplier) & (UINT64_MAX << (sizeof(uint64_t) << + 2))) == 0) || ((time->ns * multiplier) / multiplier == time->ns)); + + nstime_initialize_operand(time); + time->ns *= multiplier; +} + +void +nstime_idivide(nstime_t *time, uint64_t divisor) { + nstime_assert_initialized(time); + assert(divisor != 0); + + nstime_initialize_operand(time); + time->ns /= divisor; +} + +uint64_t +nstime_divide(const nstime_t *time, const nstime_t *divisor) { + nstime_pair_assert_initialized(time, divisor); + assert(divisor->ns != 0); + + /* No initialize operand -- *time itself remains unchanged. */ + return time->ns / divisor->ns; +} + +/* Returns time since *past, w/o updating *past. */ +uint64_t +nstime_ns_since(const nstime_t *past) { + nstime_assert_initialized(past); + + nstime_t now; + nstime_copy(&now, past); + nstime_update(&now); + + assert(nstime_compare(&now, past) >= 0); + return now.ns - past->ns; +} + +#ifdef _WIN32 +# define NSTIME_MONOTONIC true +static void +nstime_get(nstime_t *time) { + FILETIME ft; + uint64_t ticks_100ns; + + GetSystemTimeAsFileTime(&ft); + ticks_100ns = (((uint64_t)ft.dwHighDateTime) << 32) | ft.dwLowDateTime; + + nstime_init(time, ticks_100ns * 100); +} +#elif defined(JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE) +# define NSTIME_MONOTONIC true +static void +nstime_get(nstime_t *time) { + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC_COARSE, &ts); + nstime_init2(time, ts.tv_sec, ts.tv_nsec); +} +#elif defined(JEMALLOC_HAVE_CLOCK_MONOTONIC) +# define NSTIME_MONOTONIC true +static void +nstime_get(nstime_t *time) { + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC, &ts); + nstime_init2(time, ts.tv_sec, ts.tv_nsec); +} +#elif defined(JEMALLOC_HAVE_MACH_ABSOLUTE_TIME) +# define NSTIME_MONOTONIC true +static void +nstime_get(nstime_t *time) { + nstime_init(time, mach_absolute_time()); +} +#else +# define NSTIME_MONOTONIC false +static void +nstime_get(nstime_t *time) { + struct timeval tv; + + gettimeofday(&tv, NULL); + nstime_init2(time, tv.tv_sec, tv.tv_usec * 1000); +} +#endif + +static bool +nstime_monotonic_impl(void) { + return NSTIME_MONOTONIC; +#undef NSTIME_MONOTONIC +} +nstime_monotonic_t *JET_MUTABLE nstime_monotonic = nstime_monotonic_impl; + +prof_time_res_t opt_prof_time_res = + prof_time_res_default; + +const char *prof_time_res_mode_names[] = { + "default", + "high", +}; + + +static void +nstime_get_realtime(nstime_t *time) { +#if defined(JEMALLOC_HAVE_CLOCK_REALTIME) && !defined(_WIN32) + struct timespec ts; + + clock_gettime(CLOCK_REALTIME, &ts); + nstime_init2(time, ts.tv_sec, ts.tv_nsec); +#else + unreachable(); +#endif +} + +static void +nstime_prof_update_impl(nstime_t *time) { + nstime_t old_time; + + nstime_copy(&old_time, time); + + if (opt_prof_time_res == prof_time_res_high) { + nstime_get_realtime(time); + } else { + nstime_get(time); + } +} +nstime_prof_update_t *JET_MUTABLE nstime_prof_update = nstime_prof_update_impl; + +static void +nstime_update_impl(nstime_t *time) { + nstime_t old_time; + + nstime_copy(&old_time, time); + nstime_get(time); + + /* Handle non-monotonic clocks. */ + if (unlikely(nstime_compare(&old_time, time) > 0)) { + nstime_copy(time, &old_time); + } +} +nstime_update_t *JET_MUTABLE nstime_update = nstime_update_impl; + +void +nstime_init_update(nstime_t *time) { + nstime_init_zero(time); + nstime_update(time); +} + +void +nstime_prof_init_update(nstime_t *time) { + nstime_init_zero(time); + nstime_prof_update(time); +} + + diff --git a/BeefRT/JEMalloc/src/pa.c b/BeefRT/JEMalloc/src/pa.c new file mode 100644 index 00000000..eb7e4620 --- /dev/null +++ b/BeefRT/JEMalloc/src/pa.c @@ -0,0 +1,277 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/san.h" +#include "jemalloc/internal/hpa.h" + +static void +pa_nactive_add(pa_shard_t *shard, size_t add_pages) { + atomic_fetch_add_zu(&shard->nactive, add_pages, ATOMIC_RELAXED); +} + +static void +pa_nactive_sub(pa_shard_t *shard, size_t sub_pages) { + assert(atomic_load_zu(&shard->nactive, ATOMIC_RELAXED) >= sub_pages); + atomic_fetch_sub_zu(&shard->nactive, sub_pages, ATOMIC_RELAXED); +} + +bool +pa_central_init(pa_central_t *central, base_t *base, bool hpa, + hpa_hooks_t *hpa_hooks) { + bool err; + if (hpa) { + err = hpa_central_init(¢ral->hpa, base, hpa_hooks); + if (err) { + return true; + } + } + return false; +} + +bool +pa_shard_init(tsdn_t *tsdn, pa_shard_t *shard, pa_central_t *central, + emap_t *emap, base_t *base, unsigned ind, pa_shard_stats_t *stats, + malloc_mutex_t *stats_mtx, nstime_t *cur_time, + size_t pac_oversize_threshold, ssize_t dirty_decay_ms, + ssize_t muzzy_decay_ms) { + /* This will change eventually, but for now it should hold. */ + assert(base_ind_get(base) == ind); + if (edata_cache_init(&shard->edata_cache, base)) { + return true; + } + + if (pac_init(tsdn, &shard->pac, base, emap, &shard->edata_cache, + cur_time, pac_oversize_threshold, dirty_decay_ms, muzzy_decay_ms, + &stats->pac_stats, stats_mtx)) { + return true; + } + + shard->ind = ind; + + shard->ever_used_hpa = false; + atomic_store_b(&shard->use_hpa, false, ATOMIC_RELAXED); + + atomic_store_zu(&shard->nactive, 0, ATOMIC_RELAXED); + + shard->stats_mtx = stats_mtx; + shard->stats = stats; + memset(shard->stats, 0, sizeof(*shard->stats)); + + shard->central = central; + shard->emap = emap; + shard->base = base; + + return false; +} + +bool +pa_shard_enable_hpa(tsdn_t *tsdn, pa_shard_t *shard, + const hpa_shard_opts_t *hpa_opts, const sec_opts_t *hpa_sec_opts) { + if (hpa_shard_init(&shard->hpa_shard, &shard->central->hpa, shard->emap, + shard->base, &shard->edata_cache, shard->ind, hpa_opts)) { + return true; + } + if (sec_init(tsdn, &shard->hpa_sec, shard->base, &shard->hpa_shard.pai, + hpa_sec_opts)) { + return true; + } + shard->ever_used_hpa = true; + atomic_store_b(&shard->use_hpa, true, ATOMIC_RELAXED); + + return false; +} + +void +pa_shard_disable_hpa(tsdn_t *tsdn, pa_shard_t *shard) { + atomic_store_b(&shard->use_hpa, false, ATOMIC_RELAXED); + if (shard->ever_used_hpa) { + sec_disable(tsdn, &shard->hpa_sec); + hpa_shard_disable(tsdn, &shard->hpa_shard); + } +} + +void +pa_shard_reset(tsdn_t *tsdn, pa_shard_t *shard) { + atomic_store_zu(&shard->nactive, 0, ATOMIC_RELAXED); + if (shard->ever_used_hpa) { + sec_flush(tsdn, &shard->hpa_sec); + } +} + +static bool +pa_shard_uses_hpa(pa_shard_t *shard) { + return atomic_load_b(&shard->use_hpa, ATOMIC_RELAXED); +} + +void +pa_shard_destroy(tsdn_t *tsdn, pa_shard_t *shard) { + pac_destroy(tsdn, &shard->pac); + if (shard->ever_used_hpa) { + sec_flush(tsdn, &shard->hpa_sec); + hpa_shard_disable(tsdn, &shard->hpa_shard); + } +} + +static pai_t * +pa_get_pai(pa_shard_t *shard, edata_t *edata) { + return (edata_pai_get(edata) == EXTENT_PAI_PAC + ? &shard->pac.pai : &shard->hpa_sec.pai); +} + +edata_t * +pa_alloc(tsdn_t *tsdn, pa_shard_t *shard, size_t size, size_t alignment, + bool slab, szind_t szind, bool zero, bool guarded, + bool *deferred_work_generated) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + assert(!guarded || alignment <= PAGE); + + edata_t *edata = NULL; + if (!guarded && pa_shard_uses_hpa(shard)) { + edata = pai_alloc(tsdn, &shard->hpa_sec.pai, size, alignment, + zero, /* guarded */ false, slab, deferred_work_generated); + } + /* + * Fall back to the PAC if the HPA is off or couldn't serve the given + * allocation request. + */ + if (edata == NULL) { + edata = pai_alloc(tsdn, &shard->pac.pai, size, alignment, zero, + guarded, slab, deferred_work_generated); + } + if (edata != NULL) { + assert(edata_size_get(edata) == size); + pa_nactive_add(shard, size >> LG_PAGE); + emap_remap(tsdn, shard->emap, edata, szind, slab); + edata_szind_set(edata, szind); + edata_slab_set(edata, slab); + if (slab && (size > 2 * PAGE)) { + emap_register_interior(tsdn, shard->emap, edata, szind); + } + assert(edata_arena_ind_get(edata) == shard->ind); + } + return edata; +} + +bool +pa_expand(tsdn_t *tsdn, pa_shard_t *shard, edata_t *edata, size_t old_size, + size_t new_size, szind_t szind, bool zero, bool *deferred_work_generated) { + assert(new_size > old_size); + assert(edata_size_get(edata) == old_size); + assert((new_size & PAGE_MASK) == 0); + if (edata_guarded_get(edata)) { + return true; + } + size_t expand_amount = new_size - old_size; + + pai_t *pai = pa_get_pai(shard, edata); + + bool error = pai_expand(tsdn, pai, edata, old_size, new_size, zero, + deferred_work_generated); + if (error) { + return true; + } + + pa_nactive_add(shard, expand_amount >> LG_PAGE); + edata_szind_set(edata, szind); + emap_remap(tsdn, shard->emap, edata, szind, /* slab */ false); + return false; +} + +bool +pa_shrink(tsdn_t *tsdn, pa_shard_t *shard, edata_t *edata, size_t old_size, + size_t new_size, szind_t szind, bool *deferred_work_generated) { + assert(new_size < old_size); + assert(edata_size_get(edata) == old_size); + assert((new_size & PAGE_MASK) == 0); + if (edata_guarded_get(edata)) { + return true; + } + size_t shrink_amount = old_size - new_size; + + pai_t *pai = pa_get_pai(shard, edata); + bool error = pai_shrink(tsdn, pai, edata, old_size, new_size, + deferred_work_generated); + if (error) { + return true; + } + pa_nactive_sub(shard, shrink_amount >> LG_PAGE); + + edata_szind_set(edata, szind); + emap_remap(tsdn, shard->emap, edata, szind, /* slab */ false); + return false; +} + +void +pa_dalloc(tsdn_t *tsdn, pa_shard_t *shard, edata_t *edata, + bool *deferred_work_generated) { + emap_remap(tsdn, shard->emap, edata, SC_NSIZES, /* slab */ false); + if (edata_slab_get(edata)) { + emap_deregister_interior(tsdn, shard->emap, edata); + /* + * The slab state of the extent isn't cleared. It may be used + * by the pai implementation, e.g. to make caching decisions. + */ + } + edata_addr_set(edata, edata_base_get(edata)); + edata_szind_set(edata, SC_NSIZES); + pa_nactive_sub(shard, edata_size_get(edata) >> LG_PAGE); + pai_t *pai = pa_get_pai(shard, edata); + pai_dalloc(tsdn, pai, edata, deferred_work_generated); +} + +bool +pa_shard_retain_grow_limit_get_set(tsdn_t *tsdn, pa_shard_t *shard, + size_t *old_limit, size_t *new_limit) { + return pac_retain_grow_limit_get_set(tsdn, &shard->pac, old_limit, + new_limit); +} + +bool +pa_decay_ms_set(tsdn_t *tsdn, pa_shard_t *shard, extent_state_t state, + ssize_t decay_ms, pac_purge_eagerness_t eagerness) { + return pac_decay_ms_set(tsdn, &shard->pac, state, decay_ms, eagerness); +} + +ssize_t +pa_decay_ms_get(pa_shard_t *shard, extent_state_t state) { + return pac_decay_ms_get(&shard->pac, state); +} + +void +pa_shard_set_deferral_allowed(tsdn_t *tsdn, pa_shard_t *shard, + bool deferral_allowed) { + if (pa_shard_uses_hpa(shard)) { + hpa_shard_set_deferral_allowed(tsdn, &shard->hpa_shard, + deferral_allowed); + } +} + +void +pa_shard_do_deferred_work(tsdn_t *tsdn, pa_shard_t *shard) { + if (pa_shard_uses_hpa(shard)) { + hpa_shard_do_deferred_work(tsdn, &shard->hpa_shard); + } +} + +/* + * Get time until next deferred work ought to happen. If there are multiple + * things that have been deferred, this function calculates the time until + * the soonest of those things. + */ +uint64_t +pa_shard_time_until_deferred_work(tsdn_t *tsdn, pa_shard_t *shard) { + uint64_t time = pai_time_until_deferred_work(tsdn, &shard->pac.pai); + if (time == BACKGROUND_THREAD_DEFERRED_MIN) { + return time; + } + + if (pa_shard_uses_hpa(shard)) { + uint64_t hpa = + pai_time_until_deferred_work(tsdn, &shard->hpa_shard.pai); + if (hpa < time) { + time = hpa; + } + } + return time; +} diff --git a/BeefRT/JEMalloc/src/pa_extra.c b/BeefRT/JEMalloc/src/pa_extra.c new file mode 100644 index 00000000..0f488be6 --- /dev/null +++ b/BeefRT/JEMalloc/src/pa_extra.c @@ -0,0 +1,191 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +/* + * This file is logically part of the PA module. While pa.c contains the core + * allocator functionality, this file contains boring integration functionality; + * things like the pre- and post- fork handlers, and stats merging for CTL + * refreshes. + */ + +void +pa_shard_prefork0(tsdn_t *tsdn, pa_shard_t *shard) { + malloc_mutex_prefork(tsdn, &shard->pac.decay_dirty.mtx); + malloc_mutex_prefork(tsdn, &shard->pac.decay_muzzy.mtx); +} + +void +pa_shard_prefork2(tsdn_t *tsdn, pa_shard_t *shard) { + if (shard->ever_used_hpa) { + sec_prefork2(tsdn, &shard->hpa_sec); + } +} + +void +pa_shard_prefork3(tsdn_t *tsdn, pa_shard_t *shard) { + malloc_mutex_prefork(tsdn, &shard->pac.grow_mtx); + if (shard->ever_used_hpa) { + hpa_shard_prefork3(tsdn, &shard->hpa_shard); + } +} + +void +pa_shard_prefork4(tsdn_t *tsdn, pa_shard_t *shard) { + ecache_prefork(tsdn, &shard->pac.ecache_dirty); + ecache_prefork(tsdn, &shard->pac.ecache_muzzy); + ecache_prefork(tsdn, &shard->pac.ecache_retained); + if (shard->ever_used_hpa) { + hpa_shard_prefork4(tsdn, &shard->hpa_shard); + } +} + +void +pa_shard_prefork5(tsdn_t *tsdn, pa_shard_t *shard) { + edata_cache_prefork(tsdn, &shard->edata_cache); +} + +void +pa_shard_postfork_parent(tsdn_t *tsdn, pa_shard_t *shard) { + edata_cache_postfork_parent(tsdn, &shard->edata_cache); + ecache_postfork_parent(tsdn, &shard->pac.ecache_dirty); + ecache_postfork_parent(tsdn, &shard->pac.ecache_muzzy); + ecache_postfork_parent(tsdn, &shard->pac.ecache_retained); + malloc_mutex_postfork_parent(tsdn, &shard->pac.grow_mtx); + malloc_mutex_postfork_parent(tsdn, &shard->pac.decay_dirty.mtx); + malloc_mutex_postfork_parent(tsdn, &shard->pac.decay_muzzy.mtx); + if (shard->ever_used_hpa) { + sec_postfork_parent(tsdn, &shard->hpa_sec); + hpa_shard_postfork_parent(tsdn, &shard->hpa_shard); + } +} + +void +pa_shard_postfork_child(tsdn_t *tsdn, pa_shard_t *shard) { + edata_cache_postfork_child(tsdn, &shard->edata_cache); + ecache_postfork_child(tsdn, &shard->pac.ecache_dirty); + ecache_postfork_child(tsdn, &shard->pac.ecache_muzzy); + ecache_postfork_child(tsdn, &shard->pac.ecache_retained); + malloc_mutex_postfork_child(tsdn, &shard->pac.grow_mtx); + malloc_mutex_postfork_child(tsdn, &shard->pac.decay_dirty.mtx); + malloc_mutex_postfork_child(tsdn, &shard->pac.decay_muzzy.mtx); + if (shard->ever_used_hpa) { + sec_postfork_child(tsdn, &shard->hpa_sec); + hpa_shard_postfork_child(tsdn, &shard->hpa_shard); + } +} + +void +pa_shard_basic_stats_merge(pa_shard_t *shard, size_t *nactive, size_t *ndirty, + size_t *nmuzzy) { + *nactive += atomic_load_zu(&shard->nactive, ATOMIC_RELAXED); + *ndirty += ecache_npages_get(&shard->pac.ecache_dirty); + *nmuzzy += ecache_npages_get(&shard->pac.ecache_muzzy); +} + +void +pa_shard_stats_merge(tsdn_t *tsdn, pa_shard_t *shard, + pa_shard_stats_t *pa_shard_stats_out, pac_estats_t *estats_out, + hpa_shard_stats_t *hpa_stats_out, sec_stats_t *sec_stats_out, + size_t *resident) { + cassert(config_stats); + + pa_shard_stats_out->pac_stats.retained += + ecache_npages_get(&shard->pac.ecache_retained) << LG_PAGE; + pa_shard_stats_out->edata_avail += atomic_load_zu( + &shard->edata_cache.count, ATOMIC_RELAXED); + + size_t resident_pgs = 0; + resident_pgs += atomic_load_zu(&shard->nactive, ATOMIC_RELAXED); + resident_pgs += ecache_npages_get(&shard->pac.ecache_dirty); + *resident += (resident_pgs << LG_PAGE); + + /* Dirty decay stats */ + locked_inc_u64_unsynchronized( + &pa_shard_stats_out->pac_stats.decay_dirty.npurge, + locked_read_u64(tsdn, LOCKEDINT_MTX(*shard->stats_mtx), + &shard->pac.stats->decay_dirty.npurge)); + locked_inc_u64_unsynchronized( + &pa_shard_stats_out->pac_stats.decay_dirty.nmadvise, + locked_read_u64(tsdn, LOCKEDINT_MTX(*shard->stats_mtx), + &shard->pac.stats->decay_dirty.nmadvise)); + locked_inc_u64_unsynchronized( + &pa_shard_stats_out->pac_stats.decay_dirty.purged, + locked_read_u64(tsdn, LOCKEDINT_MTX(*shard->stats_mtx), + &shard->pac.stats->decay_dirty.purged)); + + /* Muzzy decay stats */ + locked_inc_u64_unsynchronized( + &pa_shard_stats_out->pac_stats.decay_muzzy.npurge, + locked_read_u64(tsdn, LOCKEDINT_MTX(*shard->stats_mtx), + &shard->pac.stats->decay_muzzy.npurge)); + locked_inc_u64_unsynchronized( + &pa_shard_stats_out->pac_stats.decay_muzzy.nmadvise, + locked_read_u64(tsdn, LOCKEDINT_MTX(*shard->stats_mtx), + &shard->pac.stats->decay_muzzy.nmadvise)); + locked_inc_u64_unsynchronized( + &pa_shard_stats_out->pac_stats.decay_muzzy.purged, + locked_read_u64(tsdn, LOCKEDINT_MTX(*shard->stats_mtx), + &shard->pac.stats->decay_muzzy.purged)); + + atomic_load_add_store_zu(&pa_shard_stats_out->pac_stats.abandoned_vm, + atomic_load_zu(&shard->pac.stats->abandoned_vm, ATOMIC_RELAXED)); + + for (pszind_t i = 0; i < SC_NPSIZES; i++) { + size_t dirty, muzzy, retained, dirty_bytes, muzzy_bytes, + retained_bytes; + dirty = ecache_nextents_get(&shard->pac.ecache_dirty, i); + muzzy = ecache_nextents_get(&shard->pac.ecache_muzzy, i); + retained = ecache_nextents_get(&shard->pac.ecache_retained, i); + dirty_bytes = ecache_nbytes_get(&shard->pac.ecache_dirty, i); + muzzy_bytes = ecache_nbytes_get(&shard->pac.ecache_muzzy, i); + retained_bytes = ecache_nbytes_get(&shard->pac.ecache_retained, + i); + + estats_out[i].ndirty = dirty; + estats_out[i].nmuzzy = muzzy; + estats_out[i].nretained = retained; + estats_out[i].dirty_bytes = dirty_bytes; + estats_out[i].muzzy_bytes = muzzy_bytes; + estats_out[i].retained_bytes = retained_bytes; + } + + if (shard->ever_used_hpa) { + hpa_shard_stats_merge(tsdn, &shard->hpa_shard, hpa_stats_out); + sec_stats_merge(tsdn, &shard->hpa_sec, sec_stats_out); + } +} + +static void +pa_shard_mtx_stats_read_single(tsdn_t *tsdn, mutex_prof_data_t *mutex_prof_data, + malloc_mutex_t *mtx, int ind) { + malloc_mutex_lock(tsdn, mtx); + malloc_mutex_prof_read(tsdn, &mutex_prof_data[ind], mtx); + malloc_mutex_unlock(tsdn, mtx); +} + +void +pa_shard_mtx_stats_read(tsdn_t *tsdn, pa_shard_t *shard, + mutex_prof_data_t mutex_prof_data[mutex_prof_num_arena_mutexes]) { + pa_shard_mtx_stats_read_single(tsdn, mutex_prof_data, + &shard->edata_cache.mtx, arena_prof_mutex_extent_avail); + pa_shard_mtx_stats_read_single(tsdn, mutex_prof_data, + &shard->pac.ecache_dirty.mtx, arena_prof_mutex_extents_dirty); + pa_shard_mtx_stats_read_single(tsdn, mutex_prof_data, + &shard->pac.ecache_muzzy.mtx, arena_prof_mutex_extents_muzzy); + pa_shard_mtx_stats_read_single(tsdn, mutex_prof_data, + &shard->pac.ecache_retained.mtx, arena_prof_mutex_extents_retained); + pa_shard_mtx_stats_read_single(tsdn, mutex_prof_data, + &shard->pac.decay_dirty.mtx, arena_prof_mutex_decay_dirty); + pa_shard_mtx_stats_read_single(tsdn, mutex_prof_data, + &shard->pac.decay_muzzy.mtx, arena_prof_mutex_decay_muzzy); + + if (shard->ever_used_hpa) { + pa_shard_mtx_stats_read_single(tsdn, mutex_prof_data, + &shard->hpa_shard.mtx, arena_prof_mutex_hpa_shard); + pa_shard_mtx_stats_read_single(tsdn, mutex_prof_data, + &shard->hpa_shard.grow_mtx, + arena_prof_mutex_hpa_shard_grow); + sec_mutex_stats_read(tsdn, &shard->hpa_sec, + &mutex_prof_data[arena_prof_mutex_hpa_sec]); + } +} diff --git a/BeefRT/JEMalloc/src/pac.c b/BeefRT/JEMalloc/src/pac.c new file mode 100644 index 00000000..53e3d823 --- /dev/null +++ b/BeefRT/JEMalloc/src/pac.c @@ -0,0 +1,587 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/pac.h" +#include "jemalloc/internal/san.h" + +static edata_t *pac_alloc_impl(tsdn_t *tsdn, pai_t *self, size_t size, + size_t alignment, bool zero, bool guarded, bool frequent_reuse, + bool *deferred_work_generated); +static bool pac_expand_impl(tsdn_t *tsdn, pai_t *self, edata_t *edata, + size_t old_size, size_t new_size, bool zero, bool *deferred_work_generated); +static bool pac_shrink_impl(tsdn_t *tsdn, pai_t *self, edata_t *edata, + size_t old_size, size_t new_size, bool *deferred_work_generated); +static void pac_dalloc_impl(tsdn_t *tsdn, pai_t *self, edata_t *edata, + bool *deferred_work_generated); +static uint64_t pac_time_until_deferred_work(tsdn_t *tsdn, pai_t *self); + +static inline void +pac_decay_data_get(pac_t *pac, extent_state_t state, + decay_t **r_decay, pac_decay_stats_t **r_decay_stats, ecache_t **r_ecache) { + switch(state) { + case extent_state_dirty: + *r_decay = &pac->decay_dirty; + *r_decay_stats = &pac->stats->decay_dirty; + *r_ecache = &pac->ecache_dirty; + return; + case extent_state_muzzy: + *r_decay = &pac->decay_muzzy; + *r_decay_stats = &pac->stats->decay_muzzy; + *r_ecache = &pac->ecache_muzzy; + return; + default: + unreachable(); + } +} + +bool +pac_init(tsdn_t *tsdn, pac_t *pac, base_t *base, emap_t *emap, + edata_cache_t *edata_cache, nstime_t *cur_time, + size_t pac_oversize_threshold, ssize_t dirty_decay_ms, + ssize_t muzzy_decay_ms, pac_stats_t *pac_stats, malloc_mutex_t *stats_mtx) { + unsigned ind = base_ind_get(base); + /* + * Delay coalescing for dirty extents despite the disruptive effect on + * memory layout for best-fit extent allocation, since cached extents + * are likely to be reused soon after deallocation, and the cost of + * merging/splitting extents is non-trivial. + */ + if (ecache_init(tsdn, &pac->ecache_dirty, extent_state_dirty, ind, + /* delay_coalesce */ true)) { + return true; + } + /* + * Coalesce muzzy extents immediately, because operations on them are in + * the critical path much less often than for dirty extents. + */ + if (ecache_init(tsdn, &pac->ecache_muzzy, extent_state_muzzy, ind, + /* delay_coalesce */ false)) { + return true; + } + /* + * Coalesce retained extents immediately, in part because they will + * never be evicted (and therefore there's no opportunity for delayed + * coalescing), but also because operations on retained extents are not + * in the critical path. + */ + if (ecache_init(tsdn, &pac->ecache_retained, extent_state_retained, + ind, /* delay_coalesce */ false)) { + return true; + } + exp_grow_init(&pac->exp_grow); + if (malloc_mutex_init(&pac->grow_mtx, "extent_grow", + WITNESS_RANK_EXTENT_GROW, malloc_mutex_rank_exclusive)) { + return true; + } + atomic_store_zu(&pac->oversize_threshold, pac_oversize_threshold, + ATOMIC_RELAXED); + if (decay_init(&pac->decay_dirty, cur_time, dirty_decay_ms)) { + return true; + } + if (decay_init(&pac->decay_muzzy, cur_time, muzzy_decay_ms)) { + return true; + } + if (san_bump_alloc_init(&pac->sba)) { + return true; + } + + pac->base = base; + pac->emap = emap; + pac->edata_cache = edata_cache; + pac->stats = pac_stats; + pac->stats_mtx = stats_mtx; + atomic_store_zu(&pac->extent_sn_next, 0, ATOMIC_RELAXED); + + pac->pai.alloc = &pac_alloc_impl; + pac->pai.alloc_batch = &pai_alloc_batch_default; + pac->pai.expand = &pac_expand_impl; + pac->pai.shrink = &pac_shrink_impl; + pac->pai.dalloc = &pac_dalloc_impl; + pac->pai.dalloc_batch = &pai_dalloc_batch_default; + pac->pai.time_until_deferred_work = &pac_time_until_deferred_work; + + return false; +} + +static inline bool +pac_may_have_muzzy(pac_t *pac) { + return pac_decay_ms_get(pac, extent_state_muzzy) != 0; +} + +static edata_t * +pac_alloc_real(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, size_t size, + size_t alignment, bool zero, bool guarded) { + assert(!guarded || alignment <= PAGE); + + edata_t *edata = ecache_alloc(tsdn, pac, ehooks, &pac->ecache_dirty, + NULL, size, alignment, zero, guarded); + + if (edata == NULL && pac_may_have_muzzy(pac)) { + edata = ecache_alloc(tsdn, pac, ehooks, &pac->ecache_muzzy, + NULL, size, alignment, zero, guarded); + } + if (edata == NULL) { + edata = ecache_alloc_grow(tsdn, pac, ehooks, + &pac->ecache_retained, NULL, size, alignment, zero, + guarded); + if (config_stats && edata != NULL) { + atomic_fetch_add_zu(&pac->stats->pac_mapped, size, + ATOMIC_RELAXED); + } + } + + return edata; +} + +static edata_t * +pac_alloc_new_guarded(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, size_t size, + size_t alignment, bool zero, bool frequent_reuse) { + assert(alignment <= PAGE); + + edata_t *edata; + if (san_bump_enabled() && frequent_reuse) { + edata = san_bump_alloc(tsdn, &pac->sba, pac, ehooks, size, + zero); + } else { + size_t size_with_guards = san_two_side_guarded_sz(size); + /* Alloc a non-guarded extent first.*/ + edata = pac_alloc_real(tsdn, pac, ehooks, size_with_guards, + /* alignment */ PAGE, zero, /* guarded */ false); + if (edata != NULL) { + /* Add guards around it. */ + assert(edata_size_get(edata) == size_with_guards); + san_guard_pages_two_sided(tsdn, ehooks, edata, + pac->emap, true); + } + } + assert(edata == NULL || (edata_guarded_get(edata) && + edata_size_get(edata) == size)); + + return edata; +} + +static edata_t * +pac_alloc_impl(tsdn_t *tsdn, pai_t *self, size_t size, size_t alignment, + bool zero, bool guarded, bool frequent_reuse, + bool *deferred_work_generated) { + pac_t *pac = (pac_t *)self; + ehooks_t *ehooks = pac_ehooks_get(pac); + + edata_t *edata = NULL; + /* + * The condition is an optimization - not frequently reused guarded + * allocations are never put in the ecache. pac_alloc_real also + * doesn't grow retained for guarded allocations. So pac_alloc_real + * for such allocations would always return NULL. + * */ + if (!guarded || frequent_reuse) { + edata = pac_alloc_real(tsdn, pac, ehooks, size, alignment, + zero, guarded); + } + if (edata == NULL && guarded) { + /* No cached guarded extents; creating a new one. */ + edata = pac_alloc_new_guarded(tsdn, pac, ehooks, size, + alignment, zero, frequent_reuse); + } + + return edata; +} + +static bool +pac_expand_impl(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size, + size_t new_size, bool zero, bool *deferred_work_generated) { + pac_t *pac = (pac_t *)self; + ehooks_t *ehooks = pac_ehooks_get(pac); + + size_t mapped_add = 0; + size_t expand_amount = new_size - old_size; + + if (ehooks_merge_will_fail(ehooks)) { + return true; + } + edata_t *trail = ecache_alloc(tsdn, pac, ehooks, &pac->ecache_dirty, + edata, expand_amount, PAGE, zero, /* guarded*/ false); + if (trail == NULL) { + trail = ecache_alloc(tsdn, pac, ehooks, &pac->ecache_muzzy, + edata, expand_amount, PAGE, zero, /* guarded*/ false); + } + if (trail == NULL) { + trail = ecache_alloc_grow(tsdn, pac, ehooks, + &pac->ecache_retained, edata, expand_amount, PAGE, zero, + /* guarded */ false); + mapped_add = expand_amount; + } + if (trail == NULL) { + return true; + } + if (extent_merge_wrapper(tsdn, pac, ehooks, edata, trail)) { + extent_dalloc_wrapper(tsdn, pac, ehooks, trail); + return true; + } + if (config_stats && mapped_add > 0) { + atomic_fetch_add_zu(&pac->stats->pac_mapped, mapped_add, + ATOMIC_RELAXED); + } + return false; +} + +static bool +pac_shrink_impl(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size, + size_t new_size, bool *deferred_work_generated) { + pac_t *pac = (pac_t *)self; + ehooks_t *ehooks = pac_ehooks_get(pac); + + size_t shrink_amount = old_size - new_size; + + if (ehooks_split_will_fail(ehooks)) { + return true; + } + + edata_t *trail = extent_split_wrapper(tsdn, pac, ehooks, edata, + new_size, shrink_amount, /* holding_core_locks */ false); + if (trail == NULL) { + return true; + } + ecache_dalloc(tsdn, pac, ehooks, &pac->ecache_dirty, trail); + *deferred_work_generated = true; + return false; +} + +static void +pac_dalloc_impl(tsdn_t *tsdn, pai_t *self, edata_t *edata, + bool *deferred_work_generated) { + pac_t *pac = (pac_t *)self; + ehooks_t *ehooks = pac_ehooks_get(pac); + + if (edata_guarded_get(edata)) { + /* + * Because cached guarded extents do exact fit only, large + * guarded extents are restored on dalloc eagerly (otherwise + * they will not be reused efficiently). Slab sizes have a + * limited number of size classes, and tend to cycle faster. + * + * In the case where coalesce is restrained (VirtualFree on + * Windows), guarded extents are also not cached -- otherwise + * during arena destroy / reset, the retained extents would not + * be whole regions (i.e. they are split between regular and + * guarded). + */ + if (!edata_slab_get(edata) || !maps_coalesce) { + assert(edata_size_get(edata) >= SC_LARGE_MINCLASS || + !maps_coalesce); + san_unguard_pages_two_sided(tsdn, ehooks, edata, + pac->emap); + } + } + + ecache_dalloc(tsdn, pac, ehooks, &pac->ecache_dirty, edata); + /* Purging of deallocated pages is deferred */ + *deferred_work_generated = true; +} + +static inline uint64_t +pac_ns_until_purge(tsdn_t *tsdn, decay_t *decay, size_t npages) { + if (malloc_mutex_trylock(tsdn, &decay->mtx)) { + /* Use minimal interval if decay is contended. */ + return BACKGROUND_THREAD_DEFERRED_MIN; + } + uint64_t result = decay_ns_until_purge(decay, npages, + ARENA_DEFERRED_PURGE_NPAGES_THRESHOLD); + + malloc_mutex_unlock(tsdn, &decay->mtx); + return result; +} + +static uint64_t +pac_time_until_deferred_work(tsdn_t *tsdn, pai_t *self) { + uint64_t time; + pac_t *pac = (pac_t *)self; + + time = pac_ns_until_purge(tsdn, + &pac->decay_dirty, + ecache_npages_get(&pac->ecache_dirty)); + if (time == BACKGROUND_THREAD_DEFERRED_MIN) { + return time; + } + + uint64_t muzzy = pac_ns_until_purge(tsdn, + &pac->decay_muzzy, + ecache_npages_get(&pac->ecache_muzzy)); + if (muzzy < time) { + time = muzzy; + } + return time; +} + +bool +pac_retain_grow_limit_get_set(tsdn_t *tsdn, pac_t *pac, size_t *old_limit, + size_t *new_limit) { + pszind_t new_ind JEMALLOC_CC_SILENCE_INIT(0); + if (new_limit != NULL) { + size_t limit = *new_limit; + /* Grow no more than the new limit. */ + if ((new_ind = sz_psz2ind(limit + 1) - 1) >= SC_NPSIZES) { + return true; + } + } + + malloc_mutex_lock(tsdn, &pac->grow_mtx); + if (old_limit != NULL) { + *old_limit = sz_pind2sz(pac->exp_grow.limit); + } + if (new_limit != NULL) { + pac->exp_grow.limit = new_ind; + } + malloc_mutex_unlock(tsdn, &pac->grow_mtx); + + return false; +} + +static size_t +pac_stash_decayed(tsdn_t *tsdn, pac_t *pac, ecache_t *ecache, + size_t npages_limit, size_t npages_decay_max, + edata_list_inactive_t *result) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + ehooks_t *ehooks = pac_ehooks_get(pac); + + /* Stash extents according to npages_limit. */ + size_t nstashed = 0; + while (nstashed < npages_decay_max) { + edata_t *edata = ecache_evict(tsdn, pac, ehooks, ecache, + npages_limit); + if (edata == NULL) { + break; + } + edata_list_inactive_append(result, edata); + nstashed += edata_size_get(edata) >> LG_PAGE; + } + return nstashed; +} + +static size_t +pac_decay_stashed(tsdn_t *tsdn, pac_t *pac, decay_t *decay, + pac_decay_stats_t *decay_stats, ecache_t *ecache, bool fully_decay, + edata_list_inactive_t *decay_extents) { + bool err; + + size_t nmadvise = 0; + size_t nunmapped = 0; + size_t npurged = 0; + + ehooks_t *ehooks = pac_ehooks_get(pac); + + bool try_muzzy = !fully_decay + && pac_decay_ms_get(pac, extent_state_muzzy) != 0; + + for (edata_t *edata = edata_list_inactive_first(decay_extents); edata != + NULL; edata = edata_list_inactive_first(decay_extents)) { + edata_list_inactive_remove(decay_extents, edata); + + size_t size = edata_size_get(edata); + size_t npages = size >> LG_PAGE; + + nmadvise++; + npurged += npages; + + switch (ecache->state) { + case extent_state_active: + not_reached(); + case extent_state_dirty: + if (try_muzzy) { + err = extent_purge_lazy_wrapper(tsdn, ehooks, + edata, /* offset */ 0, size); + if (!err) { + ecache_dalloc(tsdn, pac, ehooks, + &pac->ecache_muzzy, edata); + break; + } + } + JEMALLOC_FALLTHROUGH; + case extent_state_muzzy: + extent_dalloc_wrapper(tsdn, pac, ehooks, edata); + nunmapped += npages; + break; + case extent_state_retained: + default: + not_reached(); + } + } + + if (config_stats) { + LOCKEDINT_MTX_LOCK(tsdn, *pac->stats_mtx); + locked_inc_u64(tsdn, LOCKEDINT_MTX(*pac->stats_mtx), + &decay_stats->npurge, 1); + locked_inc_u64(tsdn, LOCKEDINT_MTX(*pac->stats_mtx), + &decay_stats->nmadvise, nmadvise); + locked_inc_u64(tsdn, LOCKEDINT_MTX(*pac->stats_mtx), + &decay_stats->purged, npurged); + LOCKEDINT_MTX_UNLOCK(tsdn, *pac->stats_mtx); + atomic_fetch_sub_zu(&pac->stats->pac_mapped, + nunmapped << LG_PAGE, ATOMIC_RELAXED); + } + + return npurged; +} + +/* + * npages_limit: Decay at most npages_decay_max pages without violating the + * invariant: (ecache_npages_get(ecache) >= npages_limit). We need an upper + * bound on number of pages in order to prevent unbounded growth (namely in + * stashed), otherwise unbounded new pages could be added to extents during the + * current decay run, so that the purging thread never finishes. + */ +static void +pac_decay_to_limit(tsdn_t *tsdn, pac_t *pac, decay_t *decay, + pac_decay_stats_t *decay_stats, ecache_t *ecache, bool fully_decay, + size_t npages_limit, size_t npages_decay_max) { + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 1); + + if (decay->purging || npages_decay_max == 0) { + return; + } + decay->purging = true; + malloc_mutex_unlock(tsdn, &decay->mtx); + + edata_list_inactive_t decay_extents; + edata_list_inactive_init(&decay_extents); + size_t npurge = pac_stash_decayed(tsdn, pac, ecache, npages_limit, + npages_decay_max, &decay_extents); + if (npurge != 0) { + size_t npurged = pac_decay_stashed(tsdn, pac, decay, + decay_stats, ecache, fully_decay, &decay_extents); + assert(npurged == npurge); + } + + malloc_mutex_lock(tsdn, &decay->mtx); + decay->purging = false; +} + +void +pac_decay_all(tsdn_t *tsdn, pac_t *pac, decay_t *decay, + pac_decay_stats_t *decay_stats, ecache_t *ecache, bool fully_decay) { + malloc_mutex_assert_owner(tsdn, &decay->mtx); + pac_decay_to_limit(tsdn, pac, decay, decay_stats, ecache, fully_decay, + /* npages_limit */ 0, ecache_npages_get(ecache)); +} + +static void +pac_decay_try_purge(tsdn_t *tsdn, pac_t *pac, decay_t *decay, + pac_decay_stats_t *decay_stats, ecache_t *ecache, + size_t current_npages, size_t npages_limit) { + if (current_npages > npages_limit) { + pac_decay_to_limit(tsdn, pac, decay, decay_stats, ecache, + /* fully_decay */ false, npages_limit, + current_npages - npages_limit); + } +} + +bool +pac_maybe_decay_purge(tsdn_t *tsdn, pac_t *pac, decay_t *decay, + pac_decay_stats_t *decay_stats, ecache_t *ecache, + pac_purge_eagerness_t eagerness) { + malloc_mutex_assert_owner(tsdn, &decay->mtx); + + /* Purge all or nothing if the option is disabled. */ + ssize_t decay_ms = decay_ms_read(decay); + if (decay_ms <= 0) { + if (decay_ms == 0) { + pac_decay_to_limit(tsdn, pac, decay, decay_stats, + ecache, /* fully_decay */ false, + /* npages_limit */ 0, ecache_npages_get(ecache)); + } + return false; + } + + /* + * If the deadline has been reached, advance to the current epoch and + * purge to the new limit if necessary. Note that dirty pages created + * during the current epoch are not subject to purge until a future + * epoch, so as a result purging only happens during epoch advances, or + * being triggered by background threads (scheduled event). + */ + nstime_t time; + nstime_init_update(&time); + size_t npages_current = ecache_npages_get(ecache); + bool epoch_advanced = decay_maybe_advance_epoch(decay, &time, + npages_current); + if (eagerness == PAC_PURGE_ALWAYS + || (epoch_advanced && eagerness == PAC_PURGE_ON_EPOCH_ADVANCE)) { + size_t npages_limit = decay_npages_limit_get(decay); + pac_decay_try_purge(tsdn, pac, decay, decay_stats, ecache, + npages_current, npages_limit); + } + + return epoch_advanced; +} + +bool +pac_decay_ms_set(tsdn_t *tsdn, pac_t *pac, extent_state_t state, + ssize_t decay_ms, pac_purge_eagerness_t eagerness) { + decay_t *decay; + pac_decay_stats_t *decay_stats; + ecache_t *ecache; + pac_decay_data_get(pac, state, &decay, &decay_stats, &ecache); + + if (!decay_ms_valid(decay_ms)) { + return true; + } + + malloc_mutex_lock(tsdn, &decay->mtx); + /* + * Restart decay backlog from scratch, which may cause many dirty pages + * to be immediately purged. It would conceptually be possible to map + * the old backlog onto the new backlog, but there is no justification + * for such complexity since decay_ms changes are intended to be + * infrequent, either between the {-1, 0, >0} states, or a one-time + * arbitrary change during initial arena configuration. + */ + nstime_t cur_time; + nstime_init_update(&cur_time); + decay_reinit(decay, &cur_time, decay_ms); + pac_maybe_decay_purge(tsdn, pac, decay, decay_stats, ecache, eagerness); + malloc_mutex_unlock(tsdn, &decay->mtx); + + return false; +} + +ssize_t +pac_decay_ms_get(pac_t *pac, extent_state_t state) { + decay_t *decay; + pac_decay_stats_t *decay_stats; + ecache_t *ecache; + pac_decay_data_get(pac, state, &decay, &decay_stats, &ecache); + return decay_ms_read(decay); +} + +void +pac_reset(tsdn_t *tsdn, pac_t *pac) { + /* + * No-op for now; purging is still done at the arena-level. It should + * get moved in here, though. + */ + (void)tsdn; + (void)pac; +} + +void +pac_destroy(tsdn_t *tsdn, pac_t *pac) { + assert(ecache_npages_get(&pac->ecache_dirty) == 0); + assert(ecache_npages_get(&pac->ecache_muzzy) == 0); + /* + * Iterate over the retained extents and destroy them. This gives the + * extent allocator underlying the extent hooks an opportunity to unmap + * all retained memory without having to keep its own metadata + * structures. In practice, virtual memory for dss-allocated extents is + * leaked here, so best practice is to avoid dss for arenas to be + * destroyed, or provide custom extent hooks that track retained + * dss-based extents for later reuse. + */ + ehooks_t *ehooks = pac_ehooks_get(pac); + edata_t *edata; + while ((edata = ecache_evict(tsdn, pac, ehooks, + &pac->ecache_retained, 0)) != NULL) { + extent_destroy_wrapper(tsdn, pac, ehooks, edata); + } +} diff --git a/BeefRT/JEMalloc/src/pages.c b/BeefRT/JEMalloc/src/pages.c new file mode 100644 index 00000000..8c83a7de --- /dev/null +++ b/BeefRT/JEMalloc/src/pages.c @@ -0,0 +1,824 @@ +#include "jemalloc/internal/jemalloc_preamble.h" + +#include "jemalloc/internal/pages.h" + +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/malloc_io.h" + +#ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT +#include +#ifdef __FreeBSD__ +#include +#endif +#endif +#ifdef __NetBSD__ +#include /* ilog2 */ +#endif +#ifdef JEMALLOC_HAVE_VM_MAKE_TAG +#define PAGES_FD_TAG VM_MAKE_TAG(101U) +#else +#define PAGES_FD_TAG -1 +#endif + +/******************************************************************************/ +/* Data. */ + +/* Actual operating system page size, detected during bootstrap, <= PAGE. */ +static size_t os_page; + +#ifndef _WIN32 +# define PAGES_PROT_COMMIT (PROT_READ | PROT_WRITE) +# define PAGES_PROT_DECOMMIT (PROT_NONE) +static int mmap_flags; +#endif +static bool os_overcommits; + +const char *thp_mode_names[] = { + "default", + "always", + "never", + "not supported" +}; +thp_mode_t opt_thp = THP_MODE_DEFAULT; +thp_mode_t init_system_thp_mode; + +/* Runtime support for lazy purge. Irrelevant when !pages_can_purge_lazy. */ +static bool pages_can_purge_lazy_runtime = true; + +#ifdef JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS +static int madvise_dont_need_zeros_is_faulty = -1; +/** + * Check that MADV_DONTNEED will actually zero pages on subsequent access. + * + * Since qemu does not support this, yet [1], and you can get very tricky + * assert if you will run program with jemalloc in use under qemu: + * + * : ../contrib/jemalloc/src/extent.c:1195: Failed assertion: "p[i] == 0" + * + * [1]: https://patchwork.kernel.org/patch/10576637/ + */ +static int madvise_MADV_DONTNEED_zeroes_pages() +{ + int works = -1; + size_t size = PAGE; + + void * addr = mmap(NULL, size, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + + if (addr == MAP_FAILED) { + malloc_write(": Cannot allocate memory for " + "MADV_DONTNEED check\n"); + if (opt_abort) { + abort(); + } + } + + memset(addr, 'A', size); + if (madvise(addr, size, MADV_DONTNEED) == 0) { + works = memchr(addr, 'A', size) == NULL; + } else { + /* + * If madvise() does not support MADV_DONTNEED, then we can + * call it anyway, and use it's return code. + */ + works = 1; + } + + if (munmap(addr, size) != 0) { + malloc_write(": Cannot deallocate memory for " + "MADV_DONTNEED check\n"); + if (opt_abort) { + abort(); + } + } + + return works; +} +#endif + +/******************************************************************************/ +/* + * Function prototypes for static functions that are referenced prior to + * definition. + */ + +static void os_pages_unmap(void *addr, size_t size); + +/******************************************************************************/ + +static void * +os_pages_map(void *addr, size_t size, size_t alignment, bool *commit) { + assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr); + assert(ALIGNMENT_CEILING(size, os_page) == size); + assert(size != 0); + + if (os_overcommits) { + *commit = true; + } + + void *ret; +#ifdef _WIN32 + /* + * If VirtualAlloc can't allocate at the given address when one is + * given, it fails and returns NULL. + */ + ret = VirtualAlloc(addr, size, MEM_RESERVE | (*commit ? MEM_COMMIT : 0), + PAGE_READWRITE); +#else + /* + * We don't use MAP_FIXED here, because it can cause the *replacement* + * of existing mappings, and we only want to create new mappings. + */ + { +#ifdef __NetBSD__ + /* + * On NetBSD PAGE for a platform is defined to the + * maximum page size of all machine architectures + * for that platform, so that we can use the same + * binaries across all machine architectures. + */ + if (alignment > os_page || PAGE > os_page) { + unsigned int a = ilog2(MAX(alignment, PAGE)); + mmap_flags |= MAP_ALIGNED(a); + } +#endif + int prot = *commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT; + + ret = mmap(addr, size, prot, mmap_flags, PAGES_FD_TAG, 0); + } + assert(ret != NULL); + + if (ret == MAP_FAILED) { + ret = NULL; + } else if (addr != NULL && ret != addr) { + /* + * We succeeded in mapping memory, but not in the right place. + */ + os_pages_unmap(ret, size); + ret = NULL; + } +#endif + assert(ret == NULL || (addr == NULL && ret != addr) || (addr != NULL && + ret == addr)); + return ret; +} + +static void * +os_pages_trim(void *addr, size_t alloc_size, size_t leadsize, size_t size, + bool *commit) { + void *ret = (void *)((uintptr_t)addr + leadsize); + + assert(alloc_size >= leadsize + size); +#ifdef _WIN32 + os_pages_unmap(addr, alloc_size); + void *new_addr = os_pages_map(ret, size, PAGE, commit); + if (new_addr == ret) { + return ret; + } + if (new_addr != NULL) { + os_pages_unmap(new_addr, size); + } + return NULL; +#else + size_t trailsize = alloc_size - leadsize - size; + + if (leadsize != 0) { + os_pages_unmap(addr, leadsize); + } + if (trailsize != 0) { + os_pages_unmap((void *)((uintptr_t)ret + size), trailsize); + } + return ret; +#endif +} + +static void +os_pages_unmap(void *addr, size_t size) { + assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr); + assert(ALIGNMENT_CEILING(size, os_page) == size); + +#ifdef _WIN32 + if (VirtualFree(addr, 0, MEM_RELEASE) == 0) +#else + if (munmap(addr, size) == -1) +#endif + { + char buf[BUFERROR_BUF]; + + buferror(get_errno(), buf, sizeof(buf)); + malloc_printf(": Error in " +#ifdef _WIN32 + "VirtualFree" +#else + "munmap" +#endif + "(): %s\n", buf); + if (opt_abort) { + abort(); + } + } +} + +static void * +pages_map_slow(size_t size, size_t alignment, bool *commit) { + size_t alloc_size = size + alignment - os_page; + /* Beware size_t wrap-around. */ + if (alloc_size < size) { + return NULL; + } + + void *ret; + do { + void *pages = os_pages_map(NULL, alloc_size, alignment, commit); + if (pages == NULL) { + return NULL; + } + size_t leadsize = ALIGNMENT_CEILING((uintptr_t)pages, alignment) + - (uintptr_t)pages; + ret = os_pages_trim(pages, alloc_size, leadsize, size, commit); + } while (ret == NULL); + + assert(ret != NULL); + assert(PAGE_ADDR2BASE(ret) == ret); + return ret; +} + +void * +pages_map(void *addr, size_t size, size_t alignment, bool *commit) { + assert(alignment >= PAGE); + assert(ALIGNMENT_ADDR2BASE(addr, alignment) == addr); + +#if defined(__FreeBSD__) && defined(MAP_EXCL) + /* + * FreeBSD has mechanisms both to mmap at specific address without + * touching existing mappings, and to mmap with specific alignment. + */ + { + if (os_overcommits) { + *commit = true; + } + + int prot = *commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT; + int flags = mmap_flags; + + if (addr != NULL) { + flags |= MAP_FIXED | MAP_EXCL; + } else { + unsigned alignment_bits = ffs_zu(alignment); + assert(alignment_bits > 0); + flags |= MAP_ALIGNED(alignment_bits); + } + + void *ret = mmap(addr, size, prot, flags, -1, 0); + if (ret == MAP_FAILED) { + ret = NULL; + } + + return ret; + } +#endif + /* + * Ideally, there would be a way to specify alignment to mmap() (like + * NetBSD has), but in the absence of such a feature, we have to work + * hard to efficiently create aligned mappings. The reliable, but + * slow method is to create a mapping that is over-sized, then trim the + * excess. However, that always results in one or two calls to + * os_pages_unmap(), and it can leave holes in the process's virtual + * memory map if memory grows downward. + * + * Optimistically try mapping precisely the right amount before falling + * back to the slow method, with the expectation that the optimistic + * approach works most of the time. + */ + + void *ret = os_pages_map(addr, size, os_page, commit); + if (ret == NULL || ret == addr) { + return ret; + } + assert(addr == NULL); + if (ALIGNMENT_ADDR2OFFSET(ret, alignment) != 0) { + os_pages_unmap(ret, size); + return pages_map_slow(size, alignment, commit); + } + + assert(PAGE_ADDR2BASE(ret) == ret); + return ret; +} + +void +pages_unmap(void *addr, size_t size) { + assert(PAGE_ADDR2BASE(addr) == addr); + assert(PAGE_CEILING(size) == size); + + os_pages_unmap(addr, size); +} + +static bool +os_pages_commit(void *addr, size_t size, bool commit) { + assert(PAGE_ADDR2BASE(addr) == addr); + assert(PAGE_CEILING(size) == size); + +#ifdef _WIN32 + return (commit ? (addr != VirtualAlloc(addr, size, MEM_COMMIT, + PAGE_READWRITE)) : (!VirtualFree(addr, size, MEM_DECOMMIT))); +#else + { + int prot = commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT; + void *result = mmap(addr, size, prot, mmap_flags | MAP_FIXED, + PAGES_FD_TAG, 0); + if (result == MAP_FAILED) { + return true; + } + if (result != addr) { + /* + * We succeeded in mapping memory, but not in the right + * place. + */ + os_pages_unmap(result, size); + return true; + } + return false; + } +#endif +} + +static bool +pages_commit_impl(void *addr, size_t size, bool commit) { + if (os_overcommits) { + return true; + } + + return os_pages_commit(addr, size, commit); +} + +bool +pages_commit(void *addr, size_t size) { + return pages_commit_impl(addr, size, true); +} + +bool +pages_decommit(void *addr, size_t size) { + return pages_commit_impl(addr, size, false); +} + +void +pages_mark_guards(void *head, void *tail) { + assert(head != NULL || tail != NULL); + assert(head == NULL || tail == NULL || + (uintptr_t)head < (uintptr_t)tail); +#ifdef JEMALLOC_HAVE_MPROTECT + if (head != NULL) { + mprotect(head, PAGE, PROT_NONE); + } + if (tail != NULL) { + mprotect(tail, PAGE, PROT_NONE); + } +#else + /* Decommit sets to PROT_NONE / MEM_DECOMMIT. */ + if (head != NULL) { + os_pages_commit(head, PAGE, false); + } + if (tail != NULL) { + os_pages_commit(tail, PAGE, false); + } +#endif +} + +void +pages_unmark_guards(void *head, void *tail) { + assert(head != NULL || tail != NULL); + assert(head == NULL || tail == NULL || + (uintptr_t)head < (uintptr_t)tail); +#ifdef JEMALLOC_HAVE_MPROTECT + bool head_and_tail = (head != NULL) && (tail != NULL); + size_t range = head_and_tail ? + (uintptr_t)tail - (uintptr_t)head + PAGE : + SIZE_T_MAX; + /* + * The amount of work that the kernel does in mprotect depends on the + * range argument. SC_LARGE_MINCLASS is an arbitrary threshold chosen + * to prevent kernel from doing too much work that would outweigh the + * savings of performing one less system call. + */ + bool ranged_mprotect = head_and_tail && range <= SC_LARGE_MINCLASS; + if (ranged_mprotect) { + mprotect(head, range, PROT_READ | PROT_WRITE); + } else { + if (head != NULL) { + mprotect(head, PAGE, PROT_READ | PROT_WRITE); + } + if (tail != NULL) { + mprotect(tail, PAGE, PROT_READ | PROT_WRITE); + } + } +#else + if (head != NULL) { + os_pages_commit(head, PAGE, true); + } + if (tail != NULL) { + os_pages_commit(tail, PAGE, true); + } +#endif +} + +bool +pages_purge_lazy(void *addr, size_t size) { + assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr); + assert(PAGE_CEILING(size) == size); + + if (!pages_can_purge_lazy) { + return true; + } + if (!pages_can_purge_lazy_runtime) { + /* + * Built with lazy purge enabled, but detected it was not + * supported on the current system. + */ + return true; + } + +#ifdef _WIN32 + VirtualAlloc(addr, size, MEM_RESET, PAGE_READWRITE); + return false; +#elif defined(JEMALLOC_PURGE_MADVISE_FREE) + return (madvise(addr, size, +# ifdef MADV_FREE + MADV_FREE +# else + JEMALLOC_MADV_FREE +# endif + ) != 0); +#elif defined(JEMALLOC_PURGE_MADVISE_DONTNEED) && \ + !defined(JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS) + return (madvise(addr, size, MADV_DONTNEED) != 0); +#elif defined(JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED) && \ + !defined(JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS) + return (posix_madvise(addr, size, POSIX_MADV_DONTNEED) != 0); +#else + not_reached(); +#endif +} + +bool +pages_purge_forced(void *addr, size_t size) { + assert(PAGE_ADDR2BASE(addr) == addr); + assert(PAGE_CEILING(size) == size); + + if (!pages_can_purge_forced) { + return true; + } + +#if defined(JEMALLOC_PURGE_MADVISE_DONTNEED) && \ + defined(JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS) + return (unlikely(madvise_dont_need_zeros_is_faulty) || + madvise(addr, size, MADV_DONTNEED) != 0); +#elif defined(JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED) && \ + defined(JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS) + return (unlikely(madvise_dont_need_zeros_is_faulty) || + posix_madvise(addr, size, POSIX_MADV_DONTNEED) != 0); +#elif defined(JEMALLOC_MAPS_COALESCE) + /* Try to overlay a new demand-zeroed mapping. */ + return pages_commit(addr, size); +#else + not_reached(); +#endif +} + +static bool +pages_huge_impl(void *addr, size_t size, bool aligned) { + if (aligned) { + assert(HUGEPAGE_ADDR2BASE(addr) == addr); + assert(HUGEPAGE_CEILING(size) == size); + } +#if defined(JEMALLOC_HAVE_MADVISE_HUGE) + return (madvise(addr, size, MADV_HUGEPAGE) != 0); +#elif defined(JEMALLOC_HAVE_MEMCNTL) + struct memcntl_mha m = {0}; + m.mha_cmd = MHA_MAPSIZE_VA; + m.mha_pagesize = HUGEPAGE; + return (memcntl(addr, size, MC_HAT_ADVISE, (caddr_t)&m, 0, 0) == 0); +#else + return true; +#endif +} + +bool +pages_huge(void *addr, size_t size) { + return pages_huge_impl(addr, size, true); +} + +static bool +pages_huge_unaligned(void *addr, size_t size) { + return pages_huge_impl(addr, size, false); +} + +static bool +pages_nohuge_impl(void *addr, size_t size, bool aligned) { + if (aligned) { + assert(HUGEPAGE_ADDR2BASE(addr) == addr); + assert(HUGEPAGE_CEILING(size) == size); + } + +#ifdef JEMALLOC_HAVE_MADVISE_HUGE + return (madvise(addr, size, MADV_NOHUGEPAGE) != 0); +#else + return false; +#endif +} + +bool +pages_nohuge(void *addr, size_t size) { + return pages_nohuge_impl(addr, size, true); +} + +static bool +pages_nohuge_unaligned(void *addr, size_t size) { + return pages_nohuge_impl(addr, size, false); +} + +bool +pages_dontdump(void *addr, size_t size) { + assert(PAGE_ADDR2BASE(addr) == addr); + assert(PAGE_CEILING(size) == size); +#if defined(JEMALLOC_MADVISE_DONTDUMP) + return madvise(addr, size, MADV_DONTDUMP) != 0; +#elif defined(JEMALLOC_MADVISE_NOCORE) + return madvise(addr, size, MADV_NOCORE) != 0; +#else + return false; +#endif +} + +bool +pages_dodump(void *addr, size_t size) { + assert(PAGE_ADDR2BASE(addr) == addr); + assert(PAGE_CEILING(size) == size); +#if defined(JEMALLOC_MADVISE_DONTDUMP) + return madvise(addr, size, MADV_DODUMP) != 0; +#elif defined(JEMALLOC_MADVISE_NOCORE) + return madvise(addr, size, MADV_CORE) != 0; +#else + return false; +#endif +} + + +static size_t +os_page_detect(void) { +#ifdef _WIN32 + SYSTEM_INFO si; + GetSystemInfo(&si); + return si.dwPageSize; +#elif defined(__FreeBSD__) + /* + * This returns the value obtained from + * the auxv vector, avoiding a syscall. + */ + return getpagesize(); +#else + long result = sysconf(_SC_PAGESIZE); + if (result == -1) { + return LG_PAGE; + } + return (size_t)result; +#endif +} + +#ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT +static bool +os_overcommits_sysctl(void) { + int vm_overcommit; + size_t sz; + + sz = sizeof(vm_overcommit); +#if defined(__FreeBSD__) && defined(VM_OVERCOMMIT) + int mib[2]; + + mib[0] = CTL_VM; + mib[1] = VM_OVERCOMMIT; + if (sysctl(mib, 2, &vm_overcommit, &sz, NULL, 0) != 0) { + return false; /* Error. */ + } +#else + if (sysctlbyname("vm.overcommit", &vm_overcommit, &sz, NULL, 0) != 0) { + return false; /* Error. */ + } +#endif + + return ((vm_overcommit & 0x3) == 0); +} +#endif + +#ifdef JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY +/* + * Use syscall(2) rather than {open,read,close}(2) when possible to avoid + * reentry during bootstrapping if another library has interposed system call + * wrappers. + */ +static bool +os_overcommits_proc(void) { + int fd; + char buf[1]; + +#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_open) + #if defined(O_CLOEXEC) + fd = (int)syscall(SYS_open, "/proc/sys/vm/overcommit_memory", O_RDONLY | + O_CLOEXEC); + #else + fd = (int)syscall(SYS_open, "/proc/sys/vm/overcommit_memory", O_RDONLY); + if (fd != -1) { + fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC); + } + #endif +#elif defined(JEMALLOC_USE_SYSCALL) && defined(SYS_openat) + #if defined(O_CLOEXEC) + fd = (int)syscall(SYS_openat, + AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC); + #else + fd = (int)syscall(SYS_openat, + AT_FDCWD, "/proc/sys/vm/overcommit_memory", O_RDONLY); + if (fd != -1) { + fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC); + } + #endif +#else + #if defined(O_CLOEXEC) + fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC); + #else + fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY); + if (fd != -1) { + fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC); + } + #endif +#endif + + if (fd == -1) { + return false; /* Error. */ + } + + ssize_t nread = malloc_read_fd(fd, &buf, sizeof(buf)); +#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_close) + syscall(SYS_close, fd); +#else + close(fd); +#endif + + if (nread < 1) { + return false; /* Error. */ + } + /* + * /proc/sys/vm/overcommit_memory meanings: + * 0: Heuristic overcommit. + * 1: Always overcommit. + * 2: Never overcommit. + */ + return (buf[0] == '0' || buf[0] == '1'); +} +#endif + +void +pages_set_thp_state (void *ptr, size_t size) { + if (opt_thp == thp_mode_default || opt_thp == init_system_thp_mode) { + return; + } + assert(opt_thp != thp_mode_not_supported && + init_system_thp_mode != thp_mode_not_supported); + + if (opt_thp == thp_mode_always + && init_system_thp_mode != thp_mode_never) { + assert(init_system_thp_mode == thp_mode_default); + pages_huge_unaligned(ptr, size); + } else if (opt_thp == thp_mode_never) { + assert(init_system_thp_mode == thp_mode_default || + init_system_thp_mode == thp_mode_always); + pages_nohuge_unaligned(ptr, size); + } +} + +static void +init_thp_state(void) { + if (!have_madvise_huge && !have_memcntl) { + if (metadata_thp_enabled() && opt_abort) { + malloc_write(": no MADV_HUGEPAGE support\n"); + abort(); + } + goto label_error; + } +#if defined(JEMALLOC_HAVE_MADVISE_HUGE) + static const char sys_state_madvise[] = "always [madvise] never\n"; + static const char sys_state_always[] = "[always] madvise never\n"; + static const char sys_state_never[] = "always madvise [never]\n"; + char buf[sizeof(sys_state_madvise)]; + +#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_open) + int fd = (int)syscall(SYS_open, + "/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY); +#elif defined(JEMALLOC_USE_SYSCALL) && defined(SYS_openat) + int fd = (int)syscall(SYS_openat, + AT_FDCWD, "/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY); +#else + int fd = open("/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY); +#endif + if (fd == -1) { + goto label_error; + } + + ssize_t nread = malloc_read_fd(fd, &buf, sizeof(buf)); +#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_close) + syscall(SYS_close, fd); +#else + close(fd); +#endif + + if (nread < 0) { + goto label_error; + } + + if (strncmp(buf, sys_state_madvise, (size_t)nread) == 0) { + init_system_thp_mode = thp_mode_default; + } else if (strncmp(buf, sys_state_always, (size_t)nread) == 0) { + init_system_thp_mode = thp_mode_always; + } else if (strncmp(buf, sys_state_never, (size_t)nread) == 0) { + init_system_thp_mode = thp_mode_never; + } else { + goto label_error; + } + return; +#elif defined(JEMALLOC_HAVE_MEMCNTL) + init_system_thp_mode = thp_mode_default; + return; +#endif +label_error: + opt_thp = init_system_thp_mode = thp_mode_not_supported; +} + +bool +pages_boot(void) { + os_page = os_page_detect(); + if (os_page > PAGE) { + malloc_write(": Unsupported system page size\n"); + if (opt_abort) { + abort(); + } + return true; + } + +#ifdef JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS + if (!opt_trust_madvise) { + madvise_dont_need_zeros_is_faulty = !madvise_MADV_DONTNEED_zeroes_pages(); + if (madvise_dont_need_zeros_is_faulty) { + malloc_write(": MADV_DONTNEED does not work (memset will be used instead)\n"); + malloc_write(": (This is the expected behaviour if you are running under QEMU)\n"); + } + } else { + /* In case opt_trust_madvise is disable, + * do not do runtime check */ + madvise_dont_need_zeros_is_faulty = 0; + } +#endif + +#ifndef _WIN32 + mmap_flags = MAP_PRIVATE | MAP_ANON; +#endif + +#ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT + os_overcommits = os_overcommits_sysctl(); +#elif defined(JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY) + os_overcommits = os_overcommits_proc(); +# ifdef MAP_NORESERVE + if (os_overcommits) { + mmap_flags |= MAP_NORESERVE; + } +# endif +#elif defined(__NetBSD__) + os_overcommits = true; +#else + os_overcommits = false; +#endif + + init_thp_state(); + +#ifdef __FreeBSD__ + /* + * FreeBSD doesn't need the check; madvise(2) is known to work. + */ +#else + /* Detect lazy purge runtime support. */ + if (pages_can_purge_lazy) { + bool committed = false; + void *madv_free_page = os_pages_map(NULL, PAGE, PAGE, &committed); + if (madv_free_page == NULL) { + return true; + } + assert(pages_can_purge_lazy_runtime); + if (pages_purge_lazy(madv_free_page, PAGE)) { + pages_can_purge_lazy_runtime = false; + } + os_pages_unmap(madv_free_page, PAGE); + } +#endif + + return false; +} diff --git a/BeefRT/JEMalloc/src/pai.c b/BeefRT/JEMalloc/src/pai.c new file mode 100644 index 00000000..45c87729 --- /dev/null +++ b/BeefRT/JEMalloc/src/pai.c @@ -0,0 +1,31 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +size_t +pai_alloc_batch_default(tsdn_t *tsdn, pai_t *self, size_t size, size_t nallocs, + edata_list_active_t *results, bool *deferred_work_generated) { + for (size_t i = 0; i < nallocs; i++) { + bool deferred_by_alloc = false; + edata_t *edata = pai_alloc(tsdn, self, size, PAGE, + /* zero */ false, /* guarded */ false, + /* frequent_reuse */ false, &deferred_by_alloc); + *deferred_work_generated |= deferred_by_alloc; + if (edata == NULL) { + return i; + } + edata_list_active_append(results, edata); + } + return nallocs; +} + +void +pai_dalloc_batch_default(tsdn_t *tsdn, pai_t *self, + edata_list_active_t *list, bool *deferred_work_generated) { + edata_t *edata; + while ((edata = edata_list_active_first(list)) != NULL) { + bool deferred_by_dalloc = false; + edata_list_active_remove(list, edata); + pai_dalloc(tsdn, self, edata, &deferred_by_dalloc); + *deferred_work_generated |= deferred_by_dalloc; + } +} diff --git a/BeefRT/JEMalloc/src/peak_event.c b/BeefRT/JEMalloc/src/peak_event.c new file mode 100644 index 00000000..4093fbcc --- /dev/null +++ b/BeefRT/JEMalloc/src/peak_event.c @@ -0,0 +1,82 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/peak_event.h" + +#include "jemalloc/internal/activity_callback.h" +#include "jemalloc/internal/peak.h" + +/* + * Update every 64K by default. We're not exposing this as a configuration + * option for now; we don't want to bind ourselves too tightly to any particular + * performance requirements for small values, or guarantee that we'll even be + * able to provide fine-grained accuracy. + */ +#define PEAK_EVENT_WAIT (64 * 1024) + +/* Update the peak with current tsd state. */ +void +peak_event_update(tsd_t *tsd) { + uint64_t alloc = tsd_thread_allocated_get(tsd); + uint64_t dalloc = tsd_thread_deallocated_get(tsd); + peak_t *peak = tsd_peakp_get(tsd); + peak_update(peak, alloc, dalloc); +} + +static void +peak_event_activity_callback(tsd_t *tsd) { + activity_callback_thunk_t *thunk = tsd_activity_callback_thunkp_get( + tsd); + uint64_t alloc = tsd_thread_allocated_get(tsd); + uint64_t dalloc = tsd_thread_deallocated_get(tsd); + if (thunk->callback != NULL) { + thunk->callback(thunk->uctx, alloc, dalloc); + } +} + +/* Set current state to zero. */ +void +peak_event_zero(tsd_t *tsd) { + uint64_t alloc = tsd_thread_allocated_get(tsd); + uint64_t dalloc = tsd_thread_deallocated_get(tsd); + peak_t *peak = tsd_peakp_get(tsd); + peak_set_zero(peak, alloc, dalloc); +} + +uint64_t +peak_event_max(tsd_t *tsd) { + peak_t *peak = tsd_peakp_get(tsd); + return peak_max(peak); +} + +uint64_t +peak_alloc_new_event_wait(tsd_t *tsd) { + return PEAK_EVENT_WAIT; +} + +uint64_t +peak_alloc_postponed_event_wait(tsd_t *tsd) { + return TE_MIN_START_WAIT; +} + +void +peak_alloc_event_handler(tsd_t *tsd, uint64_t elapsed) { + peak_event_update(tsd); + peak_event_activity_callback(tsd); +} + +uint64_t +peak_dalloc_new_event_wait(tsd_t *tsd) { + return PEAK_EVENT_WAIT; +} + +uint64_t +peak_dalloc_postponed_event_wait(tsd_t *tsd) { + return TE_MIN_START_WAIT; +} + +void +peak_dalloc_event_handler(tsd_t *tsd, uint64_t elapsed) { + peak_event_update(tsd); + peak_event_activity_callback(tsd); +} diff --git a/BeefRT/JEMalloc/src/prof.c b/BeefRT/JEMalloc/src/prof.c new file mode 100644 index 00000000..7a6d5d56 --- /dev/null +++ b/BeefRT/JEMalloc/src/prof.c @@ -0,0 +1,789 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/ctl.h" +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/counter.h" +#include "jemalloc/internal/prof_data.h" +#include "jemalloc/internal/prof_log.h" +#include "jemalloc/internal/prof_recent.h" +#include "jemalloc/internal/prof_stats.h" +#include "jemalloc/internal/prof_sys.h" +#include "jemalloc/internal/prof_hook.h" +#include "jemalloc/internal/thread_event.h" + +/* + * This file implements the profiling "APIs" needed by other parts of jemalloc, + * and also manages the relevant "operational" data, mainly options and mutexes; + * the core profiling data structures are encapsulated in prof_data.c. + */ + +/******************************************************************************/ + +/* Data. */ + +bool opt_prof = false; +bool opt_prof_active = true; +bool opt_prof_thread_active_init = true; +size_t opt_lg_prof_sample = LG_PROF_SAMPLE_DEFAULT; +ssize_t opt_lg_prof_interval = LG_PROF_INTERVAL_DEFAULT; +bool opt_prof_gdump = false; +bool opt_prof_final = false; +bool opt_prof_leak = false; +bool opt_prof_leak_error = false; +bool opt_prof_accum = false; +char opt_prof_prefix[PROF_DUMP_FILENAME_LEN]; +bool opt_prof_sys_thread_name = false; +bool opt_prof_unbias = true; + +/* Accessed via prof_sample_event_handler(). */ +static counter_accum_t prof_idump_accumulated; + +/* + * Initialized as opt_prof_active, and accessed via + * prof_active_[gs]et{_unlocked,}(). + */ +bool prof_active_state; +static malloc_mutex_t prof_active_mtx; + +/* + * Initialized as opt_prof_thread_active_init, and accessed via + * prof_thread_active_init_[gs]et(). + */ +static bool prof_thread_active_init; +static malloc_mutex_t prof_thread_active_init_mtx; + +/* + * Initialized as opt_prof_gdump, and accessed via + * prof_gdump_[gs]et{_unlocked,}(). + */ +bool prof_gdump_val; +static malloc_mutex_t prof_gdump_mtx; + +uint64_t prof_interval = 0; + +size_t lg_prof_sample; + +static uint64_t next_thr_uid; +static malloc_mutex_t next_thr_uid_mtx; + +/* Do not dump any profiles until bootstrapping is complete. */ +bool prof_booted = false; + +/* Logically a prof_backtrace_hook_t. */ +atomic_p_t prof_backtrace_hook; + +/* Logically a prof_dump_hook_t. */ +atomic_p_t prof_dump_hook; + +/******************************************************************************/ + +void +prof_alloc_rollback(tsd_t *tsd, prof_tctx_t *tctx) { + cassert(config_prof); + + if (tsd_reentrancy_level_get(tsd) > 0) { + assert((uintptr_t)tctx == (uintptr_t)1U); + return; + } + + if ((uintptr_t)tctx > (uintptr_t)1U) { + malloc_mutex_lock(tsd_tsdn(tsd), tctx->tdata->lock); + tctx->prepared = false; + prof_tctx_try_destroy(tsd, tctx); + } +} + +void +prof_malloc_sample_object(tsd_t *tsd, const void *ptr, size_t size, + size_t usize, prof_tctx_t *tctx) { + cassert(config_prof); + + if (opt_prof_sys_thread_name) { + prof_sys_thread_name_fetch(tsd); + } + + edata_t *edata = emap_edata_lookup(tsd_tsdn(tsd), &arena_emap_global, + ptr); + prof_info_set(tsd, edata, tctx, size); + + szind_t szind = sz_size2index(usize); + + malloc_mutex_lock(tsd_tsdn(tsd), tctx->tdata->lock); + /* + * We need to do these map lookups while holding the lock, to avoid the + * possibility of races with prof_reset calls, which update the map and + * then acquire the lock. This actually still leaves a data race on the + * contents of the unbias map, but we have not yet gone through and + * atomic-ified the prof module, and compilers are not yet causing us + * issues. The key thing is to make sure that, if we read garbage data, + * the prof_reset call is about to mark our tctx as expired before any + * dumping of our corrupted output is attempted. + */ + size_t shifted_unbiased_cnt = prof_shifted_unbiased_cnt[szind]; + size_t unbiased_bytes = prof_unbiased_sz[szind]; + tctx->cnts.curobjs++; + tctx->cnts.curobjs_shifted_unbiased += shifted_unbiased_cnt; + tctx->cnts.curbytes += usize; + tctx->cnts.curbytes_unbiased += unbiased_bytes; + if (opt_prof_accum) { + tctx->cnts.accumobjs++; + tctx->cnts.accumobjs_shifted_unbiased += shifted_unbiased_cnt; + tctx->cnts.accumbytes += usize; + tctx->cnts.accumbytes_unbiased += unbiased_bytes; + } + bool record_recent = prof_recent_alloc_prepare(tsd, tctx); + tctx->prepared = false; + malloc_mutex_unlock(tsd_tsdn(tsd), tctx->tdata->lock); + if (record_recent) { + assert(tctx == edata_prof_tctx_get(edata)); + prof_recent_alloc(tsd, edata, size, usize); + } + + if (opt_prof_stats) { + prof_stats_inc(tsd, szind, size); + } +} + +void +prof_free_sampled_object(tsd_t *tsd, size_t usize, prof_info_t *prof_info) { + cassert(config_prof); + + assert(prof_info != NULL); + prof_tctx_t *tctx = prof_info->alloc_tctx; + assert((uintptr_t)tctx > (uintptr_t)1U); + + szind_t szind = sz_size2index(usize); + malloc_mutex_lock(tsd_tsdn(tsd), tctx->tdata->lock); + + assert(tctx->cnts.curobjs > 0); + assert(tctx->cnts.curbytes >= usize); + /* + * It's not correct to do equivalent asserts for unbiased bytes, because + * of the potential for races with prof.reset calls. The map contents + * should really be atomic, but we have not atomic-ified the prof module + * yet. + */ + tctx->cnts.curobjs--; + tctx->cnts.curobjs_shifted_unbiased -= prof_shifted_unbiased_cnt[szind]; + tctx->cnts.curbytes -= usize; + tctx->cnts.curbytes_unbiased -= prof_unbiased_sz[szind]; + + prof_try_log(tsd, usize, prof_info); + + prof_tctx_try_destroy(tsd, tctx); + + if (opt_prof_stats) { + prof_stats_dec(tsd, szind, prof_info->alloc_size); + } +} + +prof_tctx_t * +prof_tctx_create(tsd_t *tsd) { + if (!tsd_nominal(tsd) || tsd_reentrancy_level_get(tsd) > 0) { + return NULL; + } + + prof_tdata_t *tdata = prof_tdata_get(tsd, true); + if (tdata == NULL) { + return NULL; + } + + prof_bt_t bt; + bt_init(&bt, tdata->vec); + prof_backtrace(tsd, &bt); + return prof_lookup(tsd, &bt); +} + +/* + * The bodies of this function and prof_leakcheck() are compiled out unless heap + * profiling is enabled, so that it is possible to compile jemalloc with + * floating point support completely disabled. Avoiding floating point code is + * important on memory-constrained systems, but it also enables a workaround for + * versions of glibc that don't properly save/restore floating point registers + * during dynamic lazy symbol loading (which internally calls into whatever + * malloc implementation happens to be integrated into the application). Note + * that some compilers (e.g. gcc 4.8) may use floating point registers for fast + * memory moves, so jemalloc must be compiled with such optimizations disabled + * (e.g. + * -mno-sse) in order for the workaround to be complete. + */ +uint64_t +prof_sample_new_event_wait(tsd_t *tsd) { +#ifdef JEMALLOC_PROF + if (lg_prof_sample == 0) { + return TE_MIN_START_WAIT; + } + + /* + * Compute sample interval as a geometrically distributed random + * variable with mean (2^lg_prof_sample). + * + * __ __ + * | log(u) | 1 + * bytes_until_sample = | -------- |, where p = --------------- + * | log(1-p) | lg_prof_sample + * 2 + * + * For more information on the math, see: + * + * Non-Uniform Random Variate Generation + * Luc Devroye + * Springer-Verlag, New York, 1986 + * pp 500 + * (http://luc.devroye.org/rnbookindex.html) + * + * In the actual computation, there's a non-zero probability that our + * pseudo random number generator generates an exact 0, and to avoid + * log(0), we set u to 1.0 in case r is 0. Therefore u effectively is + * uniformly distributed in (0, 1] instead of [0, 1). Further, rather + * than taking the ceiling, we take the floor and then add 1, since + * otherwise bytes_until_sample would be 0 if u is exactly 1.0. + */ + uint64_t r = prng_lg_range_u64(tsd_prng_statep_get(tsd), 53); + double u = (r == 0U) ? 1.0 : (double)r * (1.0/9007199254740992.0L); + return (uint64_t)(log(u) / + log(1.0 - (1.0 / (double)((uint64_t)1U << lg_prof_sample)))) + + (uint64_t)1U; +#else + not_reached(); + return TE_MAX_START_WAIT; +#endif +} + +uint64_t +prof_sample_postponed_event_wait(tsd_t *tsd) { + /* + * The postponed wait time for prof sample event is computed as if we + * want a new wait time (i.e. as if the event were triggered). If we + * instead postpone to the immediate next allocation, like how we're + * handling the other events, then we can have sampling bias, if e.g. + * the allocation immediately following a reentrancy always comes from + * the same stack trace. + */ + return prof_sample_new_event_wait(tsd); +} + +void +prof_sample_event_handler(tsd_t *tsd, uint64_t elapsed) { + cassert(config_prof); + assert(elapsed > 0 && elapsed != TE_INVALID_ELAPSED); + if (prof_interval == 0 || !prof_active_get_unlocked()) { + return; + } + if (counter_accum(tsd_tsdn(tsd), &prof_idump_accumulated, elapsed)) { + prof_idump(tsd_tsdn(tsd)); + } +} + +static void +prof_fdump(void) { + tsd_t *tsd; + + cassert(config_prof); + assert(opt_prof_final); + + if (!prof_booted) { + return; + } + tsd = tsd_fetch(); + assert(tsd_reentrancy_level_get(tsd) == 0); + + prof_fdump_impl(tsd); +} + +static bool +prof_idump_accum_init(void) { + cassert(config_prof); + + return counter_accum_init(&prof_idump_accumulated, prof_interval); +} + +void +prof_idump(tsdn_t *tsdn) { + tsd_t *tsd; + prof_tdata_t *tdata; + + cassert(config_prof); + + if (!prof_booted || tsdn_null(tsdn) || !prof_active_get_unlocked()) { + return; + } + tsd = tsdn_tsd(tsdn); + if (tsd_reentrancy_level_get(tsd) > 0) { + return; + } + + tdata = prof_tdata_get(tsd, true); + if (tdata == NULL) { + return; + } + if (tdata->enq) { + tdata->enq_idump = true; + return; + } + + prof_idump_impl(tsd); +} + +bool +prof_mdump(tsd_t *tsd, const char *filename) { + cassert(config_prof); + assert(tsd_reentrancy_level_get(tsd) == 0); + + if (!opt_prof || !prof_booted) { + return true; + } + + return prof_mdump_impl(tsd, filename); +} + +void +prof_gdump(tsdn_t *tsdn) { + tsd_t *tsd; + prof_tdata_t *tdata; + + cassert(config_prof); + + if (!prof_booted || tsdn_null(tsdn) || !prof_active_get_unlocked()) { + return; + } + tsd = tsdn_tsd(tsdn); + if (tsd_reentrancy_level_get(tsd) > 0) { + return; + } + + tdata = prof_tdata_get(tsd, false); + if (tdata == NULL) { + return; + } + if (tdata->enq) { + tdata->enq_gdump = true; + return; + } + + prof_gdump_impl(tsd); +} + +static uint64_t +prof_thr_uid_alloc(tsdn_t *tsdn) { + uint64_t thr_uid; + + malloc_mutex_lock(tsdn, &next_thr_uid_mtx); + thr_uid = next_thr_uid; + next_thr_uid++; + malloc_mutex_unlock(tsdn, &next_thr_uid_mtx); + + return thr_uid; +} + +prof_tdata_t * +prof_tdata_init(tsd_t *tsd) { + return prof_tdata_init_impl(tsd, prof_thr_uid_alloc(tsd_tsdn(tsd)), 0, + NULL, prof_thread_active_init_get(tsd_tsdn(tsd))); +} + +prof_tdata_t * +prof_tdata_reinit(tsd_t *tsd, prof_tdata_t *tdata) { + uint64_t thr_uid = tdata->thr_uid; + uint64_t thr_discrim = tdata->thr_discrim + 1; + char *thread_name = (tdata->thread_name != NULL) ? + prof_thread_name_alloc(tsd, tdata->thread_name) : NULL; + bool active = tdata->active; + + prof_tdata_detach(tsd, tdata); + return prof_tdata_init_impl(tsd, thr_uid, thr_discrim, thread_name, + active); +} + +void +prof_tdata_cleanup(tsd_t *tsd) { + prof_tdata_t *tdata; + + if (!config_prof) { + return; + } + + tdata = tsd_prof_tdata_get(tsd); + if (tdata != NULL) { + prof_tdata_detach(tsd, tdata); + } +} + +bool +prof_active_get(tsdn_t *tsdn) { + bool prof_active_current; + + prof_active_assert(); + malloc_mutex_lock(tsdn, &prof_active_mtx); + prof_active_current = prof_active_state; + malloc_mutex_unlock(tsdn, &prof_active_mtx); + return prof_active_current; +} + +bool +prof_active_set(tsdn_t *tsdn, bool active) { + bool prof_active_old; + + prof_active_assert(); + malloc_mutex_lock(tsdn, &prof_active_mtx); + prof_active_old = prof_active_state; + prof_active_state = active; + malloc_mutex_unlock(tsdn, &prof_active_mtx); + prof_active_assert(); + return prof_active_old; +} + +const char * +prof_thread_name_get(tsd_t *tsd) { + assert(tsd_reentrancy_level_get(tsd) == 0); + + prof_tdata_t *tdata; + + tdata = prof_tdata_get(tsd, true); + if (tdata == NULL) { + return ""; + } + return (tdata->thread_name != NULL ? tdata->thread_name : ""); +} + +int +prof_thread_name_set(tsd_t *tsd, const char *thread_name) { + if (opt_prof_sys_thread_name) { + return ENOENT; + } else { + return prof_thread_name_set_impl(tsd, thread_name); + } +} + +bool +prof_thread_active_get(tsd_t *tsd) { + assert(tsd_reentrancy_level_get(tsd) == 0); + + prof_tdata_t *tdata; + + tdata = prof_tdata_get(tsd, true); + if (tdata == NULL) { + return false; + } + return tdata->active; +} + +bool +prof_thread_active_set(tsd_t *tsd, bool active) { + assert(tsd_reentrancy_level_get(tsd) == 0); + + prof_tdata_t *tdata; + + tdata = prof_tdata_get(tsd, true); + if (tdata == NULL) { + return true; + } + tdata->active = active; + return false; +} + +bool +prof_thread_active_init_get(tsdn_t *tsdn) { + bool active_init; + + malloc_mutex_lock(tsdn, &prof_thread_active_init_mtx); + active_init = prof_thread_active_init; + malloc_mutex_unlock(tsdn, &prof_thread_active_init_mtx); + return active_init; +} + +bool +prof_thread_active_init_set(tsdn_t *tsdn, bool active_init) { + bool active_init_old; + + malloc_mutex_lock(tsdn, &prof_thread_active_init_mtx); + active_init_old = prof_thread_active_init; + prof_thread_active_init = active_init; + malloc_mutex_unlock(tsdn, &prof_thread_active_init_mtx); + return active_init_old; +} + +bool +prof_gdump_get(tsdn_t *tsdn) { + bool prof_gdump_current; + + malloc_mutex_lock(tsdn, &prof_gdump_mtx); + prof_gdump_current = prof_gdump_val; + malloc_mutex_unlock(tsdn, &prof_gdump_mtx); + return prof_gdump_current; +} + +bool +prof_gdump_set(tsdn_t *tsdn, bool gdump) { + bool prof_gdump_old; + + malloc_mutex_lock(tsdn, &prof_gdump_mtx); + prof_gdump_old = prof_gdump_val; + prof_gdump_val = gdump; + malloc_mutex_unlock(tsdn, &prof_gdump_mtx); + return prof_gdump_old; +} + +void +prof_backtrace_hook_set(prof_backtrace_hook_t hook) { + atomic_store_p(&prof_backtrace_hook, hook, ATOMIC_RELEASE); +} + +prof_backtrace_hook_t +prof_backtrace_hook_get() { + return (prof_backtrace_hook_t)atomic_load_p(&prof_backtrace_hook, + ATOMIC_ACQUIRE); +} + +void +prof_dump_hook_set(prof_dump_hook_t hook) { + atomic_store_p(&prof_dump_hook, hook, ATOMIC_RELEASE); +} + +prof_dump_hook_t +prof_dump_hook_get() { + return (prof_dump_hook_t)atomic_load_p(&prof_dump_hook, + ATOMIC_ACQUIRE); +} + +void +prof_boot0(void) { + cassert(config_prof); + + memcpy(opt_prof_prefix, PROF_PREFIX_DEFAULT, + sizeof(PROF_PREFIX_DEFAULT)); +} + +void +prof_boot1(void) { + cassert(config_prof); + + /* + * opt_prof must be in its final state before any arenas are + * initialized, so this function must be executed early. + */ + if (opt_prof_leak_error && !opt_prof_leak) { + opt_prof_leak = true; + } + + if (opt_prof_leak && !opt_prof) { + /* + * Enable opt_prof, but in such a way that profiles are never + * automatically dumped. + */ + opt_prof = true; + opt_prof_gdump = false; + } else if (opt_prof) { + if (opt_lg_prof_interval >= 0) { + prof_interval = (((uint64_t)1U) << + opt_lg_prof_interval); + } + } +} + +bool +prof_boot2(tsd_t *tsd, base_t *base) { + cassert(config_prof); + + /* + * Initialize the global mutexes unconditionally to maintain correct + * stats when opt_prof is false. + */ + if (malloc_mutex_init(&prof_active_mtx, "prof_active", + WITNESS_RANK_PROF_ACTIVE, malloc_mutex_rank_exclusive)) { + return true; + } + if (malloc_mutex_init(&prof_gdump_mtx, "prof_gdump", + WITNESS_RANK_PROF_GDUMP, malloc_mutex_rank_exclusive)) { + return true; + } + if (malloc_mutex_init(&prof_thread_active_init_mtx, + "prof_thread_active_init", WITNESS_RANK_PROF_THREAD_ACTIVE_INIT, + malloc_mutex_rank_exclusive)) { + return true; + } + if (malloc_mutex_init(&bt2gctx_mtx, "prof_bt2gctx", + WITNESS_RANK_PROF_BT2GCTX, malloc_mutex_rank_exclusive)) { + return true; + } + if (malloc_mutex_init(&tdatas_mtx, "prof_tdatas", + WITNESS_RANK_PROF_TDATAS, malloc_mutex_rank_exclusive)) { + return true; + } + if (malloc_mutex_init(&next_thr_uid_mtx, "prof_next_thr_uid", + WITNESS_RANK_PROF_NEXT_THR_UID, malloc_mutex_rank_exclusive)) { + return true; + } + if (malloc_mutex_init(&prof_stats_mtx, "prof_stats", + WITNESS_RANK_PROF_STATS, malloc_mutex_rank_exclusive)) { + return true; + } + if (malloc_mutex_init(&prof_dump_filename_mtx, + "prof_dump_filename", WITNESS_RANK_PROF_DUMP_FILENAME, + malloc_mutex_rank_exclusive)) { + return true; + } + if (malloc_mutex_init(&prof_dump_mtx, "prof_dump", + WITNESS_RANK_PROF_DUMP, malloc_mutex_rank_exclusive)) { + return true; + } + + if (opt_prof) { + lg_prof_sample = opt_lg_prof_sample; + prof_unbias_map_init(); + prof_active_state = opt_prof_active; + prof_gdump_val = opt_prof_gdump; + prof_thread_active_init = opt_prof_thread_active_init; + + if (prof_data_init(tsd)) { + return true; + } + + next_thr_uid = 0; + if (prof_idump_accum_init()) { + return true; + } + + if (opt_prof_final && opt_prof_prefix[0] != '\0' && + atexit(prof_fdump) != 0) { + malloc_write(": Error in atexit()\n"); + if (opt_abort) { + abort(); + } + } + + if (prof_log_init(tsd)) { + return true; + } + + if (prof_recent_init()) { + return true; + } + + prof_base = base; + + gctx_locks = (malloc_mutex_t *)base_alloc(tsd_tsdn(tsd), base, + PROF_NCTX_LOCKS * sizeof(malloc_mutex_t), CACHELINE); + if (gctx_locks == NULL) { + return true; + } + for (unsigned i = 0; i < PROF_NCTX_LOCKS; i++) { + if (malloc_mutex_init(&gctx_locks[i], "prof_gctx", + WITNESS_RANK_PROF_GCTX, + malloc_mutex_rank_exclusive)) { + return true; + } + } + + tdata_locks = (malloc_mutex_t *)base_alloc(tsd_tsdn(tsd), base, + PROF_NTDATA_LOCKS * sizeof(malloc_mutex_t), CACHELINE); + if (tdata_locks == NULL) { + return true; + } + for (unsigned i = 0; i < PROF_NTDATA_LOCKS; i++) { + if (malloc_mutex_init(&tdata_locks[i], "prof_tdata", + WITNESS_RANK_PROF_TDATA, + malloc_mutex_rank_exclusive)) { + return true; + } + } + + prof_unwind_init(); + prof_hooks_init(); + } + prof_booted = true; + + return false; +} + +void +prof_prefork0(tsdn_t *tsdn) { + if (config_prof && opt_prof) { + unsigned i; + + malloc_mutex_prefork(tsdn, &prof_dump_mtx); + malloc_mutex_prefork(tsdn, &bt2gctx_mtx); + malloc_mutex_prefork(tsdn, &tdatas_mtx); + for (i = 0; i < PROF_NTDATA_LOCKS; i++) { + malloc_mutex_prefork(tsdn, &tdata_locks[i]); + } + malloc_mutex_prefork(tsdn, &log_mtx); + for (i = 0; i < PROF_NCTX_LOCKS; i++) { + malloc_mutex_prefork(tsdn, &gctx_locks[i]); + } + malloc_mutex_prefork(tsdn, &prof_recent_dump_mtx); + } +} + +void +prof_prefork1(tsdn_t *tsdn) { + if (config_prof && opt_prof) { + counter_prefork(tsdn, &prof_idump_accumulated); + malloc_mutex_prefork(tsdn, &prof_active_mtx); + malloc_mutex_prefork(tsdn, &prof_dump_filename_mtx); + malloc_mutex_prefork(tsdn, &prof_gdump_mtx); + malloc_mutex_prefork(tsdn, &prof_recent_alloc_mtx); + malloc_mutex_prefork(tsdn, &prof_stats_mtx); + malloc_mutex_prefork(tsdn, &next_thr_uid_mtx); + malloc_mutex_prefork(tsdn, &prof_thread_active_init_mtx); + } +} + +void +prof_postfork_parent(tsdn_t *tsdn) { + if (config_prof && opt_prof) { + unsigned i; + + malloc_mutex_postfork_parent(tsdn, + &prof_thread_active_init_mtx); + malloc_mutex_postfork_parent(tsdn, &next_thr_uid_mtx); + malloc_mutex_postfork_parent(tsdn, &prof_stats_mtx); + malloc_mutex_postfork_parent(tsdn, &prof_recent_alloc_mtx); + malloc_mutex_postfork_parent(tsdn, &prof_gdump_mtx); + malloc_mutex_postfork_parent(tsdn, &prof_dump_filename_mtx); + malloc_mutex_postfork_parent(tsdn, &prof_active_mtx); + counter_postfork_parent(tsdn, &prof_idump_accumulated); + malloc_mutex_postfork_parent(tsdn, &prof_recent_dump_mtx); + for (i = 0; i < PROF_NCTX_LOCKS; i++) { + malloc_mutex_postfork_parent(tsdn, &gctx_locks[i]); + } + malloc_mutex_postfork_parent(tsdn, &log_mtx); + for (i = 0; i < PROF_NTDATA_LOCKS; i++) { + malloc_mutex_postfork_parent(tsdn, &tdata_locks[i]); + } + malloc_mutex_postfork_parent(tsdn, &tdatas_mtx); + malloc_mutex_postfork_parent(tsdn, &bt2gctx_mtx); + malloc_mutex_postfork_parent(tsdn, &prof_dump_mtx); + } +} + +void +prof_postfork_child(tsdn_t *tsdn) { + if (config_prof && opt_prof) { + unsigned i; + + malloc_mutex_postfork_child(tsdn, &prof_thread_active_init_mtx); + malloc_mutex_postfork_child(tsdn, &next_thr_uid_mtx); + malloc_mutex_postfork_child(tsdn, &prof_stats_mtx); + malloc_mutex_postfork_child(tsdn, &prof_recent_alloc_mtx); + malloc_mutex_postfork_child(tsdn, &prof_gdump_mtx); + malloc_mutex_postfork_child(tsdn, &prof_dump_filename_mtx); + malloc_mutex_postfork_child(tsdn, &prof_active_mtx); + counter_postfork_child(tsdn, &prof_idump_accumulated); + malloc_mutex_postfork_child(tsdn, &prof_recent_dump_mtx); + for (i = 0; i < PROF_NCTX_LOCKS; i++) { + malloc_mutex_postfork_child(tsdn, &gctx_locks[i]); + } + malloc_mutex_postfork_child(tsdn, &log_mtx); + for (i = 0; i < PROF_NTDATA_LOCKS; i++) { + malloc_mutex_postfork_child(tsdn, &tdata_locks[i]); + } + malloc_mutex_postfork_child(tsdn, &tdatas_mtx); + malloc_mutex_postfork_child(tsdn, &bt2gctx_mtx); + malloc_mutex_postfork_child(tsdn, &prof_dump_mtx); + } +} + +/******************************************************************************/ diff --git a/BeefRT/JEMalloc/src/prof_data.c b/BeefRT/JEMalloc/src/prof_data.c new file mode 100644 index 00000000..bfa55be1 --- /dev/null +++ b/BeefRT/JEMalloc/src/prof_data.c @@ -0,0 +1,1447 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/ckh.h" +#include "jemalloc/internal/hash.h" +#include "jemalloc/internal/malloc_io.h" +#include "jemalloc/internal/prof_data.h" + +/* + * This file defines and manages the core profiling data structures. + * + * Conceptually, profiling data can be imagined as a table with three columns: + * thread, stack trace, and current allocation size. (When prof_accum is on, + * there's one additional column which is the cumulative allocation size.) + * + * Implementation wise, each thread maintains a hash recording the stack trace + * to allocation size correspondences, which are basically the individual rows + * in the table. In addition, two global "indices" are built to make data + * aggregation efficient (for dumping): bt2gctx and tdatas, which are basically + * the "grouped by stack trace" and "grouped by thread" views of the same table, + * respectively. Note that the allocation size is only aggregated to the two + * indices at dumping time, so as to optimize for performance. + */ + +/******************************************************************************/ + +malloc_mutex_t bt2gctx_mtx; +malloc_mutex_t tdatas_mtx; +malloc_mutex_t prof_dump_mtx; + +/* + * Table of mutexes that are shared among gctx's. These are leaf locks, so + * there is no problem with using them for more than one gctx at the same time. + * The primary motivation for this sharing though is that gctx's are ephemeral, + * and destroying mutexes causes complications for systems that allocate when + * creating/destroying mutexes. + */ +malloc_mutex_t *gctx_locks; +static atomic_u_t cum_gctxs; /* Atomic counter. */ + +/* + * Table of mutexes that are shared among tdata's. No operations require + * holding multiple tdata locks, so there is no problem with using them for more + * than one tdata at the same time, even though a gctx lock may be acquired + * while holding a tdata lock. + */ +malloc_mutex_t *tdata_locks; + +/* + * Global hash of (prof_bt_t *)-->(prof_gctx_t *). This is the master data + * structure that knows about all backtraces currently captured. + */ +static ckh_t bt2gctx; + +/* + * Tree of all extant prof_tdata_t structures, regardless of state, + * {attached,detached,expired}. + */ +static prof_tdata_tree_t tdatas; + +size_t prof_unbiased_sz[PROF_SC_NSIZES]; +size_t prof_shifted_unbiased_cnt[PROF_SC_NSIZES]; + +/******************************************************************************/ +/* Red-black trees. */ + +static int +prof_tctx_comp(const prof_tctx_t *a, const prof_tctx_t *b) { + uint64_t a_thr_uid = a->thr_uid; + uint64_t b_thr_uid = b->thr_uid; + int ret = (a_thr_uid > b_thr_uid) - (a_thr_uid < b_thr_uid); + if (ret == 0) { + uint64_t a_thr_discrim = a->thr_discrim; + uint64_t b_thr_discrim = b->thr_discrim; + ret = (a_thr_discrim > b_thr_discrim) - (a_thr_discrim < + b_thr_discrim); + if (ret == 0) { + uint64_t a_tctx_uid = a->tctx_uid; + uint64_t b_tctx_uid = b->tctx_uid; + ret = (a_tctx_uid > b_tctx_uid) - (a_tctx_uid < + b_tctx_uid); + } + } + return ret; +} + +rb_gen(static UNUSED, tctx_tree_, prof_tctx_tree_t, prof_tctx_t, + tctx_link, prof_tctx_comp) + +static int +prof_gctx_comp(const prof_gctx_t *a, const prof_gctx_t *b) { + unsigned a_len = a->bt.len; + unsigned b_len = b->bt.len; + unsigned comp_len = (a_len < b_len) ? a_len : b_len; + int ret = memcmp(a->bt.vec, b->bt.vec, comp_len * sizeof(void *)); + if (ret == 0) { + ret = (a_len > b_len) - (a_len < b_len); + } + return ret; +} + +rb_gen(static UNUSED, gctx_tree_, prof_gctx_tree_t, prof_gctx_t, dump_link, + prof_gctx_comp) + +static int +prof_tdata_comp(const prof_tdata_t *a, const prof_tdata_t *b) { + int ret; + uint64_t a_uid = a->thr_uid; + uint64_t b_uid = b->thr_uid; + + ret = ((a_uid > b_uid) - (a_uid < b_uid)); + if (ret == 0) { + uint64_t a_discrim = a->thr_discrim; + uint64_t b_discrim = b->thr_discrim; + + ret = ((a_discrim > b_discrim) - (a_discrim < b_discrim)); + } + return ret; +} + +rb_gen(static UNUSED, tdata_tree_, prof_tdata_tree_t, prof_tdata_t, tdata_link, + prof_tdata_comp) + +/******************************************************************************/ + +static malloc_mutex_t * +prof_gctx_mutex_choose(void) { + unsigned ngctxs = atomic_fetch_add_u(&cum_gctxs, 1, ATOMIC_RELAXED); + + return &gctx_locks[(ngctxs - 1) % PROF_NCTX_LOCKS]; +} + +static malloc_mutex_t * +prof_tdata_mutex_choose(uint64_t thr_uid) { + return &tdata_locks[thr_uid % PROF_NTDATA_LOCKS]; +} + +bool +prof_data_init(tsd_t *tsd) { + tdata_tree_new(&tdatas); + return ckh_new(tsd, &bt2gctx, PROF_CKH_MINITEMS, + prof_bt_hash, prof_bt_keycomp); +} + +static void +prof_enter(tsd_t *tsd, prof_tdata_t *tdata) { + cassert(config_prof); + assert(tdata == prof_tdata_get(tsd, false)); + + if (tdata != NULL) { + assert(!tdata->enq); + tdata->enq = true; + } + + malloc_mutex_lock(tsd_tsdn(tsd), &bt2gctx_mtx); +} + +static void +prof_leave(tsd_t *tsd, prof_tdata_t *tdata) { + cassert(config_prof); + assert(tdata == prof_tdata_get(tsd, false)); + + malloc_mutex_unlock(tsd_tsdn(tsd), &bt2gctx_mtx); + + if (tdata != NULL) { + bool idump, gdump; + + assert(tdata->enq); + tdata->enq = false; + idump = tdata->enq_idump; + tdata->enq_idump = false; + gdump = tdata->enq_gdump; + tdata->enq_gdump = false; + + if (idump) { + prof_idump(tsd_tsdn(tsd)); + } + if (gdump) { + prof_gdump(tsd_tsdn(tsd)); + } + } +} + +static prof_gctx_t * +prof_gctx_create(tsdn_t *tsdn, prof_bt_t *bt) { + /* + * Create a single allocation that has space for vec of length bt->len. + */ + size_t size = offsetof(prof_gctx_t, vec) + (bt->len * sizeof(void *)); + prof_gctx_t *gctx = (prof_gctx_t *)iallocztm(tsdn, size, + sz_size2index(size), false, NULL, true, arena_get(TSDN_NULL, 0, true), + true); + if (gctx == NULL) { + return NULL; + } + gctx->lock = prof_gctx_mutex_choose(); + /* + * Set nlimbo to 1, in order to avoid a race condition with + * prof_tctx_destroy()/prof_gctx_try_destroy(). + */ + gctx->nlimbo = 1; + tctx_tree_new(&gctx->tctxs); + /* Duplicate bt. */ + memcpy(gctx->vec, bt->vec, bt->len * sizeof(void *)); + gctx->bt.vec = gctx->vec; + gctx->bt.len = bt->len; + return gctx; +} + +static void +prof_gctx_try_destroy(tsd_t *tsd, prof_tdata_t *tdata_self, + prof_gctx_t *gctx) { + cassert(config_prof); + + /* + * Check that gctx is still unused by any thread cache before destroying + * it. prof_lookup() increments gctx->nlimbo in order to avoid a race + * condition with this function, as does prof_tctx_destroy() in order to + * avoid a race between the main body of prof_tctx_destroy() and entry + * into this function. + */ + prof_enter(tsd, tdata_self); + malloc_mutex_lock(tsd_tsdn(tsd), gctx->lock); + assert(gctx->nlimbo != 0); + if (tctx_tree_empty(&gctx->tctxs) && gctx->nlimbo == 1) { + /* Remove gctx from bt2gctx. */ + if (ckh_remove(tsd, &bt2gctx, &gctx->bt, NULL, NULL)) { + not_reached(); + } + prof_leave(tsd, tdata_self); + /* Destroy gctx. */ + malloc_mutex_unlock(tsd_tsdn(tsd), gctx->lock); + idalloctm(tsd_tsdn(tsd), gctx, NULL, NULL, true, true); + } else { + /* + * Compensate for increment in prof_tctx_destroy() or + * prof_lookup(). + */ + gctx->nlimbo--; + malloc_mutex_unlock(tsd_tsdn(tsd), gctx->lock); + prof_leave(tsd, tdata_self); + } +} + +static bool +prof_gctx_should_destroy(prof_gctx_t *gctx) { + if (opt_prof_accum) { + return false; + } + if (!tctx_tree_empty(&gctx->tctxs)) { + return false; + } + if (gctx->nlimbo != 0) { + return false; + } + return true; +} + +static bool +prof_lookup_global(tsd_t *tsd, prof_bt_t *bt, prof_tdata_t *tdata, + void **p_btkey, prof_gctx_t **p_gctx, bool *p_new_gctx) { + union { + prof_gctx_t *p; + void *v; + } gctx, tgctx; + union { + prof_bt_t *p; + void *v; + } btkey; + bool new_gctx; + + prof_enter(tsd, tdata); + if (ckh_search(&bt2gctx, bt, &btkey.v, &gctx.v)) { + /* bt has never been seen before. Insert it. */ + prof_leave(tsd, tdata); + tgctx.p = prof_gctx_create(tsd_tsdn(tsd), bt); + if (tgctx.v == NULL) { + return true; + } + prof_enter(tsd, tdata); + if (ckh_search(&bt2gctx, bt, &btkey.v, &gctx.v)) { + gctx.p = tgctx.p; + btkey.p = &gctx.p->bt; + if (ckh_insert(tsd, &bt2gctx, btkey.v, gctx.v)) { + /* OOM. */ + prof_leave(tsd, tdata); + idalloctm(tsd_tsdn(tsd), gctx.v, NULL, NULL, + true, true); + return true; + } + new_gctx = true; + } else { + new_gctx = false; + } + } else { + tgctx.v = NULL; + new_gctx = false; + } + + if (!new_gctx) { + /* + * Increment nlimbo, in order to avoid a race condition with + * prof_tctx_destroy()/prof_gctx_try_destroy(). + */ + malloc_mutex_lock(tsd_tsdn(tsd), gctx.p->lock); + gctx.p->nlimbo++; + malloc_mutex_unlock(tsd_tsdn(tsd), gctx.p->lock); + new_gctx = false; + + if (tgctx.v != NULL) { + /* Lost race to insert. */ + idalloctm(tsd_tsdn(tsd), tgctx.v, NULL, NULL, true, + true); + } + } + prof_leave(tsd, tdata); + + *p_btkey = btkey.v; + *p_gctx = gctx.p; + *p_new_gctx = new_gctx; + return false; +} + +prof_tctx_t * +prof_lookup(tsd_t *tsd, prof_bt_t *bt) { + union { + prof_tctx_t *p; + void *v; + } ret; + prof_tdata_t *tdata; + bool not_found; + + cassert(config_prof); + + tdata = prof_tdata_get(tsd, false); + assert(tdata != NULL); + + malloc_mutex_lock(tsd_tsdn(tsd), tdata->lock); + not_found = ckh_search(&tdata->bt2tctx, bt, NULL, &ret.v); + if (!not_found) { /* Note double negative! */ + ret.p->prepared = true; + } + malloc_mutex_unlock(tsd_tsdn(tsd), tdata->lock); + if (not_found) { + void *btkey; + prof_gctx_t *gctx; + bool new_gctx, error; + + /* + * This thread's cache lacks bt. Look for it in the global + * cache. + */ + if (prof_lookup_global(tsd, bt, tdata, &btkey, &gctx, + &new_gctx)) { + return NULL; + } + + /* Link a prof_tctx_t into gctx for this thread. */ + ret.v = iallocztm(tsd_tsdn(tsd), sizeof(prof_tctx_t), + sz_size2index(sizeof(prof_tctx_t)), false, NULL, true, + arena_ichoose(tsd, NULL), true); + if (ret.p == NULL) { + if (new_gctx) { + prof_gctx_try_destroy(tsd, tdata, gctx); + } + return NULL; + } + ret.p->tdata = tdata; + ret.p->thr_uid = tdata->thr_uid; + ret.p->thr_discrim = tdata->thr_discrim; + ret.p->recent_count = 0; + memset(&ret.p->cnts, 0, sizeof(prof_cnt_t)); + ret.p->gctx = gctx; + ret.p->tctx_uid = tdata->tctx_uid_next++; + ret.p->prepared = true; + ret.p->state = prof_tctx_state_initializing; + malloc_mutex_lock(tsd_tsdn(tsd), tdata->lock); + error = ckh_insert(tsd, &tdata->bt2tctx, btkey, ret.v); + malloc_mutex_unlock(tsd_tsdn(tsd), tdata->lock); + if (error) { + if (new_gctx) { + prof_gctx_try_destroy(tsd, tdata, gctx); + } + idalloctm(tsd_tsdn(tsd), ret.v, NULL, NULL, true, true); + return NULL; + } + malloc_mutex_lock(tsd_tsdn(tsd), gctx->lock); + ret.p->state = prof_tctx_state_nominal; + tctx_tree_insert(&gctx->tctxs, ret.p); + gctx->nlimbo--; + malloc_mutex_unlock(tsd_tsdn(tsd), gctx->lock); + } + + return ret.p; +} + +/* Used in unit tests. */ +static prof_tdata_t * +prof_tdata_count_iter(prof_tdata_tree_t *tdatas_ptr, prof_tdata_t *tdata, + void *arg) { + size_t *tdata_count = (size_t *)arg; + + (*tdata_count)++; + + return NULL; +} + +/* Used in unit tests. */ +size_t +prof_tdata_count(void) { + size_t tdata_count = 0; + tsdn_t *tsdn; + + tsdn = tsdn_fetch(); + malloc_mutex_lock(tsdn, &tdatas_mtx); + tdata_tree_iter(&tdatas, NULL, prof_tdata_count_iter, + (void *)&tdata_count); + malloc_mutex_unlock(tsdn, &tdatas_mtx); + + return tdata_count; +} + +/* Used in unit tests. */ +size_t +prof_bt_count(void) { + size_t bt_count; + tsd_t *tsd; + prof_tdata_t *tdata; + + tsd = tsd_fetch(); + tdata = prof_tdata_get(tsd, false); + if (tdata == NULL) { + return 0; + } + + malloc_mutex_lock(tsd_tsdn(tsd), &bt2gctx_mtx); + bt_count = ckh_count(&bt2gctx); + malloc_mutex_unlock(tsd_tsdn(tsd), &bt2gctx_mtx); + + return bt_count; +} + +char * +prof_thread_name_alloc(tsd_t *tsd, const char *thread_name) { + char *ret; + size_t size; + + if (thread_name == NULL) { + return NULL; + } + + size = strlen(thread_name) + 1; + if (size == 1) { + return ""; + } + + ret = iallocztm(tsd_tsdn(tsd), size, sz_size2index(size), false, NULL, + true, arena_get(TSDN_NULL, 0, true), true); + if (ret == NULL) { + return NULL; + } + memcpy(ret, thread_name, size); + return ret; +} + +int +prof_thread_name_set_impl(tsd_t *tsd, const char *thread_name) { + assert(tsd_reentrancy_level_get(tsd) == 0); + + prof_tdata_t *tdata; + unsigned i; + char *s; + + tdata = prof_tdata_get(tsd, true); + if (tdata == NULL) { + return EAGAIN; + } + + /* Validate input. */ + if (thread_name == NULL) { + return EFAULT; + } + for (i = 0; thread_name[i] != '\0'; i++) { + char c = thread_name[i]; + if (!isgraph(c) && !isblank(c)) { + return EFAULT; + } + } + + s = prof_thread_name_alloc(tsd, thread_name); + if (s == NULL) { + return EAGAIN; + } + + if (tdata->thread_name != NULL) { + idalloctm(tsd_tsdn(tsd), tdata->thread_name, NULL, NULL, true, + true); + tdata->thread_name = NULL; + } + if (strlen(s) > 0) { + tdata->thread_name = s; + } + return 0; +} + +JEMALLOC_FORMAT_PRINTF(3, 4) +static void +prof_dump_printf(write_cb_t *prof_dump_write, void *cbopaque, + const char *format, ...) { + va_list ap; + char buf[PROF_PRINTF_BUFSIZE]; + + va_start(ap, format); + malloc_vsnprintf(buf, sizeof(buf), format, ap); + va_end(ap); + prof_dump_write(cbopaque, buf); +} + +/* + * Casting a double to a uint64_t may not necessarily be in range; this can be + * UB. I don't think this is practically possible with the cur counters, but + * plausibly could be with the accum counters. + */ +#ifdef JEMALLOC_PROF +static uint64_t +prof_double_uint64_cast(double d) { + /* + * Note: UINT64_MAX + 1 is exactly representable as a double on all + * reasonable platforms (certainly those we'll support). Writing this + * as !(a < b) instead of (a >= b) means that we're NaN-safe. + */ + double rounded = round(d); + if (!(rounded < (double)UINT64_MAX)) { + return UINT64_MAX; + } + return (uint64_t)rounded; +} +#endif + +void prof_unbias_map_init() { + /* See the comment in prof_sample_new_event_wait */ +#ifdef JEMALLOC_PROF + for (szind_t i = 0; i < SC_NSIZES; i++) { + double sz = (double)sz_index2size(i); + double rate = (double)(ZU(1) << lg_prof_sample); + double div_val = 1.0 - exp(-sz / rate); + double unbiased_sz = sz / div_val; + /* + * The "true" right value for the unbiased count is + * 1.0/(1 - exp(-sz/rate)). The problem is, we keep the counts + * as integers (for a variety of reasons -- rounding errors + * could trigger asserts, and not all libcs can properly handle + * floating point arithmetic during malloc calls inside libc). + * Rounding to an integer, though, can lead to rounding errors + * of over 30% for sizes close to the sampling rate. So + * instead, we multiply by a constant, dividing the maximum + * possible roundoff error by that constant. To avoid overflow + * in summing up size_t values, the largest safe constant we can + * pick is the size of the smallest allocation. + */ + double cnt_shift = (double)(ZU(1) << SC_LG_TINY_MIN); + double shifted_unbiased_cnt = cnt_shift / div_val; + prof_unbiased_sz[i] = (size_t)round(unbiased_sz); + prof_shifted_unbiased_cnt[i] = (size_t)round( + shifted_unbiased_cnt); + } +#else + unreachable(); +#endif +} + +/* + * The unbiasing story is long. The jeprof unbiasing logic was copied from + * pprof. Both shared an issue: they unbiased using the average size of the + * allocations at a particular stack trace. This can work out OK if allocations + * are mostly of the same size given some stack, but not otherwise. We now + * internally track what the unbiased results ought to be. We can't just report + * them as they are though; they'll still go through the jeprof unbiasing + * process. Instead, we figure out what values we can feed *into* jeprof's + * unbiasing mechanism that will lead to getting the right values out. + * + * It'll unbias count and aggregate size as: + * + * c_out = c_in * 1/(1-exp(-s_in/c_in/R) + * s_out = s_in * 1/(1-exp(-s_in/c_in/R) + * + * We want to solve for the values of c_in and s_in that will + * give the c_out and s_out that we've computed internally. + * + * Let's do a change of variables (both to make the math easier and to make it + * easier to write): + * x = s_in / c_in + * y = s_in + * k = 1/R. + * + * Then + * c_out = y/x * 1/(1-exp(-k*x)) + * s_out = y * 1/(1-exp(-k*x)) + * + * The first equation gives: + * y = x * c_out * (1-exp(-k*x)) + * The second gives: + * y = s_out * (1-exp(-k*x)) + * So we have + * x = s_out / c_out. + * And all the other values fall out from that. + * + * This is all a fair bit of work. The thing we get out of it is that we don't + * break backwards compatibility with jeprof (and the various tools that have + * copied its unbiasing logic). Eventually, we anticipate a v3 heap profile + * dump format based on JSON, at which point I think much of this logic can get + * cleaned up (since we'll be taking a compatibility break there anyways). + */ +static void +prof_do_unbias(uint64_t c_out_shifted_i, uint64_t s_out_i, uint64_t *r_c_in, + uint64_t *r_s_in) { +#ifdef JEMALLOC_PROF + if (c_out_shifted_i == 0 || s_out_i == 0) { + *r_c_in = 0; + *r_s_in = 0; + return; + } + /* + * See the note in prof_unbias_map_init() to see why we take c_out in a + * shifted form. + */ + double c_out = (double)c_out_shifted_i + / (double)(ZU(1) << SC_LG_TINY_MIN); + double s_out = (double)s_out_i; + double R = (double)(ZU(1) << lg_prof_sample); + + double x = s_out / c_out; + double y = s_out * (1.0 - exp(-x / R)); + + double c_in = y / x; + double s_in = y; + + *r_c_in = prof_double_uint64_cast(c_in); + *r_s_in = prof_double_uint64_cast(s_in); +#else + unreachable(); +#endif +} + +static void +prof_dump_print_cnts(write_cb_t *prof_dump_write, void *cbopaque, + const prof_cnt_t *cnts) { + uint64_t curobjs; + uint64_t curbytes; + uint64_t accumobjs; + uint64_t accumbytes; + if (opt_prof_unbias) { + prof_do_unbias(cnts->curobjs_shifted_unbiased, + cnts->curbytes_unbiased, &curobjs, &curbytes); + prof_do_unbias(cnts->accumobjs_shifted_unbiased, + cnts->accumbytes_unbiased, &accumobjs, &accumbytes); + } else { + curobjs = cnts->curobjs; + curbytes = cnts->curbytes; + accumobjs = cnts->accumobjs; + accumbytes = cnts->accumbytes; + } + prof_dump_printf(prof_dump_write, cbopaque, + "%"FMTu64": %"FMTu64" [%"FMTu64": %"FMTu64"]", + curobjs, curbytes, accumobjs, accumbytes); +} + +static void +prof_tctx_merge_tdata(tsdn_t *tsdn, prof_tctx_t *tctx, prof_tdata_t *tdata) { + malloc_mutex_assert_owner(tsdn, tctx->tdata->lock); + + malloc_mutex_lock(tsdn, tctx->gctx->lock); + + switch (tctx->state) { + case prof_tctx_state_initializing: + malloc_mutex_unlock(tsdn, tctx->gctx->lock); + return; + case prof_tctx_state_nominal: + tctx->state = prof_tctx_state_dumping; + malloc_mutex_unlock(tsdn, tctx->gctx->lock); + + memcpy(&tctx->dump_cnts, &tctx->cnts, sizeof(prof_cnt_t)); + + tdata->cnt_summed.curobjs += tctx->dump_cnts.curobjs; + tdata->cnt_summed.curobjs_shifted_unbiased + += tctx->dump_cnts.curobjs_shifted_unbiased; + tdata->cnt_summed.curbytes += tctx->dump_cnts.curbytes; + tdata->cnt_summed.curbytes_unbiased + += tctx->dump_cnts.curbytes_unbiased; + if (opt_prof_accum) { + tdata->cnt_summed.accumobjs += + tctx->dump_cnts.accumobjs; + tdata->cnt_summed.accumobjs_shifted_unbiased += + tctx->dump_cnts.accumobjs_shifted_unbiased; + tdata->cnt_summed.accumbytes += + tctx->dump_cnts.accumbytes; + tdata->cnt_summed.accumbytes_unbiased += + tctx->dump_cnts.accumbytes_unbiased; + } + break; + case prof_tctx_state_dumping: + case prof_tctx_state_purgatory: + not_reached(); + } +} + +static void +prof_tctx_merge_gctx(tsdn_t *tsdn, prof_tctx_t *tctx, prof_gctx_t *gctx) { + malloc_mutex_assert_owner(tsdn, gctx->lock); + + gctx->cnt_summed.curobjs += tctx->dump_cnts.curobjs; + gctx->cnt_summed.curobjs_shifted_unbiased + += tctx->dump_cnts.curobjs_shifted_unbiased; + gctx->cnt_summed.curbytes += tctx->dump_cnts.curbytes; + gctx->cnt_summed.curbytes_unbiased += tctx->dump_cnts.curbytes_unbiased; + if (opt_prof_accum) { + gctx->cnt_summed.accumobjs += tctx->dump_cnts.accumobjs; + gctx->cnt_summed.accumobjs_shifted_unbiased + += tctx->dump_cnts.accumobjs_shifted_unbiased; + gctx->cnt_summed.accumbytes += tctx->dump_cnts.accumbytes; + gctx->cnt_summed.accumbytes_unbiased + += tctx->dump_cnts.accumbytes_unbiased; + } +} + +static prof_tctx_t * +prof_tctx_merge_iter(prof_tctx_tree_t *tctxs, prof_tctx_t *tctx, void *arg) { + tsdn_t *tsdn = (tsdn_t *)arg; + + malloc_mutex_assert_owner(tsdn, tctx->gctx->lock); + + switch (tctx->state) { + case prof_tctx_state_nominal: + /* New since dumping started; ignore. */ + break; + case prof_tctx_state_dumping: + case prof_tctx_state_purgatory: + prof_tctx_merge_gctx(tsdn, tctx, tctx->gctx); + break; + default: + not_reached(); + } + + return NULL; +} + +typedef struct prof_dump_iter_arg_s prof_dump_iter_arg_t; +struct prof_dump_iter_arg_s { + tsdn_t *tsdn; + write_cb_t *prof_dump_write; + void *cbopaque; +}; + +static prof_tctx_t * +prof_tctx_dump_iter(prof_tctx_tree_t *tctxs, prof_tctx_t *tctx, void *opaque) { + prof_dump_iter_arg_t *arg = (prof_dump_iter_arg_t *)opaque; + malloc_mutex_assert_owner(arg->tsdn, tctx->gctx->lock); + + switch (tctx->state) { + case prof_tctx_state_initializing: + case prof_tctx_state_nominal: + /* Not captured by this dump. */ + break; + case prof_tctx_state_dumping: + case prof_tctx_state_purgatory: + prof_dump_printf(arg->prof_dump_write, arg->cbopaque, + " t%"FMTu64": ", tctx->thr_uid); + prof_dump_print_cnts(arg->prof_dump_write, arg->cbopaque, + &tctx->dump_cnts); + arg->prof_dump_write(arg->cbopaque, "\n"); + break; + default: + not_reached(); + } + return NULL; +} + +static prof_tctx_t * +prof_tctx_finish_iter(prof_tctx_tree_t *tctxs, prof_tctx_t *tctx, void *arg) { + tsdn_t *tsdn = (tsdn_t *)arg; + prof_tctx_t *ret; + + malloc_mutex_assert_owner(tsdn, tctx->gctx->lock); + + switch (tctx->state) { + case prof_tctx_state_nominal: + /* New since dumping started; ignore. */ + break; + case prof_tctx_state_dumping: + tctx->state = prof_tctx_state_nominal; + break; + case prof_tctx_state_purgatory: + ret = tctx; + goto label_return; + default: + not_reached(); + } + + ret = NULL; +label_return: + return ret; +} + +static void +prof_dump_gctx_prep(tsdn_t *tsdn, prof_gctx_t *gctx, prof_gctx_tree_t *gctxs) { + cassert(config_prof); + + malloc_mutex_lock(tsdn, gctx->lock); + + /* + * Increment nlimbo so that gctx won't go away before dump. + * Additionally, link gctx into the dump list so that it is included in + * prof_dump()'s second pass. + */ + gctx->nlimbo++; + gctx_tree_insert(gctxs, gctx); + + memset(&gctx->cnt_summed, 0, sizeof(prof_cnt_t)); + + malloc_mutex_unlock(tsdn, gctx->lock); +} + +typedef struct prof_gctx_merge_iter_arg_s prof_gctx_merge_iter_arg_t; +struct prof_gctx_merge_iter_arg_s { + tsdn_t *tsdn; + size_t *leak_ngctx; +}; + +static prof_gctx_t * +prof_gctx_merge_iter(prof_gctx_tree_t *gctxs, prof_gctx_t *gctx, void *opaque) { + prof_gctx_merge_iter_arg_t *arg = (prof_gctx_merge_iter_arg_t *)opaque; + + malloc_mutex_lock(arg->tsdn, gctx->lock); + tctx_tree_iter(&gctx->tctxs, NULL, prof_tctx_merge_iter, + (void *)arg->tsdn); + if (gctx->cnt_summed.curobjs != 0) { + (*arg->leak_ngctx)++; + } + malloc_mutex_unlock(arg->tsdn, gctx->lock); + + return NULL; +} + +static void +prof_gctx_finish(tsd_t *tsd, prof_gctx_tree_t *gctxs) { + prof_tdata_t *tdata = prof_tdata_get(tsd, false); + prof_gctx_t *gctx; + + /* + * Standard tree iteration won't work here, because as soon as we + * decrement gctx->nlimbo and unlock gctx, another thread can + * concurrently destroy it, which will corrupt the tree. Therefore, + * tear down the tree one node at a time during iteration. + */ + while ((gctx = gctx_tree_first(gctxs)) != NULL) { + gctx_tree_remove(gctxs, gctx); + malloc_mutex_lock(tsd_tsdn(tsd), gctx->lock); + { + prof_tctx_t *next; + + next = NULL; + do { + prof_tctx_t *to_destroy = + tctx_tree_iter(&gctx->tctxs, next, + prof_tctx_finish_iter, + (void *)tsd_tsdn(tsd)); + if (to_destroy != NULL) { + next = tctx_tree_next(&gctx->tctxs, + to_destroy); + tctx_tree_remove(&gctx->tctxs, + to_destroy); + idalloctm(tsd_tsdn(tsd), to_destroy, + NULL, NULL, true, true); + } else { + next = NULL; + } + } while (next != NULL); + } + gctx->nlimbo--; + if (prof_gctx_should_destroy(gctx)) { + gctx->nlimbo++; + malloc_mutex_unlock(tsd_tsdn(tsd), gctx->lock); + prof_gctx_try_destroy(tsd, tdata, gctx); + } else { + malloc_mutex_unlock(tsd_tsdn(tsd), gctx->lock); + } + } +} + +typedef struct prof_tdata_merge_iter_arg_s prof_tdata_merge_iter_arg_t; +struct prof_tdata_merge_iter_arg_s { + tsdn_t *tsdn; + prof_cnt_t *cnt_all; +}; + +static prof_tdata_t * +prof_tdata_merge_iter(prof_tdata_tree_t *tdatas_ptr, prof_tdata_t *tdata, + void *opaque) { + prof_tdata_merge_iter_arg_t *arg = + (prof_tdata_merge_iter_arg_t *)opaque; + + malloc_mutex_lock(arg->tsdn, tdata->lock); + if (!tdata->expired) { + size_t tabind; + union { + prof_tctx_t *p; + void *v; + } tctx; + + tdata->dumping = true; + memset(&tdata->cnt_summed, 0, sizeof(prof_cnt_t)); + for (tabind = 0; !ckh_iter(&tdata->bt2tctx, &tabind, NULL, + &tctx.v);) { + prof_tctx_merge_tdata(arg->tsdn, tctx.p, tdata); + } + + arg->cnt_all->curobjs += tdata->cnt_summed.curobjs; + arg->cnt_all->curobjs_shifted_unbiased + += tdata->cnt_summed.curobjs_shifted_unbiased; + arg->cnt_all->curbytes += tdata->cnt_summed.curbytes; + arg->cnt_all->curbytes_unbiased + += tdata->cnt_summed.curbytes_unbiased; + if (opt_prof_accum) { + arg->cnt_all->accumobjs += tdata->cnt_summed.accumobjs; + arg->cnt_all->accumobjs_shifted_unbiased + += tdata->cnt_summed.accumobjs_shifted_unbiased; + arg->cnt_all->accumbytes += + tdata->cnt_summed.accumbytes; + arg->cnt_all->accumbytes_unbiased += + tdata->cnt_summed.accumbytes_unbiased; + } + } else { + tdata->dumping = false; + } + malloc_mutex_unlock(arg->tsdn, tdata->lock); + + return NULL; +} + +static prof_tdata_t * +prof_tdata_dump_iter(prof_tdata_tree_t *tdatas_ptr, prof_tdata_t *tdata, + void *opaque) { + if (!tdata->dumping) { + return NULL; + } + + prof_dump_iter_arg_t *arg = (prof_dump_iter_arg_t *)opaque; + prof_dump_printf(arg->prof_dump_write, arg->cbopaque, " t%"FMTu64": ", + tdata->thr_uid); + prof_dump_print_cnts(arg->prof_dump_write, arg->cbopaque, + &tdata->cnt_summed); + if (tdata->thread_name != NULL) { + arg->prof_dump_write(arg->cbopaque, " "); + arg->prof_dump_write(arg->cbopaque, tdata->thread_name); + } + arg->prof_dump_write(arg->cbopaque, "\n"); + return NULL; +} + +static void +prof_dump_header(prof_dump_iter_arg_t *arg, const prof_cnt_t *cnt_all) { + prof_dump_printf(arg->prof_dump_write, arg->cbopaque, + "heap_v2/%"FMTu64"\n t*: ", ((uint64_t)1U << lg_prof_sample)); + prof_dump_print_cnts(arg->prof_dump_write, arg->cbopaque, cnt_all); + arg->prof_dump_write(arg->cbopaque, "\n"); + + malloc_mutex_lock(arg->tsdn, &tdatas_mtx); + tdata_tree_iter(&tdatas, NULL, prof_tdata_dump_iter, arg); + malloc_mutex_unlock(arg->tsdn, &tdatas_mtx); +} + +static void +prof_dump_gctx(prof_dump_iter_arg_t *arg, prof_gctx_t *gctx, + const prof_bt_t *bt, prof_gctx_tree_t *gctxs) { + cassert(config_prof); + malloc_mutex_assert_owner(arg->tsdn, gctx->lock); + + /* Avoid dumping such gctx's that have no useful data. */ + if ((!opt_prof_accum && gctx->cnt_summed.curobjs == 0) || + (opt_prof_accum && gctx->cnt_summed.accumobjs == 0)) { + assert(gctx->cnt_summed.curobjs == 0); + assert(gctx->cnt_summed.curbytes == 0); + /* + * These asserts would not be correct -- see the comment on races + * in prof.c + * assert(gctx->cnt_summed.curobjs_unbiased == 0); + * assert(gctx->cnt_summed.curbytes_unbiased == 0); + */ + assert(gctx->cnt_summed.accumobjs == 0); + assert(gctx->cnt_summed.accumobjs_shifted_unbiased == 0); + assert(gctx->cnt_summed.accumbytes == 0); + assert(gctx->cnt_summed.accumbytes_unbiased == 0); + return; + } + + arg->prof_dump_write(arg->cbopaque, "@"); + for (unsigned i = 0; i < bt->len; i++) { + prof_dump_printf(arg->prof_dump_write, arg->cbopaque, + " %#"FMTxPTR, (uintptr_t)bt->vec[i]); + } + + arg->prof_dump_write(arg->cbopaque, "\n t*: "); + prof_dump_print_cnts(arg->prof_dump_write, arg->cbopaque, + &gctx->cnt_summed); + arg->prof_dump_write(arg->cbopaque, "\n"); + + tctx_tree_iter(&gctx->tctxs, NULL, prof_tctx_dump_iter, arg); +} + +/* + * See prof_sample_new_event_wait() comment for why the body of this function + * is conditionally compiled. + */ +static void +prof_leakcheck(const prof_cnt_t *cnt_all, size_t leak_ngctx) { +#ifdef JEMALLOC_PROF + /* + * Scaling is equivalent AdjustSamples() in jeprof, but the result may + * differ slightly from what jeprof reports, because here we scale the + * summary values, whereas jeprof scales each context individually and + * reports the sums of the scaled values. + */ + if (cnt_all->curbytes != 0) { + double sample_period = (double)((uint64_t)1 << lg_prof_sample); + double ratio = (((double)cnt_all->curbytes) / + (double)cnt_all->curobjs) / sample_period; + double scale_factor = 1.0 / (1.0 - exp(-ratio)); + uint64_t curbytes = (uint64_t)round(((double)cnt_all->curbytes) + * scale_factor); + uint64_t curobjs = (uint64_t)round(((double)cnt_all->curobjs) * + scale_factor); + + malloc_printf(": Leak approximation summary: ~%"FMTu64 + " byte%s, ~%"FMTu64" object%s, >= %zu context%s\n", + curbytes, (curbytes != 1) ? "s" : "", curobjs, (curobjs != + 1) ? "s" : "", leak_ngctx, (leak_ngctx != 1) ? "s" : ""); + malloc_printf( + ": Run jeprof on dump output for leak detail\n"); + if (opt_prof_leak_error) { + malloc_printf( + ": Exiting with error code because memory" + " leaks were detected\n"); + /* + * Use _exit() with underscore to avoid calling atexit() + * and entering endless cycle. + */ + _exit(1); + } + } +#endif +} + +static prof_gctx_t * +prof_gctx_dump_iter(prof_gctx_tree_t *gctxs, prof_gctx_t *gctx, void *opaque) { + prof_dump_iter_arg_t *arg = (prof_dump_iter_arg_t *)opaque; + malloc_mutex_lock(arg->tsdn, gctx->lock); + prof_dump_gctx(arg, gctx, &gctx->bt, gctxs); + malloc_mutex_unlock(arg->tsdn, gctx->lock); + return NULL; +} + +static void +prof_dump_prep(tsd_t *tsd, prof_tdata_t *tdata, prof_cnt_t *cnt_all, + size_t *leak_ngctx, prof_gctx_tree_t *gctxs) { + size_t tabind; + union { + prof_gctx_t *p; + void *v; + } gctx; + + prof_enter(tsd, tdata); + + /* + * Put gctx's in limbo and clear their counters in preparation for + * summing. + */ + gctx_tree_new(gctxs); + for (tabind = 0; !ckh_iter(&bt2gctx, &tabind, NULL, &gctx.v);) { + prof_dump_gctx_prep(tsd_tsdn(tsd), gctx.p, gctxs); + } + + /* + * Iterate over tdatas, and for the non-expired ones snapshot their tctx + * stats and merge them into the associated gctx's. + */ + memset(cnt_all, 0, sizeof(prof_cnt_t)); + prof_tdata_merge_iter_arg_t prof_tdata_merge_iter_arg = {tsd_tsdn(tsd), + cnt_all}; + malloc_mutex_lock(tsd_tsdn(tsd), &tdatas_mtx); + tdata_tree_iter(&tdatas, NULL, prof_tdata_merge_iter, + &prof_tdata_merge_iter_arg); + malloc_mutex_unlock(tsd_tsdn(tsd), &tdatas_mtx); + + /* Merge tctx stats into gctx's. */ + *leak_ngctx = 0; + prof_gctx_merge_iter_arg_t prof_gctx_merge_iter_arg = {tsd_tsdn(tsd), + leak_ngctx}; + gctx_tree_iter(gctxs, NULL, prof_gctx_merge_iter, + &prof_gctx_merge_iter_arg); + + prof_leave(tsd, tdata); +} + +void +prof_dump_impl(tsd_t *tsd, write_cb_t *prof_dump_write, void *cbopaque, + prof_tdata_t *tdata, bool leakcheck) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_dump_mtx); + prof_cnt_t cnt_all; + size_t leak_ngctx; + prof_gctx_tree_t gctxs; + prof_dump_prep(tsd, tdata, &cnt_all, &leak_ngctx, &gctxs); + prof_dump_iter_arg_t prof_dump_iter_arg = {tsd_tsdn(tsd), + prof_dump_write, cbopaque}; + prof_dump_header(&prof_dump_iter_arg, &cnt_all); + gctx_tree_iter(&gctxs, NULL, prof_gctx_dump_iter, &prof_dump_iter_arg); + prof_gctx_finish(tsd, &gctxs); + if (leakcheck) { + prof_leakcheck(&cnt_all, leak_ngctx); + } +} + +/* Used in unit tests. */ +void +prof_cnt_all(prof_cnt_t *cnt_all) { + tsd_t *tsd = tsd_fetch(); + prof_tdata_t *tdata = prof_tdata_get(tsd, false); + if (tdata == NULL) { + memset(cnt_all, 0, sizeof(prof_cnt_t)); + } else { + size_t leak_ngctx; + prof_gctx_tree_t gctxs; + prof_dump_prep(tsd, tdata, cnt_all, &leak_ngctx, &gctxs); + prof_gctx_finish(tsd, &gctxs); + } +} + +void +prof_bt_hash(const void *key, size_t r_hash[2]) { + prof_bt_t *bt = (prof_bt_t *)key; + + cassert(config_prof); + + hash(bt->vec, bt->len * sizeof(void *), 0x94122f33U, r_hash); +} + +bool +prof_bt_keycomp(const void *k1, const void *k2) { + const prof_bt_t *bt1 = (prof_bt_t *)k1; + const prof_bt_t *bt2 = (prof_bt_t *)k2; + + cassert(config_prof); + + if (bt1->len != bt2->len) { + return false; + } + return (memcmp(bt1->vec, bt2->vec, bt1->len * sizeof(void *)) == 0); +} + +prof_tdata_t * +prof_tdata_init_impl(tsd_t *tsd, uint64_t thr_uid, uint64_t thr_discrim, + char *thread_name, bool active) { + assert(tsd_reentrancy_level_get(tsd) == 0); + + prof_tdata_t *tdata; + + cassert(config_prof); + + /* Initialize an empty cache for this thread. */ + tdata = (prof_tdata_t *)iallocztm(tsd_tsdn(tsd), sizeof(prof_tdata_t), + sz_size2index(sizeof(prof_tdata_t)), false, NULL, true, + arena_get(TSDN_NULL, 0, true), true); + if (tdata == NULL) { + return NULL; + } + + tdata->lock = prof_tdata_mutex_choose(thr_uid); + tdata->thr_uid = thr_uid; + tdata->thr_discrim = thr_discrim; + tdata->thread_name = thread_name; + tdata->attached = true; + tdata->expired = false; + tdata->tctx_uid_next = 0; + + if (ckh_new(tsd, &tdata->bt2tctx, PROF_CKH_MINITEMS, prof_bt_hash, + prof_bt_keycomp)) { + idalloctm(tsd_tsdn(tsd), tdata, NULL, NULL, true, true); + return NULL; + } + + tdata->enq = false; + tdata->enq_idump = false; + tdata->enq_gdump = false; + + tdata->dumping = false; + tdata->active = active; + + malloc_mutex_lock(tsd_tsdn(tsd), &tdatas_mtx); + tdata_tree_insert(&tdatas, tdata); + malloc_mutex_unlock(tsd_tsdn(tsd), &tdatas_mtx); + + return tdata; +} + +static bool +prof_tdata_should_destroy_unlocked(prof_tdata_t *tdata, bool even_if_attached) { + if (tdata->attached && !even_if_attached) { + return false; + } + if (ckh_count(&tdata->bt2tctx) != 0) { + return false; + } + return true; +} + +static bool +prof_tdata_should_destroy(tsdn_t *tsdn, prof_tdata_t *tdata, + bool even_if_attached) { + malloc_mutex_assert_owner(tsdn, tdata->lock); + + return prof_tdata_should_destroy_unlocked(tdata, even_if_attached); +} + +static void +prof_tdata_destroy_locked(tsd_t *tsd, prof_tdata_t *tdata, + bool even_if_attached) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), &tdatas_mtx); + malloc_mutex_assert_not_owner(tsd_tsdn(tsd), tdata->lock); + + tdata_tree_remove(&tdatas, tdata); + + assert(prof_tdata_should_destroy_unlocked(tdata, even_if_attached)); + + if (tdata->thread_name != NULL) { + idalloctm(tsd_tsdn(tsd), tdata->thread_name, NULL, NULL, true, + true); + } + ckh_delete(tsd, &tdata->bt2tctx); + idalloctm(tsd_tsdn(tsd), tdata, NULL, NULL, true, true); +} + +static void +prof_tdata_destroy(tsd_t *tsd, prof_tdata_t *tdata, bool even_if_attached) { + malloc_mutex_lock(tsd_tsdn(tsd), &tdatas_mtx); + prof_tdata_destroy_locked(tsd, tdata, even_if_attached); + malloc_mutex_unlock(tsd_tsdn(tsd), &tdatas_mtx); +} + +void +prof_tdata_detach(tsd_t *tsd, prof_tdata_t *tdata) { + bool destroy_tdata; + + malloc_mutex_lock(tsd_tsdn(tsd), tdata->lock); + if (tdata->attached) { + destroy_tdata = prof_tdata_should_destroy(tsd_tsdn(tsd), tdata, + true); + /* + * Only detach if !destroy_tdata, because detaching would allow + * another thread to win the race to destroy tdata. + */ + if (!destroy_tdata) { + tdata->attached = false; + } + tsd_prof_tdata_set(tsd, NULL); + } else { + destroy_tdata = false; + } + malloc_mutex_unlock(tsd_tsdn(tsd), tdata->lock); + if (destroy_tdata) { + prof_tdata_destroy(tsd, tdata, true); + } +} + +static bool +prof_tdata_expire(tsdn_t *tsdn, prof_tdata_t *tdata) { + bool destroy_tdata; + + malloc_mutex_lock(tsdn, tdata->lock); + if (!tdata->expired) { + tdata->expired = true; + destroy_tdata = prof_tdata_should_destroy(tsdn, tdata, false); + } else { + destroy_tdata = false; + } + malloc_mutex_unlock(tsdn, tdata->lock); + + return destroy_tdata; +} + +static prof_tdata_t * +prof_tdata_reset_iter(prof_tdata_tree_t *tdatas_ptr, prof_tdata_t *tdata, + void *arg) { + tsdn_t *tsdn = (tsdn_t *)arg; + + return (prof_tdata_expire(tsdn, tdata) ? tdata : NULL); +} + +void +prof_reset(tsd_t *tsd, size_t lg_sample) { + prof_tdata_t *next; + + assert(lg_sample < (sizeof(uint64_t) << 3)); + + malloc_mutex_lock(tsd_tsdn(tsd), &prof_dump_mtx); + malloc_mutex_lock(tsd_tsdn(tsd), &tdatas_mtx); + + lg_prof_sample = lg_sample; + prof_unbias_map_init(); + + next = NULL; + do { + prof_tdata_t *to_destroy = tdata_tree_iter(&tdatas, next, + prof_tdata_reset_iter, (void *)tsd); + if (to_destroy != NULL) { + next = tdata_tree_next(&tdatas, to_destroy); + prof_tdata_destroy_locked(tsd, to_destroy, false); + } else { + next = NULL; + } + } while (next != NULL); + + malloc_mutex_unlock(tsd_tsdn(tsd), &tdatas_mtx); + malloc_mutex_unlock(tsd_tsdn(tsd), &prof_dump_mtx); +} + +static bool +prof_tctx_should_destroy(tsd_t *tsd, prof_tctx_t *tctx) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), tctx->tdata->lock); + + if (opt_prof_accum) { + return false; + } + if (tctx->cnts.curobjs != 0) { + return false; + } + if (tctx->prepared) { + return false; + } + if (tctx->recent_count != 0) { + return false; + } + return true; +} + +static void +prof_tctx_destroy(tsd_t *tsd, prof_tctx_t *tctx) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), tctx->tdata->lock); + + assert(tctx->cnts.curobjs == 0); + assert(tctx->cnts.curbytes == 0); + /* + * These asserts are not correct -- see the comment about races in + * prof.c + * + * assert(tctx->cnts.curobjs_shifted_unbiased == 0); + * assert(tctx->cnts.curbytes_unbiased == 0); + */ + assert(!opt_prof_accum); + assert(tctx->cnts.accumobjs == 0); + assert(tctx->cnts.accumbytes == 0); + /* + * These ones are, since accumbyte counts never go down. Either + * prof_accum is off (in which case these should never have changed from + * their initial value of zero), or it's on (in which case we shouldn't + * be destroying this tctx). + */ + assert(tctx->cnts.accumobjs_shifted_unbiased == 0); + assert(tctx->cnts.accumbytes_unbiased == 0); + + prof_gctx_t *gctx = tctx->gctx; + + { + prof_tdata_t *tdata = tctx->tdata; + tctx->tdata = NULL; + ckh_remove(tsd, &tdata->bt2tctx, &gctx->bt, NULL, NULL); + bool destroy_tdata = prof_tdata_should_destroy(tsd_tsdn(tsd), + tdata, false); + malloc_mutex_unlock(tsd_tsdn(tsd), tdata->lock); + if (destroy_tdata) { + prof_tdata_destroy(tsd, tdata, false); + } + } + + bool destroy_tctx, destroy_gctx; + + malloc_mutex_lock(tsd_tsdn(tsd), gctx->lock); + switch (tctx->state) { + case prof_tctx_state_nominal: + tctx_tree_remove(&gctx->tctxs, tctx); + destroy_tctx = true; + if (prof_gctx_should_destroy(gctx)) { + /* + * Increment gctx->nlimbo in order to keep another + * thread from winning the race to destroy gctx while + * this one has gctx->lock dropped. Without this, it + * would be possible for another thread to: + * + * 1) Sample an allocation associated with gctx. + * 2) Deallocate the sampled object. + * 3) Successfully prof_gctx_try_destroy(gctx). + * + * The result would be that gctx no longer exists by the + * time this thread accesses it in + * prof_gctx_try_destroy(). + */ + gctx->nlimbo++; + destroy_gctx = true; + } else { + destroy_gctx = false; + } + break; + case prof_tctx_state_dumping: + /* + * A dumping thread needs tctx to remain valid until dumping + * has finished. Change state such that the dumping thread will + * complete destruction during a late dump iteration phase. + */ + tctx->state = prof_tctx_state_purgatory; + destroy_tctx = false; + destroy_gctx = false; + break; + default: + not_reached(); + destroy_tctx = false; + destroy_gctx = false; + } + malloc_mutex_unlock(tsd_tsdn(tsd), gctx->lock); + if (destroy_gctx) { + prof_gctx_try_destroy(tsd, prof_tdata_get(tsd, false), gctx); + } + if (destroy_tctx) { + idalloctm(tsd_tsdn(tsd), tctx, NULL, NULL, true, true); + } +} + +void +prof_tctx_try_destroy(tsd_t *tsd, prof_tctx_t *tctx) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), tctx->tdata->lock); + if (prof_tctx_should_destroy(tsd, tctx)) { + /* tctx->tdata->lock will be released in prof_tctx_destroy(). */ + prof_tctx_destroy(tsd, tctx); + } else { + malloc_mutex_unlock(tsd_tsdn(tsd), tctx->tdata->lock); + } +} + +/******************************************************************************/ diff --git a/BeefRT/JEMalloc/src/prof_log.c b/BeefRT/JEMalloc/src/prof_log.c new file mode 100644 index 00000000..0632c3b3 --- /dev/null +++ b/BeefRT/JEMalloc/src/prof_log.c @@ -0,0 +1,717 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/buf_writer.h" +#include "jemalloc/internal/ckh.h" +#include "jemalloc/internal/emitter.h" +#include "jemalloc/internal/hash.h" +#include "jemalloc/internal/malloc_io.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/prof_data.h" +#include "jemalloc/internal/prof_log.h" +#include "jemalloc/internal/prof_sys.h" + +bool opt_prof_log = false; +typedef enum prof_logging_state_e prof_logging_state_t; +enum prof_logging_state_e { + prof_logging_state_stopped, + prof_logging_state_started, + prof_logging_state_dumping +}; + +/* + * - stopped: log_start never called, or previous log_stop has completed. + * - started: log_start called, log_stop not called yet. Allocations are logged. + * - dumping: log_stop called but not finished; samples are not logged anymore. + */ +prof_logging_state_t prof_logging_state = prof_logging_state_stopped; + +/* Used in unit tests. */ +static bool prof_log_dummy = false; + +/* Incremented for every log file that is output. */ +static uint64_t log_seq = 0; +static char log_filename[ + /* Minimize memory bloat for non-prof builds. */ +#ifdef JEMALLOC_PROF + PATH_MAX + +#endif + 1]; + +/* Timestamp for most recent call to log_start(). */ +static nstime_t log_start_timestamp; + +/* Increment these when adding to the log_bt and log_thr linked lists. */ +static size_t log_bt_index = 0; +static size_t log_thr_index = 0; + +/* Linked list node definitions. These are only used in this file. */ +typedef struct prof_bt_node_s prof_bt_node_t; + +struct prof_bt_node_s { + prof_bt_node_t *next; + size_t index; + prof_bt_t bt; + /* Variable size backtrace vector pointed to by bt. */ + void *vec[1]; +}; + +typedef struct prof_thr_node_s prof_thr_node_t; + +struct prof_thr_node_s { + prof_thr_node_t *next; + size_t index; + uint64_t thr_uid; + /* Variable size based on thr_name_sz. */ + char name[1]; +}; + +typedef struct prof_alloc_node_s prof_alloc_node_t; + +/* This is output when logging sampled allocations. */ +struct prof_alloc_node_s { + prof_alloc_node_t *next; + /* Indices into an array of thread data. */ + size_t alloc_thr_ind; + size_t free_thr_ind; + + /* Indices into an array of backtraces. */ + size_t alloc_bt_ind; + size_t free_bt_ind; + + uint64_t alloc_time_ns; + uint64_t free_time_ns; + + size_t usize; +}; + +/* + * Created on the first call to prof_try_log and deleted on prof_log_stop. + * These are the backtraces and threads that have already been logged by an + * allocation. + */ +static bool log_tables_initialized = false; +static ckh_t log_bt_node_set; +static ckh_t log_thr_node_set; + +/* Store linked lists for logged data. */ +static prof_bt_node_t *log_bt_first = NULL; +static prof_bt_node_t *log_bt_last = NULL; +static prof_thr_node_t *log_thr_first = NULL; +static prof_thr_node_t *log_thr_last = NULL; +static prof_alloc_node_t *log_alloc_first = NULL; +static prof_alloc_node_t *log_alloc_last = NULL; + +/* Protects the prof_logging_state and any log_{...} variable. */ +malloc_mutex_t log_mtx; + +/******************************************************************************/ +/* + * Function prototypes for static functions that are referenced prior to + * definition. + */ + +/* Hashtable functions for log_bt_node_set and log_thr_node_set. */ +static void prof_thr_node_hash(const void *key, size_t r_hash[2]); +static bool prof_thr_node_keycomp(const void *k1, const void *k2); +static void prof_bt_node_hash(const void *key, size_t r_hash[2]); +static bool prof_bt_node_keycomp(const void *k1, const void *k2); + +/******************************************************************************/ + +static size_t +prof_log_bt_index(tsd_t *tsd, prof_bt_t *bt) { + assert(prof_logging_state == prof_logging_state_started); + malloc_mutex_assert_owner(tsd_tsdn(tsd), &log_mtx); + + prof_bt_node_t dummy_node; + dummy_node.bt = *bt; + prof_bt_node_t *node; + + /* See if this backtrace is already cached in the table. */ + if (ckh_search(&log_bt_node_set, (void *)(&dummy_node), + (void **)(&node), NULL)) { + size_t sz = offsetof(prof_bt_node_t, vec) + + (bt->len * sizeof(void *)); + prof_bt_node_t *new_node = (prof_bt_node_t *) + iallocztm(tsd_tsdn(tsd), sz, sz_size2index(sz), false, NULL, + true, arena_get(TSDN_NULL, 0, true), true); + if (log_bt_first == NULL) { + log_bt_first = new_node; + log_bt_last = new_node; + } else { + log_bt_last->next = new_node; + log_bt_last = new_node; + } + + new_node->next = NULL; + new_node->index = log_bt_index; + /* + * Copy the backtrace: bt is inside a tdata or gctx, which + * might die before prof_log_stop is called. + */ + new_node->bt.len = bt->len; + memcpy(new_node->vec, bt->vec, bt->len * sizeof(void *)); + new_node->bt.vec = new_node->vec; + + log_bt_index++; + ckh_insert(tsd, &log_bt_node_set, (void *)new_node, NULL); + return new_node->index; + } else { + return node->index; + } +} + +static size_t +prof_log_thr_index(tsd_t *tsd, uint64_t thr_uid, const char *name) { + assert(prof_logging_state == prof_logging_state_started); + malloc_mutex_assert_owner(tsd_tsdn(tsd), &log_mtx); + + prof_thr_node_t dummy_node; + dummy_node.thr_uid = thr_uid; + prof_thr_node_t *node; + + /* See if this thread is already cached in the table. */ + if (ckh_search(&log_thr_node_set, (void *)(&dummy_node), + (void **)(&node), NULL)) { + size_t sz = offsetof(prof_thr_node_t, name) + strlen(name) + 1; + prof_thr_node_t *new_node = (prof_thr_node_t *) + iallocztm(tsd_tsdn(tsd), sz, sz_size2index(sz), false, NULL, + true, arena_get(TSDN_NULL, 0, true), true); + if (log_thr_first == NULL) { + log_thr_first = new_node; + log_thr_last = new_node; + } else { + log_thr_last->next = new_node; + log_thr_last = new_node; + } + + new_node->next = NULL; + new_node->index = log_thr_index; + new_node->thr_uid = thr_uid; + strcpy(new_node->name, name); + + log_thr_index++; + ckh_insert(tsd, &log_thr_node_set, (void *)new_node, NULL); + return new_node->index; + } else { + return node->index; + } +} + +JEMALLOC_COLD +void +prof_try_log(tsd_t *tsd, size_t usize, prof_info_t *prof_info) { + cassert(config_prof); + prof_tctx_t *tctx = prof_info->alloc_tctx; + malloc_mutex_assert_owner(tsd_tsdn(tsd), tctx->tdata->lock); + + prof_tdata_t *cons_tdata = prof_tdata_get(tsd, false); + if (cons_tdata == NULL) { + /* + * We decide not to log these allocations. cons_tdata will be + * NULL only when the current thread is in a weird state (e.g. + * it's being destroyed). + */ + return; + } + + malloc_mutex_lock(tsd_tsdn(tsd), &log_mtx); + + if (prof_logging_state != prof_logging_state_started) { + goto label_done; + } + + if (!log_tables_initialized) { + bool err1 = ckh_new(tsd, &log_bt_node_set, PROF_CKH_MINITEMS, + prof_bt_node_hash, prof_bt_node_keycomp); + bool err2 = ckh_new(tsd, &log_thr_node_set, PROF_CKH_MINITEMS, + prof_thr_node_hash, prof_thr_node_keycomp); + if (err1 || err2) { + goto label_done; + } + log_tables_initialized = true; + } + + nstime_t alloc_time = prof_info->alloc_time; + nstime_t free_time; + nstime_prof_init_update(&free_time); + + size_t sz = sizeof(prof_alloc_node_t); + prof_alloc_node_t *new_node = (prof_alloc_node_t *) + iallocztm(tsd_tsdn(tsd), sz, sz_size2index(sz), false, NULL, true, + arena_get(TSDN_NULL, 0, true), true); + + const char *prod_thr_name = (tctx->tdata->thread_name == NULL)? + "" : tctx->tdata->thread_name; + const char *cons_thr_name = prof_thread_name_get(tsd); + + prof_bt_t bt; + /* Initialize the backtrace, using the buffer in tdata to store it. */ + bt_init(&bt, cons_tdata->vec); + prof_backtrace(tsd, &bt); + prof_bt_t *cons_bt = &bt; + + /* We haven't destroyed tctx yet, so gctx should be good to read. */ + prof_bt_t *prod_bt = &tctx->gctx->bt; + + new_node->next = NULL; + new_node->alloc_thr_ind = prof_log_thr_index(tsd, tctx->tdata->thr_uid, + prod_thr_name); + new_node->free_thr_ind = prof_log_thr_index(tsd, cons_tdata->thr_uid, + cons_thr_name); + new_node->alloc_bt_ind = prof_log_bt_index(tsd, prod_bt); + new_node->free_bt_ind = prof_log_bt_index(tsd, cons_bt); + new_node->alloc_time_ns = nstime_ns(&alloc_time); + new_node->free_time_ns = nstime_ns(&free_time); + new_node->usize = usize; + + if (log_alloc_first == NULL) { + log_alloc_first = new_node; + log_alloc_last = new_node; + } else { + log_alloc_last->next = new_node; + log_alloc_last = new_node; + } + +label_done: + malloc_mutex_unlock(tsd_tsdn(tsd), &log_mtx); +} + +static void +prof_bt_node_hash(const void *key, size_t r_hash[2]) { + const prof_bt_node_t *bt_node = (prof_bt_node_t *)key; + prof_bt_hash((void *)(&bt_node->bt), r_hash); +} + +static bool +prof_bt_node_keycomp(const void *k1, const void *k2) { + const prof_bt_node_t *bt_node1 = (prof_bt_node_t *)k1; + const prof_bt_node_t *bt_node2 = (prof_bt_node_t *)k2; + return prof_bt_keycomp((void *)(&bt_node1->bt), + (void *)(&bt_node2->bt)); +} + +static void +prof_thr_node_hash(const void *key, size_t r_hash[2]) { + const prof_thr_node_t *thr_node = (prof_thr_node_t *)key; + hash(&thr_node->thr_uid, sizeof(uint64_t), 0x94122f35U, r_hash); +} + +static bool +prof_thr_node_keycomp(const void *k1, const void *k2) { + const prof_thr_node_t *thr_node1 = (prof_thr_node_t *)k1; + const prof_thr_node_t *thr_node2 = (prof_thr_node_t *)k2; + return thr_node1->thr_uid == thr_node2->thr_uid; +} + +/* Used in unit tests. */ +size_t +prof_log_bt_count(void) { + cassert(config_prof); + size_t cnt = 0; + prof_bt_node_t *node = log_bt_first; + while (node != NULL) { + cnt++; + node = node->next; + } + return cnt; +} + +/* Used in unit tests. */ +size_t +prof_log_alloc_count(void) { + cassert(config_prof); + size_t cnt = 0; + prof_alloc_node_t *node = log_alloc_first; + while (node != NULL) { + cnt++; + node = node->next; + } + return cnt; +} + +/* Used in unit tests. */ +size_t +prof_log_thr_count(void) { + cassert(config_prof); + size_t cnt = 0; + prof_thr_node_t *node = log_thr_first; + while (node != NULL) { + cnt++; + node = node->next; + } + return cnt; +} + +/* Used in unit tests. */ +bool +prof_log_is_logging(void) { + cassert(config_prof); + return prof_logging_state == prof_logging_state_started; +} + +/* Used in unit tests. */ +bool +prof_log_rep_check(void) { + cassert(config_prof); + if (prof_logging_state == prof_logging_state_stopped + && log_tables_initialized) { + return true; + } + + if (log_bt_last != NULL && log_bt_last->next != NULL) { + return true; + } + if (log_thr_last != NULL && log_thr_last->next != NULL) { + return true; + } + if (log_alloc_last != NULL && log_alloc_last->next != NULL) { + return true; + } + + size_t bt_count = prof_log_bt_count(); + size_t thr_count = prof_log_thr_count(); + size_t alloc_count = prof_log_alloc_count(); + + + if (prof_logging_state == prof_logging_state_stopped) { + if (bt_count != 0 || thr_count != 0 || alloc_count || 0) { + return true; + } + } + + prof_alloc_node_t *node = log_alloc_first; + while (node != NULL) { + if (node->alloc_bt_ind >= bt_count) { + return true; + } + if (node->free_bt_ind >= bt_count) { + return true; + } + if (node->alloc_thr_ind >= thr_count) { + return true; + } + if (node->free_thr_ind >= thr_count) { + return true; + } + if (node->alloc_time_ns > node->free_time_ns) { + return true; + } + node = node->next; + } + + return false; +} + +/* Used in unit tests. */ +void +prof_log_dummy_set(bool new_value) { + cassert(config_prof); + prof_log_dummy = new_value; +} + +/* Used as an atexit function to stop logging on exit. */ +static void +prof_log_stop_final(void) { + tsd_t *tsd = tsd_fetch(); + prof_log_stop(tsd_tsdn(tsd)); +} + +JEMALLOC_COLD +bool +prof_log_start(tsdn_t *tsdn, const char *filename) { + cassert(config_prof); + + if (!opt_prof) { + return true; + } + + bool ret = false; + + malloc_mutex_lock(tsdn, &log_mtx); + + static bool prof_log_atexit_called = false; + if (!prof_log_atexit_called) { + prof_log_atexit_called = true; + if (atexit(prof_log_stop_final) != 0) { + malloc_write(": Error in atexit() " + "for logging\n"); + if (opt_abort) { + abort(); + } + ret = true; + goto label_done; + } + } + + if (prof_logging_state != prof_logging_state_stopped) { + ret = true; + } else if (filename == NULL) { + /* Make default name. */ + prof_get_default_filename(tsdn, log_filename, log_seq); + log_seq++; + prof_logging_state = prof_logging_state_started; + } else if (strlen(filename) >= PROF_DUMP_FILENAME_LEN) { + ret = true; + } else { + strcpy(log_filename, filename); + prof_logging_state = prof_logging_state_started; + } + + if (!ret) { + nstime_prof_init_update(&log_start_timestamp); + } +label_done: + malloc_mutex_unlock(tsdn, &log_mtx); + + return ret; +} + +struct prof_emitter_cb_arg_s { + int fd; + ssize_t ret; +}; + +static void +prof_emitter_write_cb(void *opaque, const char *to_write) { + struct prof_emitter_cb_arg_s *arg = + (struct prof_emitter_cb_arg_s *)opaque; + size_t bytes = strlen(to_write); + if (prof_log_dummy) { + return; + } + arg->ret = malloc_write_fd(arg->fd, to_write, bytes); +} + +/* + * prof_log_emit_{...} goes through the appropriate linked list, emitting each + * node to the json and deallocating it. + */ +static void +prof_log_emit_threads(tsd_t *tsd, emitter_t *emitter) { + emitter_json_array_kv_begin(emitter, "threads"); + prof_thr_node_t *thr_node = log_thr_first; + prof_thr_node_t *thr_old_node; + while (thr_node != NULL) { + emitter_json_object_begin(emitter); + + emitter_json_kv(emitter, "thr_uid", emitter_type_uint64, + &thr_node->thr_uid); + + char *thr_name = thr_node->name; + + emitter_json_kv(emitter, "thr_name", emitter_type_string, + &thr_name); + + emitter_json_object_end(emitter); + thr_old_node = thr_node; + thr_node = thr_node->next; + idalloctm(tsd_tsdn(tsd), thr_old_node, NULL, NULL, true, true); + } + emitter_json_array_end(emitter); +} + +static void +prof_log_emit_traces(tsd_t *tsd, emitter_t *emitter) { + emitter_json_array_kv_begin(emitter, "stack_traces"); + prof_bt_node_t *bt_node = log_bt_first; + prof_bt_node_t *bt_old_node; + /* + * Calculate how many hex digits we need: twice number of bytes, two for + * "0x", and then one more for terminating '\0'. + */ + char buf[2 * sizeof(intptr_t) + 3]; + size_t buf_sz = sizeof(buf); + while (bt_node != NULL) { + emitter_json_array_begin(emitter); + size_t i; + for (i = 0; i < bt_node->bt.len; i++) { + malloc_snprintf(buf, buf_sz, "%p", bt_node->bt.vec[i]); + char *trace_str = buf; + emitter_json_value(emitter, emitter_type_string, + &trace_str); + } + emitter_json_array_end(emitter); + + bt_old_node = bt_node; + bt_node = bt_node->next; + idalloctm(tsd_tsdn(tsd), bt_old_node, NULL, NULL, true, true); + } + emitter_json_array_end(emitter); +} + +static void +prof_log_emit_allocs(tsd_t *tsd, emitter_t *emitter) { + emitter_json_array_kv_begin(emitter, "allocations"); + prof_alloc_node_t *alloc_node = log_alloc_first; + prof_alloc_node_t *alloc_old_node; + while (alloc_node != NULL) { + emitter_json_object_begin(emitter); + + emitter_json_kv(emitter, "alloc_thread", emitter_type_size, + &alloc_node->alloc_thr_ind); + + emitter_json_kv(emitter, "free_thread", emitter_type_size, + &alloc_node->free_thr_ind); + + emitter_json_kv(emitter, "alloc_trace", emitter_type_size, + &alloc_node->alloc_bt_ind); + + emitter_json_kv(emitter, "free_trace", emitter_type_size, + &alloc_node->free_bt_ind); + + emitter_json_kv(emitter, "alloc_timestamp", + emitter_type_uint64, &alloc_node->alloc_time_ns); + + emitter_json_kv(emitter, "free_timestamp", emitter_type_uint64, + &alloc_node->free_time_ns); + + emitter_json_kv(emitter, "usize", emitter_type_uint64, + &alloc_node->usize); + + emitter_json_object_end(emitter); + + alloc_old_node = alloc_node; + alloc_node = alloc_node->next; + idalloctm(tsd_tsdn(tsd), alloc_old_node, NULL, NULL, true, + true); + } + emitter_json_array_end(emitter); +} + +static void +prof_log_emit_metadata(emitter_t *emitter) { + emitter_json_object_kv_begin(emitter, "info"); + + nstime_t now; + + nstime_prof_init_update(&now); + uint64_t ns = nstime_ns(&now) - nstime_ns(&log_start_timestamp); + emitter_json_kv(emitter, "duration", emitter_type_uint64, &ns); + + char *vers = JEMALLOC_VERSION; + emitter_json_kv(emitter, "version", + emitter_type_string, &vers); + + emitter_json_kv(emitter, "lg_sample_rate", + emitter_type_int, &lg_prof_sample); + + const char *res_type = prof_time_res_mode_names[opt_prof_time_res]; + emitter_json_kv(emitter, "prof_time_resolution", emitter_type_string, + &res_type); + + int pid = prof_getpid(); + emitter_json_kv(emitter, "pid", emitter_type_int, &pid); + + emitter_json_object_end(emitter); +} + +#define PROF_LOG_STOP_BUFSIZE PROF_DUMP_BUFSIZE +JEMALLOC_COLD +bool +prof_log_stop(tsdn_t *tsdn) { + cassert(config_prof); + if (!opt_prof || !prof_booted) { + return true; + } + + tsd_t *tsd = tsdn_tsd(tsdn); + malloc_mutex_lock(tsdn, &log_mtx); + + if (prof_logging_state != prof_logging_state_started) { + malloc_mutex_unlock(tsdn, &log_mtx); + return true; + } + + /* + * Set the state to dumping. We'll set it to stopped when we're done. + * Since other threads won't be able to start/stop/log when the state is + * dumping, we don't have to hold the lock during the whole method. + */ + prof_logging_state = prof_logging_state_dumping; + malloc_mutex_unlock(tsdn, &log_mtx); + + + emitter_t emitter; + + /* Create a file. */ + + int fd; + if (prof_log_dummy) { + fd = 0; + } else { + fd = creat(log_filename, 0644); + } + + if (fd == -1) { + malloc_printf(": creat() for log file \"%s\" " + " failed with %d\n", log_filename, errno); + if (opt_abort) { + abort(); + } + return true; + } + + struct prof_emitter_cb_arg_s arg; + arg.fd = fd; + + buf_writer_t buf_writer; + buf_writer_init(tsdn, &buf_writer, prof_emitter_write_cb, &arg, NULL, + PROF_LOG_STOP_BUFSIZE); + emitter_init(&emitter, emitter_output_json_compact, buf_writer_cb, + &buf_writer); + + emitter_begin(&emitter); + prof_log_emit_metadata(&emitter); + prof_log_emit_threads(tsd, &emitter); + prof_log_emit_traces(tsd, &emitter); + prof_log_emit_allocs(tsd, &emitter); + emitter_end(&emitter); + + buf_writer_terminate(tsdn, &buf_writer); + + /* Reset global state. */ + if (log_tables_initialized) { + ckh_delete(tsd, &log_bt_node_set); + ckh_delete(tsd, &log_thr_node_set); + } + log_tables_initialized = false; + log_bt_index = 0; + log_thr_index = 0; + log_bt_first = NULL; + log_bt_last = NULL; + log_thr_first = NULL; + log_thr_last = NULL; + log_alloc_first = NULL; + log_alloc_last = NULL; + + malloc_mutex_lock(tsdn, &log_mtx); + prof_logging_state = prof_logging_state_stopped; + malloc_mutex_unlock(tsdn, &log_mtx); + + if (prof_log_dummy) { + return false; + } + return close(fd) || arg.ret == -1; +} +#undef PROF_LOG_STOP_BUFSIZE + +JEMALLOC_COLD +bool +prof_log_init(tsd_t *tsd) { + cassert(config_prof); + if (malloc_mutex_init(&log_mtx, "prof_log", + WITNESS_RANK_PROF_LOG, malloc_mutex_rank_exclusive)) { + return true; + } + + if (opt_prof_log) { + prof_log_start(tsd_tsdn(tsd), NULL); + } + + return false; +} + +/******************************************************************************/ diff --git a/BeefRT/JEMalloc/src/prof_recent.c b/BeefRT/JEMalloc/src/prof_recent.c new file mode 100644 index 00000000..834a9446 --- /dev/null +++ b/BeefRT/JEMalloc/src/prof_recent.c @@ -0,0 +1,600 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/buf_writer.h" +#include "jemalloc/internal/emitter.h" +#include "jemalloc/internal/prof_data.h" +#include "jemalloc/internal/prof_recent.h" + +ssize_t opt_prof_recent_alloc_max = PROF_RECENT_ALLOC_MAX_DEFAULT; +malloc_mutex_t prof_recent_alloc_mtx; /* Protects the fields below */ +static atomic_zd_t prof_recent_alloc_max; +static ssize_t prof_recent_alloc_count = 0; +prof_recent_list_t prof_recent_alloc_list; + +malloc_mutex_t prof_recent_dump_mtx; /* Protects dumping. */ + +static void +prof_recent_alloc_max_init() { + atomic_store_zd(&prof_recent_alloc_max, opt_prof_recent_alloc_max, + ATOMIC_RELAXED); +} + +static inline ssize_t +prof_recent_alloc_max_get_no_lock() { + return atomic_load_zd(&prof_recent_alloc_max, ATOMIC_RELAXED); +} + +static inline ssize_t +prof_recent_alloc_max_get(tsd_t *tsd) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + return prof_recent_alloc_max_get_no_lock(); +} + +static inline ssize_t +prof_recent_alloc_max_update(tsd_t *tsd, ssize_t max) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + ssize_t old_max = prof_recent_alloc_max_get(tsd); + atomic_store_zd(&prof_recent_alloc_max, max, ATOMIC_RELAXED); + return old_max; +} + +static prof_recent_t * +prof_recent_allocate_node(tsdn_t *tsdn) { + return (prof_recent_t *)iallocztm(tsdn, sizeof(prof_recent_t), + sz_size2index(sizeof(prof_recent_t)), false, NULL, true, + arena_get(tsdn, 0, false), true); +} + +static void +prof_recent_free_node(tsdn_t *tsdn, prof_recent_t *node) { + assert(node != NULL); + assert(isalloc(tsdn, node) == sz_s2u(sizeof(prof_recent_t))); + idalloctm(tsdn, node, NULL, NULL, true, true); +} + +static inline void +increment_recent_count(tsd_t *tsd, prof_tctx_t *tctx) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), tctx->tdata->lock); + ++tctx->recent_count; + assert(tctx->recent_count > 0); +} + +bool +prof_recent_alloc_prepare(tsd_t *tsd, prof_tctx_t *tctx) { + cassert(config_prof); + assert(opt_prof && prof_booted); + malloc_mutex_assert_owner(tsd_tsdn(tsd), tctx->tdata->lock); + malloc_mutex_assert_not_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + + /* + * Check whether last-N mode is turned on without trying to acquire the + * lock, so as to optimize for the following two scenarios: + * (1) Last-N mode is switched off; + * (2) Dumping, during which last-N mode is temporarily turned off so + * as not to block sampled allocations. + */ + if (prof_recent_alloc_max_get_no_lock() == 0) { + return false; + } + + /* + * Increment recent_count to hold the tctx so that it won't be gone + * even after tctx->tdata->lock is released. This acts as a + * "placeholder"; the real recording of the allocation requires a lock + * on prof_recent_alloc_mtx and is done in prof_recent_alloc (when + * tctx->tdata->lock has been released). + */ + increment_recent_count(tsd, tctx); + return true; +} + +static void +decrement_recent_count(tsd_t *tsd, prof_tctx_t *tctx) { + malloc_mutex_assert_not_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + assert(tctx != NULL); + malloc_mutex_lock(tsd_tsdn(tsd), tctx->tdata->lock); + assert(tctx->recent_count > 0); + --tctx->recent_count; + prof_tctx_try_destroy(tsd, tctx); +} + +static inline edata_t * +prof_recent_alloc_edata_get_no_lock(const prof_recent_t *n) { + return (edata_t *)atomic_load_p(&n->alloc_edata, ATOMIC_ACQUIRE); +} + +edata_t * +prof_recent_alloc_edata_get_no_lock_test(const prof_recent_t *n) { + cassert(config_prof); + return prof_recent_alloc_edata_get_no_lock(n); +} + +static inline edata_t * +prof_recent_alloc_edata_get(tsd_t *tsd, const prof_recent_t *n) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + return prof_recent_alloc_edata_get_no_lock(n); +} + +static void +prof_recent_alloc_edata_set(tsd_t *tsd, prof_recent_t *n, edata_t *edata) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + atomic_store_p(&n->alloc_edata, edata, ATOMIC_RELEASE); +} + +void +edata_prof_recent_alloc_init(edata_t *edata) { + cassert(config_prof); + edata_prof_recent_alloc_set_dont_call_directly(edata, NULL); +} + +static inline prof_recent_t * +edata_prof_recent_alloc_get_no_lock(const edata_t *edata) { + cassert(config_prof); + return edata_prof_recent_alloc_get_dont_call_directly(edata); +} + +prof_recent_t * +edata_prof_recent_alloc_get_no_lock_test(const edata_t *edata) { + cassert(config_prof); + return edata_prof_recent_alloc_get_no_lock(edata); +} + +static inline prof_recent_t * +edata_prof_recent_alloc_get(tsd_t *tsd, const edata_t *edata) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + prof_recent_t *recent_alloc = + edata_prof_recent_alloc_get_no_lock(edata); + assert(recent_alloc == NULL || + prof_recent_alloc_edata_get(tsd, recent_alloc) == edata); + return recent_alloc; +} + +static prof_recent_t * +edata_prof_recent_alloc_update_internal(tsd_t *tsd, edata_t *edata, + prof_recent_t *recent_alloc) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + prof_recent_t *old_recent_alloc = + edata_prof_recent_alloc_get(tsd, edata); + edata_prof_recent_alloc_set_dont_call_directly(edata, recent_alloc); + return old_recent_alloc; +} + +static void +edata_prof_recent_alloc_set(tsd_t *tsd, edata_t *edata, + prof_recent_t *recent_alloc) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + assert(recent_alloc != NULL); + prof_recent_t *old_recent_alloc = + edata_prof_recent_alloc_update_internal(tsd, edata, recent_alloc); + assert(old_recent_alloc == NULL); + prof_recent_alloc_edata_set(tsd, recent_alloc, edata); +} + +static void +edata_prof_recent_alloc_reset(tsd_t *tsd, edata_t *edata, + prof_recent_t *recent_alloc) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + assert(recent_alloc != NULL); + prof_recent_t *old_recent_alloc = + edata_prof_recent_alloc_update_internal(tsd, edata, NULL); + assert(old_recent_alloc == recent_alloc); + assert(edata == prof_recent_alloc_edata_get(tsd, recent_alloc)); + prof_recent_alloc_edata_set(tsd, recent_alloc, NULL); +} + +/* + * This function should be called right before an allocation is released, so + * that the associated recent allocation record can contain the following + * information: + * (1) The allocation is released; + * (2) The time of the deallocation; and + * (3) The prof_tctx associated with the deallocation. + */ +void +prof_recent_alloc_reset(tsd_t *tsd, edata_t *edata) { + cassert(config_prof); + /* + * Check whether the recent allocation record still exists without + * trying to acquire the lock. + */ + if (edata_prof_recent_alloc_get_no_lock(edata) == NULL) { + return; + } + + prof_tctx_t *dalloc_tctx = prof_tctx_create(tsd); + /* + * In case dalloc_tctx is NULL, e.g. due to OOM, we will not record the + * deallocation time / tctx, which is handled later, after we check + * again when holding the lock. + */ + + if (dalloc_tctx != NULL) { + malloc_mutex_lock(tsd_tsdn(tsd), dalloc_tctx->tdata->lock); + increment_recent_count(tsd, dalloc_tctx); + dalloc_tctx->prepared = false; + malloc_mutex_unlock(tsd_tsdn(tsd), dalloc_tctx->tdata->lock); + } + + malloc_mutex_lock(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + /* Check again after acquiring the lock. */ + prof_recent_t *recent = edata_prof_recent_alloc_get(tsd, edata); + if (recent != NULL) { + assert(nstime_equals_zero(&recent->dalloc_time)); + assert(recent->dalloc_tctx == NULL); + if (dalloc_tctx != NULL) { + nstime_prof_update(&recent->dalloc_time); + recent->dalloc_tctx = dalloc_tctx; + dalloc_tctx = NULL; + } + edata_prof_recent_alloc_reset(tsd, edata, recent); + } + malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + + if (dalloc_tctx != NULL) { + /* We lost the rase - the allocation record was just gone. */ + decrement_recent_count(tsd, dalloc_tctx); + } +} + +static void +prof_recent_alloc_evict_edata(tsd_t *tsd, prof_recent_t *recent_alloc) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + edata_t *edata = prof_recent_alloc_edata_get(tsd, recent_alloc); + if (edata != NULL) { + edata_prof_recent_alloc_reset(tsd, edata, recent_alloc); + } +} + +static bool +prof_recent_alloc_is_empty(tsd_t *tsd) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + if (ql_empty(&prof_recent_alloc_list)) { + assert(prof_recent_alloc_count == 0); + return true; + } else { + assert(prof_recent_alloc_count > 0); + return false; + } +} + +static void +prof_recent_alloc_assert_count(tsd_t *tsd) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + if (!config_debug) { + return; + } + ssize_t count = 0; + prof_recent_t *n; + ql_foreach(n, &prof_recent_alloc_list, link) { + ++count; + } + assert(count == prof_recent_alloc_count); + assert(prof_recent_alloc_max_get(tsd) == -1 || + count <= prof_recent_alloc_max_get(tsd)); +} + +void +prof_recent_alloc(tsd_t *tsd, edata_t *edata, size_t size, size_t usize) { + cassert(config_prof); + assert(edata != NULL); + prof_tctx_t *tctx = edata_prof_tctx_get(edata); + + malloc_mutex_assert_not_owner(tsd_tsdn(tsd), tctx->tdata->lock); + malloc_mutex_lock(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + prof_recent_alloc_assert_count(tsd); + + /* + * Reserve a new prof_recent_t node if needed. If needed, we release + * the prof_recent_alloc_mtx lock and allocate. Then, rather than + * immediately checking for OOM, we regain the lock and try to make use + * of the reserve node if needed. There are six scenarios: + * + * \ now | no need | need but OOMed | need and allocated + * later \ | | | + * ------------------------------------------------------------ + * no need | (1) | (2) | (3) + * ------------------------------------------------------------ + * need | (4) | (5) | (6) + * + * First, "(4)" never happens, because we don't release the lock in the + * middle if there's no need for a new node; in such cases "(1)" always + * takes place, which is trivial. + * + * Out of the remaining four scenarios, "(6)" is the common case and is + * trivial. "(5)" is also trivial, in which case we'll rollback the + * effect of prof_recent_alloc_prepare() as expected. + * + * "(2)" / "(3)" occurs when the need for a new node is gone after we + * regain the lock. If the new node is successfully allocated, i.e. in + * the case of "(3)", we'll release it in the end; otherwise, i.e. in + * the case of "(2)", we do nothing - we're lucky that the OOM ends up + * doing no harm at all. + * + * Therefore, the only performance cost of the "release lock" -> + * "allocate" -> "regain lock" design is the "(3)" case, but it happens + * very rarely, so the cost is relatively small compared to the gain of + * not having to have the lock order of prof_recent_alloc_mtx above all + * the allocation locks. + */ + prof_recent_t *reserve = NULL; + if (prof_recent_alloc_max_get(tsd) == -1 || + prof_recent_alloc_count < prof_recent_alloc_max_get(tsd)) { + assert(prof_recent_alloc_max_get(tsd) != 0); + malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + reserve = prof_recent_allocate_node(tsd_tsdn(tsd)); + malloc_mutex_lock(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + prof_recent_alloc_assert_count(tsd); + } + + if (prof_recent_alloc_max_get(tsd) == 0) { + assert(prof_recent_alloc_is_empty(tsd)); + goto label_rollback; + } + + prof_tctx_t *old_alloc_tctx, *old_dalloc_tctx; + if (prof_recent_alloc_count == prof_recent_alloc_max_get(tsd)) { + /* If upper limit is reached, rotate the head. */ + assert(prof_recent_alloc_max_get(tsd) != -1); + assert(!prof_recent_alloc_is_empty(tsd)); + prof_recent_t *head = ql_first(&prof_recent_alloc_list); + old_alloc_tctx = head->alloc_tctx; + assert(old_alloc_tctx != NULL); + old_dalloc_tctx = head->dalloc_tctx; + prof_recent_alloc_evict_edata(tsd, head); + ql_rotate(&prof_recent_alloc_list, link); + } else { + /* Otherwise make use of the new node. */ + assert(prof_recent_alloc_max_get(tsd) == -1 || + prof_recent_alloc_count < prof_recent_alloc_max_get(tsd)); + if (reserve == NULL) { + goto label_rollback; + } + ql_elm_new(reserve, link); + ql_tail_insert(&prof_recent_alloc_list, reserve, link); + reserve = NULL; + old_alloc_tctx = NULL; + old_dalloc_tctx = NULL; + ++prof_recent_alloc_count; + } + + /* Fill content into the tail node. */ + prof_recent_t *tail = ql_last(&prof_recent_alloc_list, link); + assert(tail != NULL); + tail->size = size; + tail->usize = usize; + nstime_copy(&tail->alloc_time, edata_prof_alloc_time_get(edata)); + tail->alloc_tctx = tctx; + nstime_init_zero(&tail->dalloc_time); + tail->dalloc_tctx = NULL; + edata_prof_recent_alloc_set(tsd, edata, tail); + + assert(!prof_recent_alloc_is_empty(tsd)); + prof_recent_alloc_assert_count(tsd); + malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + + if (reserve != NULL) { + prof_recent_free_node(tsd_tsdn(tsd), reserve); + } + + /* + * Asynchronously handle the tctx of the old node, so that there's no + * simultaneous holdings of prof_recent_alloc_mtx and tdata->lock. + * In the worst case this may delay the tctx release but it's better + * than holding prof_recent_alloc_mtx for longer. + */ + if (old_alloc_tctx != NULL) { + decrement_recent_count(tsd, old_alloc_tctx); + } + if (old_dalloc_tctx != NULL) { + decrement_recent_count(tsd, old_dalloc_tctx); + } + return; + +label_rollback: + assert(edata_prof_recent_alloc_get(tsd, edata) == NULL); + prof_recent_alloc_assert_count(tsd); + malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + if (reserve != NULL) { + prof_recent_free_node(tsd_tsdn(tsd), reserve); + } + decrement_recent_count(tsd, tctx); +} + +ssize_t +prof_recent_alloc_max_ctl_read() { + cassert(config_prof); + /* Don't bother to acquire the lock. */ + return prof_recent_alloc_max_get_no_lock(); +} + +static void +prof_recent_alloc_restore_locked(tsd_t *tsd, prof_recent_list_t *to_delete) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + ssize_t max = prof_recent_alloc_max_get(tsd); + if (max == -1 || prof_recent_alloc_count <= max) { + /* Easy case - no need to alter the list. */ + ql_new(to_delete); + prof_recent_alloc_assert_count(tsd); + return; + } + + prof_recent_t *node; + ql_foreach(node, &prof_recent_alloc_list, link) { + if (prof_recent_alloc_count == max) { + break; + } + prof_recent_alloc_evict_edata(tsd, node); + --prof_recent_alloc_count; + } + assert(prof_recent_alloc_count == max); + + ql_move(to_delete, &prof_recent_alloc_list); + if (max == 0) { + assert(node == NULL); + } else { + assert(node != NULL); + ql_split(to_delete, node, &prof_recent_alloc_list, link); + } + assert(!ql_empty(to_delete)); + prof_recent_alloc_assert_count(tsd); +} + +static void +prof_recent_alloc_async_cleanup(tsd_t *tsd, prof_recent_list_t *to_delete) { + malloc_mutex_assert_not_owner(tsd_tsdn(tsd), &prof_recent_dump_mtx); + malloc_mutex_assert_not_owner(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + while (!ql_empty(to_delete)) { + prof_recent_t *node = ql_first(to_delete); + ql_remove(to_delete, node, link); + decrement_recent_count(tsd, node->alloc_tctx); + if (node->dalloc_tctx != NULL) { + decrement_recent_count(tsd, node->dalloc_tctx); + } + prof_recent_free_node(tsd_tsdn(tsd), node); + } +} + +ssize_t +prof_recent_alloc_max_ctl_write(tsd_t *tsd, ssize_t max) { + cassert(config_prof); + assert(max >= -1); + malloc_mutex_lock(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + prof_recent_alloc_assert_count(tsd); + const ssize_t old_max = prof_recent_alloc_max_update(tsd, max); + prof_recent_list_t to_delete; + prof_recent_alloc_restore_locked(tsd, &to_delete); + malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + prof_recent_alloc_async_cleanup(tsd, &to_delete); + return old_max; +} + +static void +prof_recent_alloc_dump_bt(emitter_t *emitter, prof_tctx_t *tctx) { + char bt_buf[2 * sizeof(intptr_t) + 3]; + char *s = bt_buf; + assert(tctx != NULL); + prof_bt_t *bt = &tctx->gctx->bt; + for (size_t i = 0; i < bt->len; ++i) { + malloc_snprintf(bt_buf, sizeof(bt_buf), "%p", bt->vec[i]); + emitter_json_value(emitter, emitter_type_string, &s); + } +} + +static void +prof_recent_alloc_dump_node(emitter_t *emitter, prof_recent_t *node) { + emitter_json_object_begin(emitter); + + emitter_json_kv(emitter, "size", emitter_type_size, &node->size); + emitter_json_kv(emitter, "usize", emitter_type_size, &node->usize); + bool released = prof_recent_alloc_edata_get_no_lock(node) == NULL; + emitter_json_kv(emitter, "released", emitter_type_bool, &released); + + emitter_json_kv(emitter, "alloc_thread_uid", emitter_type_uint64, + &node->alloc_tctx->thr_uid); + prof_tdata_t *alloc_tdata = node->alloc_tctx->tdata; + assert(alloc_tdata != NULL); + if (alloc_tdata->thread_name != NULL) { + emitter_json_kv(emitter, "alloc_thread_name", + emitter_type_string, &alloc_tdata->thread_name); + } + uint64_t alloc_time_ns = nstime_ns(&node->alloc_time); + emitter_json_kv(emitter, "alloc_time", emitter_type_uint64, + &alloc_time_ns); + emitter_json_array_kv_begin(emitter, "alloc_trace"); + prof_recent_alloc_dump_bt(emitter, node->alloc_tctx); + emitter_json_array_end(emitter); + + if (released && node->dalloc_tctx != NULL) { + emitter_json_kv(emitter, "dalloc_thread_uid", + emitter_type_uint64, &node->dalloc_tctx->thr_uid); + prof_tdata_t *dalloc_tdata = node->dalloc_tctx->tdata; + assert(dalloc_tdata != NULL); + if (dalloc_tdata->thread_name != NULL) { + emitter_json_kv(emitter, "dalloc_thread_name", + emitter_type_string, &dalloc_tdata->thread_name); + } + assert(!nstime_equals_zero(&node->dalloc_time)); + uint64_t dalloc_time_ns = nstime_ns(&node->dalloc_time); + emitter_json_kv(emitter, "dalloc_time", emitter_type_uint64, + &dalloc_time_ns); + emitter_json_array_kv_begin(emitter, "dalloc_trace"); + prof_recent_alloc_dump_bt(emitter, node->dalloc_tctx); + emitter_json_array_end(emitter); + } + + emitter_json_object_end(emitter); +} + +#define PROF_RECENT_PRINT_BUFSIZE 65536 +JEMALLOC_COLD +void +prof_recent_alloc_dump(tsd_t *tsd, write_cb_t *write_cb, void *cbopaque) { + cassert(config_prof); + malloc_mutex_lock(tsd_tsdn(tsd), &prof_recent_dump_mtx); + buf_writer_t buf_writer; + buf_writer_init(tsd_tsdn(tsd), &buf_writer, write_cb, cbopaque, NULL, + PROF_RECENT_PRINT_BUFSIZE); + emitter_t emitter; + emitter_init(&emitter, emitter_output_json_compact, buf_writer_cb, + &buf_writer); + prof_recent_list_t temp_list; + + malloc_mutex_lock(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + prof_recent_alloc_assert_count(tsd); + ssize_t dump_max = prof_recent_alloc_max_get(tsd); + ql_move(&temp_list, &prof_recent_alloc_list); + ssize_t dump_count = prof_recent_alloc_count; + prof_recent_alloc_count = 0; + prof_recent_alloc_assert_count(tsd); + malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + + emitter_begin(&emitter); + uint64_t sample_interval = (uint64_t)1U << lg_prof_sample; + emitter_json_kv(&emitter, "sample_interval", emitter_type_uint64, + &sample_interval); + emitter_json_kv(&emitter, "recent_alloc_max", emitter_type_ssize, + &dump_max); + emitter_json_array_kv_begin(&emitter, "recent_alloc"); + prof_recent_t *node; + ql_foreach(node, &temp_list, link) { + prof_recent_alloc_dump_node(&emitter, node); + } + emitter_json_array_end(&emitter); + emitter_end(&emitter); + + malloc_mutex_lock(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + prof_recent_alloc_assert_count(tsd); + ql_concat(&temp_list, &prof_recent_alloc_list, link); + ql_move(&prof_recent_alloc_list, &temp_list); + prof_recent_alloc_count += dump_count; + prof_recent_alloc_restore_locked(tsd, &temp_list); + malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_alloc_mtx); + + buf_writer_terminate(tsd_tsdn(tsd), &buf_writer); + malloc_mutex_unlock(tsd_tsdn(tsd), &prof_recent_dump_mtx); + + prof_recent_alloc_async_cleanup(tsd, &temp_list); +} +#undef PROF_RECENT_PRINT_BUFSIZE + +bool +prof_recent_init() { + cassert(config_prof); + prof_recent_alloc_max_init(); + + if (malloc_mutex_init(&prof_recent_alloc_mtx, "prof_recent_alloc", + WITNESS_RANK_PROF_RECENT_ALLOC, malloc_mutex_rank_exclusive)) { + return true; + } + + if (malloc_mutex_init(&prof_recent_dump_mtx, "prof_recent_dump", + WITNESS_RANK_PROF_RECENT_DUMP, malloc_mutex_rank_exclusive)) { + return true; + } + + ql_new(&prof_recent_alloc_list); + + return false; +} diff --git a/BeefRT/JEMalloc/src/prof_stats.c b/BeefRT/JEMalloc/src/prof_stats.c new file mode 100644 index 00000000..5d1a506b --- /dev/null +++ b/BeefRT/JEMalloc/src/prof_stats.c @@ -0,0 +1,57 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/prof_stats.h" + +bool opt_prof_stats = false; +malloc_mutex_t prof_stats_mtx; +static prof_stats_t prof_stats_live[PROF_SC_NSIZES]; +static prof_stats_t prof_stats_accum[PROF_SC_NSIZES]; + +static void +prof_stats_enter(tsd_t *tsd, szind_t ind) { + assert(opt_prof && opt_prof_stats); + assert(ind < SC_NSIZES); + malloc_mutex_lock(tsd_tsdn(tsd), &prof_stats_mtx); +} + +static void +prof_stats_leave(tsd_t *tsd) { + malloc_mutex_unlock(tsd_tsdn(tsd), &prof_stats_mtx); +} + +void +prof_stats_inc(tsd_t *tsd, szind_t ind, size_t size) { + cassert(config_prof); + prof_stats_enter(tsd, ind); + prof_stats_live[ind].req_sum += size; + prof_stats_live[ind].count++; + prof_stats_accum[ind].req_sum += size; + prof_stats_accum[ind].count++; + prof_stats_leave(tsd); +} + +void +prof_stats_dec(tsd_t *tsd, szind_t ind, size_t size) { + cassert(config_prof); + prof_stats_enter(tsd, ind); + prof_stats_live[ind].req_sum -= size; + prof_stats_live[ind].count--; + prof_stats_leave(tsd); +} + +void +prof_stats_get_live(tsd_t *tsd, szind_t ind, prof_stats_t *stats) { + cassert(config_prof); + prof_stats_enter(tsd, ind); + memcpy(stats, &prof_stats_live[ind], sizeof(prof_stats_t)); + prof_stats_leave(tsd); +} + +void +prof_stats_get_accum(tsd_t *tsd, szind_t ind, prof_stats_t *stats) { + cassert(config_prof); + prof_stats_enter(tsd, ind); + memcpy(stats, &prof_stats_accum[ind], sizeof(prof_stats_t)); + prof_stats_leave(tsd); +} diff --git a/BeefRT/JEMalloc/src/prof_sys.c b/BeefRT/JEMalloc/src/prof_sys.c new file mode 100644 index 00000000..b5f1f5b2 --- /dev/null +++ b/BeefRT/JEMalloc/src/prof_sys.c @@ -0,0 +1,669 @@ +#define JEMALLOC_PROF_SYS_C_ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/buf_writer.h" +#include "jemalloc/internal/ctl.h" +#include "jemalloc/internal/prof_data.h" +#include "jemalloc/internal/prof_sys.h" + +#ifdef JEMALLOC_PROF_LIBUNWIND +#define UNW_LOCAL_ONLY +#include +#endif + +#ifdef JEMALLOC_PROF_LIBGCC +/* + * We have a circular dependency -- jemalloc_internal.h tells us if we should + * use libgcc's unwinding functionality, but after we've included that, we've + * already hooked _Unwind_Backtrace. We'll temporarily disable hooking. + */ +#undef _Unwind_Backtrace +#include +#define _Unwind_Backtrace JEMALLOC_TEST_HOOK(_Unwind_Backtrace, test_hooks_libc_hook) +#endif + +/******************************************************************************/ + +malloc_mutex_t prof_dump_filename_mtx; + +bool prof_do_mock = false; + +static uint64_t prof_dump_seq; +static uint64_t prof_dump_iseq; +static uint64_t prof_dump_mseq; +static uint64_t prof_dump_useq; + +static char *prof_prefix = NULL; + +/* The fallback allocator profiling functionality will use. */ +base_t *prof_base; + +void +bt_init(prof_bt_t *bt, void **vec) { + cassert(config_prof); + + bt->vec = vec; + bt->len = 0; +} + +#ifdef JEMALLOC_PROF_LIBUNWIND +static void +prof_backtrace_impl(void **vec, unsigned *len, unsigned max_len) { + int nframes; + + cassert(config_prof); + assert(*len == 0); + assert(vec != NULL); + assert(max_len == PROF_BT_MAX); + + nframes = unw_backtrace(vec, PROF_BT_MAX); + if (nframes <= 0) { + return; + } + *len = nframes; +} +#elif (defined(JEMALLOC_PROF_LIBGCC)) +static _Unwind_Reason_Code +prof_unwind_init_callback(struct _Unwind_Context *context, void *arg) { + cassert(config_prof); + + return _URC_NO_REASON; +} + +static _Unwind_Reason_Code +prof_unwind_callback(struct _Unwind_Context *context, void *arg) { + prof_unwind_data_t *data = (prof_unwind_data_t *)arg; + void *ip; + + cassert(config_prof); + + ip = (void *)_Unwind_GetIP(context); + if (ip == NULL) { + return _URC_END_OF_STACK; + } + data->vec[*data->len] = ip; + (*data->len)++; + if (*data->len == data->max) { + return _URC_END_OF_STACK; + } + + return _URC_NO_REASON; +} + +static void +prof_backtrace_impl(void **vec, unsigned *len, unsigned max_len) { + prof_unwind_data_t data = {vec, len, max_len}; + + cassert(config_prof); + assert(vec != NULL); + assert(max_len == PROF_BT_MAX); + + _Unwind_Backtrace(prof_unwind_callback, &data); +} +#elif (defined(JEMALLOC_PROF_GCC)) +static void +prof_backtrace_impl(void **vec, unsigned *len, unsigned max_len) { +#define BT_FRAME(i) \ + if ((i) < max_len) { \ + void *p; \ + if (__builtin_frame_address(i) == 0) { \ + return; \ + } \ + p = __builtin_return_address(i); \ + if (p == NULL) { \ + return; \ + } \ + vec[(i)] = p; \ + *len = (i) + 1; \ + } else { \ + return; \ + } + + cassert(config_prof); + assert(vec != NULL); + assert(max_len == PROF_BT_MAX); + + BT_FRAME(0) + BT_FRAME(1) + BT_FRAME(2) + BT_FRAME(3) + BT_FRAME(4) + BT_FRAME(5) + BT_FRAME(6) + BT_FRAME(7) + BT_FRAME(8) + BT_FRAME(9) + + BT_FRAME(10) + BT_FRAME(11) + BT_FRAME(12) + BT_FRAME(13) + BT_FRAME(14) + BT_FRAME(15) + BT_FRAME(16) + BT_FRAME(17) + BT_FRAME(18) + BT_FRAME(19) + + BT_FRAME(20) + BT_FRAME(21) + BT_FRAME(22) + BT_FRAME(23) + BT_FRAME(24) + BT_FRAME(25) + BT_FRAME(26) + BT_FRAME(27) + BT_FRAME(28) + BT_FRAME(29) + + BT_FRAME(30) + BT_FRAME(31) + BT_FRAME(32) + BT_FRAME(33) + BT_FRAME(34) + BT_FRAME(35) + BT_FRAME(36) + BT_FRAME(37) + BT_FRAME(38) + BT_FRAME(39) + + BT_FRAME(40) + BT_FRAME(41) + BT_FRAME(42) + BT_FRAME(43) + BT_FRAME(44) + BT_FRAME(45) + BT_FRAME(46) + BT_FRAME(47) + BT_FRAME(48) + BT_FRAME(49) + + BT_FRAME(50) + BT_FRAME(51) + BT_FRAME(52) + BT_FRAME(53) + BT_FRAME(54) + BT_FRAME(55) + BT_FRAME(56) + BT_FRAME(57) + BT_FRAME(58) + BT_FRAME(59) + + BT_FRAME(60) + BT_FRAME(61) + BT_FRAME(62) + BT_FRAME(63) + BT_FRAME(64) + BT_FRAME(65) + BT_FRAME(66) + BT_FRAME(67) + BT_FRAME(68) + BT_FRAME(69) + + BT_FRAME(70) + BT_FRAME(71) + BT_FRAME(72) + BT_FRAME(73) + BT_FRAME(74) + BT_FRAME(75) + BT_FRAME(76) + BT_FRAME(77) + BT_FRAME(78) + BT_FRAME(79) + + BT_FRAME(80) + BT_FRAME(81) + BT_FRAME(82) + BT_FRAME(83) + BT_FRAME(84) + BT_FRAME(85) + BT_FRAME(86) + BT_FRAME(87) + BT_FRAME(88) + BT_FRAME(89) + + BT_FRAME(90) + BT_FRAME(91) + BT_FRAME(92) + BT_FRAME(93) + BT_FRAME(94) + BT_FRAME(95) + BT_FRAME(96) + BT_FRAME(97) + BT_FRAME(98) + BT_FRAME(99) + + BT_FRAME(100) + BT_FRAME(101) + BT_FRAME(102) + BT_FRAME(103) + BT_FRAME(104) + BT_FRAME(105) + BT_FRAME(106) + BT_FRAME(107) + BT_FRAME(108) + BT_FRAME(109) + + BT_FRAME(110) + BT_FRAME(111) + BT_FRAME(112) + BT_FRAME(113) + BT_FRAME(114) + BT_FRAME(115) + BT_FRAME(116) + BT_FRAME(117) + BT_FRAME(118) + BT_FRAME(119) + + BT_FRAME(120) + BT_FRAME(121) + BT_FRAME(122) + BT_FRAME(123) + BT_FRAME(124) + BT_FRAME(125) + BT_FRAME(126) + BT_FRAME(127) +#undef BT_FRAME +} +#else +static void +prof_backtrace_impl(void **vec, unsigned *len, unsigned max_len) { + cassert(config_prof); + not_reached(); +} +#endif + +void +prof_backtrace(tsd_t *tsd, prof_bt_t *bt) { + cassert(config_prof); + prof_backtrace_hook_t prof_backtrace_hook = prof_backtrace_hook_get(); + assert(prof_backtrace_hook != NULL); + + pre_reentrancy(tsd, NULL); + prof_backtrace_hook(bt->vec, &bt->len, PROF_BT_MAX); + post_reentrancy(tsd); +} + +void +prof_hooks_init() { + prof_backtrace_hook_set(&prof_backtrace_impl); + prof_dump_hook_set(NULL); +} + +void +prof_unwind_init() { +#ifdef JEMALLOC_PROF_LIBGCC + /* + * Cause the backtracing machinery to allocate its internal + * state before enabling profiling. + */ + _Unwind_Backtrace(prof_unwind_init_callback, NULL); +#endif +} + +static int +prof_sys_thread_name_read_impl(char *buf, size_t limit) { +#if defined(JEMALLOC_HAVE_PTHREAD_GETNAME_NP) + return pthread_getname_np(pthread_self(), buf, limit); +#elif defined(JEMALLOC_HAVE_PTHREAD_GET_NAME_NP) + pthread_get_name_np(pthread_self(), buf, limit); + return 0; +#else + return ENOSYS; +#endif +} +prof_sys_thread_name_read_t *JET_MUTABLE prof_sys_thread_name_read = + prof_sys_thread_name_read_impl; + +void +prof_sys_thread_name_fetch(tsd_t *tsd) { +#define THREAD_NAME_MAX_LEN 16 + char buf[THREAD_NAME_MAX_LEN]; + if (!prof_sys_thread_name_read(buf, THREAD_NAME_MAX_LEN)) { + prof_thread_name_set_impl(tsd, buf); + } +#undef THREAD_NAME_MAX_LEN +} + +int +prof_getpid(void) { +#ifdef _WIN32 + return GetCurrentProcessId(); +#else + return getpid(); +#endif +} + +/* + * This buffer is rather large for stack allocation, so use a single buffer for + * all profile dumps; protected by prof_dump_mtx. + */ +static char prof_dump_buf[PROF_DUMP_BUFSIZE]; + +typedef struct prof_dump_arg_s prof_dump_arg_t; +struct prof_dump_arg_s { + /* + * Whether error should be handled locally: if true, then we print out + * error message as well as abort (if opt_abort is true) when an error + * occurred, and we also report the error back to the caller in the end; + * if false, then we only report the error back to the caller in the + * end. + */ + const bool handle_error_locally; + /* + * Whether there has been an error in the dumping process, which could + * have happened either in file opening or in file writing. When an + * error has already occurred, we will stop further writing to the file. + */ + bool error; + /* File descriptor of the dump file. */ + int prof_dump_fd; +}; + +static void +prof_dump_check_possible_error(prof_dump_arg_t *arg, bool err_cond, + const char *format, ...) { + assert(!arg->error); + if (!err_cond) { + return; + } + + arg->error = true; + if (!arg->handle_error_locally) { + return; + } + + va_list ap; + char buf[PROF_PRINTF_BUFSIZE]; + va_start(ap, format); + malloc_vsnprintf(buf, sizeof(buf), format, ap); + va_end(ap); + malloc_write(buf); + + if (opt_abort) { + abort(); + } +} + +static int +prof_dump_open_file_impl(const char *filename, int mode) { + return creat(filename, mode); +} +prof_dump_open_file_t *JET_MUTABLE prof_dump_open_file = + prof_dump_open_file_impl; + +static void +prof_dump_open(prof_dump_arg_t *arg, const char *filename) { + arg->prof_dump_fd = prof_dump_open_file(filename, 0644); + prof_dump_check_possible_error(arg, arg->prof_dump_fd == -1, + ": failed to open \"%s\"\n", filename); +} + +prof_dump_write_file_t *JET_MUTABLE prof_dump_write_file = malloc_write_fd; + +static void +prof_dump_flush(void *opaque, const char *s) { + cassert(config_prof); + prof_dump_arg_t *arg = (prof_dump_arg_t *)opaque; + if (!arg->error) { + ssize_t err = prof_dump_write_file(arg->prof_dump_fd, s, + strlen(s)); + prof_dump_check_possible_error(arg, err == -1, + ": failed to write during heap profile flush\n"); + } +} + +static void +prof_dump_close(prof_dump_arg_t *arg) { + if (arg->prof_dump_fd != -1) { + close(arg->prof_dump_fd); + } +} + +#ifndef _WIN32 +JEMALLOC_FORMAT_PRINTF(1, 2) +static int +prof_open_maps_internal(const char *format, ...) { + int mfd; + va_list ap; + char filename[PATH_MAX + 1]; + + va_start(ap, format); + malloc_vsnprintf(filename, sizeof(filename), format, ap); + va_end(ap); + +#if defined(O_CLOEXEC) + mfd = open(filename, O_RDONLY | O_CLOEXEC); +#else + mfd = open(filename, O_RDONLY); + if (mfd != -1) { + fcntl(mfd, F_SETFD, fcntl(mfd, F_GETFD) | FD_CLOEXEC); + } +#endif + + return mfd; +} +#endif + +static int +prof_dump_open_maps_impl() { + int mfd; + + cassert(config_prof); +#if defined(__FreeBSD__) || defined(__DragonFly__) + mfd = prof_open_maps_internal("/proc/curproc/map"); +#elif defined(_WIN32) + mfd = -1; // Not implemented +#else + int pid = prof_getpid(); + + mfd = prof_open_maps_internal("/proc/%d/task/%d/maps", pid, pid); + if (mfd == -1) { + mfd = prof_open_maps_internal("/proc/%d/maps", pid); + } +#endif + return mfd; +} +prof_dump_open_maps_t *JET_MUTABLE prof_dump_open_maps = + prof_dump_open_maps_impl; + +static ssize_t +prof_dump_read_maps_cb(void *read_cbopaque, void *buf, size_t limit) { + int mfd = *(int *)read_cbopaque; + assert(mfd != -1); + return malloc_read_fd(mfd, buf, limit); +} + +static void +prof_dump_maps(buf_writer_t *buf_writer) { + int mfd = prof_dump_open_maps(); + if (mfd == -1) { + return; + } + + buf_writer_cb(buf_writer, "\nMAPPED_LIBRARIES:\n"); + buf_writer_pipe(buf_writer, prof_dump_read_maps_cb, &mfd); + close(mfd); +} + +static bool +prof_dump(tsd_t *tsd, bool propagate_err, const char *filename, + bool leakcheck) { + cassert(config_prof); + assert(tsd_reentrancy_level_get(tsd) == 0); + + prof_tdata_t * tdata = prof_tdata_get(tsd, true); + if (tdata == NULL) { + return true; + } + + prof_dump_arg_t arg = {/* handle_error_locally */ !propagate_err, + /* error */ false, /* prof_dump_fd */ -1}; + + pre_reentrancy(tsd, NULL); + malloc_mutex_lock(tsd_tsdn(tsd), &prof_dump_mtx); + + prof_dump_open(&arg, filename); + buf_writer_t buf_writer; + bool err = buf_writer_init(tsd_tsdn(tsd), &buf_writer, prof_dump_flush, + &arg, prof_dump_buf, PROF_DUMP_BUFSIZE); + assert(!err); + prof_dump_impl(tsd, buf_writer_cb, &buf_writer, tdata, leakcheck); + prof_dump_maps(&buf_writer); + buf_writer_terminate(tsd_tsdn(tsd), &buf_writer); + prof_dump_close(&arg); + + prof_dump_hook_t dump_hook = prof_dump_hook_get(); + if (dump_hook != NULL) { + dump_hook(filename); + } + malloc_mutex_unlock(tsd_tsdn(tsd), &prof_dump_mtx); + post_reentrancy(tsd); + + return arg.error; +} + +/* + * If profiling is off, then PROF_DUMP_FILENAME_LEN is 1, so we'll end up + * calling strncpy with a size of 0, which triggers a -Wstringop-truncation + * warning (strncpy can never actually be called in this case, since we bail out + * much earlier when config_prof is false). This function works around the + * warning to let us leave the warning on. + */ +static inline void +prof_strncpy(char *UNUSED dest, const char *UNUSED src, size_t UNUSED size) { + cassert(config_prof); +#ifdef JEMALLOC_PROF + strncpy(dest, src, size); +#endif +} + +static const char * +prof_prefix_get(tsdn_t* tsdn) { + malloc_mutex_assert_owner(tsdn, &prof_dump_filename_mtx); + + return prof_prefix == NULL ? opt_prof_prefix : prof_prefix; +} + +static bool +prof_prefix_is_empty(tsdn_t *tsdn) { + malloc_mutex_lock(tsdn, &prof_dump_filename_mtx); + bool ret = (prof_prefix_get(tsdn)[0] == '\0'); + malloc_mutex_unlock(tsdn, &prof_dump_filename_mtx); + return ret; +} + +#define DUMP_FILENAME_BUFSIZE (PATH_MAX + 1) +#define VSEQ_INVALID UINT64_C(0xffffffffffffffff) +static void +prof_dump_filename(tsd_t *tsd, char *filename, char v, uint64_t vseq) { + cassert(config_prof); + + assert(tsd_reentrancy_level_get(tsd) == 0); + const char *prefix = prof_prefix_get(tsd_tsdn(tsd)); + + if (vseq != VSEQ_INVALID) { + /* "...v.heap" */ + malloc_snprintf(filename, DUMP_FILENAME_BUFSIZE, + "%s.%d.%"FMTu64".%c%"FMTu64".heap", prefix, prof_getpid(), + prof_dump_seq, v, vseq); + } else { + /* "....heap" */ + malloc_snprintf(filename, DUMP_FILENAME_BUFSIZE, + "%s.%d.%"FMTu64".%c.heap", prefix, prof_getpid(), + prof_dump_seq, v); + } + prof_dump_seq++; +} + +void +prof_get_default_filename(tsdn_t *tsdn, char *filename, uint64_t ind) { + malloc_mutex_lock(tsdn, &prof_dump_filename_mtx); + malloc_snprintf(filename, PROF_DUMP_FILENAME_LEN, + "%s.%d.%"FMTu64".json", prof_prefix_get(tsdn), prof_getpid(), ind); + malloc_mutex_unlock(tsdn, &prof_dump_filename_mtx); +} + +void +prof_fdump_impl(tsd_t *tsd) { + char filename[DUMP_FILENAME_BUFSIZE]; + + assert(!prof_prefix_is_empty(tsd_tsdn(tsd))); + malloc_mutex_lock(tsd_tsdn(tsd), &prof_dump_filename_mtx); + prof_dump_filename(tsd, filename, 'f', VSEQ_INVALID); + malloc_mutex_unlock(tsd_tsdn(tsd), &prof_dump_filename_mtx); + prof_dump(tsd, false, filename, opt_prof_leak); +} + +bool +prof_prefix_set(tsdn_t *tsdn, const char *prefix) { + cassert(config_prof); + ctl_mtx_assert_held(tsdn); + malloc_mutex_lock(tsdn, &prof_dump_filename_mtx); + if (prof_prefix == NULL) { + malloc_mutex_unlock(tsdn, &prof_dump_filename_mtx); + /* Everything is still guarded by ctl_mtx. */ + char *buffer = base_alloc(tsdn, prof_base, + PROF_DUMP_FILENAME_LEN, QUANTUM); + if (buffer == NULL) { + return true; + } + malloc_mutex_lock(tsdn, &prof_dump_filename_mtx); + prof_prefix = buffer; + } + assert(prof_prefix != NULL); + + prof_strncpy(prof_prefix, prefix, PROF_DUMP_FILENAME_LEN - 1); + prof_prefix[PROF_DUMP_FILENAME_LEN - 1] = '\0'; + malloc_mutex_unlock(tsdn, &prof_dump_filename_mtx); + + return false; +} + +void +prof_idump_impl(tsd_t *tsd) { + malloc_mutex_lock(tsd_tsdn(tsd), &prof_dump_filename_mtx); + if (prof_prefix_get(tsd_tsdn(tsd))[0] == '\0') { + malloc_mutex_unlock(tsd_tsdn(tsd), &prof_dump_filename_mtx); + return; + } + char filename[PATH_MAX + 1]; + prof_dump_filename(tsd, filename, 'i', prof_dump_iseq); + prof_dump_iseq++; + malloc_mutex_unlock(tsd_tsdn(tsd), &prof_dump_filename_mtx); + prof_dump(tsd, false, filename, false); +} + +bool +prof_mdump_impl(tsd_t *tsd, const char *filename) { + char filename_buf[DUMP_FILENAME_BUFSIZE]; + if (filename == NULL) { + /* No filename specified, so automatically generate one. */ + malloc_mutex_lock(tsd_tsdn(tsd), &prof_dump_filename_mtx); + if (prof_prefix_get(tsd_tsdn(tsd))[0] == '\0') { + malloc_mutex_unlock(tsd_tsdn(tsd), &prof_dump_filename_mtx); + return true; + } + prof_dump_filename(tsd, filename_buf, 'm', prof_dump_mseq); + prof_dump_mseq++; + malloc_mutex_unlock(tsd_tsdn(tsd), &prof_dump_filename_mtx); + filename = filename_buf; + } + return prof_dump(tsd, true, filename, false); +} + +void +prof_gdump_impl(tsd_t *tsd) { + tsdn_t *tsdn = tsd_tsdn(tsd); + malloc_mutex_lock(tsdn, &prof_dump_filename_mtx); + if (prof_prefix_get(tsdn)[0] == '\0') { + malloc_mutex_unlock(tsdn, &prof_dump_filename_mtx); + return; + } + char filename[DUMP_FILENAME_BUFSIZE]; + prof_dump_filename(tsd, filename, 'u', prof_dump_useq); + prof_dump_useq++; + malloc_mutex_unlock(tsdn, &prof_dump_filename_mtx); + prof_dump(tsd, false, filename, false); +} diff --git a/BeefRT/JEMalloc/src/psset.c b/BeefRT/JEMalloc/src/psset.c new file mode 100644 index 00000000..9a8f054f --- /dev/null +++ b/BeefRT/JEMalloc/src/psset.c @@ -0,0 +1,385 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/psset.h" + +#include "jemalloc/internal/fb.h" + +void +psset_init(psset_t *psset) { + for (unsigned i = 0; i < PSSET_NPSIZES; i++) { + hpdata_age_heap_new(&psset->pageslabs[i]); + } + fb_init(psset->pageslab_bitmap, PSSET_NPSIZES); + memset(&psset->merged_stats, 0, sizeof(psset->merged_stats)); + memset(&psset->stats, 0, sizeof(psset->stats)); + hpdata_empty_list_init(&psset->empty); + for (int i = 0; i < PSSET_NPURGE_LISTS; i++) { + hpdata_purge_list_init(&psset->to_purge[i]); + } + fb_init(psset->purge_bitmap, PSSET_NPURGE_LISTS); + hpdata_hugify_list_init(&psset->to_hugify); +} + +static void +psset_bin_stats_accum(psset_bin_stats_t *dst, psset_bin_stats_t *src) { + dst->npageslabs += src->npageslabs; + dst->nactive += src->nactive; + dst->ndirty += src->ndirty; +} + +void +psset_stats_accum(psset_stats_t *dst, psset_stats_t *src) { + psset_bin_stats_accum(&dst->full_slabs[0], &src->full_slabs[0]); + psset_bin_stats_accum(&dst->full_slabs[1], &src->full_slabs[1]); + psset_bin_stats_accum(&dst->empty_slabs[0], &src->empty_slabs[0]); + psset_bin_stats_accum(&dst->empty_slabs[1], &src->empty_slabs[1]); + for (pszind_t i = 0; i < PSSET_NPSIZES; i++) { + psset_bin_stats_accum(&dst->nonfull_slabs[i][0], + &src->nonfull_slabs[i][0]); + psset_bin_stats_accum(&dst->nonfull_slabs[i][1], + &src->nonfull_slabs[i][1]); + } +} + +/* + * The stats maintenance strategy is to remove a pageslab's contribution to the + * stats when we call psset_update_begin, and re-add it (to a potentially new + * bin) when we call psset_update_end. + */ +JEMALLOC_ALWAYS_INLINE void +psset_bin_stats_insert_remove(psset_t *psset, psset_bin_stats_t *binstats, + hpdata_t *ps, bool insert) { + size_t mul = insert ? (size_t)1 : (size_t)-1; + size_t huge_idx = (size_t)hpdata_huge_get(ps); + + binstats[huge_idx].npageslabs += mul * 1; + binstats[huge_idx].nactive += mul * hpdata_nactive_get(ps); + binstats[huge_idx].ndirty += mul * hpdata_ndirty_get(ps); + + psset->merged_stats.npageslabs += mul * 1; + psset->merged_stats.nactive += mul * hpdata_nactive_get(ps); + psset->merged_stats.ndirty += mul * hpdata_ndirty_get(ps); + + if (config_debug) { + psset_bin_stats_t check_stats = {0}; + for (size_t huge = 0; huge <= 1; huge++) { + psset_bin_stats_accum(&check_stats, + &psset->stats.full_slabs[huge]); + psset_bin_stats_accum(&check_stats, + &psset->stats.empty_slabs[huge]); + for (pszind_t pind = 0; pind < PSSET_NPSIZES; pind++) { + psset_bin_stats_accum(&check_stats, + &psset->stats.nonfull_slabs[pind][huge]); + } + } + assert(psset->merged_stats.npageslabs + == check_stats.npageslabs); + assert(psset->merged_stats.nactive == check_stats.nactive); + assert(psset->merged_stats.ndirty == check_stats.ndirty); + } +} + +static void +psset_bin_stats_insert(psset_t *psset, psset_bin_stats_t *binstats, + hpdata_t *ps) { + psset_bin_stats_insert_remove(psset, binstats, ps, true); +} + +static void +psset_bin_stats_remove(psset_t *psset, psset_bin_stats_t *binstats, + hpdata_t *ps) { + psset_bin_stats_insert_remove(psset, binstats, ps, false); +} + +static void +psset_hpdata_heap_remove(psset_t *psset, pszind_t pind, hpdata_t *ps) { + hpdata_age_heap_remove(&psset->pageslabs[pind], ps); + if (hpdata_age_heap_empty(&psset->pageslabs[pind])) { + fb_unset(psset->pageslab_bitmap, PSSET_NPSIZES, (size_t)pind); + } +} + +static void +psset_hpdata_heap_insert(psset_t *psset, pszind_t pind, hpdata_t *ps) { + if (hpdata_age_heap_empty(&psset->pageslabs[pind])) { + fb_set(psset->pageslab_bitmap, PSSET_NPSIZES, (size_t)pind); + } + hpdata_age_heap_insert(&psset->pageslabs[pind], ps); +} + +static void +psset_stats_insert(psset_t* psset, hpdata_t *ps) { + if (hpdata_empty(ps)) { + psset_bin_stats_insert(psset, psset->stats.empty_slabs, ps); + } else if (hpdata_full(ps)) { + psset_bin_stats_insert(psset, psset->stats.full_slabs, ps); + } else { + size_t longest_free_range = hpdata_longest_free_range_get(ps); + + pszind_t pind = sz_psz2ind(sz_psz_quantize_floor( + longest_free_range << LG_PAGE)); + assert(pind < PSSET_NPSIZES); + + psset_bin_stats_insert(psset, psset->stats.nonfull_slabs[pind], + ps); + } +} + +static void +psset_stats_remove(psset_t *psset, hpdata_t *ps) { + if (hpdata_empty(ps)) { + psset_bin_stats_remove(psset, psset->stats.empty_slabs, ps); + } else if (hpdata_full(ps)) { + psset_bin_stats_remove(psset, psset->stats.full_slabs, ps); + } else { + size_t longest_free_range = hpdata_longest_free_range_get(ps); + + pszind_t pind = sz_psz2ind(sz_psz_quantize_floor( + longest_free_range << LG_PAGE)); + assert(pind < PSSET_NPSIZES); + + psset_bin_stats_remove(psset, psset->stats.nonfull_slabs[pind], + ps); + } +} + +/* + * Put ps into some container so that it can be found during future allocation + * requests. + */ +static void +psset_alloc_container_insert(psset_t *psset, hpdata_t *ps) { + assert(!hpdata_in_psset_alloc_container_get(ps)); + hpdata_in_psset_alloc_container_set(ps, true); + if (hpdata_empty(ps)) { + /* + * This prepend, paired with popping the head in psset_fit, + * means we implement LIFO ordering for the empty slabs set, + * which seems reasonable. + */ + hpdata_empty_list_prepend(&psset->empty, ps); + } else if (hpdata_full(ps)) { + /* + * We don't need to keep track of the full slabs; we're never + * going to return them from a psset_pick_alloc call. + */ + } else { + size_t longest_free_range = hpdata_longest_free_range_get(ps); + + pszind_t pind = sz_psz2ind(sz_psz_quantize_floor( + longest_free_range << LG_PAGE)); + assert(pind < PSSET_NPSIZES); + + psset_hpdata_heap_insert(psset, pind, ps); + } +} + +/* Remove ps from those collections. */ +static void +psset_alloc_container_remove(psset_t *psset, hpdata_t *ps) { + assert(hpdata_in_psset_alloc_container_get(ps)); + hpdata_in_psset_alloc_container_set(ps, false); + + if (hpdata_empty(ps)) { + hpdata_empty_list_remove(&psset->empty, ps); + } else if (hpdata_full(ps)) { + /* Same as above -- do nothing in this case. */ + } else { + size_t longest_free_range = hpdata_longest_free_range_get(ps); + + pszind_t pind = sz_psz2ind(sz_psz_quantize_floor( + longest_free_range << LG_PAGE)); + assert(pind < PSSET_NPSIZES); + + psset_hpdata_heap_remove(psset, pind, ps); + } +} + +static size_t +psset_purge_list_ind(hpdata_t *ps) { + size_t ndirty = hpdata_ndirty_get(ps); + /* Shouldn't have something with no dirty pages purgeable. */ + assert(ndirty > 0); + /* + * Higher indices correspond to lists we'd like to purge earlier; make + * the two highest indices correspond to empty lists, which we attempt + * to purge before purging any non-empty list. This has two advantages: + * - Empty page slabs are the least likely to get reused (we'll only + * pick them for an allocation if we have no other choice). + * - Empty page slabs can purge every dirty page they contain in a + * single call, which is not usually the case. + * + * We purge hugeified empty slabs before nonhugeified ones, on the basis + * that they are fully dirty, while nonhugified slabs might not be, so + * we free up more pages more easily. + */ + if (hpdata_nactive_get(ps) == 0) { + if (hpdata_huge_get(ps)) { + return PSSET_NPURGE_LISTS - 1; + } else { + return PSSET_NPURGE_LISTS - 2; + } + } + + pszind_t pind = sz_psz2ind(sz_psz_quantize_floor(ndirty << LG_PAGE)); + /* + * For non-empty slabs, we may reuse them again. Prefer purging + * non-hugeified slabs before hugeified ones then, among pages of + * similar dirtiness. We still get some benefit from the hugification. + */ + return (size_t)pind * 2 + (hpdata_huge_get(ps) ? 0 : 1); +} + +static void +psset_maybe_remove_purge_list(psset_t *psset, hpdata_t *ps) { + /* + * Remove the hpdata from its purge list (if it's in one). Even if it's + * going to stay in the same one, by appending it during + * psset_update_end, we move it to the end of its queue, so that we + * purge LRU within a given dirtiness bucket. + */ + if (hpdata_purge_allowed_get(ps)) { + size_t ind = psset_purge_list_ind(ps); + hpdata_purge_list_t *purge_list = &psset->to_purge[ind]; + hpdata_purge_list_remove(purge_list, ps); + if (hpdata_purge_list_empty(purge_list)) { + fb_unset(psset->purge_bitmap, PSSET_NPURGE_LISTS, ind); + } + } +} + +static void +psset_maybe_insert_purge_list(psset_t *psset, hpdata_t *ps) { + if (hpdata_purge_allowed_get(ps)) { + size_t ind = psset_purge_list_ind(ps); + hpdata_purge_list_t *purge_list = &psset->to_purge[ind]; + if (hpdata_purge_list_empty(purge_list)) { + fb_set(psset->purge_bitmap, PSSET_NPURGE_LISTS, ind); + } + hpdata_purge_list_append(purge_list, ps); + } + +} + +void +psset_update_begin(psset_t *psset, hpdata_t *ps) { + hpdata_assert_consistent(ps); + assert(hpdata_in_psset_get(ps)); + hpdata_updating_set(ps, true); + psset_stats_remove(psset, ps); + if (hpdata_in_psset_alloc_container_get(ps)) { + /* + * Some metadata updates can break alloc container invariants + * (e.g. the longest free range determines the hpdata_heap_t the + * pageslab lives in). + */ + assert(hpdata_alloc_allowed_get(ps)); + psset_alloc_container_remove(psset, ps); + } + psset_maybe_remove_purge_list(psset, ps); + /* + * We don't update presence in the hugify list; we try to keep it FIFO, + * even in the presence of other metadata updates. We'll update + * presence at the end of the metadata update if necessary. + */ +} + +void +psset_update_end(psset_t *psset, hpdata_t *ps) { + assert(hpdata_in_psset_get(ps)); + hpdata_updating_set(ps, false); + psset_stats_insert(psset, ps); + + /* + * The update begin should have removed ps from whatever alloc container + * it was in. + */ + assert(!hpdata_in_psset_alloc_container_get(ps)); + if (hpdata_alloc_allowed_get(ps)) { + psset_alloc_container_insert(psset, ps); + } + psset_maybe_insert_purge_list(psset, ps); + + if (hpdata_hugify_allowed_get(ps) + && !hpdata_in_psset_hugify_container_get(ps)) { + hpdata_in_psset_hugify_container_set(ps, true); + hpdata_hugify_list_append(&psset->to_hugify, ps); + } else if (!hpdata_hugify_allowed_get(ps) + && hpdata_in_psset_hugify_container_get(ps)) { + hpdata_in_psset_hugify_container_set(ps, false); + hpdata_hugify_list_remove(&psset->to_hugify, ps); + } + hpdata_assert_consistent(ps); +} + +hpdata_t * +psset_pick_alloc(psset_t *psset, size_t size) { + assert((size & PAGE_MASK) == 0); + assert(size <= HUGEPAGE); + + pszind_t min_pind = sz_psz2ind(sz_psz_quantize_ceil(size)); + pszind_t pind = (pszind_t)fb_ffs(psset->pageslab_bitmap, PSSET_NPSIZES, + (size_t)min_pind); + if (pind == PSSET_NPSIZES) { + return hpdata_empty_list_first(&psset->empty); + } + hpdata_t *ps = hpdata_age_heap_first(&psset->pageslabs[pind]); + if (ps == NULL) { + return NULL; + } + + hpdata_assert_consistent(ps); + + return ps; +} + +hpdata_t * +psset_pick_purge(psset_t *psset) { + ssize_t ind_ssz = fb_fls(psset->purge_bitmap, PSSET_NPURGE_LISTS, + PSSET_NPURGE_LISTS - 1); + if (ind_ssz < 0) { + return NULL; + } + pszind_t ind = (pszind_t)ind_ssz; + assert(ind < PSSET_NPURGE_LISTS); + hpdata_t *ps = hpdata_purge_list_first(&psset->to_purge[ind]); + assert(ps != NULL); + return ps; +} + +hpdata_t * +psset_pick_hugify(psset_t *psset) { + return hpdata_hugify_list_first(&psset->to_hugify); +} + +void +psset_insert(psset_t *psset, hpdata_t *ps) { + hpdata_in_psset_set(ps, true); + + psset_stats_insert(psset, ps); + if (hpdata_alloc_allowed_get(ps)) { + psset_alloc_container_insert(psset, ps); + } + psset_maybe_insert_purge_list(psset, ps); + + if (hpdata_hugify_allowed_get(ps)) { + hpdata_in_psset_hugify_container_set(ps, true); + hpdata_hugify_list_append(&psset->to_hugify, ps); + } +} + +void +psset_remove(psset_t *psset, hpdata_t *ps) { + hpdata_in_psset_set(ps, false); + + psset_stats_remove(psset, ps); + if (hpdata_in_psset_alloc_container_get(ps)) { + psset_alloc_container_remove(psset, ps); + } + psset_maybe_remove_purge_list(psset, ps); + if (hpdata_in_psset_hugify_container_get(ps)) { + hpdata_in_psset_hugify_container_set(ps, false); + hpdata_hugify_list_remove(&psset->to_hugify, ps); + } +} diff --git a/BeefRT/JEMalloc/src/rtree.c b/BeefRT/JEMalloc/src/rtree.c new file mode 100644 index 00000000..6496b5af --- /dev/null +++ b/BeefRT/JEMalloc/src/rtree.c @@ -0,0 +1,261 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/mutex.h" + +/* + * Only the most significant bits of keys passed to rtree_{read,write}() are + * used. + */ +bool +rtree_new(rtree_t *rtree, base_t *base, bool zeroed) { +#ifdef JEMALLOC_JET + if (!zeroed) { + memset(rtree, 0, sizeof(rtree_t)); /* Clear root. */ + } +#else + assert(zeroed); +#endif + rtree->base = base; + + if (malloc_mutex_init(&rtree->init_lock, "rtree", WITNESS_RANK_RTREE, + malloc_mutex_rank_exclusive)) { + return true; + } + + return false; +} + +static rtree_node_elm_t * +rtree_node_alloc(tsdn_t *tsdn, rtree_t *rtree, size_t nelms) { + return (rtree_node_elm_t *)base_alloc(tsdn, rtree->base, + nelms * sizeof(rtree_node_elm_t), CACHELINE); +} + +static rtree_leaf_elm_t * +rtree_leaf_alloc(tsdn_t *tsdn, rtree_t *rtree, size_t nelms) { + return (rtree_leaf_elm_t *)base_alloc(tsdn, rtree->base, + nelms * sizeof(rtree_leaf_elm_t), CACHELINE); +} + +static rtree_node_elm_t * +rtree_node_init(tsdn_t *tsdn, rtree_t *rtree, unsigned level, + atomic_p_t *elmp) { + malloc_mutex_lock(tsdn, &rtree->init_lock); + /* + * If *elmp is non-null, then it was initialized with the init lock + * held, so we can get by with 'relaxed' here. + */ + rtree_node_elm_t *node = atomic_load_p(elmp, ATOMIC_RELAXED); + if (node == NULL) { + node = rtree_node_alloc(tsdn, rtree, ZU(1) << + rtree_levels[level].bits); + if (node == NULL) { + malloc_mutex_unlock(tsdn, &rtree->init_lock); + return NULL; + } + /* + * Even though we hold the lock, a later reader might not; we + * need release semantics. + */ + atomic_store_p(elmp, node, ATOMIC_RELEASE); + } + malloc_mutex_unlock(tsdn, &rtree->init_lock); + + return node; +} + +static rtree_leaf_elm_t * +rtree_leaf_init(tsdn_t *tsdn, rtree_t *rtree, atomic_p_t *elmp) { + malloc_mutex_lock(tsdn, &rtree->init_lock); + /* + * If *elmp is non-null, then it was initialized with the init lock + * held, so we can get by with 'relaxed' here. + */ + rtree_leaf_elm_t *leaf = atomic_load_p(elmp, ATOMIC_RELAXED); + if (leaf == NULL) { + leaf = rtree_leaf_alloc(tsdn, rtree, ZU(1) << + rtree_levels[RTREE_HEIGHT-1].bits); + if (leaf == NULL) { + malloc_mutex_unlock(tsdn, &rtree->init_lock); + return NULL; + } + /* + * Even though we hold the lock, a later reader might not; we + * need release semantics. + */ + atomic_store_p(elmp, leaf, ATOMIC_RELEASE); + } + malloc_mutex_unlock(tsdn, &rtree->init_lock); + + return leaf; +} + +static bool +rtree_node_valid(rtree_node_elm_t *node) { + return ((uintptr_t)node != (uintptr_t)0); +} + +static bool +rtree_leaf_valid(rtree_leaf_elm_t *leaf) { + return ((uintptr_t)leaf != (uintptr_t)0); +} + +static rtree_node_elm_t * +rtree_child_node_tryread(rtree_node_elm_t *elm, bool dependent) { + rtree_node_elm_t *node; + + if (dependent) { + node = (rtree_node_elm_t *)atomic_load_p(&elm->child, + ATOMIC_RELAXED); + } else { + node = (rtree_node_elm_t *)atomic_load_p(&elm->child, + ATOMIC_ACQUIRE); + } + + assert(!dependent || node != NULL); + return node; +} + +static rtree_node_elm_t * +rtree_child_node_read(tsdn_t *tsdn, rtree_t *rtree, rtree_node_elm_t *elm, + unsigned level, bool dependent) { + rtree_node_elm_t *node; + + node = rtree_child_node_tryread(elm, dependent); + if (!dependent && unlikely(!rtree_node_valid(node))) { + node = rtree_node_init(tsdn, rtree, level + 1, &elm->child); + } + assert(!dependent || node != NULL); + return node; +} + +static rtree_leaf_elm_t * +rtree_child_leaf_tryread(rtree_node_elm_t *elm, bool dependent) { + rtree_leaf_elm_t *leaf; + + if (dependent) { + leaf = (rtree_leaf_elm_t *)atomic_load_p(&elm->child, + ATOMIC_RELAXED); + } else { + leaf = (rtree_leaf_elm_t *)atomic_load_p(&elm->child, + ATOMIC_ACQUIRE); + } + + assert(!dependent || leaf != NULL); + return leaf; +} + +static rtree_leaf_elm_t * +rtree_child_leaf_read(tsdn_t *tsdn, rtree_t *rtree, rtree_node_elm_t *elm, + unsigned level, bool dependent) { + rtree_leaf_elm_t *leaf; + + leaf = rtree_child_leaf_tryread(elm, dependent); + if (!dependent && unlikely(!rtree_leaf_valid(leaf))) { + leaf = rtree_leaf_init(tsdn, rtree, &elm->child); + } + assert(!dependent || leaf != NULL); + return leaf; +} + +rtree_leaf_elm_t * +rtree_leaf_elm_lookup_hard(tsdn_t *tsdn, rtree_t *rtree, rtree_ctx_t *rtree_ctx, + uintptr_t key, bool dependent, bool init_missing) { + rtree_node_elm_t *node; + rtree_leaf_elm_t *leaf; +#if RTREE_HEIGHT > 1 + node = rtree->root; +#else + leaf = rtree->root; +#endif + + if (config_debug) { + uintptr_t leafkey = rtree_leafkey(key); + for (unsigned i = 0; i < RTREE_CTX_NCACHE; i++) { + assert(rtree_ctx->cache[i].leafkey != leafkey); + } + for (unsigned i = 0; i < RTREE_CTX_NCACHE_L2; i++) { + assert(rtree_ctx->l2_cache[i].leafkey != leafkey); + } + } + +#define RTREE_GET_CHILD(level) { \ + assert(level < RTREE_HEIGHT-1); \ + if (level != 0 && !dependent && \ + unlikely(!rtree_node_valid(node))) { \ + return NULL; \ + } \ + uintptr_t subkey = rtree_subkey(key, level); \ + if (level + 2 < RTREE_HEIGHT) { \ + node = init_missing ? \ + rtree_child_node_read(tsdn, rtree, \ + &node[subkey], level, dependent) : \ + rtree_child_node_tryread(&node[subkey], \ + dependent); \ + } else { \ + leaf = init_missing ? \ + rtree_child_leaf_read(tsdn, rtree, \ + &node[subkey], level, dependent) : \ + rtree_child_leaf_tryread(&node[subkey], \ + dependent); \ + } \ + } + /* + * Cache replacement upon hard lookup (i.e. L1 & L2 rtree cache miss): + * (1) evict last entry in L2 cache; (2) move the collision slot from L1 + * cache down to L2; and 3) fill L1. + */ +#define RTREE_GET_LEAF(level) { \ + assert(level == RTREE_HEIGHT-1); \ + if (!dependent && unlikely(!rtree_leaf_valid(leaf))) { \ + return NULL; \ + } \ + if (RTREE_CTX_NCACHE_L2 > 1) { \ + memmove(&rtree_ctx->l2_cache[1], \ + &rtree_ctx->l2_cache[0], \ + sizeof(rtree_ctx_cache_elm_t) * \ + (RTREE_CTX_NCACHE_L2 - 1)); \ + } \ + size_t slot = rtree_cache_direct_map(key); \ + rtree_ctx->l2_cache[0].leafkey = \ + rtree_ctx->cache[slot].leafkey; \ + rtree_ctx->l2_cache[0].leaf = \ + rtree_ctx->cache[slot].leaf; \ + uintptr_t leafkey = rtree_leafkey(key); \ + rtree_ctx->cache[slot].leafkey = leafkey; \ + rtree_ctx->cache[slot].leaf = leaf; \ + uintptr_t subkey = rtree_subkey(key, level); \ + return &leaf[subkey]; \ + } + if (RTREE_HEIGHT > 1) { + RTREE_GET_CHILD(0) + } + if (RTREE_HEIGHT > 2) { + RTREE_GET_CHILD(1) + } + if (RTREE_HEIGHT > 3) { + for (unsigned i = 2; i < RTREE_HEIGHT-1; i++) { + RTREE_GET_CHILD(i) + } + } + RTREE_GET_LEAF(RTREE_HEIGHT-1) +#undef RTREE_GET_CHILD +#undef RTREE_GET_LEAF + not_reached(); +} + +void +rtree_ctx_data_init(rtree_ctx_t *ctx) { + for (unsigned i = 0; i < RTREE_CTX_NCACHE; i++) { + rtree_ctx_cache_elm_t *cache = &ctx->cache[i]; + cache->leafkey = RTREE_LEAFKEY_INVALID; + cache->leaf = NULL; + } + for (unsigned i = 0; i < RTREE_CTX_NCACHE_L2; i++) { + rtree_ctx_cache_elm_t *cache = &ctx->l2_cache[i]; + cache->leafkey = RTREE_LEAFKEY_INVALID; + cache->leaf = NULL; + } +} diff --git a/BeefRT/JEMalloc/src/safety_check.c b/BeefRT/JEMalloc/src/safety_check.c new file mode 100644 index 00000000..209fdda9 --- /dev/null +++ b/BeefRT/JEMalloc/src/safety_check.c @@ -0,0 +1,36 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +static safety_check_abort_hook_t safety_check_abort; + +void safety_check_fail_sized_dealloc(bool current_dealloc, const void *ptr, + size_t true_size, size_t input_size) { + char *src = current_dealloc ? "the current pointer being freed" : + "in thread cache, possibly from previous deallocations"; + + safety_check_fail(": size mismatch detected (true size %zu " + "vs input size %zu), likely caused by application sized " + "deallocation bugs (source address: %p, %s). Suggest building with " + "--enable-debug or address sanitizer for debugging. Abort.\n", + true_size, input_size, ptr, src); +} + +void safety_check_set_abort(safety_check_abort_hook_t abort_fn) { + safety_check_abort = abort_fn; +} + +void safety_check_fail(const char *format, ...) { + char buf[MALLOC_PRINTF_BUFSIZE]; + + va_list ap; + va_start(ap, format); + malloc_vsnprintf(buf, MALLOC_PRINTF_BUFSIZE, format, ap); + va_end(ap); + + if (safety_check_abort == NULL) { + malloc_write(buf); + abort(); + } else { + safety_check_abort(buf); + } +} diff --git a/BeefRT/JEMalloc/src/san.c b/BeefRT/JEMalloc/src/san.c new file mode 100644 index 00000000..6e512911 --- /dev/null +++ b/BeefRT/JEMalloc/src/san.c @@ -0,0 +1,208 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/ehooks.h" +#include "jemalloc/internal/san.h" +#include "jemalloc/internal/tsd.h" + +/* The sanitizer options. */ +size_t opt_san_guard_large = SAN_GUARD_LARGE_EVERY_N_EXTENTS_DEFAULT; +size_t opt_san_guard_small = SAN_GUARD_SMALL_EVERY_N_EXTENTS_DEFAULT; + +/* Aligned (-1 is off) ptrs will be junked & stashed on dealloc. */ +ssize_t opt_lg_san_uaf_align = SAN_LG_UAF_ALIGN_DEFAULT; + +/* + * Initialized in san_init(). When disabled, the mask is set to (uintptr_t)-1 + * to always fail the nonfast_align check. + */ +uintptr_t san_cache_bin_nonfast_mask = SAN_CACHE_BIN_NONFAST_MASK_DEFAULT; + +static inline void +san_find_guarded_addr(edata_t *edata, uintptr_t *guard1, uintptr_t *guard2, + uintptr_t *addr, size_t size, bool left, bool right) { + assert(!edata_guarded_get(edata)); + assert(size % PAGE == 0); + *addr = (uintptr_t)edata_base_get(edata); + if (left) { + *guard1 = *addr; + *addr += SAN_PAGE_GUARD; + } else { + *guard1 = 0; + } + + if (right) { + *guard2 = *addr + size; + } else { + *guard2 = 0; + } +} + +static inline void +san_find_unguarded_addr(edata_t *edata, uintptr_t *guard1, uintptr_t *guard2, + uintptr_t *addr, size_t size, bool left, bool right) { + assert(edata_guarded_get(edata)); + assert(size % PAGE == 0); + *addr = (uintptr_t)edata_base_get(edata); + if (right) { + *guard2 = *addr + size; + } else { + *guard2 = 0; + } + + if (left) { + *guard1 = *addr - SAN_PAGE_GUARD; + assert(*guard1 != 0); + *addr = *guard1; + } else { + *guard1 = 0; + } +} + +void +san_guard_pages(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, emap_t *emap, + bool left, bool right, bool remap) { + assert(left || right); + if (remap) { + emap_deregister_boundary(tsdn, emap, edata); + } + + size_t size_with_guards = edata_size_get(edata); + size_t usize = (left && right) + ? san_two_side_unguarded_sz(size_with_guards) + : san_one_side_unguarded_sz(size_with_guards); + + uintptr_t guard1, guard2, addr; + san_find_guarded_addr(edata, &guard1, &guard2, &addr, usize, left, + right); + + assert(edata_state_get(edata) == extent_state_active); + ehooks_guard(tsdn, ehooks, (void *)guard1, (void *)guard2); + + /* Update the guarded addr and usable size of the edata. */ + edata_size_set(edata, usize); + edata_addr_set(edata, (void *)addr); + edata_guarded_set(edata, true); + + if (remap) { + emap_register_boundary(tsdn, emap, edata, SC_NSIZES, + /* slab */ false); + } +} + +static void +san_unguard_pages_impl(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + emap_t *emap, bool left, bool right, bool remap) { + assert(left || right); + /* Remove the inner boundary which no longer exists. */ + if (remap) { + assert(edata_state_get(edata) == extent_state_active); + emap_deregister_boundary(tsdn, emap, edata); + } else { + assert(edata_state_get(edata) == extent_state_retained); + } + + size_t size = edata_size_get(edata); + size_t size_with_guards = (left && right) + ? san_two_side_guarded_sz(size) + : san_one_side_guarded_sz(size); + + uintptr_t guard1, guard2, addr; + san_find_unguarded_addr(edata, &guard1, &guard2, &addr, size, left, + right); + + ehooks_unguard(tsdn, ehooks, (void *)guard1, (void *)guard2); + + /* Update the true addr and usable size of the edata. */ + edata_size_set(edata, size_with_guards); + edata_addr_set(edata, (void *)addr); + edata_guarded_set(edata, false); + + /* + * Then re-register the outer boundary including the guards, if + * requested. + */ + if (remap) { + emap_register_boundary(tsdn, emap, edata, SC_NSIZES, + /* slab */ false); + } +} + +void +san_unguard_pages(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + emap_t *emap, bool left, bool right) { + san_unguard_pages_impl(tsdn, ehooks, edata, emap, left, right, + /* remap */ true); +} + +void +san_unguard_pages_pre_destroy(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, + emap_t *emap) { + emap_assert_not_mapped(tsdn, emap, edata); + /* + * We don't want to touch the emap of about to be destroyed extents, as + * they have been unmapped upon eviction from the retained ecache. Also, + * we unguard the extents to the right, because retained extents only + * own their right guard page per san_bump_alloc's logic. + */ + san_unguard_pages_impl(tsdn, ehooks, edata, emap, /* left */ false, + /* right */ true, /* remap */ false); +} + +static bool +san_stashed_corrupted(void *ptr, size_t size) { + if (san_junk_ptr_should_slow()) { + for (size_t i = 0; i < size; i++) { + if (((char *)ptr)[i] != (char)uaf_detect_junk) { + return true; + } + } + return false; + } + + void *first, *mid, *last; + san_junk_ptr_locations(ptr, size, &first, &mid, &last); + if (*(uintptr_t *)first != uaf_detect_junk || + *(uintptr_t *)mid != uaf_detect_junk || + *(uintptr_t *)last != uaf_detect_junk) { + return true; + } + + return false; +} + +void +san_check_stashed_ptrs(void **ptrs, size_t nstashed, size_t usize) { + /* + * Verify that the junked-filled & stashed pointers remain unchanged, to + * detect write-after-free. + */ + for (size_t n = 0; n < nstashed; n++) { + void *stashed = ptrs[n]; + assert(stashed != NULL); + assert(cache_bin_nonfast_aligned(stashed)); + if (unlikely(san_stashed_corrupted(stashed, usize))) { + safety_check_fail(": Write-after-free " + "detected on deallocated pointer %p (size %zu).\n", + stashed, usize); + } + } +} + +void +tsd_san_init(tsd_t *tsd) { + *tsd_san_extents_until_guard_smallp_get(tsd) = opt_san_guard_small; + *tsd_san_extents_until_guard_largep_get(tsd) = opt_san_guard_large; +} + +void +san_init(ssize_t lg_san_uaf_align) { + assert(lg_san_uaf_align == -1 || lg_san_uaf_align >= LG_PAGE); + if (lg_san_uaf_align == -1) { + san_cache_bin_nonfast_mask = (uintptr_t)-1; + return; + } + + san_cache_bin_nonfast_mask = ((uintptr_t)1 << lg_san_uaf_align) - 1; +} diff --git a/BeefRT/JEMalloc/src/san_bump.c b/BeefRT/JEMalloc/src/san_bump.c new file mode 100644 index 00000000..88897455 --- /dev/null +++ b/BeefRT/JEMalloc/src/san_bump.c @@ -0,0 +1,104 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/san_bump.h" +#include "jemalloc/internal/pac.h" +#include "jemalloc/internal/san.h" +#include "jemalloc/internal/ehooks.h" +#include "jemalloc/internal/edata_cache.h" + +static bool +san_bump_grow_locked(tsdn_t *tsdn, san_bump_alloc_t *sba, pac_t *pac, + ehooks_t *ehooks, size_t size); + +edata_t * +san_bump_alloc(tsdn_t *tsdn, san_bump_alloc_t* sba, pac_t *pac, + ehooks_t *ehooks, size_t size, bool zero) { + assert(san_bump_enabled()); + + edata_t* to_destroy; + size_t guarded_size = san_one_side_guarded_sz(size); + + malloc_mutex_lock(tsdn, &sba->mtx); + + if (sba->curr_reg == NULL || + edata_size_get(sba->curr_reg) < guarded_size) { + /* + * If the current region can't accommodate the allocation, + * try replacing it with a larger one and destroy current if the + * replacement succeeds. + */ + to_destroy = sba->curr_reg; + bool err = san_bump_grow_locked(tsdn, sba, pac, ehooks, + guarded_size); + if (err) { + goto label_err; + } + } else { + to_destroy = NULL; + } + assert(guarded_size <= edata_size_get(sba->curr_reg)); + size_t trail_size = edata_size_get(sba->curr_reg) - guarded_size; + + edata_t* edata; + if (trail_size != 0) { + edata_t* curr_reg_trail = extent_split_wrapper(tsdn, pac, + ehooks, sba->curr_reg, guarded_size, trail_size, + /* holding_core_locks */ true); + if (curr_reg_trail == NULL) { + goto label_err; + } + edata = sba->curr_reg; + sba->curr_reg = curr_reg_trail; + } else { + edata = sba->curr_reg; + sba->curr_reg = NULL; + } + + malloc_mutex_unlock(tsdn, &sba->mtx); + + assert(!edata_guarded_get(edata)); + assert(sba->curr_reg == NULL || !edata_guarded_get(sba->curr_reg)); + assert(to_destroy == NULL || !edata_guarded_get(to_destroy)); + + if (to_destroy != NULL) { + extent_destroy_wrapper(tsdn, pac, ehooks, to_destroy); + } + + san_guard_pages(tsdn, ehooks, edata, pac->emap, /* left */ false, + /* right */ true, /* remap */ true); + + if (extent_commit_zero(tsdn, ehooks, edata, /* commit */ true, zero, + /* growing_retained */ false)) { + extent_record(tsdn, pac, ehooks, &pac->ecache_retained, + edata); + return NULL; + } + + if (config_prof) { + extent_gdump_add(tsdn, edata); + } + + return edata; +label_err: + malloc_mutex_unlock(tsdn, &sba->mtx); + return NULL; +} + +static bool +san_bump_grow_locked(tsdn_t *tsdn, san_bump_alloc_t *sba, pac_t *pac, + ehooks_t *ehooks, size_t size) { + malloc_mutex_assert_owner(tsdn, &sba->mtx); + + bool committed = false, zeroed = false; + size_t alloc_size = size > SBA_RETAINED_ALLOC_SIZE ? size : + SBA_RETAINED_ALLOC_SIZE; + assert((alloc_size & PAGE_MASK) == 0); + sba->curr_reg = extent_alloc_wrapper(tsdn, pac, ehooks, NULL, + alloc_size, PAGE, zeroed, &committed, + /* growing_retained */ true); + if (sba->curr_reg == NULL) { + return true; + } + return false; +} diff --git a/BeefRT/JEMalloc/src/sc.c b/BeefRT/JEMalloc/src/sc.c new file mode 100644 index 00000000..e4a94d89 --- /dev/null +++ b/BeefRT/JEMalloc/src/sc.c @@ -0,0 +1,306 @@ +#include "jemalloc/internal/jemalloc_preamble.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/bit_util.h" +#include "jemalloc/internal/bitmap.h" +#include "jemalloc/internal/pages.h" +#include "jemalloc/internal/sc.h" + +/* + * This module computes the size classes used to satisfy allocations. The logic + * here was ported more or less line-by-line from a shell script, and because of + * that is not the most idiomatic C. Eventually we should fix this, but for now + * at least the damage is compartmentalized to this file. + */ + +size_t +reg_size_compute(int lg_base, int lg_delta, int ndelta) { + return (ZU(1) << lg_base) + (ZU(ndelta) << lg_delta); +} + +/* Returns the number of pages in the slab. */ +static int +slab_size(int lg_page, int lg_base, int lg_delta, int ndelta) { + size_t page = (ZU(1) << lg_page); + size_t reg_size = reg_size_compute(lg_base, lg_delta, ndelta); + + size_t try_slab_size = page; + size_t try_nregs = try_slab_size / reg_size; + size_t perfect_slab_size = 0; + bool perfect = false; + /* + * This loop continues until we find the least common multiple of the + * page size and size class size. Size classes are all of the form + * base + ndelta * delta == (ndelta + base/ndelta) * delta, which is + * (ndelta + ngroup) * delta. The way we choose slabbing strategies + * means that delta is at most the page size and ndelta < ngroup. So + * the loop executes for at most 2 * ngroup - 1 iterations, which is + * also the bound on the number of pages in a slab chosen by default. + * With the current default settings, this is at most 7. + */ + while (!perfect) { + perfect_slab_size = try_slab_size; + size_t perfect_nregs = try_nregs; + try_slab_size += page; + try_nregs = try_slab_size / reg_size; + if (perfect_slab_size == perfect_nregs * reg_size) { + perfect = true; + } + } + return (int)(perfect_slab_size / page); +} + +static void +size_class( + /* Output. */ + sc_t *sc, + /* Configuration decisions. */ + int lg_max_lookup, int lg_page, int lg_ngroup, + /* Inputs specific to the size class. */ + int index, int lg_base, int lg_delta, int ndelta) { + sc->index = index; + sc->lg_base = lg_base; + sc->lg_delta = lg_delta; + sc->ndelta = ndelta; + size_t size = reg_size_compute(lg_base, lg_delta, ndelta); + sc->psz = (size % (ZU(1) << lg_page) == 0); + if (index == 0) { + assert(!sc->psz); + } + if (size < (ZU(1) << (lg_page + lg_ngroup))) { + sc->bin = true; + sc->pgs = slab_size(lg_page, lg_base, lg_delta, ndelta); + } else { + sc->bin = false; + sc->pgs = 0; + } + if (size <= (ZU(1) << lg_max_lookup)) { + sc->lg_delta_lookup = lg_delta; + } else { + sc->lg_delta_lookup = 0; + } +} + +static void +size_classes( + /* Output. */ + sc_data_t *sc_data, + /* Determined by the system. */ + size_t lg_ptr_size, int lg_quantum, + /* Configuration decisions. */ + int lg_tiny_min, int lg_max_lookup, int lg_page, int lg_ngroup) { + int ptr_bits = (1 << lg_ptr_size) * 8; + int ngroup = (1 << lg_ngroup); + int ntiny = 0; + int nlbins = 0; + int lg_tiny_maxclass = (unsigned)-1; + int nbins = 0; + int npsizes = 0; + + int index = 0; + + int ndelta = 0; + int lg_base = lg_tiny_min; + int lg_delta = lg_base; + + /* Outputs that we update as we go. */ + size_t lookup_maxclass = 0; + size_t small_maxclass = 0; + int lg_large_minclass = 0; + size_t large_maxclass = 0; + + /* Tiny size classes. */ + while (lg_base < lg_quantum) { + sc_t *sc = &sc_data->sc[index]; + size_class(sc, lg_max_lookup, lg_page, lg_ngroup, index, + lg_base, lg_delta, ndelta); + if (sc->lg_delta_lookup != 0) { + nlbins = index + 1; + } + if (sc->psz) { + npsizes++; + } + if (sc->bin) { + nbins++; + } + ntiny++; + /* Final written value is correct. */ + lg_tiny_maxclass = lg_base; + index++; + lg_delta = lg_base; + lg_base++; + } + + /* First non-tiny (pseudo) group. */ + if (ntiny != 0) { + sc_t *sc = &sc_data->sc[index]; + /* + * See the note in sc.h; the first non-tiny size class has an + * unusual encoding. + */ + lg_base--; + ndelta = 1; + size_class(sc, lg_max_lookup, lg_page, lg_ngroup, index, + lg_base, lg_delta, ndelta); + index++; + lg_base++; + lg_delta++; + if (sc->psz) { + npsizes++; + } + if (sc->bin) { + nbins++; + } + } + while (ndelta < ngroup) { + sc_t *sc = &sc_data->sc[index]; + size_class(sc, lg_max_lookup, lg_page, lg_ngroup, index, + lg_base, lg_delta, ndelta); + index++; + ndelta++; + if (sc->psz) { + npsizes++; + } + if (sc->bin) { + nbins++; + } + } + + /* All remaining groups. */ + lg_base = lg_base + lg_ngroup; + while (lg_base < ptr_bits - 1) { + ndelta = 1; + int ndelta_limit; + if (lg_base == ptr_bits - 2) { + ndelta_limit = ngroup - 1; + } else { + ndelta_limit = ngroup; + } + while (ndelta <= ndelta_limit) { + sc_t *sc = &sc_data->sc[index]; + size_class(sc, lg_max_lookup, lg_page, lg_ngroup, index, + lg_base, lg_delta, ndelta); + if (sc->lg_delta_lookup != 0) { + nlbins = index + 1; + /* Final written value is correct. */ + lookup_maxclass = (ZU(1) << lg_base) + + (ZU(ndelta) << lg_delta); + } + if (sc->psz) { + npsizes++; + } + if (sc->bin) { + nbins++; + /* Final written value is correct. */ + small_maxclass = (ZU(1) << lg_base) + + (ZU(ndelta) << lg_delta); + if (lg_ngroup > 0) { + lg_large_minclass = lg_base + 1; + } else { + lg_large_minclass = lg_base + 2; + } + } + large_maxclass = (ZU(1) << lg_base) + + (ZU(ndelta) << lg_delta); + index++; + ndelta++; + } + lg_base++; + lg_delta++; + } + /* Additional outputs. */ + int nsizes = index; + unsigned lg_ceil_nsizes = lg_ceil(nsizes); + + /* Fill in the output data. */ + sc_data->ntiny = ntiny; + sc_data->nlbins = nlbins; + sc_data->nbins = nbins; + sc_data->nsizes = nsizes; + sc_data->lg_ceil_nsizes = lg_ceil_nsizes; + sc_data->npsizes = npsizes; + sc_data->lg_tiny_maxclass = lg_tiny_maxclass; + sc_data->lookup_maxclass = lookup_maxclass; + sc_data->small_maxclass = small_maxclass; + sc_data->lg_large_minclass = lg_large_minclass; + sc_data->large_minclass = (ZU(1) << lg_large_minclass); + sc_data->large_maxclass = large_maxclass; + + /* + * We compute these values in two ways: + * - Incrementally, as above. + * - In macros, in sc.h. + * The computation is easier when done incrementally, but putting it in + * a constant makes it available to the fast paths without having to + * touch the extra global cacheline. We assert, however, that the two + * computations are equivalent. + */ + assert(sc_data->npsizes == SC_NPSIZES); + assert(sc_data->lg_tiny_maxclass == SC_LG_TINY_MAXCLASS); + assert(sc_data->small_maxclass == SC_SMALL_MAXCLASS); + assert(sc_data->large_minclass == SC_LARGE_MINCLASS); + assert(sc_data->lg_large_minclass == SC_LG_LARGE_MINCLASS); + assert(sc_data->large_maxclass == SC_LARGE_MAXCLASS); + + /* + * In the allocation fastpath, we want to assume that we can + * unconditionally subtract the requested allocation size from + * a ssize_t, and detect passing through 0 correctly. This + * results in optimal generated code. For this to work, the + * maximum allocation size must be less than SSIZE_MAX. + */ + assert(SC_LARGE_MAXCLASS < SSIZE_MAX); +} + +void +sc_data_init(sc_data_t *sc_data) { + size_classes(sc_data, LG_SIZEOF_PTR, LG_QUANTUM, SC_LG_TINY_MIN, + SC_LG_MAX_LOOKUP, LG_PAGE, SC_LG_NGROUP); + + sc_data->initialized = true; +} + +static void +sc_data_update_sc_slab_size(sc_t *sc, size_t reg_size, size_t pgs_guess) { + size_t min_pgs = reg_size / PAGE; + if (reg_size % PAGE != 0) { + min_pgs++; + } + /* + * BITMAP_MAXBITS is actually determined by putting the smallest + * possible size-class on one page, so this can never be 0. + */ + size_t max_pgs = BITMAP_MAXBITS * reg_size / PAGE; + + assert(min_pgs <= max_pgs); + assert(min_pgs > 0); + assert(max_pgs >= 1); + if (pgs_guess < min_pgs) { + sc->pgs = (int)min_pgs; + } else if (pgs_guess > max_pgs) { + sc->pgs = (int)max_pgs; + } else { + sc->pgs = (int)pgs_guess; + } +} + +void +sc_data_update_slab_size(sc_data_t *data, size_t begin, size_t end, int pgs) { + assert(data->initialized); + for (int i = 0; i < data->nsizes; i++) { + sc_t *sc = &data->sc[i]; + if (!sc->bin) { + break; + } + size_t reg_size = reg_size_compute(sc->lg_base, sc->lg_delta, + sc->ndelta); + if (begin <= reg_size && reg_size <= end) { + sc_data_update_sc_slab_size(sc, reg_size, pgs); + } + } +} + +void +sc_boot(sc_data_t *data) { + sc_data_init(data); +} diff --git a/BeefRT/JEMalloc/src/sec.c b/BeefRT/JEMalloc/src/sec.c new file mode 100644 index 00000000..df675590 --- /dev/null +++ b/BeefRT/JEMalloc/src/sec.c @@ -0,0 +1,422 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/sec.h" + +static edata_t *sec_alloc(tsdn_t *tsdn, pai_t *self, size_t size, + size_t alignment, bool zero, bool guarded, bool frequent_reuse, + bool *deferred_work_generated); +static bool sec_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, + size_t old_size, size_t new_size, bool zero, bool *deferred_work_generated); +static bool sec_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata, + size_t old_size, size_t new_size, bool *deferred_work_generated); +static void sec_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata, + bool *deferred_work_generated); + +static void +sec_bin_init(sec_bin_t *bin) { + bin->being_batch_filled = false; + bin->bytes_cur = 0; + edata_list_active_init(&bin->freelist); +} + +bool +sec_init(tsdn_t *tsdn, sec_t *sec, base_t *base, pai_t *fallback, + const sec_opts_t *opts) { + assert(opts->max_alloc >= PAGE); + + size_t max_alloc = PAGE_FLOOR(opts->max_alloc); + pszind_t npsizes = sz_psz2ind(max_alloc) + 1; + + size_t sz_shards = opts->nshards * sizeof(sec_shard_t); + size_t sz_bins = opts->nshards * (size_t)npsizes * sizeof(sec_bin_t); + size_t sz_alloc = sz_shards + sz_bins; + void *dynalloc = base_alloc(tsdn, base, sz_alloc, CACHELINE); + if (dynalloc == NULL) { + return true; + } + sec_shard_t *shard_cur = (sec_shard_t *)dynalloc; + sec->shards = shard_cur; + sec_bin_t *bin_cur = (sec_bin_t *)&shard_cur[opts->nshards]; + /* Just for asserts, below. */ + sec_bin_t *bin_start = bin_cur; + + for (size_t i = 0; i < opts->nshards; i++) { + sec_shard_t *shard = shard_cur; + shard_cur++; + bool err = malloc_mutex_init(&shard->mtx, "sec_shard", + WITNESS_RANK_SEC_SHARD, malloc_mutex_rank_exclusive); + if (err) { + return true; + } + shard->enabled = true; + shard->bins = bin_cur; + for (pszind_t j = 0; j < npsizes; j++) { + sec_bin_init(&shard->bins[j]); + bin_cur++; + } + shard->bytes_cur = 0; + shard->to_flush_next = 0; + } + /* + * Should have exactly matched the bin_start to the first unused byte + * after the shards. + */ + assert((void *)shard_cur == (void *)bin_start); + /* And the last bin to use up the last bytes of the allocation. */ + assert((char *)bin_cur == ((char *)dynalloc + sz_alloc)); + sec->fallback = fallback; + + + sec->opts = *opts; + sec->npsizes = npsizes; + + /* + * Initialize these last so that an improper use of an SEC whose + * initialization failed will segfault in an easy-to-spot way. + */ + sec->pai.alloc = &sec_alloc; + sec->pai.alloc_batch = &pai_alloc_batch_default; + sec->pai.expand = &sec_expand; + sec->pai.shrink = &sec_shrink; + sec->pai.dalloc = &sec_dalloc; + sec->pai.dalloc_batch = &pai_dalloc_batch_default; + + return false; +} + +static sec_shard_t * +sec_shard_pick(tsdn_t *tsdn, sec_t *sec) { + /* + * Eventually, we should implement affinity, tracking source shard using + * the edata_t's newly freed up fields. For now, just randomly + * distribute across all shards. + */ + if (tsdn_null(tsdn)) { + return &sec->shards[0]; + } + tsd_t *tsd = tsdn_tsd(tsdn); + uint8_t *idxp = tsd_sec_shardp_get(tsd); + if (*idxp == (uint8_t)-1) { + /* + * First use; initialize using the trick from Daniel Lemire's + * "A fast alternative to the modulo reduction. Use a 64 bit + * number to store 32 bits, since we'll deliberately overflow + * when we multiply by the number of shards. + */ + uint64_t rand32 = prng_lg_range_u64(tsd_prng_statep_get(tsd), 32); + uint32_t idx = + (uint32_t)((rand32 * (uint64_t)sec->opts.nshards) >> 32); + assert(idx < (uint32_t)sec->opts.nshards); + *idxp = (uint8_t)idx; + } + return &sec->shards[*idxp]; +} + +/* + * Perhaps surprisingly, this can be called on the alloc pathways; if we hit an + * empty cache, we'll try to fill it, which can push the shard over it's limit. + */ +static void +sec_flush_some_and_unlock(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + edata_list_active_t to_flush; + edata_list_active_init(&to_flush); + while (shard->bytes_cur > sec->opts.bytes_after_flush) { + /* Pick a victim. */ + sec_bin_t *bin = &shard->bins[shard->to_flush_next]; + + /* Update our victim-picking state. */ + shard->to_flush_next++; + if (shard->to_flush_next == sec->npsizes) { + shard->to_flush_next = 0; + } + + assert(shard->bytes_cur >= bin->bytes_cur); + if (bin->bytes_cur != 0) { + shard->bytes_cur -= bin->bytes_cur; + bin->bytes_cur = 0; + edata_list_active_concat(&to_flush, &bin->freelist); + } + /* + * Either bin->bytes_cur was 0, in which case we didn't touch + * the bin list but it should be empty anyways (or else we + * missed a bytes_cur update on a list modification), or it + * *was* 0 and we emptied it ourselves. Either way, it should + * be empty now. + */ + assert(edata_list_active_empty(&bin->freelist)); + } + + malloc_mutex_unlock(tsdn, &shard->mtx); + bool deferred_work_generated = false; + pai_dalloc_batch(tsdn, sec->fallback, &to_flush, + &deferred_work_generated); +} + +static edata_t * +sec_shard_alloc_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard, + sec_bin_t *bin) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + if (!shard->enabled) { + return NULL; + } + edata_t *edata = edata_list_active_first(&bin->freelist); + if (edata != NULL) { + edata_list_active_remove(&bin->freelist, edata); + assert(edata_size_get(edata) <= bin->bytes_cur); + bin->bytes_cur -= edata_size_get(edata); + assert(edata_size_get(edata) <= shard->bytes_cur); + shard->bytes_cur -= edata_size_get(edata); + } + return edata; +} + +static edata_t * +sec_batch_fill_and_alloc(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard, + sec_bin_t *bin, size_t size) { + malloc_mutex_assert_not_owner(tsdn, &shard->mtx); + + edata_list_active_t result; + edata_list_active_init(&result); + bool deferred_work_generated = false; + size_t nalloc = pai_alloc_batch(tsdn, sec->fallback, size, + 1 + sec->opts.batch_fill_extra, &result, &deferred_work_generated); + + edata_t *ret = edata_list_active_first(&result); + if (ret != NULL) { + edata_list_active_remove(&result, ret); + } + + malloc_mutex_lock(tsdn, &shard->mtx); + bin->being_batch_filled = false; + /* + * Handle the easy case first: nothing to cache. Note that this can + * only happen in case of OOM, since sec_alloc checks the expected + * number of allocs, and doesn't bother going down the batch_fill + * pathway if there won't be anything left to cache. So to be in this + * code path, we must have asked for > 1 alloc, but only gotten 1 back. + */ + if (nalloc <= 1) { + malloc_mutex_unlock(tsdn, &shard->mtx); + return ret; + } + + size_t new_cached_bytes = (nalloc - 1) * size; + + edata_list_active_concat(&bin->freelist, &result); + bin->bytes_cur += new_cached_bytes; + shard->bytes_cur += new_cached_bytes; + + if (shard->bytes_cur > sec->opts.max_bytes) { + sec_flush_some_and_unlock(tsdn, sec, shard); + } else { + malloc_mutex_unlock(tsdn, &shard->mtx); + } + + return ret; +} + +static edata_t * +sec_alloc(tsdn_t *tsdn, pai_t *self, size_t size, size_t alignment, bool zero, + bool guarded, bool frequent_reuse, bool *deferred_work_generated) { + assert((size & PAGE_MASK) == 0); + assert(!guarded); + + sec_t *sec = (sec_t *)self; + + if (zero || alignment > PAGE || sec->opts.nshards == 0 + || size > sec->opts.max_alloc) { + return pai_alloc(tsdn, sec->fallback, size, alignment, zero, + /* guarded */ false, frequent_reuse, + deferred_work_generated); + } + pszind_t pszind = sz_psz2ind(size); + assert(pszind < sec->npsizes); + + sec_shard_t *shard = sec_shard_pick(tsdn, sec); + sec_bin_t *bin = &shard->bins[pszind]; + bool do_batch_fill = false; + + malloc_mutex_lock(tsdn, &shard->mtx); + edata_t *edata = sec_shard_alloc_locked(tsdn, sec, shard, bin); + if (edata == NULL) { + if (!bin->being_batch_filled + && sec->opts.batch_fill_extra > 0) { + bin->being_batch_filled = true; + do_batch_fill = true; + } + } + malloc_mutex_unlock(tsdn, &shard->mtx); + if (edata == NULL) { + if (do_batch_fill) { + edata = sec_batch_fill_and_alloc(tsdn, sec, shard, bin, + size); + } else { + edata = pai_alloc(tsdn, sec->fallback, size, alignment, + zero, /* guarded */ false, frequent_reuse, + deferred_work_generated); + } + } + return edata; +} + +static bool +sec_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size, + size_t new_size, bool zero, bool *deferred_work_generated) { + sec_t *sec = (sec_t *)self; + return pai_expand(tsdn, sec->fallback, edata, old_size, new_size, zero, + deferred_work_generated); +} + +static bool +sec_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size, + size_t new_size, bool *deferred_work_generated) { + sec_t *sec = (sec_t *)self; + return pai_shrink(tsdn, sec->fallback, edata, old_size, new_size, + deferred_work_generated); +} + +static void +sec_flush_all_locked(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + shard->bytes_cur = 0; + edata_list_active_t to_flush; + edata_list_active_init(&to_flush); + for (pszind_t i = 0; i < sec->npsizes; i++) { + sec_bin_t *bin = &shard->bins[i]; + bin->bytes_cur = 0; + edata_list_active_concat(&to_flush, &bin->freelist); + } + + /* + * Ordinarily we would try to avoid doing the batch deallocation while + * holding the shard mutex, but the flush_all pathways only happen when + * we're disabling the HPA or resetting the arena, both of which are + * rare pathways. + */ + bool deferred_work_generated = false; + pai_dalloc_batch(tsdn, sec->fallback, &to_flush, + &deferred_work_generated); +} + +static void +sec_shard_dalloc_and_unlock(tsdn_t *tsdn, sec_t *sec, sec_shard_t *shard, + edata_t *edata) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + assert(shard->bytes_cur <= sec->opts.max_bytes); + size_t size = edata_size_get(edata); + pszind_t pszind = sz_psz2ind(size); + assert(pszind < sec->npsizes); + /* + * Prepending here results in LIFO allocation per bin, which seems + * reasonable. + */ + sec_bin_t *bin = &shard->bins[pszind]; + edata_list_active_prepend(&bin->freelist, edata); + bin->bytes_cur += size; + shard->bytes_cur += size; + if (shard->bytes_cur > sec->opts.max_bytes) { + /* + * We've exceeded the shard limit. We make two nods in the + * direction of fragmentation avoidance: we flush everything in + * the shard, rather than one particular bin, and we hold the + * lock while flushing (in case one of the extents we flush is + * highly preferred from a fragmentation-avoidance perspective + * in the backing allocator). This has the extra advantage of + * not requiring advanced cache balancing strategies. + */ + sec_flush_some_and_unlock(tsdn, sec, shard); + malloc_mutex_assert_not_owner(tsdn, &shard->mtx); + } else { + malloc_mutex_unlock(tsdn, &shard->mtx); + } +} + +static void +sec_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata, + bool *deferred_work_generated) { + sec_t *sec = (sec_t *)self; + if (sec->opts.nshards == 0 + || edata_size_get(edata) > sec->opts.max_alloc) { + pai_dalloc(tsdn, sec->fallback, edata, + deferred_work_generated); + return; + } + sec_shard_t *shard = sec_shard_pick(tsdn, sec); + malloc_mutex_lock(tsdn, &shard->mtx); + if (shard->enabled) { + sec_shard_dalloc_and_unlock(tsdn, sec, shard, edata); + } else { + malloc_mutex_unlock(tsdn, &shard->mtx); + pai_dalloc(tsdn, sec->fallback, edata, + deferred_work_generated); + } +} + +void +sec_flush(tsdn_t *tsdn, sec_t *sec) { + for (size_t i = 0; i < sec->opts.nshards; i++) { + malloc_mutex_lock(tsdn, &sec->shards[i].mtx); + sec_flush_all_locked(tsdn, sec, &sec->shards[i]); + malloc_mutex_unlock(tsdn, &sec->shards[i].mtx); + } +} + +void +sec_disable(tsdn_t *tsdn, sec_t *sec) { + for (size_t i = 0; i < sec->opts.nshards; i++) { + malloc_mutex_lock(tsdn, &sec->shards[i].mtx); + sec->shards[i].enabled = false; + sec_flush_all_locked(tsdn, sec, &sec->shards[i]); + malloc_mutex_unlock(tsdn, &sec->shards[i].mtx); + } +} + +void +sec_stats_merge(tsdn_t *tsdn, sec_t *sec, sec_stats_t *stats) { + size_t sum = 0; + for (size_t i = 0; i < sec->opts.nshards; i++) { + /* + * We could save these lock acquisitions by making bytes_cur + * atomic, but stats collection is rare anyways and we expect + * the number and type of stats to get more interesting. + */ + malloc_mutex_lock(tsdn, &sec->shards[i].mtx); + sum += sec->shards[i].bytes_cur; + malloc_mutex_unlock(tsdn, &sec->shards[i].mtx); + } + stats->bytes += sum; +} + +void +sec_mutex_stats_read(tsdn_t *tsdn, sec_t *sec, + mutex_prof_data_t *mutex_prof_data) { + for (size_t i = 0; i < sec->opts.nshards; i++) { + malloc_mutex_lock(tsdn, &sec->shards[i].mtx); + malloc_mutex_prof_accum(tsdn, mutex_prof_data, + &sec->shards[i].mtx); + malloc_mutex_unlock(tsdn, &sec->shards[i].mtx); + } +} + +void +sec_prefork2(tsdn_t *tsdn, sec_t *sec) { + for (size_t i = 0; i < sec->opts.nshards; i++) { + malloc_mutex_prefork(tsdn, &sec->shards[i].mtx); + } +} + +void +sec_postfork_parent(tsdn_t *tsdn, sec_t *sec) { + for (size_t i = 0; i < sec->opts.nshards; i++) { + malloc_mutex_postfork_parent(tsdn, &sec->shards[i].mtx); + } +} + +void +sec_postfork_child(tsdn_t *tsdn, sec_t *sec) { + for (size_t i = 0; i < sec->opts.nshards; i++) { + malloc_mutex_postfork_child(tsdn, &sec->shards[i].mtx); + } +} diff --git a/BeefRT/JEMalloc/src/stats.c b/BeefRT/JEMalloc/src/stats.c new file mode 100644 index 00000000..efc70fd3 --- /dev/null +++ b/BeefRT/JEMalloc/src/stats.c @@ -0,0 +1,1973 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/ctl.h" +#include "jemalloc/internal/emitter.h" +#include "jemalloc/internal/fxp.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/mutex_prof.h" +#include "jemalloc/internal/prof_stats.h" + +const char *global_mutex_names[mutex_prof_num_global_mutexes] = { +#define OP(mtx) #mtx, + MUTEX_PROF_GLOBAL_MUTEXES +#undef OP +}; + +const char *arena_mutex_names[mutex_prof_num_arena_mutexes] = { +#define OP(mtx) #mtx, + MUTEX_PROF_ARENA_MUTEXES +#undef OP +}; + +#define CTL_GET(n, v, t) do { \ + size_t sz = sizeof(t); \ + xmallctl(n, (void *)v, &sz, NULL, 0); \ +} while (0) + +#define CTL_LEAF_PREPARE(mib, miblen, name) do { \ + assert(miblen < CTL_MAX_DEPTH); \ + size_t miblen_new = CTL_MAX_DEPTH; \ + xmallctlmibnametomib(mib, miblen, name, &miblen_new); \ + assert(miblen_new > miblen); \ +} while (0) + +#define CTL_LEAF(mib, miblen, leaf, v, t) do { \ + assert(miblen < CTL_MAX_DEPTH); \ + size_t miblen_new = CTL_MAX_DEPTH; \ + size_t sz = sizeof(t); \ + xmallctlbymibname(mib, miblen, leaf, &miblen_new, (void *)v, \ + &sz, NULL, 0); \ + assert(miblen_new == miblen + 1); \ +} while (0) + +#define CTL_M2_GET(n, i, v, t) do { \ + size_t mib[CTL_MAX_DEPTH]; \ + size_t miblen = sizeof(mib) / sizeof(size_t); \ + size_t sz = sizeof(t); \ + xmallctlnametomib(n, mib, &miblen); \ + mib[2] = (i); \ + xmallctlbymib(mib, miblen, (void *)v, &sz, NULL, 0); \ +} while (0) + +/******************************************************************************/ +/* Data. */ + +bool opt_stats_print = false; +char opt_stats_print_opts[stats_print_tot_num_options+1] = ""; + +int64_t opt_stats_interval = STATS_INTERVAL_DEFAULT; +char opt_stats_interval_opts[stats_print_tot_num_options+1] = ""; + +static counter_accum_t stats_interval_accumulated; +/* Per thread batch accum size for stats_interval. */ +static uint64_t stats_interval_accum_batch; + +/******************************************************************************/ + +static uint64_t +rate_per_second(uint64_t value, uint64_t uptime_ns) { + uint64_t billion = 1000000000; + if (uptime_ns == 0 || value == 0) { + return 0; + } + if (uptime_ns < billion) { + return value; + } else { + uint64_t uptime_s = uptime_ns / billion; + return value / uptime_s; + } +} + +/* Calculate x.yyy and output a string (takes a fixed sized char array). */ +static bool +get_rate_str(uint64_t dividend, uint64_t divisor, char str[6]) { + if (divisor == 0 || dividend > divisor) { + /* The rate is not supposed to be greater than 1. */ + return true; + } + if (dividend > 0) { + assert(UINT64_MAX / dividend >= 1000); + } + + unsigned n = (unsigned)((dividend * 1000) / divisor); + if (n < 10) { + malloc_snprintf(str, 6, "0.00%u", n); + } else if (n < 100) { + malloc_snprintf(str, 6, "0.0%u", n); + } else if (n < 1000) { + malloc_snprintf(str, 6, "0.%u", n); + } else { + malloc_snprintf(str, 6, "1"); + } + + return false; +} + +static void +mutex_stats_init_cols(emitter_row_t *row, const char *table_name, + emitter_col_t *name, + emitter_col_t col_uint64_t[mutex_prof_num_uint64_t_counters], + emitter_col_t col_uint32_t[mutex_prof_num_uint32_t_counters]) { + mutex_prof_uint64_t_counter_ind_t k_uint64_t = 0; + mutex_prof_uint32_t_counter_ind_t k_uint32_t = 0; + + emitter_col_t *col; + + if (name != NULL) { + emitter_col_init(name, row); + name->justify = emitter_justify_left; + name->width = 21; + name->type = emitter_type_title; + name->str_val = table_name; + } + +#define WIDTH_uint32_t 12 +#define WIDTH_uint64_t 16 +#define OP(counter, counter_type, human, derived, base_counter) \ + col = &col_##counter_type[k_##counter_type]; \ + ++k_##counter_type; \ + emitter_col_init(col, row); \ + col->justify = emitter_justify_right; \ + col->width = derived ? 8 : WIDTH_##counter_type; \ + col->type = emitter_type_title; \ + col->str_val = human; + MUTEX_PROF_COUNTERS +#undef OP +#undef WIDTH_uint32_t +#undef WIDTH_uint64_t + col_uint64_t[mutex_counter_total_wait_time_ps].width = 10; +} + +static void +mutex_stats_read_global(size_t mib[], size_t miblen, const char *name, + emitter_col_t *col_name, + emitter_col_t col_uint64_t[mutex_prof_num_uint64_t_counters], + emitter_col_t col_uint32_t[mutex_prof_num_uint32_t_counters], + uint64_t uptime) { + CTL_LEAF_PREPARE(mib, miblen, name); + size_t miblen_name = miblen + 1; + + col_name->str_val = name; + + emitter_col_t *dst; +#define EMITTER_TYPE_uint32_t emitter_type_uint32 +#define EMITTER_TYPE_uint64_t emitter_type_uint64 +#define OP(counter, counter_type, human, derived, base_counter) \ + dst = &col_##counter_type[mutex_counter_##counter]; \ + dst->type = EMITTER_TYPE_##counter_type; \ + if (!derived) { \ + CTL_LEAF(mib, miblen_name, #counter, \ + (counter_type *)&dst->bool_val, counter_type); \ + } else { \ + emitter_col_t *base = \ + &col_##counter_type[mutex_counter_##base_counter]; \ + dst->counter_type##_val = \ + (counter_type)rate_per_second( \ + base->counter_type##_val, uptime); \ + } + MUTEX_PROF_COUNTERS +#undef OP +#undef EMITTER_TYPE_uint32_t +#undef EMITTER_TYPE_uint64_t +} + +static void +mutex_stats_read_arena(size_t mib[], size_t miblen, const char *name, + emitter_col_t *col_name, + emitter_col_t col_uint64_t[mutex_prof_num_uint64_t_counters], + emitter_col_t col_uint32_t[mutex_prof_num_uint32_t_counters], + uint64_t uptime) { + CTL_LEAF_PREPARE(mib, miblen, name); + size_t miblen_name = miblen + 1; + + col_name->str_val = name; + + emitter_col_t *dst; +#define EMITTER_TYPE_uint32_t emitter_type_uint32 +#define EMITTER_TYPE_uint64_t emitter_type_uint64 +#define OP(counter, counter_type, human, derived, base_counter) \ + dst = &col_##counter_type[mutex_counter_##counter]; \ + dst->type = EMITTER_TYPE_##counter_type; \ + if (!derived) { \ + CTL_LEAF(mib, miblen_name, #counter, \ + (counter_type *)&dst->bool_val, counter_type); \ + } else { \ + emitter_col_t *base = \ + &col_##counter_type[mutex_counter_##base_counter]; \ + dst->counter_type##_val = \ + (counter_type)rate_per_second( \ + base->counter_type##_val, uptime); \ + } + MUTEX_PROF_COUNTERS +#undef OP +#undef EMITTER_TYPE_uint32_t +#undef EMITTER_TYPE_uint64_t +} + +static void +mutex_stats_read_arena_bin(size_t mib[], size_t miblen, + emitter_col_t col_uint64_t[mutex_prof_num_uint64_t_counters], + emitter_col_t col_uint32_t[mutex_prof_num_uint32_t_counters], + uint64_t uptime) { + CTL_LEAF_PREPARE(mib, miblen, "mutex"); + size_t miblen_mutex = miblen + 1; + + emitter_col_t *dst; + +#define EMITTER_TYPE_uint32_t emitter_type_uint32 +#define EMITTER_TYPE_uint64_t emitter_type_uint64 +#define OP(counter, counter_type, human, derived, base_counter) \ + dst = &col_##counter_type[mutex_counter_##counter]; \ + dst->type = EMITTER_TYPE_##counter_type; \ + if (!derived) { \ + CTL_LEAF(mib, miblen_mutex, #counter, \ + (counter_type *)&dst->bool_val, counter_type); \ + } else { \ + emitter_col_t *base = \ + &col_##counter_type[mutex_counter_##base_counter]; \ + dst->counter_type##_val = \ + (counter_type)rate_per_second( \ + base->counter_type##_val, uptime); \ + } + MUTEX_PROF_COUNTERS +#undef OP +#undef EMITTER_TYPE_uint32_t +#undef EMITTER_TYPE_uint64_t +} + +/* "row" can be NULL to avoid emitting in table mode. */ +static void +mutex_stats_emit(emitter_t *emitter, emitter_row_t *row, + emitter_col_t col_uint64_t[mutex_prof_num_uint64_t_counters], + emitter_col_t col_uint32_t[mutex_prof_num_uint32_t_counters]) { + if (row != NULL) { + emitter_table_row(emitter, row); + } + + mutex_prof_uint64_t_counter_ind_t k_uint64_t = 0; + mutex_prof_uint32_t_counter_ind_t k_uint32_t = 0; + + emitter_col_t *col; + +#define EMITTER_TYPE_uint32_t emitter_type_uint32 +#define EMITTER_TYPE_uint64_t emitter_type_uint64 +#define OP(counter, type, human, derived, base_counter) \ + if (!derived) { \ + col = &col_##type[k_##type]; \ + ++k_##type; \ + emitter_json_kv(emitter, #counter, EMITTER_TYPE_##type, \ + (const void *)&col->bool_val); \ + } + MUTEX_PROF_COUNTERS; +#undef OP +#undef EMITTER_TYPE_uint32_t +#undef EMITTER_TYPE_uint64_t +} + +#define COL_DECLARE(column_name) \ + emitter_col_t col_##column_name; + +#define COL_INIT(row_name, column_name, left_or_right, col_width, etype)\ + emitter_col_init(&col_##column_name, &row_name); \ + col_##column_name.justify = emitter_justify_##left_or_right; \ + col_##column_name.width = col_width; \ + col_##column_name.type = emitter_type_##etype; + +#define COL(row_name, column_name, left_or_right, col_width, etype) \ + COL_DECLARE(column_name); \ + COL_INIT(row_name, column_name, left_or_right, col_width, etype) + +#define COL_HDR_DECLARE(column_name) \ + COL_DECLARE(column_name); \ + emitter_col_t header_##column_name; + +#define COL_HDR_INIT(row_name, column_name, human, left_or_right, \ + col_width, etype) \ + COL_INIT(row_name, column_name, left_or_right, col_width, etype)\ + emitter_col_init(&header_##column_name, &header_##row_name); \ + header_##column_name.justify = emitter_justify_##left_or_right; \ + header_##column_name.width = col_width; \ + header_##column_name.type = emitter_type_title; \ + header_##column_name.str_val = human ? human : #column_name; + +#define COL_HDR(row_name, column_name, human, left_or_right, col_width, \ + etype) \ + COL_HDR_DECLARE(column_name) \ + COL_HDR_INIT(row_name, column_name, human, left_or_right, \ + col_width, etype) + +JEMALLOC_COLD +static void +stats_arena_bins_print(emitter_t *emitter, bool mutex, unsigned i, + uint64_t uptime) { + size_t page; + bool in_gap, in_gap_prev; + unsigned nbins, j; + + CTL_GET("arenas.page", &page, size_t); + + CTL_GET("arenas.nbins", &nbins, unsigned); + + emitter_row_t header_row; + emitter_row_init(&header_row); + + emitter_row_t row; + emitter_row_init(&row); + + bool prof_stats_on = config_prof && opt_prof && opt_prof_stats + && i == MALLCTL_ARENAS_ALL; + + COL_HDR(row, size, NULL, right, 20, size) + COL_HDR(row, ind, NULL, right, 4, unsigned) + COL_HDR(row, allocated, NULL, right, 13, uint64) + COL_HDR(row, nmalloc, NULL, right, 13, uint64) + COL_HDR(row, nmalloc_ps, "(#/sec)", right, 8, uint64) + COL_HDR(row, ndalloc, NULL, right, 13, uint64) + COL_HDR(row, ndalloc_ps, "(#/sec)", right, 8, uint64) + COL_HDR(row, nrequests, NULL, right, 13, uint64) + COL_HDR(row, nrequests_ps, "(#/sec)", right, 10, uint64) + COL_HDR_DECLARE(prof_live_requested); + COL_HDR_DECLARE(prof_live_count); + COL_HDR_DECLARE(prof_accum_requested); + COL_HDR_DECLARE(prof_accum_count); + if (prof_stats_on) { + COL_HDR_INIT(row, prof_live_requested, NULL, right, 21, uint64) + COL_HDR_INIT(row, prof_live_count, NULL, right, 17, uint64) + COL_HDR_INIT(row, prof_accum_requested, NULL, right, 21, uint64) + COL_HDR_INIT(row, prof_accum_count, NULL, right, 17, uint64) + } + COL_HDR(row, nshards, NULL, right, 9, unsigned) + COL_HDR(row, curregs, NULL, right, 13, size) + COL_HDR(row, curslabs, NULL, right, 13, size) + COL_HDR(row, nonfull_slabs, NULL, right, 15, size) + COL_HDR(row, regs, NULL, right, 5, unsigned) + COL_HDR(row, pgs, NULL, right, 4, size) + /* To buffer a right- and left-justified column. */ + COL_HDR(row, justify_spacer, NULL, right, 1, title) + COL_HDR(row, util, NULL, right, 6, title) + COL_HDR(row, nfills, NULL, right, 13, uint64) + COL_HDR(row, nfills_ps, "(#/sec)", right, 8, uint64) + COL_HDR(row, nflushes, NULL, right, 13, uint64) + COL_HDR(row, nflushes_ps, "(#/sec)", right, 8, uint64) + COL_HDR(row, nslabs, NULL, right, 13, uint64) + COL_HDR(row, nreslabs, NULL, right, 13, uint64) + COL_HDR(row, nreslabs_ps, "(#/sec)", right, 8, uint64) + + /* Don't want to actually print the name. */ + header_justify_spacer.str_val = " "; + col_justify_spacer.str_val = " "; + + emitter_col_t col_mutex64[mutex_prof_num_uint64_t_counters]; + emitter_col_t col_mutex32[mutex_prof_num_uint32_t_counters]; + + emitter_col_t header_mutex64[mutex_prof_num_uint64_t_counters]; + emitter_col_t header_mutex32[mutex_prof_num_uint32_t_counters]; + + if (mutex) { + mutex_stats_init_cols(&row, NULL, NULL, col_mutex64, + col_mutex32); + mutex_stats_init_cols(&header_row, NULL, NULL, header_mutex64, + header_mutex32); + } + + /* + * We print a "bins:" header as part of the table row; we need to adjust + * the header size column to compensate. + */ + header_size.width -=5; + emitter_table_printf(emitter, "bins:"); + emitter_table_row(emitter, &header_row); + emitter_json_array_kv_begin(emitter, "bins"); + + size_t stats_arenas_mib[CTL_MAX_DEPTH]; + CTL_LEAF_PREPARE(stats_arenas_mib, 0, "stats.arenas"); + stats_arenas_mib[2] = i; + CTL_LEAF_PREPARE(stats_arenas_mib, 3, "bins"); + + size_t arenas_bin_mib[CTL_MAX_DEPTH]; + CTL_LEAF_PREPARE(arenas_bin_mib, 0, "arenas.bin"); + + size_t prof_stats_mib[CTL_MAX_DEPTH]; + if (prof_stats_on) { + CTL_LEAF_PREPARE(prof_stats_mib, 0, "prof.stats.bins"); + } + + for (j = 0, in_gap = false; j < nbins; j++) { + uint64_t nslabs; + size_t reg_size, slab_size, curregs; + size_t curslabs; + size_t nonfull_slabs; + uint32_t nregs, nshards; + uint64_t nmalloc, ndalloc, nrequests, nfills, nflushes; + uint64_t nreslabs; + prof_stats_t prof_live; + prof_stats_t prof_accum; + + stats_arenas_mib[4] = j; + arenas_bin_mib[2] = j; + + CTL_LEAF(stats_arenas_mib, 5, "nslabs", &nslabs, uint64_t); + + if (prof_stats_on) { + prof_stats_mib[3] = j; + CTL_LEAF(prof_stats_mib, 4, "live", &prof_live, + prof_stats_t); + CTL_LEAF(prof_stats_mib, 4, "accum", &prof_accum, + prof_stats_t); + } + + in_gap_prev = in_gap; + if (prof_stats_on) { + in_gap = (nslabs == 0 && prof_accum.count == 0); + } else { + in_gap = (nslabs == 0); + } + + if (in_gap_prev && !in_gap) { + emitter_table_printf(emitter, + " ---\n"); + } + + if (in_gap && !emitter_outputs_json(emitter)) { + continue; + } + + CTL_LEAF(arenas_bin_mib, 3, "size", ®_size, size_t); + CTL_LEAF(arenas_bin_mib, 3, "nregs", &nregs, uint32_t); + CTL_LEAF(arenas_bin_mib, 3, "slab_size", &slab_size, size_t); + CTL_LEAF(arenas_bin_mib, 3, "nshards", &nshards, uint32_t); + CTL_LEAF(stats_arenas_mib, 5, "nmalloc", &nmalloc, uint64_t); + CTL_LEAF(stats_arenas_mib, 5, "ndalloc", &ndalloc, uint64_t); + CTL_LEAF(stats_arenas_mib, 5, "curregs", &curregs, size_t); + CTL_LEAF(stats_arenas_mib, 5, "nrequests", &nrequests, + uint64_t); + CTL_LEAF(stats_arenas_mib, 5, "nfills", &nfills, uint64_t); + CTL_LEAF(stats_arenas_mib, 5, "nflushes", &nflushes, uint64_t); + CTL_LEAF(stats_arenas_mib, 5, "nreslabs", &nreslabs, uint64_t); + CTL_LEAF(stats_arenas_mib, 5, "curslabs", &curslabs, size_t); + CTL_LEAF(stats_arenas_mib, 5, "nonfull_slabs", &nonfull_slabs, + size_t); + + if (mutex) { + mutex_stats_read_arena_bin(stats_arenas_mib, 5, + col_mutex64, col_mutex32, uptime); + } + + emitter_json_object_begin(emitter); + emitter_json_kv(emitter, "nmalloc", emitter_type_uint64, + &nmalloc); + emitter_json_kv(emitter, "ndalloc", emitter_type_uint64, + &ndalloc); + emitter_json_kv(emitter, "curregs", emitter_type_size, + &curregs); + emitter_json_kv(emitter, "nrequests", emitter_type_uint64, + &nrequests); + if (prof_stats_on) { + emitter_json_kv(emitter, "prof_live_requested", + emitter_type_uint64, &prof_live.req_sum); + emitter_json_kv(emitter, "prof_live_count", + emitter_type_uint64, &prof_live.count); + emitter_json_kv(emitter, "prof_accum_requested", + emitter_type_uint64, &prof_accum.req_sum); + emitter_json_kv(emitter, "prof_accum_count", + emitter_type_uint64, &prof_accum.count); + } + emitter_json_kv(emitter, "nfills", emitter_type_uint64, + &nfills); + emitter_json_kv(emitter, "nflushes", emitter_type_uint64, + &nflushes); + emitter_json_kv(emitter, "nreslabs", emitter_type_uint64, + &nreslabs); + emitter_json_kv(emitter, "curslabs", emitter_type_size, + &curslabs); + emitter_json_kv(emitter, "nonfull_slabs", emitter_type_size, + &nonfull_slabs); + if (mutex) { + emitter_json_object_kv_begin(emitter, "mutex"); + mutex_stats_emit(emitter, NULL, col_mutex64, + col_mutex32); + emitter_json_object_end(emitter); + } + emitter_json_object_end(emitter); + + size_t availregs = nregs * curslabs; + char util[6]; + if (get_rate_str((uint64_t)curregs, (uint64_t)availregs, util)) + { + if (availregs == 0) { + malloc_snprintf(util, sizeof(util), "1"); + } else if (curregs > availregs) { + /* + * Race detected: the counters were read in + * separate mallctl calls and concurrent + * operations happened in between. In this case + * no meaningful utilization can be computed. + */ + malloc_snprintf(util, sizeof(util), " race"); + } else { + not_reached(); + } + } + + col_size.size_val = reg_size; + col_ind.unsigned_val = j; + col_allocated.size_val = curregs * reg_size; + col_nmalloc.uint64_val = nmalloc; + col_nmalloc_ps.uint64_val = rate_per_second(nmalloc, uptime); + col_ndalloc.uint64_val = ndalloc; + col_ndalloc_ps.uint64_val = rate_per_second(ndalloc, uptime); + col_nrequests.uint64_val = nrequests; + col_nrequests_ps.uint64_val = rate_per_second(nrequests, uptime); + if (prof_stats_on) { + col_prof_live_requested.uint64_val = prof_live.req_sum; + col_prof_live_count.uint64_val = prof_live.count; + col_prof_accum_requested.uint64_val = + prof_accum.req_sum; + col_prof_accum_count.uint64_val = prof_accum.count; + } + col_nshards.unsigned_val = nshards; + col_curregs.size_val = curregs; + col_curslabs.size_val = curslabs; + col_nonfull_slabs.size_val = nonfull_slabs; + col_regs.unsigned_val = nregs; + col_pgs.size_val = slab_size / page; + col_util.str_val = util; + col_nfills.uint64_val = nfills; + col_nfills_ps.uint64_val = rate_per_second(nfills, uptime); + col_nflushes.uint64_val = nflushes; + col_nflushes_ps.uint64_val = rate_per_second(nflushes, uptime); + col_nslabs.uint64_val = nslabs; + col_nreslabs.uint64_val = nreslabs; + col_nreslabs_ps.uint64_val = rate_per_second(nreslabs, uptime); + + /* + * Note that mutex columns were initialized above, if mutex == + * true. + */ + + emitter_table_row(emitter, &row); + } + emitter_json_array_end(emitter); /* Close "bins". */ + + if (in_gap) { + emitter_table_printf(emitter, " ---\n"); + } +} + +JEMALLOC_COLD +static void +stats_arena_lextents_print(emitter_t *emitter, unsigned i, uint64_t uptime) { + unsigned nbins, nlextents, j; + bool in_gap, in_gap_prev; + + CTL_GET("arenas.nbins", &nbins, unsigned); + CTL_GET("arenas.nlextents", &nlextents, unsigned); + + emitter_row_t header_row; + emitter_row_init(&header_row); + emitter_row_t row; + emitter_row_init(&row); + + bool prof_stats_on = config_prof && opt_prof && opt_prof_stats + && i == MALLCTL_ARENAS_ALL; + + COL_HDR(row, size, NULL, right, 20, size) + COL_HDR(row, ind, NULL, right, 4, unsigned) + COL_HDR(row, allocated, NULL, right, 13, size) + COL_HDR(row, nmalloc, NULL, right, 13, uint64) + COL_HDR(row, nmalloc_ps, "(#/sec)", right, 8, uint64) + COL_HDR(row, ndalloc, NULL, right, 13, uint64) + COL_HDR(row, ndalloc_ps, "(#/sec)", right, 8, uint64) + COL_HDR(row, nrequests, NULL, right, 13, uint64) + COL_HDR(row, nrequests_ps, "(#/sec)", right, 8, uint64) + COL_HDR_DECLARE(prof_live_requested) + COL_HDR_DECLARE(prof_live_count) + COL_HDR_DECLARE(prof_accum_requested) + COL_HDR_DECLARE(prof_accum_count) + if (prof_stats_on) { + COL_HDR_INIT(row, prof_live_requested, NULL, right, 21, uint64) + COL_HDR_INIT(row, prof_live_count, NULL, right, 17, uint64) + COL_HDR_INIT(row, prof_accum_requested, NULL, right, 21, uint64) + COL_HDR_INIT(row, prof_accum_count, NULL, right, 17, uint64) + } + COL_HDR(row, curlextents, NULL, right, 13, size) + + /* As with bins, we label the large extents table. */ + header_size.width -= 6; + emitter_table_printf(emitter, "large:"); + emitter_table_row(emitter, &header_row); + emitter_json_array_kv_begin(emitter, "lextents"); + + size_t stats_arenas_mib[CTL_MAX_DEPTH]; + CTL_LEAF_PREPARE(stats_arenas_mib, 0, "stats.arenas"); + stats_arenas_mib[2] = i; + CTL_LEAF_PREPARE(stats_arenas_mib, 3, "lextents"); + + size_t arenas_lextent_mib[CTL_MAX_DEPTH]; + CTL_LEAF_PREPARE(arenas_lextent_mib, 0, "arenas.lextent"); + + size_t prof_stats_mib[CTL_MAX_DEPTH]; + if (prof_stats_on) { + CTL_LEAF_PREPARE(prof_stats_mib, 0, "prof.stats.lextents"); + } + + for (j = 0, in_gap = false; j < nlextents; j++) { + uint64_t nmalloc, ndalloc, nrequests; + size_t lextent_size, curlextents; + prof_stats_t prof_live; + prof_stats_t prof_accum; + + stats_arenas_mib[4] = j; + arenas_lextent_mib[2] = j; + + CTL_LEAF(stats_arenas_mib, 5, "nmalloc", &nmalloc, uint64_t); + CTL_LEAF(stats_arenas_mib, 5, "ndalloc", &ndalloc, uint64_t); + CTL_LEAF(stats_arenas_mib, 5, "nrequests", &nrequests, + uint64_t); + + in_gap_prev = in_gap; + in_gap = (nrequests == 0); + + if (in_gap_prev && !in_gap) { + emitter_table_printf(emitter, + " ---\n"); + } + + CTL_LEAF(arenas_lextent_mib, 3, "size", &lextent_size, size_t); + CTL_LEAF(stats_arenas_mib, 5, "curlextents", &curlextents, + size_t); + + if (prof_stats_on) { + prof_stats_mib[3] = j; + CTL_LEAF(prof_stats_mib, 4, "live", &prof_live, + prof_stats_t); + CTL_LEAF(prof_stats_mib, 4, "accum", &prof_accum, + prof_stats_t); + } + + emitter_json_object_begin(emitter); + if (prof_stats_on) { + emitter_json_kv(emitter, "prof_live_requested", + emitter_type_uint64, &prof_live.req_sum); + emitter_json_kv(emitter, "prof_live_count", + emitter_type_uint64, &prof_live.count); + emitter_json_kv(emitter, "prof_accum_requested", + emitter_type_uint64, &prof_accum.req_sum); + emitter_json_kv(emitter, "prof_accum_count", + emitter_type_uint64, &prof_accum.count); + } + emitter_json_kv(emitter, "curlextents", emitter_type_size, + &curlextents); + emitter_json_object_end(emitter); + + col_size.size_val = lextent_size; + col_ind.unsigned_val = nbins + j; + col_allocated.size_val = curlextents * lextent_size; + col_nmalloc.uint64_val = nmalloc; + col_nmalloc_ps.uint64_val = rate_per_second(nmalloc, uptime); + col_ndalloc.uint64_val = ndalloc; + col_ndalloc_ps.uint64_val = rate_per_second(ndalloc, uptime); + col_nrequests.uint64_val = nrequests; + col_nrequests_ps.uint64_val = rate_per_second(nrequests, uptime); + if (prof_stats_on) { + col_prof_live_requested.uint64_val = prof_live.req_sum; + col_prof_live_count.uint64_val = prof_live.count; + col_prof_accum_requested.uint64_val = + prof_accum.req_sum; + col_prof_accum_count.uint64_val = prof_accum.count; + } + col_curlextents.size_val = curlextents; + + if (!in_gap) { + emitter_table_row(emitter, &row); + } + } + emitter_json_array_end(emitter); /* Close "lextents". */ + if (in_gap) { + emitter_table_printf(emitter, " ---\n"); + } +} + +JEMALLOC_COLD +static void +stats_arena_extents_print(emitter_t *emitter, unsigned i) { + unsigned j; + bool in_gap, in_gap_prev; + emitter_row_t header_row; + emitter_row_init(&header_row); + emitter_row_t row; + emitter_row_init(&row); + + COL_HDR(row, size, NULL, right, 20, size) + COL_HDR(row, ind, NULL, right, 4, unsigned) + COL_HDR(row, ndirty, NULL, right, 13, size) + COL_HDR(row, dirty, NULL, right, 13, size) + COL_HDR(row, nmuzzy, NULL, right, 13, size) + COL_HDR(row, muzzy, NULL, right, 13, size) + COL_HDR(row, nretained, NULL, right, 13, size) + COL_HDR(row, retained, NULL, right, 13, size) + COL_HDR(row, ntotal, NULL, right, 13, size) + COL_HDR(row, total, NULL, right, 13, size) + + /* Label this section. */ + header_size.width -= 8; + emitter_table_printf(emitter, "extents:"); + emitter_table_row(emitter, &header_row); + emitter_json_array_kv_begin(emitter, "extents"); + + size_t stats_arenas_mib[CTL_MAX_DEPTH]; + CTL_LEAF_PREPARE(stats_arenas_mib, 0, "stats.arenas"); + stats_arenas_mib[2] = i; + CTL_LEAF_PREPARE(stats_arenas_mib, 3, "extents"); + + in_gap = false; + for (j = 0; j < SC_NPSIZES; j++) { + size_t ndirty, nmuzzy, nretained, total, dirty_bytes, + muzzy_bytes, retained_bytes, total_bytes; + stats_arenas_mib[4] = j; + + CTL_LEAF(stats_arenas_mib, 5, "ndirty", &ndirty, size_t); + CTL_LEAF(stats_arenas_mib, 5, "nmuzzy", &nmuzzy, size_t); + CTL_LEAF(stats_arenas_mib, 5, "nretained", &nretained, size_t); + CTL_LEAF(stats_arenas_mib, 5, "dirty_bytes", &dirty_bytes, + size_t); + CTL_LEAF(stats_arenas_mib, 5, "muzzy_bytes", &muzzy_bytes, + size_t); + CTL_LEAF(stats_arenas_mib, 5, "retained_bytes", + &retained_bytes, size_t); + + total = ndirty + nmuzzy + nretained; + total_bytes = dirty_bytes + muzzy_bytes + retained_bytes; + + in_gap_prev = in_gap; + in_gap = (total == 0); + + if (in_gap_prev && !in_gap) { + emitter_table_printf(emitter, + " ---\n"); + } + + emitter_json_object_begin(emitter); + emitter_json_kv(emitter, "ndirty", emitter_type_size, &ndirty); + emitter_json_kv(emitter, "nmuzzy", emitter_type_size, &nmuzzy); + emitter_json_kv(emitter, "nretained", emitter_type_size, + &nretained); + + emitter_json_kv(emitter, "dirty_bytes", emitter_type_size, + &dirty_bytes); + emitter_json_kv(emitter, "muzzy_bytes", emitter_type_size, + &muzzy_bytes); + emitter_json_kv(emitter, "retained_bytes", emitter_type_size, + &retained_bytes); + emitter_json_object_end(emitter); + + col_size.size_val = sz_pind2sz(j); + col_ind.size_val = j; + col_ndirty.size_val = ndirty; + col_dirty.size_val = dirty_bytes; + col_nmuzzy.size_val = nmuzzy; + col_muzzy.size_val = muzzy_bytes; + col_nretained.size_val = nretained; + col_retained.size_val = retained_bytes; + col_ntotal.size_val = total; + col_total.size_val = total_bytes; + + if (!in_gap) { + emitter_table_row(emitter, &row); + } + } + emitter_json_array_end(emitter); /* Close "extents". */ + if (in_gap) { + emitter_table_printf(emitter, " ---\n"); + } +} + +static void +stats_arena_hpa_shard_print(emitter_t *emitter, unsigned i, uint64_t uptime) { + emitter_row_t header_row; + emitter_row_init(&header_row); + emitter_row_t row; + emitter_row_init(&row); + + uint64_t npurge_passes; + uint64_t npurges; + uint64_t nhugifies; + uint64_t ndehugifies; + + CTL_M2_GET("stats.arenas.0.hpa_shard.npurge_passes", + i, &npurge_passes, uint64_t); + CTL_M2_GET("stats.arenas.0.hpa_shard.npurges", + i, &npurges, uint64_t); + CTL_M2_GET("stats.arenas.0.hpa_shard.nhugifies", + i, &nhugifies, uint64_t); + CTL_M2_GET("stats.arenas.0.hpa_shard.ndehugifies", + i, &ndehugifies, uint64_t); + + size_t npageslabs_huge; + size_t nactive_huge; + size_t ndirty_huge; + + size_t npageslabs_nonhuge; + size_t nactive_nonhuge; + size_t ndirty_nonhuge; + size_t nretained_nonhuge; + + size_t sec_bytes; + CTL_M2_GET("stats.arenas.0.hpa_sec_bytes", i, &sec_bytes, size_t); + emitter_kv(emitter, "sec_bytes", "Bytes in small extent cache", + emitter_type_size, &sec_bytes); + + /* First, global stats. */ + emitter_table_printf(emitter, + "HPA shard stats:\n" + " Purge passes: %" FMTu64 " (%" FMTu64 " / sec)\n" + " Purges: %" FMTu64 " (%" FMTu64 " / sec)\n" + " Hugeifies: %" FMTu64 " (%" FMTu64 " / sec)\n" + " Dehugifies: %" FMTu64 " (%" FMTu64 " / sec)\n" + "\n", + npurge_passes, rate_per_second(npurge_passes, uptime), + npurges, rate_per_second(npurges, uptime), + nhugifies, rate_per_second(nhugifies, uptime), + ndehugifies, rate_per_second(ndehugifies, uptime)); + + emitter_json_object_kv_begin(emitter, "hpa_shard"); + emitter_json_kv(emitter, "npurge_passes", emitter_type_uint64, + &npurge_passes); + emitter_json_kv(emitter, "npurges", emitter_type_uint64, + &npurges); + emitter_json_kv(emitter, "nhugifies", emitter_type_uint64, + &nhugifies); + emitter_json_kv(emitter, "ndehugifies", emitter_type_uint64, + &ndehugifies); + + /* Next, full slab stats. */ + CTL_M2_GET("stats.arenas.0.hpa_shard.full_slabs.npageslabs_huge", + i, &npageslabs_huge, size_t); + CTL_M2_GET("stats.arenas.0.hpa_shard.full_slabs.nactive_huge", + i, &nactive_huge, size_t); + CTL_M2_GET("stats.arenas.0.hpa_shard.full_slabs.ndirty_huge", + i, &ndirty_huge, size_t); + + CTL_M2_GET("stats.arenas.0.hpa_shard.full_slabs.npageslabs_nonhuge", + i, &npageslabs_nonhuge, size_t); + CTL_M2_GET("stats.arenas.0.hpa_shard.full_slabs.nactive_nonhuge", + i, &nactive_nonhuge, size_t); + CTL_M2_GET("stats.arenas.0.hpa_shard.full_slabs.ndirty_nonhuge", + i, &ndirty_nonhuge, size_t); + nretained_nonhuge = npageslabs_nonhuge * HUGEPAGE_PAGES + - nactive_nonhuge - ndirty_nonhuge; + + emitter_table_printf(emitter, + " In full slabs:\n" + " npageslabs: %zu huge, %zu nonhuge\n" + " nactive: %zu huge, %zu nonhuge \n" + " ndirty: %zu huge, %zu nonhuge \n" + " nretained: 0 huge, %zu nonhuge \n", + npageslabs_huge, npageslabs_nonhuge, + nactive_huge, nactive_nonhuge, + ndirty_huge, ndirty_nonhuge, + nretained_nonhuge); + + emitter_json_object_kv_begin(emitter, "full_slabs"); + emitter_json_kv(emitter, "npageslabs_huge", emitter_type_size, + &npageslabs_huge); + emitter_json_kv(emitter, "nactive_huge", emitter_type_size, + &nactive_huge); + emitter_json_kv(emitter, "nactive_huge", emitter_type_size, + &nactive_huge); + emitter_json_kv(emitter, "npageslabs_nonhuge", emitter_type_size, + &npageslabs_nonhuge); + emitter_json_kv(emitter, "nactive_nonhuge", emitter_type_size, + &nactive_nonhuge); + emitter_json_kv(emitter, "ndirty_nonhuge", emitter_type_size, + &ndirty_nonhuge); + emitter_json_object_end(emitter); /* End "full_slabs" */ + + /* Next, empty slab stats. */ + CTL_M2_GET("stats.arenas.0.hpa_shard.empty_slabs.npageslabs_huge", + i, &npageslabs_huge, size_t); + CTL_M2_GET("stats.arenas.0.hpa_shard.empty_slabs.nactive_huge", + i, &nactive_huge, size_t); + CTL_M2_GET("stats.arenas.0.hpa_shard.empty_slabs.ndirty_huge", + i, &ndirty_huge, size_t); + + CTL_M2_GET("stats.arenas.0.hpa_shard.empty_slabs.npageslabs_nonhuge", + i, &npageslabs_nonhuge, size_t); + CTL_M2_GET("stats.arenas.0.hpa_shard.empty_slabs.nactive_nonhuge", + i, &nactive_nonhuge, size_t); + CTL_M2_GET("stats.arenas.0.hpa_shard.empty_slabs.ndirty_nonhuge", + i, &ndirty_nonhuge, size_t); + nretained_nonhuge = npageslabs_nonhuge * HUGEPAGE_PAGES + - nactive_nonhuge - ndirty_nonhuge; + + emitter_table_printf(emitter, + " In empty slabs:\n" + " npageslabs: %zu huge, %zu nonhuge\n" + " nactive: %zu huge, %zu nonhuge \n" + " ndirty: %zu huge, %zu nonhuge \n" + " nretained: 0 huge, %zu nonhuge \n" + "\n", + npageslabs_huge, npageslabs_nonhuge, + nactive_huge, nactive_nonhuge, + ndirty_huge, ndirty_nonhuge, + nretained_nonhuge); + + emitter_json_object_kv_begin(emitter, "empty_slabs"); + emitter_json_kv(emitter, "npageslabs_huge", emitter_type_size, + &npageslabs_huge); + emitter_json_kv(emitter, "nactive_huge", emitter_type_size, + &nactive_huge); + emitter_json_kv(emitter, "nactive_huge", emitter_type_size, + &nactive_huge); + emitter_json_kv(emitter, "npageslabs_nonhuge", emitter_type_size, + &npageslabs_nonhuge); + emitter_json_kv(emitter, "nactive_nonhuge", emitter_type_size, + &nactive_nonhuge); + emitter_json_kv(emitter, "ndirty_nonhuge", emitter_type_size, + &ndirty_nonhuge); + emitter_json_object_end(emitter); /* End "empty_slabs" */ + + COL_HDR(row, size, NULL, right, 20, size) + COL_HDR(row, ind, NULL, right, 4, unsigned) + COL_HDR(row, npageslabs_huge, NULL, right, 16, size) + COL_HDR(row, nactive_huge, NULL, right, 16, size) + COL_HDR(row, ndirty_huge, NULL, right, 16, size) + COL_HDR(row, npageslabs_nonhuge, NULL, right, 20, size) + COL_HDR(row, nactive_nonhuge, NULL, right, 20, size) + COL_HDR(row, ndirty_nonhuge, NULL, right, 20, size) + COL_HDR(row, nretained_nonhuge, NULL, right, 20, size) + + size_t stats_arenas_mib[CTL_MAX_DEPTH]; + CTL_LEAF_PREPARE(stats_arenas_mib, 0, "stats.arenas"); + stats_arenas_mib[2] = i; + CTL_LEAF_PREPARE(stats_arenas_mib, 3, "hpa_shard.nonfull_slabs"); + + emitter_table_row(emitter, &header_row); + emitter_json_array_kv_begin(emitter, "nonfull_slabs"); + bool in_gap = false; + for (pszind_t j = 0; j < PSSET_NPSIZES && j < SC_NPSIZES; j++) { + stats_arenas_mib[5] = j; + + CTL_LEAF(stats_arenas_mib, 6, "npageslabs_huge", + &npageslabs_huge, size_t); + CTL_LEAF(stats_arenas_mib, 6, "nactive_huge", + &nactive_huge, size_t); + CTL_LEAF(stats_arenas_mib, 6, "ndirty_huge", + &ndirty_huge, size_t); + + CTL_LEAF(stats_arenas_mib, 6, "npageslabs_nonhuge", + &npageslabs_nonhuge, size_t); + CTL_LEAF(stats_arenas_mib, 6, "nactive_nonhuge", + &nactive_nonhuge, size_t); + CTL_LEAF(stats_arenas_mib, 6, "ndirty_nonhuge", + &ndirty_nonhuge, size_t); + nretained_nonhuge = npageslabs_nonhuge * HUGEPAGE_PAGES + - nactive_nonhuge - ndirty_nonhuge; + + bool in_gap_prev = in_gap; + in_gap = (npageslabs_huge == 0 && npageslabs_nonhuge == 0); + if (in_gap_prev && !in_gap) { + emitter_table_printf(emitter, + " ---\n"); + } + + col_size.size_val = sz_pind2sz(j); + col_ind.size_val = j; + col_npageslabs_huge.size_val = npageslabs_huge; + col_nactive_huge.size_val = nactive_huge; + col_ndirty_huge.size_val = ndirty_huge; + col_npageslabs_nonhuge.size_val = npageslabs_nonhuge; + col_nactive_nonhuge.size_val = nactive_nonhuge; + col_ndirty_nonhuge.size_val = ndirty_nonhuge; + col_nretained_nonhuge.size_val = nretained_nonhuge; + if (!in_gap) { + emitter_table_row(emitter, &row); + } + + emitter_json_object_begin(emitter); + emitter_json_kv(emitter, "npageslabs_huge", emitter_type_size, + &npageslabs_huge); + emitter_json_kv(emitter, "nactive_huge", emitter_type_size, + &nactive_huge); + emitter_json_kv(emitter, "ndirty_huge", emitter_type_size, + &ndirty_huge); + emitter_json_kv(emitter, "npageslabs_nonhuge", emitter_type_size, + &npageslabs_nonhuge); + emitter_json_kv(emitter, "nactive_nonhuge", emitter_type_size, + &nactive_nonhuge); + emitter_json_kv(emitter, "ndirty_nonhuge", emitter_type_size, + &ndirty_nonhuge); + emitter_json_object_end(emitter); + } + emitter_json_array_end(emitter); /* End "nonfull_slabs" */ + emitter_json_object_end(emitter); /* End "hpa_shard" */ + if (in_gap) { + emitter_table_printf(emitter, " ---\n"); + } +} + +static void +stats_arena_mutexes_print(emitter_t *emitter, unsigned arena_ind, uint64_t uptime) { + emitter_row_t row; + emitter_col_t col_name; + emitter_col_t col64[mutex_prof_num_uint64_t_counters]; + emitter_col_t col32[mutex_prof_num_uint32_t_counters]; + + emitter_row_init(&row); + mutex_stats_init_cols(&row, "", &col_name, col64, col32); + + emitter_json_object_kv_begin(emitter, "mutexes"); + emitter_table_row(emitter, &row); + + size_t stats_arenas_mib[CTL_MAX_DEPTH]; + CTL_LEAF_PREPARE(stats_arenas_mib, 0, "stats.arenas"); + stats_arenas_mib[2] = arena_ind; + CTL_LEAF_PREPARE(stats_arenas_mib, 3, "mutexes"); + + for (mutex_prof_arena_ind_t i = 0; i < mutex_prof_num_arena_mutexes; + i++) { + const char *name = arena_mutex_names[i]; + emitter_json_object_kv_begin(emitter, name); + mutex_stats_read_arena(stats_arenas_mib, 4, name, &col_name, + col64, col32, uptime); + mutex_stats_emit(emitter, &row, col64, col32); + emitter_json_object_end(emitter); /* Close the mutex dict. */ + } + emitter_json_object_end(emitter); /* End "mutexes". */ +} + +JEMALLOC_COLD +static void +stats_arena_print(emitter_t *emitter, unsigned i, bool bins, bool large, + bool mutex, bool extents, bool hpa) { + unsigned nthreads; + const char *dss; + ssize_t dirty_decay_ms, muzzy_decay_ms; + size_t page, pactive, pdirty, pmuzzy, mapped, retained; + size_t base, internal, resident, metadata_thp, extent_avail; + uint64_t dirty_npurge, dirty_nmadvise, dirty_purged; + uint64_t muzzy_npurge, muzzy_nmadvise, muzzy_purged; + size_t small_allocated; + uint64_t small_nmalloc, small_ndalloc, small_nrequests, small_nfills, + small_nflushes; + size_t large_allocated; + uint64_t large_nmalloc, large_ndalloc, large_nrequests, large_nfills, + large_nflushes; + size_t tcache_bytes, tcache_stashed_bytes, abandoned_vm; + uint64_t uptime; + + CTL_GET("arenas.page", &page, size_t); + + CTL_M2_GET("stats.arenas.0.nthreads", i, &nthreads, unsigned); + emitter_kv(emitter, "nthreads", "assigned threads", + emitter_type_unsigned, &nthreads); + + CTL_M2_GET("stats.arenas.0.uptime", i, &uptime, uint64_t); + emitter_kv(emitter, "uptime_ns", "uptime", emitter_type_uint64, + &uptime); + + CTL_M2_GET("stats.arenas.0.dss", i, &dss, const char *); + emitter_kv(emitter, "dss", "dss allocation precedence", + emitter_type_string, &dss); + + CTL_M2_GET("stats.arenas.0.dirty_decay_ms", i, &dirty_decay_ms, + ssize_t); + CTL_M2_GET("stats.arenas.0.muzzy_decay_ms", i, &muzzy_decay_ms, + ssize_t); + CTL_M2_GET("stats.arenas.0.pactive", i, &pactive, size_t); + CTL_M2_GET("stats.arenas.0.pdirty", i, &pdirty, size_t); + CTL_M2_GET("stats.arenas.0.pmuzzy", i, &pmuzzy, size_t); + CTL_M2_GET("stats.arenas.0.dirty_npurge", i, &dirty_npurge, uint64_t); + CTL_M2_GET("stats.arenas.0.dirty_nmadvise", i, &dirty_nmadvise, + uint64_t); + CTL_M2_GET("stats.arenas.0.dirty_purged", i, &dirty_purged, uint64_t); + CTL_M2_GET("stats.arenas.0.muzzy_npurge", i, &muzzy_npurge, uint64_t); + CTL_M2_GET("stats.arenas.0.muzzy_nmadvise", i, &muzzy_nmadvise, + uint64_t); + CTL_M2_GET("stats.arenas.0.muzzy_purged", i, &muzzy_purged, uint64_t); + + emitter_row_t decay_row; + emitter_row_init(&decay_row); + + /* JSON-style emission. */ + emitter_json_kv(emitter, "dirty_decay_ms", emitter_type_ssize, + &dirty_decay_ms); + emitter_json_kv(emitter, "muzzy_decay_ms", emitter_type_ssize, + &muzzy_decay_ms); + + emitter_json_kv(emitter, "pactive", emitter_type_size, &pactive); + emitter_json_kv(emitter, "pdirty", emitter_type_size, &pdirty); + emitter_json_kv(emitter, "pmuzzy", emitter_type_size, &pmuzzy); + + emitter_json_kv(emitter, "dirty_npurge", emitter_type_uint64, + &dirty_npurge); + emitter_json_kv(emitter, "dirty_nmadvise", emitter_type_uint64, + &dirty_nmadvise); + emitter_json_kv(emitter, "dirty_purged", emitter_type_uint64, + &dirty_purged); + + emitter_json_kv(emitter, "muzzy_npurge", emitter_type_uint64, + &muzzy_npurge); + emitter_json_kv(emitter, "muzzy_nmadvise", emitter_type_uint64, + &muzzy_nmadvise); + emitter_json_kv(emitter, "muzzy_purged", emitter_type_uint64, + &muzzy_purged); + + /* Table-style emission. */ + COL(decay_row, decay_type, right, 9, title); + col_decay_type.str_val = "decaying:"; + + COL(decay_row, decay_time, right, 6, title); + col_decay_time.str_val = "time"; + + COL(decay_row, decay_npages, right, 13, title); + col_decay_npages.str_val = "npages"; + + COL(decay_row, decay_sweeps, right, 13, title); + col_decay_sweeps.str_val = "sweeps"; + + COL(decay_row, decay_madvises, right, 13, title); + col_decay_madvises.str_val = "madvises"; + + COL(decay_row, decay_purged, right, 13, title); + col_decay_purged.str_val = "purged"; + + /* Title row. */ + emitter_table_row(emitter, &decay_row); + + /* Dirty row. */ + col_decay_type.str_val = "dirty:"; + + if (dirty_decay_ms >= 0) { + col_decay_time.type = emitter_type_ssize; + col_decay_time.ssize_val = dirty_decay_ms; + } else { + col_decay_time.type = emitter_type_title; + col_decay_time.str_val = "N/A"; + } + + col_decay_npages.type = emitter_type_size; + col_decay_npages.size_val = pdirty; + + col_decay_sweeps.type = emitter_type_uint64; + col_decay_sweeps.uint64_val = dirty_npurge; + + col_decay_madvises.type = emitter_type_uint64; + col_decay_madvises.uint64_val = dirty_nmadvise; + + col_decay_purged.type = emitter_type_uint64; + col_decay_purged.uint64_val = dirty_purged; + + emitter_table_row(emitter, &decay_row); + + /* Muzzy row. */ + col_decay_type.str_val = "muzzy:"; + + if (muzzy_decay_ms >= 0) { + col_decay_time.type = emitter_type_ssize; + col_decay_time.ssize_val = muzzy_decay_ms; + } else { + col_decay_time.type = emitter_type_title; + col_decay_time.str_val = "N/A"; + } + + col_decay_npages.type = emitter_type_size; + col_decay_npages.size_val = pmuzzy; + + col_decay_sweeps.type = emitter_type_uint64; + col_decay_sweeps.uint64_val = muzzy_npurge; + + col_decay_madvises.type = emitter_type_uint64; + col_decay_madvises.uint64_val = muzzy_nmadvise; + + col_decay_purged.type = emitter_type_uint64; + col_decay_purged.uint64_val = muzzy_purged; + + emitter_table_row(emitter, &decay_row); + + /* Small / large / total allocation counts. */ + emitter_row_t alloc_count_row; + emitter_row_init(&alloc_count_row); + + COL(alloc_count_row, count_title, left, 21, title); + col_count_title.str_val = ""; + + COL(alloc_count_row, count_allocated, right, 16, title); + col_count_allocated.str_val = "allocated"; + + COL(alloc_count_row, count_nmalloc, right, 16, title); + col_count_nmalloc.str_val = "nmalloc"; + COL(alloc_count_row, count_nmalloc_ps, right, 10, title); + col_count_nmalloc_ps.str_val = "(#/sec)"; + + COL(alloc_count_row, count_ndalloc, right, 16, title); + col_count_ndalloc.str_val = "ndalloc"; + COL(alloc_count_row, count_ndalloc_ps, right, 10, title); + col_count_ndalloc_ps.str_val = "(#/sec)"; + + COL(alloc_count_row, count_nrequests, right, 16, title); + col_count_nrequests.str_val = "nrequests"; + COL(alloc_count_row, count_nrequests_ps, right, 10, title); + col_count_nrequests_ps.str_val = "(#/sec)"; + + COL(alloc_count_row, count_nfills, right, 16, title); + col_count_nfills.str_val = "nfill"; + COL(alloc_count_row, count_nfills_ps, right, 10, title); + col_count_nfills_ps.str_val = "(#/sec)"; + + COL(alloc_count_row, count_nflushes, right, 16, title); + col_count_nflushes.str_val = "nflush"; + COL(alloc_count_row, count_nflushes_ps, right, 10, title); + col_count_nflushes_ps.str_val = "(#/sec)"; + + emitter_table_row(emitter, &alloc_count_row); + + col_count_nmalloc_ps.type = emitter_type_uint64; + col_count_ndalloc_ps.type = emitter_type_uint64; + col_count_nrequests_ps.type = emitter_type_uint64; + col_count_nfills_ps.type = emitter_type_uint64; + col_count_nflushes_ps.type = emitter_type_uint64; + +#define GET_AND_EMIT_ALLOC_STAT(small_or_large, name, valtype) \ + CTL_M2_GET("stats.arenas.0." #small_or_large "." #name, i, \ + &small_or_large##_##name, valtype##_t); \ + emitter_json_kv(emitter, #name, emitter_type_##valtype, \ + &small_or_large##_##name); \ + col_count_##name.type = emitter_type_##valtype; \ + col_count_##name.valtype##_val = small_or_large##_##name; + + emitter_json_object_kv_begin(emitter, "small"); + col_count_title.str_val = "small:"; + + GET_AND_EMIT_ALLOC_STAT(small, allocated, size) + GET_AND_EMIT_ALLOC_STAT(small, nmalloc, uint64) + col_count_nmalloc_ps.uint64_val = + rate_per_second(col_count_nmalloc.uint64_val, uptime); + GET_AND_EMIT_ALLOC_STAT(small, ndalloc, uint64) + col_count_ndalloc_ps.uint64_val = + rate_per_second(col_count_ndalloc.uint64_val, uptime); + GET_AND_EMIT_ALLOC_STAT(small, nrequests, uint64) + col_count_nrequests_ps.uint64_val = + rate_per_second(col_count_nrequests.uint64_val, uptime); + GET_AND_EMIT_ALLOC_STAT(small, nfills, uint64) + col_count_nfills_ps.uint64_val = + rate_per_second(col_count_nfills.uint64_val, uptime); + GET_AND_EMIT_ALLOC_STAT(small, nflushes, uint64) + col_count_nflushes_ps.uint64_val = + rate_per_second(col_count_nflushes.uint64_val, uptime); + + emitter_table_row(emitter, &alloc_count_row); + emitter_json_object_end(emitter); /* Close "small". */ + + emitter_json_object_kv_begin(emitter, "large"); + col_count_title.str_val = "large:"; + + GET_AND_EMIT_ALLOC_STAT(large, allocated, size) + GET_AND_EMIT_ALLOC_STAT(large, nmalloc, uint64) + col_count_nmalloc_ps.uint64_val = + rate_per_second(col_count_nmalloc.uint64_val, uptime); + GET_AND_EMIT_ALLOC_STAT(large, ndalloc, uint64) + col_count_ndalloc_ps.uint64_val = + rate_per_second(col_count_ndalloc.uint64_val, uptime); + GET_AND_EMIT_ALLOC_STAT(large, nrequests, uint64) + col_count_nrequests_ps.uint64_val = + rate_per_second(col_count_nrequests.uint64_val, uptime); + GET_AND_EMIT_ALLOC_STAT(large, nfills, uint64) + col_count_nfills_ps.uint64_val = + rate_per_second(col_count_nfills.uint64_val, uptime); + GET_AND_EMIT_ALLOC_STAT(large, nflushes, uint64) + col_count_nflushes_ps.uint64_val = + rate_per_second(col_count_nflushes.uint64_val, uptime); + + emitter_table_row(emitter, &alloc_count_row); + emitter_json_object_end(emitter); /* Close "large". */ + +#undef GET_AND_EMIT_ALLOC_STAT + + /* Aggregated small + large stats are emitter only in table mode. */ + col_count_title.str_val = "total:"; + col_count_allocated.size_val = small_allocated + large_allocated; + col_count_nmalloc.uint64_val = small_nmalloc + large_nmalloc; + col_count_ndalloc.uint64_val = small_ndalloc + large_ndalloc; + col_count_nrequests.uint64_val = small_nrequests + large_nrequests; + col_count_nfills.uint64_val = small_nfills + large_nfills; + col_count_nflushes.uint64_val = small_nflushes + large_nflushes; + col_count_nmalloc_ps.uint64_val = + rate_per_second(col_count_nmalloc.uint64_val, uptime); + col_count_ndalloc_ps.uint64_val = + rate_per_second(col_count_ndalloc.uint64_val, uptime); + col_count_nrequests_ps.uint64_val = + rate_per_second(col_count_nrequests.uint64_val, uptime); + col_count_nfills_ps.uint64_val = + rate_per_second(col_count_nfills.uint64_val, uptime); + col_count_nflushes_ps.uint64_val = + rate_per_second(col_count_nflushes.uint64_val, uptime); + emitter_table_row(emitter, &alloc_count_row); + + emitter_row_t mem_count_row; + emitter_row_init(&mem_count_row); + + emitter_col_t mem_count_title; + emitter_col_init(&mem_count_title, &mem_count_row); + mem_count_title.justify = emitter_justify_left; + mem_count_title.width = 21; + mem_count_title.type = emitter_type_title; + mem_count_title.str_val = ""; + + emitter_col_t mem_count_val; + emitter_col_init(&mem_count_val, &mem_count_row); + mem_count_val.justify = emitter_justify_right; + mem_count_val.width = 16; + mem_count_val.type = emitter_type_title; + mem_count_val.str_val = ""; + + emitter_table_row(emitter, &mem_count_row); + mem_count_val.type = emitter_type_size; + + /* Active count in bytes is emitted only in table mode. */ + mem_count_title.str_val = "active:"; + mem_count_val.size_val = pactive * page; + emitter_table_row(emitter, &mem_count_row); + +#define GET_AND_EMIT_MEM_STAT(stat) \ + CTL_M2_GET("stats.arenas.0."#stat, i, &stat, size_t); \ + emitter_json_kv(emitter, #stat, emitter_type_size, &stat); \ + mem_count_title.str_val = #stat":"; \ + mem_count_val.size_val = stat; \ + emitter_table_row(emitter, &mem_count_row); + + GET_AND_EMIT_MEM_STAT(mapped) + GET_AND_EMIT_MEM_STAT(retained) + GET_AND_EMIT_MEM_STAT(base) + GET_AND_EMIT_MEM_STAT(internal) + GET_AND_EMIT_MEM_STAT(metadata_thp) + GET_AND_EMIT_MEM_STAT(tcache_bytes) + GET_AND_EMIT_MEM_STAT(tcache_stashed_bytes) + GET_AND_EMIT_MEM_STAT(resident) + GET_AND_EMIT_MEM_STAT(abandoned_vm) + GET_AND_EMIT_MEM_STAT(extent_avail) +#undef GET_AND_EMIT_MEM_STAT + + if (mutex) { + stats_arena_mutexes_print(emitter, i, uptime); + } + if (bins) { + stats_arena_bins_print(emitter, mutex, i, uptime); + } + if (large) { + stats_arena_lextents_print(emitter, i, uptime); + } + if (extents) { + stats_arena_extents_print(emitter, i); + } + if (hpa) { + stats_arena_hpa_shard_print(emitter, i, uptime); + } +} + +JEMALLOC_COLD +static void +stats_general_print(emitter_t *emitter) { + const char *cpv; + bool bv, bv2; + unsigned uv; + uint32_t u32v; + uint64_t u64v; + int64_t i64v; + ssize_t ssv, ssv2; + size_t sv, bsz, usz, u32sz, u64sz, i64sz, ssz, sssz, cpsz; + + bsz = sizeof(bool); + usz = sizeof(unsigned); + ssz = sizeof(size_t); + sssz = sizeof(ssize_t); + cpsz = sizeof(const char *); + u32sz = sizeof(uint32_t); + i64sz = sizeof(int64_t); + u64sz = sizeof(uint64_t); + + CTL_GET("version", &cpv, const char *); + emitter_kv(emitter, "version", "Version", emitter_type_string, &cpv); + + /* config. */ + emitter_dict_begin(emitter, "config", "Build-time option settings"); +#define CONFIG_WRITE_BOOL(name) \ + do { \ + CTL_GET("config."#name, &bv, bool); \ + emitter_kv(emitter, #name, "config."#name, \ + emitter_type_bool, &bv); \ + } while (0) + + CONFIG_WRITE_BOOL(cache_oblivious); + CONFIG_WRITE_BOOL(debug); + CONFIG_WRITE_BOOL(fill); + CONFIG_WRITE_BOOL(lazy_lock); + emitter_kv(emitter, "malloc_conf", "config.malloc_conf", + emitter_type_string, &config_malloc_conf); + + CONFIG_WRITE_BOOL(opt_safety_checks); + CONFIG_WRITE_BOOL(prof); + CONFIG_WRITE_BOOL(prof_libgcc); + CONFIG_WRITE_BOOL(prof_libunwind); + CONFIG_WRITE_BOOL(stats); + CONFIG_WRITE_BOOL(utrace); + CONFIG_WRITE_BOOL(xmalloc); +#undef CONFIG_WRITE_BOOL + emitter_dict_end(emitter); /* Close "config" dict. */ + + /* opt. */ +#define OPT_WRITE(name, var, size, emitter_type) \ + if (je_mallctl("opt."name, (void *)&var, &size, NULL, 0) == \ + 0) { \ + emitter_kv(emitter, name, "opt."name, emitter_type, \ + &var); \ + } + +#define OPT_WRITE_MUTABLE(name, var1, var2, size, emitter_type, \ + altname) \ + if (je_mallctl("opt."name, (void *)&var1, &size, NULL, 0) == \ + 0 && je_mallctl(altname, (void *)&var2, &size, NULL, 0) \ + == 0) { \ + emitter_kv_note(emitter, name, "opt."name, \ + emitter_type, &var1, altname, emitter_type, \ + &var2); \ + } + +#define OPT_WRITE_BOOL(name) OPT_WRITE(name, bv, bsz, emitter_type_bool) +#define OPT_WRITE_BOOL_MUTABLE(name, altname) \ + OPT_WRITE_MUTABLE(name, bv, bv2, bsz, emitter_type_bool, altname) + +#define OPT_WRITE_UNSIGNED(name) \ + OPT_WRITE(name, uv, usz, emitter_type_unsigned) + +#define OPT_WRITE_INT64(name) \ + OPT_WRITE(name, i64v, i64sz, emitter_type_int64) +#define OPT_WRITE_UINT64(name) \ + OPT_WRITE(name, u64v, u64sz, emitter_type_uint64) + +#define OPT_WRITE_SIZE_T(name) \ + OPT_WRITE(name, sv, ssz, emitter_type_size) +#define OPT_WRITE_SSIZE_T(name) \ + OPT_WRITE(name, ssv, sssz, emitter_type_ssize) +#define OPT_WRITE_SSIZE_T_MUTABLE(name, altname) \ + OPT_WRITE_MUTABLE(name, ssv, ssv2, sssz, emitter_type_ssize, \ + altname) + +#define OPT_WRITE_CHAR_P(name) \ + OPT_WRITE(name, cpv, cpsz, emitter_type_string) + + emitter_dict_begin(emitter, "opt", "Run-time option settings"); + + OPT_WRITE_BOOL("abort") + OPT_WRITE_BOOL("abort_conf") + OPT_WRITE_BOOL("cache_oblivious") + OPT_WRITE_BOOL("confirm_conf") + OPT_WRITE_BOOL("retain") + OPT_WRITE_CHAR_P("dss") + OPT_WRITE_UNSIGNED("narenas") + OPT_WRITE_CHAR_P("percpu_arena") + OPT_WRITE_SIZE_T("oversize_threshold") + OPT_WRITE_BOOL("hpa") + OPT_WRITE_SIZE_T("hpa_slab_max_alloc") + OPT_WRITE_SIZE_T("hpa_hugification_threshold") + OPT_WRITE_UINT64("hpa_hugify_delay_ms") + OPT_WRITE_UINT64("hpa_min_purge_interval_ms") + if (je_mallctl("opt.hpa_dirty_mult", (void *)&u32v, &u32sz, NULL, 0) + == 0) { + /* + * We cheat a little and "know" the secret meaning of this + * representation. + */ + if (u32v == (uint32_t)-1) { + const char *neg1 = "-1"; + emitter_kv(emitter, "hpa_dirty_mult", + "opt.hpa_dirty_mult", emitter_type_string, &neg1); + } else { + char buf[FXP_BUF_SIZE]; + fxp_print(u32v, buf); + const char *bufp = buf; + emitter_kv(emitter, "hpa_dirty_mult", + "opt.hpa_dirty_mult", emitter_type_string, &bufp); + } + } + OPT_WRITE_SIZE_T("hpa_sec_nshards") + OPT_WRITE_SIZE_T("hpa_sec_max_alloc") + OPT_WRITE_SIZE_T("hpa_sec_max_bytes") + OPT_WRITE_SIZE_T("hpa_sec_bytes_after_flush") + OPT_WRITE_SIZE_T("hpa_sec_batch_fill_extra") + OPT_WRITE_CHAR_P("metadata_thp") + OPT_WRITE_INT64("mutex_max_spin") + OPT_WRITE_BOOL_MUTABLE("background_thread", "background_thread") + OPT_WRITE_SSIZE_T_MUTABLE("dirty_decay_ms", "arenas.dirty_decay_ms") + OPT_WRITE_SSIZE_T_MUTABLE("muzzy_decay_ms", "arenas.muzzy_decay_ms") + OPT_WRITE_SIZE_T("lg_extent_max_active_fit") + OPT_WRITE_CHAR_P("junk") + OPT_WRITE_BOOL("zero") + OPT_WRITE_BOOL("utrace") + OPT_WRITE_BOOL("xmalloc") + OPT_WRITE_BOOL("experimental_infallible_new") + OPT_WRITE_BOOL("tcache") + OPT_WRITE_SIZE_T("tcache_max") + OPT_WRITE_UNSIGNED("tcache_nslots_small_min") + OPT_WRITE_UNSIGNED("tcache_nslots_small_max") + OPT_WRITE_UNSIGNED("tcache_nslots_large") + OPT_WRITE_SSIZE_T("lg_tcache_nslots_mul") + OPT_WRITE_SIZE_T("tcache_gc_incr_bytes") + OPT_WRITE_SIZE_T("tcache_gc_delay_bytes") + OPT_WRITE_UNSIGNED("lg_tcache_flush_small_div") + OPT_WRITE_UNSIGNED("lg_tcache_flush_large_div") + OPT_WRITE_CHAR_P("thp") + OPT_WRITE_BOOL("prof") + OPT_WRITE_CHAR_P("prof_prefix") + OPT_WRITE_BOOL_MUTABLE("prof_active", "prof.active") + OPT_WRITE_BOOL_MUTABLE("prof_thread_active_init", + "prof.thread_active_init") + OPT_WRITE_SSIZE_T_MUTABLE("lg_prof_sample", "prof.lg_sample") + OPT_WRITE_BOOL("prof_accum") + OPT_WRITE_SSIZE_T("lg_prof_interval") + OPT_WRITE_BOOL("prof_gdump") + OPT_WRITE_BOOL("prof_final") + OPT_WRITE_BOOL("prof_leak") + OPT_WRITE_BOOL("prof_leak_error") + OPT_WRITE_BOOL("stats_print") + OPT_WRITE_CHAR_P("stats_print_opts") + OPT_WRITE_BOOL("stats_print") + OPT_WRITE_CHAR_P("stats_print_opts") + OPT_WRITE_INT64("stats_interval") + OPT_WRITE_CHAR_P("stats_interval_opts") + OPT_WRITE_CHAR_P("zero_realloc") + + emitter_dict_end(emitter); + +#undef OPT_WRITE +#undef OPT_WRITE_MUTABLE +#undef OPT_WRITE_BOOL +#undef OPT_WRITE_BOOL_MUTABLE +#undef OPT_WRITE_UNSIGNED +#undef OPT_WRITE_SSIZE_T +#undef OPT_WRITE_SSIZE_T_MUTABLE +#undef OPT_WRITE_CHAR_P + + /* prof. */ + if (config_prof) { + emitter_dict_begin(emitter, "prof", "Profiling settings"); + + CTL_GET("prof.thread_active_init", &bv, bool); + emitter_kv(emitter, "thread_active_init", + "prof.thread_active_init", emitter_type_bool, &bv); + + CTL_GET("prof.active", &bv, bool); + emitter_kv(emitter, "active", "prof.active", emitter_type_bool, + &bv); + + CTL_GET("prof.gdump", &bv, bool); + emitter_kv(emitter, "gdump", "prof.gdump", emitter_type_bool, + &bv); + + CTL_GET("prof.interval", &u64v, uint64_t); + emitter_kv(emitter, "interval", "prof.interval", + emitter_type_uint64, &u64v); + + CTL_GET("prof.lg_sample", &ssv, ssize_t); + emitter_kv(emitter, "lg_sample", "prof.lg_sample", + emitter_type_ssize, &ssv); + + emitter_dict_end(emitter); /* Close "prof". */ + } + + /* arenas. */ + /* + * The json output sticks arena info into an "arenas" dict; the table + * output puts them at the top-level. + */ + emitter_json_object_kv_begin(emitter, "arenas"); + + CTL_GET("arenas.narenas", &uv, unsigned); + emitter_kv(emitter, "narenas", "Arenas", emitter_type_unsigned, &uv); + + /* + * Decay settings are emitted only in json mode; in table mode, they're + * emitted as notes with the opt output, above. + */ + CTL_GET("arenas.dirty_decay_ms", &ssv, ssize_t); + emitter_json_kv(emitter, "dirty_decay_ms", emitter_type_ssize, &ssv); + + CTL_GET("arenas.muzzy_decay_ms", &ssv, ssize_t); + emitter_json_kv(emitter, "muzzy_decay_ms", emitter_type_ssize, &ssv); + + CTL_GET("arenas.quantum", &sv, size_t); + emitter_kv(emitter, "quantum", "Quantum size", emitter_type_size, &sv); + + CTL_GET("arenas.page", &sv, size_t); + emitter_kv(emitter, "page", "Page size", emitter_type_size, &sv); + + if (je_mallctl("arenas.tcache_max", (void *)&sv, &ssz, NULL, 0) == 0) { + emitter_kv(emitter, "tcache_max", + "Maximum thread-cached size class", emitter_type_size, &sv); + } + + unsigned arenas_nbins; + CTL_GET("arenas.nbins", &arenas_nbins, unsigned); + emitter_kv(emitter, "nbins", "Number of bin size classes", + emitter_type_unsigned, &arenas_nbins); + + unsigned arenas_nhbins; + CTL_GET("arenas.nhbins", &arenas_nhbins, unsigned); + emitter_kv(emitter, "nhbins", "Number of thread-cache bin size classes", + emitter_type_unsigned, &arenas_nhbins); + + /* + * We do enough mallctls in a loop that we actually want to omit them + * (not just omit the printing). + */ + if (emitter_outputs_json(emitter)) { + emitter_json_array_kv_begin(emitter, "bin"); + size_t arenas_bin_mib[CTL_MAX_DEPTH]; + CTL_LEAF_PREPARE(arenas_bin_mib, 0, "arenas.bin"); + for (unsigned i = 0; i < arenas_nbins; i++) { + arenas_bin_mib[2] = i; + emitter_json_object_begin(emitter); + + CTL_LEAF(arenas_bin_mib, 3, "size", &sv, size_t); + emitter_json_kv(emitter, "size", emitter_type_size, + &sv); + + CTL_LEAF(arenas_bin_mib, 3, "nregs", &u32v, uint32_t); + emitter_json_kv(emitter, "nregs", emitter_type_uint32, + &u32v); + + CTL_LEAF(arenas_bin_mib, 3, "slab_size", &sv, size_t); + emitter_json_kv(emitter, "slab_size", emitter_type_size, + &sv); + + CTL_LEAF(arenas_bin_mib, 3, "nshards", &u32v, uint32_t); + emitter_json_kv(emitter, "nshards", emitter_type_uint32, + &u32v); + + emitter_json_object_end(emitter); + } + emitter_json_array_end(emitter); /* Close "bin". */ + } + + unsigned nlextents; + CTL_GET("arenas.nlextents", &nlextents, unsigned); + emitter_kv(emitter, "nlextents", "Number of large size classes", + emitter_type_unsigned, &nlextents); + + if (emitter_outputs_json(emitter)) { + emitter_json_array_kv_begin(emitter, "lextent"); + size_t arenas_lextent_mib[CTL_MAX_DEPTH]; + CTL_LEAF_PREPARE(arenas_lextent_mib, 0, "arenas.lextent"); + for (unsigned i = 0; i < nlextents; i++) { + arenas_lextent_mib[2] = i; + emitter_json_object_begin(emitter); + + CTL_LEAF(arenas_lextent_mib, 3, "size", &sv, size_t); + emitter_json_kv(emitter, "size", emitter_type_size, + &sv); + + emitter_json_object_end(emitter); + } + emitter_json_array_end(emitter); /* Close "lextent". */ + } + + emitter_json_object_end(emitter); /* Close "arenas" */ +} + +JEMALLOC_COLD +static void +stats_print_helper(emitter_t *emitter, bool merged, bool destroyed, + bool unmerged, bool bins, bool large, bool mutex, bool extents, bool hpa) { + /* + * These should be deleted. We keep them around for a while, to aid in + * the transition to the emitter code. + */ + size_t allocated, active, metadata, metadata_thp, resident, mapped, + retained; + size_t num_background_threads; + size_t zero_reallocs; + uint64_t background_thread_num_runs, background_thread_run_interval; + + CTL_GET("stats.allocated", &allocated, size_t); + CTL_GET("stats.active", &active, size_t); + CTL_GET("stats.metadata", &metadata, size_t); + CTL_GET("stats.metadata_thp", &metadata_thp, size_t); + CTL_GET("stats.resident", &resident, size_t); + CTL_GET("stats.mapped", &mapped, size_t); + CTL_GET("stats.retained", &retained, size_t); + + CTL_GET("stats.zero_reallocs", &zero_reallocs, size_t); + + if (have_background_thread) { + CTL_GET("stats.background_thread.num_threads", + &num_background_threads, size_t); + CTL_GET("stats.background_thread.num_runs", + &background_thread_num_runs, uint64_t); + CTL_GET("stats.background_thread.run_interval", + &background_thread_run_interval, uint64_t); + } else { + num_background_threads = 0; + background_thread_num_runs = 0; + background_thread_run_interval = 0; + } + + /* Generic global stats. */ + emitter_json_object_kv_begin(emitter, "stats"); + emitter_json_kv(emitter, "allocated", emitter_type_size, &allocated); + emitter_json_kv(emitter, "active", emitter_type_size, &active); + emitter_json_kv(emitter, "metadata", emitter_type_size, &metadata); + emitter_json_kv(emitter, "metadata_thp", emitter_type_size, + &metadata_thp); + emitter_json_kv(emitter, "resident", emitter_type_size, &resident); + emitter_json_kv(emitter, "mapped", emitter_type_size, &mapped); + emitter_json_kv(emitter, "retained", emitter_type_size, &retained); + emitter_json_kv(emitter, "zero_reallocs", emitter_type_size, + &zero_reallocs); + + emitter_table_printf(emitter, "Allocated: %zu, active: %zu, " + "metadata: %zu (n_thp %zu), resident: %zu, mapped: %zu, " + "retained: %zu\n", allocated, active, metadata, metadata_thp, + resident, mapped, retained); + + /* Strange behaviors */ + emitter_table_printf(emitter, + "Count of realloc(non-null-ptr, 0) calls: %zu\n", zero_reallocs); + + /* Background thread stats. */ + emitter_json_object_kv_begin(emitter, "background_thread"); + emitter_json_kv(emitter, "num_threads", emitter_type_size, + &num_background_threads); + emitter_json_kv(emitter, "num_runs", emitter_type_uint64, + &background_thread_num_runs); + emitter_json_kv(emitter, "run_interval", emitter_type_uint64, + &background_thread_run_interval); + emitter_json_object_end(emitter); /* Close "background_thread". */ + + emitter_table_printf(emitter, "Background threads: %zu, " + "num_runs: %"FMTu64", run_interval: %"FMTu64" ns\n", + num_background_threads, background_thread_num_runs, + background_thread_run_interval); + + if (mutex) { + emitter_row_t row; + emitter_col_t name; + emitter_col_t col64[mutex_prof_num_uint64_t_counters]; + emitter_col_t col32[mutex_prof_num_uint32_t_counters]; + uint64_t uptime; + + emitter_row_init(&row); + mutex_stats_init_cols(&row, "", &name, col64, col32); + + emitter_table_row(emitter, &row); + emitter_json_object_kv_begin(emitter, "mutexes"); + + CTL_M2_GET("stats.arenas.0.uptime", 0, &uptime, uint64_t); + + size_t stats_mutexes_mib[CTL_MAX_DEPTH]; + CTL_LEAF_PREPARE(stats_mutexes_mib, 0, "stats.mutexes"); + for (int i = 0; i < mutex_prof_num_global_mutexes; i++) { + mutex_stats_read_global(stats_mutexes_mib, 2, + global_mutex_names[i], &name, col64, col32, uptime); + emitter_json_object_kv_begin(emitter, global_mutex_names[i]); + mutex_stats_emit(emitter, &row, col64, col32); + emitter_json_object_end(emitter); + } + + emitter_json_object_end(emitter); /* Close "mutexes". */ + } + + emitter_json_object_end(emitter); /* Close "stats". */ + + if (merged || destroyed || unmerged) { + unsigned narenas; + + emitter_json_object_kv_begin(emitter, "stats.arenas"); + + CTL_GET("arenas.narenas", &narenas, unsigned); + size_t mib[3]; + size_t miblen = sizeof(mib) / sizeof(size_t); + size_t sz; + VARIABLE_ARRAY(bool, initialized, narenas); + bool destroyed_initialized; + unsigned i, j, ninitialized; + + xmallctlnametomib("arena.0.initialized", mib, &miblen); + for (i = ninitialized = 0; i < narenas; i++) { + mib[1] = i; + sz = sizeof(bool); + xmallctlbymib(mib, miblen, &initialized[i], &sz, + NULL, 0); + if (initialized[i]) { + ninitialized++; + } + } + mib[1] = MALLCTL_ARENAS_DESTROYED; + sz = sizeof(bool); + xmallctlbymib(mib, miblen, &destroyed_initialized, &sz, + NULL, 0); + + /* Merged stats. */ + if (merged && (ninitialized > 1 || !unmerged)) { + /* Print merged arena stats. */ + emitter_table_printf(emitter, "Merged arenas stats:\n"); + emitter_json_object_kv_begin(emitter, "merged"); + stats_arena_print(emitter, MALLCTL_ARENAS_ALL, bins, + large, mutex, extents, hpa); + emitter_json_object_end(emitter); /* Close "merged". */ + } + + /* Destroyed stats. */ + if (destroyed_initialized && destroyed) { + /* Print destroyed arena stats. */ + emitter_table_printf(emitter, + "Destroyed arenas stats:\n"); + emitter_json_object_kv_begin(emitter, "destroyed"); + stats_arena_print(emitter, MALLCTL_ARENAS_DESTROYED, + bins, large, mutex, extents, hpa); + emitter_json_object_end(emitter); /* Close "destroyed". */ + } + + /* Unmerged stats. */ + if (unmerged) { + for (i = j = 0; i < narenas; i++) { + if (initialized[i]) { + char arena_ind_str[20]; + malloc_snprintf(arena_ind_str, + sizeof(arena_ind_str), "%u", i); + emitter_json_object_kv_begin(emitter, + arena_ind_str); + emitter_table_printf(emitter, + "arenas[%s]:\n", arena_ind_str); + stats_arena_print(emitter, i, bins, + large, mutex, extents, hpa); + /* Close "". */ + emitter_json_object_end(emitter); + } + } + } + emitter_json_object_end(emitter); /* Close "stats.arenas". */ + } +} + +void +stats_print(write_cb_t *write_cb, void *cbopaque, const char *opts) { + int err; + uint64_t epoch; + size_t u64sz; +#define OPTION(o, v, d, s) bool v = d; + STATS_PRINT_OPTIONS +#undef OPTION + + /* + * Refresh stats, in case mallctl() was called by the application. + * + * Check for OOM here, since refreshing the ctl cache can trigger + * allocation. In practice, none of the subsequent mallctl()-related + * calls in this function will cause OOM if this one succeeds. + * */ + epoch = 1; + u64sz = sizeof(uint64_t); + err = je_mallctl("epoch", (void *)&epoch, &u64sz, (void *)&epoch, + sizeof(uint64_t)); + if (err != 0) { + if (err == EAGAIN) { + malloc_write(": Memory allocation failure in " + "mallctl(\"epoch\", ...)\n"); + return; + } + malloc_write(": Failure in mallctl(\"epoch\", " + "...)\n"); + abort(); + } + + if (opts != NULL) { + for (unsigned i = 0; opts[i] != '\0'; i++) { + switch (opts[i]) { +#define OPTION(o, v, d, s) case o: v = s; break; + STATS_PRINT_OPTIONS +#undef OPTION + default:; + } + } + } + + emitter_t emitter; + emitter_init(&emitter, + json ? emitter_output_json_compact : emitter_output_table, + write_cb, cbopaque); + emitter_begin(&emitter); + emitter_table_printf(&emitter, "___ Begin jemalloc statistics ___\n"); + emitter_json_object_kv_begin(&emitter, "jemalloc"); + + if (general) { + stats_general_print(&emitter); + } + if (config_stats) { + stats_print_helper(&emitter, merged, destroyed, unmerged, + bins, large, mutex, extents, hpa); + } + + emitter_json_object_end(&emitter); /* Closes the "jemalloc" dict. */ + emitter_table_printf(&emitter, "--- End jemalloc statistics ---\n"); + emitter_end(&emitter); +} + +uint64_t +stats_interval_new_event_wait(tsd_t *tsd) { + return stats_interval_accum_batch; +} + +uint64_t +stats_interval_postponed_event_wait(tsd_t *tsd) { + return TE_MIN_START_WAIT; +} + +void +stats_interval_event_handler(tsd_t *tsd, uint64_t elapsed) { + assert(elapsed > 0 && elapsed != TE_INVALID_ELAPSED); + if (counter_accum(tsd_tsdn(tsd), &stats_interval_accumulated, + elapsed)) { + je_malloc_stats_print(NULL, NULL, opt_stats_interval_opts); + } +} + +bool +stats_boot(void) { + uint64_t stats_interval; + if (opt_stats_interval < 0) { + assert(opt_stats_interval == -1); + stats_interval = 0; + stats_interval_accum_batch = 0; + } else{ + /* See comments in stats.h */ + stats_interval = (opt_stats_interval > 0) ? + opt_stats_interval : 1; + uint64_t batch = stats_interval >> + STATS_INTERVAL_ACCUM_LG_BATCH_SIZE; + if (batch > STATS_INTERVAL_ACCUM_BATCH_MAX) { + batch = STATS_INTERVAL_ACCUM_BATCH_MAX; + } else if (batch == 0) { + batch = 1; + } + stats_interval_accum_batch = batch; + } + + return counter_accum_init(&stats_interval_accumulated, stats_interval); +} + +void +stats_prefork(tsdn_t *tsdn) { + counter_prefork(tsdn, &stats_interval_accumulated); +} + +void +stats_postfork_parent(tsdn_t *tsdn) { + counter_postfork_parent(tsdn, &stats_interval_accumulated); +} + +void +stats_postfork_child(tsdn_t *tsdn) { + counter_postfork_child(tsdn, &stats_interval_accumulated); +} diff --git a/BeefRT/JEMalloc/src/sz.c b/BeefRT/JEMalloc/src/sz.c new file mode 100644 index 00000000..d3115dda --- /dev/null +++ b/BeefRT/JEMalloc/src/sz.c @@ -0,0 +1,114 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" +#include "jemalloc/internal/sz.h" + +JEMALLOC_ALIGNED(CACHELINE) +size_t sz_pind2sz_tab[SC_NPSIZES+1]; +size_t sz_large_pad; + +size_t +sz_psz_quantize_floor(size_t size) { + size_t ret; + pszind_t pind; + + assert(size > 0); + assert((size & PAGE_MASK) == 0); + + pind = sz_psz2ind(size - sz_large_pad + 1); + if (pind == 0) { + /* + * Avoid underflow. This short-circuit would also do the right + * thing for all sizes in the range for which there are + * PAGE-spaced size classes, but it's simplest to just handle + * the one case that would cause erroneous results. + */ + return size; + } + ret = sz_pind2sz(pind - 1) + sz_large_pad; + assert(ret <= size); + return ret; +} + +size_t +sz_psz_quantize_ceil(size_t size) { + size_t ret; + + assert(size > 0); + assert(size - sz_large_pad <= SC_LARGE_MAXCLASS); + assert((size & PAGE_MASK) == 0); + + ret = sz_psz_quantize_floor(size); + if (ret < size) { + /* + * Skip a quantization that may have an adequately large extent, + * because under-sized extents may be mixed in. This only + * happens when an unusual size is requested, i.e. for aligned + * allocation, and is just one of several places where linear + * search would potentially find sufficiently aligned available + * memory somewhere lower. + */ + ret = sz_pind2sz(sz_psz2ind(ret - sz_large_pad + 1)) + + sz_large_pad; + } + return ret; +} + +static void +sz_boot_pind2sz_tab(const sc_data_t *sc_data) { + int pind = 0; + for (unsigned i = 0; i < SC_NSIZES; i++) { + const sc_t *sc = &sc_data->sc[i]; + if (sc->psz) { + sz_pind2sz_tab[pind] = (ZU(1) << sc->lg_base) + + (ZU(sc->ndelta) << sc->lg_delta); + pind++; + } + } + for (int i = pind; i <= (int)SC_NPSIZES; i++) { + sz_pind2sz_tab[pind] = sc_data->large_maxclass + PAGE; + } +} + +JEMALLOC_ALIGNED(CACHELINE) +size_t sz_index2size_tab[SC_NSIZES]; + +static void +sz_boot_index2size_tab(const sc_data_t *sc_data) { + for (unsigned i = 0; i < SC_NSIZES; i++) { + const sc_t *sc = &sc_data->sc[i]; + sz_index2size_tab[i] = (ZU(1) << sc->lg_base) + + (ZU(sc->ndelta) << (sc->lg_delta)); + } +} + +/* + * To keep this table small, we divide sizes by the tiny min size, which gives + * the smallest interval for which the result can change. + */ +JEMALLOC_ALIGNED(CACHELINE) +uint8_t sz_size2index_tab[(SC_LOOKUP_MAXCLASS >> SC_LG_TINY_MIN) + 1]; + +static void +sz_boot_size2index_tab(const sc_data_t *sc_data) { + size_t dst_max = (SC_LOOKUP_MAXCLASS >> SC_LG_TINY_MIN) + 1; + size_t dst_ind = 0; + for (unsigned sc_ind = 0; sc_ind < SC_NSIZES && dst_ind < dst_max; + sc_ind++) { + const sc_t *sc = &sc_data->sc[sc_ind]; + size_t sz = (ZU(1) << sc->lg_base) + + (ZU(sc->ndelta) << sc->lg_delta); + size_t max_ind = ((sz + (ZU(1) << SC_LG_TINY_MIN) - 1) + >> SC_LG_TINY_MIN); + for (; dst_ind <= max_ind && dst_ind < dst_max; dst_ind++) { + sz_size2index_tab[dst_ind] = sc_ind; + } + } +} + +void +sz_boot(const sc_data_t *sc_data, bool cache_oblivious) { + sz_large_pad = cache_oblivious ? PAGE : 0; + sz_boot_pind2sz_tab(sc_data); + sz_boot_index2size_tab(sc_data); + sz_boot_size2index_tab(sc_data); +} diff --git a/BeefRT/JEMalloc/src/tcache.c b/BeefRT/JEMalloc/src/tcache.c new file mode 100644 index 00000000..fa16732e --- /dev/null +++ b/BeefRT/JEMalloc/src/tcache.c @@ -0,0 +1,1101 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/safety_check.h" +#include "jemalloc/internal/san.h" +#include "jemalloc/internal/sc.h" + +/******************************************************************************/ +/* Data. */ + +bool opt_tcache = true; + +/* tcache_maxclass is set to 32KB by default. */ +size_t opt_tcache_max = ((size_t)1) << 15; + +/* Reasonable defaults for min and max values. */ +unsigned opt_tcache_nslots_small_min = 20; +unsigned opt_tcache_nslots_small_max = 200; +unsigned opt_tcache_nslots_large = 20; + +/* + * We attempt to make the number of slots in a tcache bin for a given size class + * equal to the number of objects in a slab times some multiplier. By default, + * the multiplier is 2 (i.e. we set the maximum number of objects in the tcache + * to twice the number of objects in a slab). + * This is bounded by some other constraints as well, like the fact that it + * must be even, must be less than opt_tcache_nslots_small_max, etc.. + */ +ssize_t opt_lg_tcache_nslots_mul = 1; + +/* + * Number of allocation bytes between tcache incremental GCs. Again, this + * default just seems to work well; more tuning is possible. + */ +size_t opt_tcache_gc_incr_bytes = 65536; + +/* + * With default settings, we may end up flushing small bins frequently with + * small flush amounts. To limit this tendency, we can set a number of bytes to + * "delay" by. If we try to flush N M-byte items, we decrease that size-class's + * delay by N * M. So, if delay is 1024 and we're looking at the 64-byte size + * class, we won't do any flushing until we've been asked to flush 1024/64 == 16 + * items. This can happen in any configuration (i.e. being asked to flush 16 + * items once, or 4 items 4 times). + * + * Practically, this is stored as a count of items in a uint8_t, so the + * effective maximum value for a size class is 255 * sz. + */ +size_t opt_tcache_gc_delay_bytes = 0; + +/* + * When a cache bin is flushed because it's full, how much of it do we flush? + * By default, we flush half the maximum number of items. + */ +unsigned opt_lg_tcache_flush_small_div = 1; +unsigned opt_lg_tcache_flush_large_div = 1; + +cache_bin_info_t *tcache_bin_info; + +/* Total stack size required (per tcache). Include the padding above. */ +static size_t tcache_bin_alloc_size; +static size_t tcache_bin_alloc_alignment; + +/* Number of cache bins enabled, including both large and small. */ +unsigned nhbins; +/* Max size class to be cached (can be small or large). */ +size_t tcache_maxclass; + +tcaches_t *tcaches; + +/* Index of first element within tcaches that has never been used. */ +static unsigned tcaches_past; + +/* Head of singly linked list tracking available tcaches elements. */ +static tcaches_t *tcaches_avail; + +/* Protects tcaches{,_past,_avail}. */ +static malloc_mutex_t tcaches_mtx; + +/******************************************************************************/ + +size_t +tcache_salloc(tsdn_t *tsdn, const void *ptr) { + return arena_salloc(tsdn, ptr); +} + +uint64_t +tcache_gc_new_event_wait(tsd_t *tsd) { + return opt_tcache_gc_incr_bytes; +} + +uint64_t +tcache_gc_postponed_event_wait(tsd_t *tsd) { + return TE_MIN_START_WAIT; +} + +uint64_t +tcache_gc_dalloc_new_event_wait(tsd_t *tsd) { + return opt_tcache_gc_incr_bytes; +} + +uint64_t +tcache_gc_dalloc_postponed_event_wait(tsd_t *tsd) { + return TE_MIN_START_WAIT; +} + +static uint8_t +tcache_gc_item_delay_compute(szind_t szind) { + assert(szind < SC_NBINS); + size_t sz = sz_index2size(szind); + size_t item_delay = opt_tcache_gc_delay_bytes / sz; + size_t delay_max = ZU(1) + << (sizeof(((tcache_slow_t *)NULL)->bin_flush_delay_items[0]) * 8); + if (item_delay >= delay_max) { + item_delay = delay_max - 1; + } + return (uint8_t)item_delay; +} + +static void +tcache_gc_small(tsd_t *tsd, tcache_slow_t *tcache_slow, tcache_t *tcache, + szind_t szind) { + /* Aim to flush 3/4 of items below low-water. */ + assert(szind < SC_NBINS); + + cache_bin_t *cache_bin = &tcache->bins[szind]; + cache_bin_sz_t ncached = cache_bin_ncached_get_local(cache_bin, + &tcache_bin_info[szind]); + cache_bin_sz_t low_water = cache_bin_low_water_get(cache_bin, + &tcache_bin_info[szind]); + assert(!tcache_slow->bin_refilled[szind]); + + size_t nflush = low_water - (low_water >> 2); + if (nflush < tcache_slow->bin_flush_delay_items[szind]) { + /* Workaround for a conversion warning. */ + uint8_t nflush_uint8 = (uint8_t)nflush; + assert(sizeof(tcache_slow->bin_flush_delay_items[0]) == + sizeof(nflush_uint8)); + tcache_slow->bin_flush_delay_items[szind] -= nflush_uint8; + return; + } else { + tcache_slow->bin_flush_delay_items[szind] + = tcache_gc_item_delay_compute(szind); + } + + tcache_bin_flush_small(tsd, tcache, cache_bin, szind, + (unsigned)(ncached - nflush)); + + /* + * Reduce fill count by 2X. Limit lg_fill_div such that + * the fill count is always at least 1. + */ + if ((cache_bin_info_ncached_max(&tcache_bin_info[szind]) + >> (tcache_slow->lg_fill_div[szind] + 1)) >= 1) { + tcache_slow->lg_fill_div[szind]++; + } +} + +static void +tcache_gc_large(tsd_t *tsd, tcache_slow_t *tcache_slow, tcache_t *tcache, + szind_t szind) { + /* Like the small GC; flush 3/4 of untouched items. */ + assert(szind >= SC_NBINS); + cache_bin_t *cache_bin = &tcache->bins[szind]; + cache_bin_sz_t ncached = cache_bin_ncached_get_local(cache_bin, + &tcache_bin_info[szind]); + cache_bin_sz_t low_water = cache_bin_low_water_get(cache_bin, + &tcache_bin_info[szind]); + tcache_bin_flush_large(tsd, tcache, cache_bin, szind, + (unsigned)(ncached - low_water + (low_water >> 2))); +} + +static void +tcache_event(tsd_t *tsd) { + tcache_t *tcache = tcache_get(tsd); + if (tcache == NULL) { + return; + } + + tcache_slow_t *tcache_slow = tsd_tcache_slowp_get(tsd); + szind_t szind = tcache_slow->next_gc_bin; + bool is_small = (szind < SC_NBINS); + cache_bin_t *cache_bin = &tcache->bins[szind]; + + tcache_bin_flush_stashed(tsd, tcache, cache_bin, szind, is_small); + + cache_bin_sz_t low_water = cache_bin_low_water_get(cache_bin, + &tcache_bin_info[szind]); + if (low_water > 0) { + if (is_small) { + tcache_gc_small(tsd, tcache_slow, tcache, szind); + } else { + tcache_gc_large(tsd, tcache_slow, tcache, szind); + } + } else if (is_small && tcache_slow->bin_refilled[szind]) { + assert(low_water == 0); + /* + * Increase fill count by 2X for small bins. Make sure + * lg_fill_div stays greater than 0. + */ + if (tcache_slow->lg_fill_div[szind] > 1) { + tcache_slow->lg_fill_div[szind]--; + } + tcache_slow->bin_refilled[szind] = false; + } + cache_bin_low_water_set(cache_bin); + + tcache_slow->next_gc_bin++; + if (tcache_slow->next_gc_bin == nhbins) { + tcache_slow->next_gc_bin = 0; + } +} + +void +tcache_gc_event_handler(tsd_t *tsd, uint64_t elapsed) { + assert(elapsed == TE_INVALID_ELAPSED); + tcache_event(tsd); +} + +void +tcache_gc_dalloc_event_handler(tsd_t *tsd, uint64_t elapsed) { + assert(elapsed == TE_INVALID_ELAPSED); + tcache_event(tsd); +} + +void * +tcache_alloc_small_hard(tsdn_t *tsdn, arena_t *arena, + tcache_t *tcache, cache_bin_t *cache_bin, szind_t binind, + bool *tcache_success) { + tcache_slow_t *tcache_slow = tcache->tcache_slow; + void *ret; + + assert(tcache_slow->arena != NULL); + unsigned nfill = cache_bin_info_ncached_max(&tcache_bin_info[binind]) + >> tcache_slow->lg_fill_div[binind]; + arena_cache_bin_fill_small(tsdn, arena, cache_bin, + &tcache_bin_info[binind], binind, nfill); + tcache_slow->bin_refilled[binind] = true; + ret = cache_bin_alloc(cache_bin, tcache_success); + + return ret; +} + +static const void * +tcache_bin_flush_ptr_getter(void *arr_ctx, size_t ind) { + cache_bin_ptr_array_t *arr = (cache_bin_ptr_array_t *)arr_ctx; + return arr->ptr[ind]; +} + +static void +tcache_bin_flush_metadata_visitor(void *szind_sum_ctx, + emap_full_alloc_ctx_t *alloc_ctx) { + size_t *szind_sum = (size_t *)szind_sum_ctx; + *szind_sum -= alloc_ctx->szind; + util_prefetch_write_range(alloc_ctx->edata, sizeof(edata_t)); +} + +JEMALLOC_NOINLINE static void +tcache_bin_flush_size_check_fail(cache_bin_ptr_array_t *arr, szind_t szind, + size_t nptrs, emap_batch_lookup_result_t *edatas) { + bool found_mismatch = false; + for (size_t i = 0; i < nptrs; i++) { + szind_t true_szind = edata_szind_get(edatas[i].edata); + if (true_szind != szind) { + found_mismatch = true; + safety_check_fail_sized_dealloc( + /* current_dealloc */ false, + /* ptr */ tcache_bin_flush_ptr_getter(arr, i), + /* true_size */ sz_index2size(true_szind), + /* input_size */ sz_index2size(szind)); + } + } + assert(found_mismatch); +} + +static void +tcache_bin_flush_edatas_lookup(tsd_t *tsd, cache_bin_ptr_array_t *arr, + szind_t binind, size_t nflush, emap_batch_lookup_result_t *edatas) { + + /* + * This gets compiled away when config_opt_safety_checks is false. + * Checks for sized deallocation bugs, failing early rather than + * corrupting metadata. + */ + size_t szind_sum = binind * nflush; + emap_edata_lookup_batch(tsd, &arena_emap_global, nflush, + &tcache_bin_flush_ptr_getter, (void *)arr, + &tcache_bin_flush_metadata_visitor, (void *)&szind_sum, + edatas); + if (config_opt_safety_checks && unlikely(szind_sum != 0)) { + tcache_bin_flush_size_check_fail(arr, binind, nflush, edatas); + } +} + +JEMALLOC_ALWAYS_INLINE bool +tcache_bin_flush_match(edata_t *edata, unsigned cur_arena_ind, + unsigned cur_binshard, bool small) { + if (small) { + return edata_arena_ind_get(edata) == cur_arena_ind + && edata_binshard_get(edata) == cur_binshard; + } else { + return edata_arena_ind_get(edata) == cur_arena_ind; + } +} + +JEMALLOC_ALWAYS_INLINE void +tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, + szind_t binind, cache_bin_ptr_array_t *ptrs, unsigned nflush, bool small) { + tcache_slow_t *tcache_slow = tcache->tcache_slow; + /* + * A couple lookup calls take tsdn; declare it once for convenience + * instead of calling tsd_tsdn(tsd) all the time. + */ + tsdn_t *tsdn = tsd_tsdn(tsd); + + if (small) { + assert(binind < SC_NBINS); + } else { + assert(binind < nhbins); + } + arena_t *tcache_arena = tcache_slow->arena; + assert(tcache_arena != NULL); + + /* + * Variable length array must have > 0 length; the last element is never + * touched (it's just included to satisfy the no-zero-length rule). + */ + VARIABLE_ARRAY(emap_batch_lookup_result_t, item_edata, nflush + 1); + tcache_bin_flush_edatas_lookup(tsd, ptrs, binind, nflush, item_edata); + + /* + * The slabs where we freed the last remaining object in the slab (and + * so need to free the slab itself). + * Used only if small == true. + */ + unsigned dalloc_count = 0; + VARIABLE_ARRAY(edata_t *, dalloc_slabs, nflush + 1); + + /* + * We're about to grab a bunch of locks. If one of them happens to be + * the one guarding the arena-level stats counters we flush our + * thread-local ones to, we do so under one critical section. + */ + bool merged_stats = false; + while (nflush > 0) { + /* Lock the arena, or bin, associated with the first object. */ + edata_t *edata = item_edata[0].edata; + unsigned cur_arena_ind = edata_arena_ind_get(edata); + arena_t *cur_arena = arena_get(tsdn, cur_arena_ind, false); + + /* + * These assignments are always overwritten when small is true, + * and their values are always ignored when small is false, but + * to avoid the technical UB when we pass them as parameters, we + * need to intialize them. + */ + unsigned cur_binshard = 0; + bin_t *cur_bin = NULL; + if (small) { + cur_binshard = edata_binshard_get(edata); + cur_bin = arena_get_bin(cur_arena, binind, + cur_binshard); + assert(cur_binshard < bin_infos[binind].n_shards); + /* + * If you're looking at profiles, you might think this + * is a good place to prefetch the bin stats, which are + * often a cache miss. This turns out not to be + * helpful on the workloads we've looked at, with moving + * the bin stats next to the lock seeming to do better. + */ + } + + if (small) { + malloc_mutex_lock(tsdn, &cur_bin->lock); + } + if (!small && !arena_is_auto(cur_arena)) { + malloc_mutex_lock(tsdn, &cur_arena->large_mtx); + } + + /* + * If we acquired the right lock and have some stats to flush, + * flush them. + */ + if (config_stats && tcache_arena == cur_arena + && !merged_stats) { + merged_stats = true; + if (small) { + cur_bin->stats.nflushes++; + cur_bin->stats.nrequests += + cache_bin->tstats.nrequests; + cache_bin->tstats.nrequests = 0; + } else { + arena_stats_large_flush_nrequests_add(tsdn, + &tcache_arena->stats, binind, + cache_bin->tstats.nrequests); + cache_bin->tstats.nrequests = 0; + } + } + + /* + * Large allocations need special prep done. Afterwards, we can + * drop the large lock. + */ + if (!small) { + for (unsigned i = 0; i < nflush; i++) { + void *ptr = ptrs->ptr[i]; + edata = item_edata[i].edata; + assert(ptr != NULL && edata != NULL); + + if (tcache_bin_flush_match(edata, cur_arena_ind, + cur_binshard, small)) { + large_dalloc_prep_locked(tsdn, + edata); + } + } + } + if (!small && !arena_is_auto(cur_arena)) { + malloc_mutex_unlock(tsdn, &cur_arena->large_mtx); + } + + /* Deallocate whatever we can. */ + unsigned ndeferred = 0; + /* Init only to avoid used-uninitialized warning. */ + arena_dalloc_bin_locked_info_t dalloc_bin_info = {0}; + if (small) { + arena_dalloc_bin_locked_begin(&dalloc_bin_info, binind); + } + for (unsigned i = 0; i < nflush; i++) { + void *ptr = ptrs->ptr[i]; + edata = item_edata[i].edata; + assert(ptr != NULL && edata != NULL); + if (!tcache_bin_flush_match(edata, cur_arena_ind, + cur_binshard, small)) { + /* + * The object was allocated either via a + * different arena, or a different bin in this + * arena. Either way, stash the object so that + * it can be handled in a future pass. + */ + ptrs->ptr[ndeferred] = ptr; + item_edata[ndeferred].edata = edata; + ndeferred++; + continue; + } + if (small) { + if (arena_dalloc_bin_locked_step(tsdn, + cur_arena, cur_bin, &dalloc_bin_info, + binind, edata, ptr)) { + dalloc_slabs[dalloc_count] = edata; + dalloc_count++; + } + } else { + if (large_dalloc_safety_checks(edata, ptr, + binind)) { + /* See the comment in isfree. */ + continue; + } + large_dalloc_finish(tsdn, edata); + } + } + + if (small) { + arena_dalloc_bin_locked_finish(tsdn, cur_arena, cur_bin, + &dalloc_bin_info); + malloc_mutex_unlock(tsdn, &cur_bin->lock); + } + arena_decay_ticks(tsdn, cur_arena, nflush - ndeferred); + nflush = ndeferred; + } + + /* Handle all deferred slab dalloc. */ + assert(small || dalloc_count == 0); + for (unsigned i = 0; i < dalloc_count; i++) { + edata_t *slab = dalloc_slabs[i]; + arena_slab_dalloc(tsdn, arena_get_from_edata(slab), slab); + + } + + if (config_stats && !merged_stats) { + if (small) { + /* + * The flush loop didn't happen to flush to this + * thread's arena, so the stats didn't get merged. + * Manually do so now. + */ + bin_t *bin = arena_bin_choose(tsdn, tcache_arena, + binind, NULL); + malloc_mutex_lock(tsdn, &bin->lock); + bin->stats.nflushes++; + bin->stats.nrequests += cache_bin->tstats.nrequests; + cache_bin->tstats.nrequests = 0; + malloc_mutex_unlock(tsdn, &bin->lock); + } else { + arena_stats_large_flush_nrequests_add(tsdn, + &tcache_arena->stats, binind, + cache_bin->tstats.nrequests); + cache_bin->tstats.nrequests = 0; + } + } + +} + +JEMALLOC_ALWAYS_INLINE void +tcache_bin_flush_bottom(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, + szind_t binind, unsigned rem, bool small) { + tcache_bin_flush_stashed(tsd, tcache, cache_bin, binind, small); + + cache_bin_sz_t ncached = cache_bin_ncached_get_local(cache_bin, + &tcache_bin_info[binind]); + assert((cache_bin_sz_t)rem <= ncached); + unsigned nflush = ncached - rem; + + CACHE_BIN_PTR_ARRAY_DECLARE(ptrs, nflush); + cache_bin_init_ptr_array_for_flush(cache_bin, &tcache_bin_info[binind], + &ptrs, nflush); + + tcache_bin_flush_impl(tsd, tcache, cache_bin, binind, &ptrs, nflush, + small); + + cache_bin_finish_flush(cache_bin, &tcache_bin_info[binind], &ptrs, + ncached - rem); +} + +void +tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, + szind_t binind, unsigned rem) { + tcache_bin_flush_bottom(tsd, tcache, cache_bin, binind, rem, true); +} + +void +tcache_bin_flush_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, + szind_t binind, unsigned rem) { + tcache_bin_flush_bottom(tsd, tcache, cache_bin, binind, rem, false); +} + +/* + * Flushing stashed happens when 1) tcache fill, 2) tcache flush, or 3) tcache + * GC event. This makes sure that the stashed items do not hold memory for too + * long, and new buffers can only be allocated when nothing is stashed. + * + * The downside is, the time between stash and flush may be relatively short, + * especially when the request rate is high. It lowers the chance of detecting + * write-after-free -- however that is a delayed detection anyway, and is less + * of a focus than the memory overhead. + */ +void +tcache_bin_flush_stashed(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, + szind_t binind, bool is_small) { + cache_bin_info_t *info = &tcache_bin_info[binind]; + /* + * The two below are for assertion only. The content of original cached + * items remain unchanged -- the stashed items reside on the other end + * of the stack. Checking the stack head and ncached to verify. + */ + void *head_content = *cache_bin->stack_head; + cache_bin_sz_t orig_cached = cache_bin_ncached_get_local(cache_bin, + info); + + cache_bin_sz_t nstashed = cache_bin_nstashed_get_local(cache_bin, info); + assert(orig_cached + nstashed <= cache_bin_info_ncached_max(info)); + if (nstashed == 0) { + return; + } + + CACHE_BIN_PTR_ARRAY_DECLARE(ptrs, nstashed); + cache_bin_init_ptr_array_for_stashed(cache_bin, binind, info, &ptrs, + nstashed); + san_check_stashed_ptrs(ptrs.ptr, nstashed, sz_index2size(binind)); + tcache_bin_flush_impl(tsd, tcache, cache_bin, binind, &ptrs, nstashed, + is_small); + cache_bin_finish_flush_stashed(cache_bin, info); + + assert(cache_bin_nstashed_get_local(cache_bin, info) == 0); + assert(cache_bin_ncached_get_local(cache_bin, info) == orig_cached); + assert(head_content == *cache_bin->stack_head); +} + +void +tcache_arena_associate(tsdn_t *tsdn, tcache_slow_t *tcache_slow, + tcache_t *tcache, arena_t *arena) { + assert(tcache_slow->arena == NULL); + tcache_slow->arena = arena; + + if (config_stats) { + /* Link into list of extant tcaches. */ + malloc_mutex_lock(tsdn, &arena->tcache_ql_mtx); + + ql_elm_new(tcache_slow, link); + ql_tail_insert(&arena->tcache_ql, tcache_slow, link); + cache_bin_array_descriptor_init( + &tcache_slow->cache_bin_array_descriptor, tcache->bins); + ql_tail_insert(&arena->cache_bin_array_descriptor_ql, + &tcache_slow->cache_bin_array_descriptor, link); + + malloc_mutex_unlock(tsdn, &arena->tcache_ql_mtx); + } +} + +static void +tcache_arena_dissociate(tsdn_t *tsdn, tcache_slow_t *tcache_slow, + tcache_t *tcache) { + arena_t *arena = tcache_slow->arena; + assert(arena != NULL); + if (config_stats) { + /* Unlink from list of extant tcaches. */ + malloc_mutex_lock(tsdn, &arena->tcache_ql_mtx); + if (config_debug) { + bool in_ql = false; + tcache_slow_t *iter; + ql_foreach(iter, &arena->tcache_ql, link) { + if (iter == tcache_slow) { + in_ql = true; + break; + } + } + assert(in_ql); + } + ql_remove(&arena->tcache_ql, tcache_slow, link); + ql_remove(&arena->cache_bin_array_descriptor_ql, + &tcache_slow->cache_bin_array_descriptor, link); + tcache_stats_merge(tsdn, tcache_slow->tcache, arena); + malloc_mutex_unlock(tsdn, &arena->tcache_ql_mtx); + } + tcache_slow->arena = NULL; +} + +void +tcache_arena_reassociate(tsdn_t *tsdn, tcache_slow_t *tcache_slow, + tcache_t *tcache, arena_t *arena) { + tcache_arena_dissociate(tsdn, tcache_slow, tcache); + tcache_arena_associate(tsdn, tcache_slow, tcache, arena); +} + +bool +tsd_tcache_enabled_data_init(tsd_t *tsd) { + /* Called upon tsd initialization. */ + tsd_tcache_enabled_set(tsd, opt_tcache); + tsd_slow_update(tsd); + + if (opt_tcache) { + /* Trigger tcache init. */ + tsd_tcache_data_init(tsd); + } + + return false; +} + +static void +tcache_init(tsd_t *tsd, tcache_slow_t *tcache_slow, tcache_t *tcache, + void *mem) { + tcache->tcache_slow = tcache_slow; + tcache_slow->tcache = tcache; + + memset(&tcache_slow->link, 0, sizeof(ql_elm(tcache_t))); + tcache_slow->next_gc_bin = 0; + tcache_slow->arena = NULL; + tcache_slow->dyn_alloc = mem; + + /* + * We reserve cache bins for all small size classes, even if some may + * not get used (i.e. bins higher than nhbins). This allows the fast + * and common paths to access cache bin metadata safely w/o worrying + * about which ones are disabled. + */ + unsigned n_reserved_bins = nhbins < SC_NBINS ? SC_NBINS : nhbins; + memset(tcache->bins, 0, sizeof(cache_bin_t) * n_reserved_bins); + + size_t cur_offset = 0; + cache_bin_preincrement(tcache_bin_info, nhbins, mem, + &cur_offset); + for (unsigned i = 0; i < nhbins; i++) { + if (i < SC_NBINS) { + tcache_slow->lg_fill_div[i] = 1; + tcache_slow->bin_refilled[i] = false; + tcache_slow->bin_flush_delay_items[i] + = tcache_gc_item_delay_compute(i); + } + cache_bin_t *cache_bin = &tcache->bins[i]; + cache_bin_init(cache_bin, &tcache_bin_info[i], mem, + &cur_offset); + } + /* + * For small size classes beyond tcache_maxclass (i.e. nhbins < NBINS), + * their cache bins are initialized to a state to safely and efficiently + * fail all fastpath alloc / free, so that no additional check around + * nhbins is needed on fastpath. + */ + for (unsigned i = nhbins; i < SC_NBINS; i++) { + /* Disabled small bins. */ + cache_bin_t *cache_bin = &tcache->bins[i]; + void *fake_stack = mem; + size_t fake_offset = 0; + + cache_bin_init(cache_bin, &tcache_bin_info[i], fake_stack, + &fake_offset); + assert(tcache_small_bin_disabled(i, cache_bin)); + } + + cache_bin_postincrement(tcache_bin_info, nhbins, mem, + &cur_offset); + /* Sanity check that the whole stack is used. */ + assert(cur_offset == tcache_bin_alloc_size); +} + +/* Initialize auto tcache (embedded in TSD). */ +bool +tsd_tcache_data_init(tsd_t *tsd) { + tcache_slow_t *tcache_slow = tsd_tcache_slowp_get_unsafe(tsd); + tcache_t *tcache = tsd_tcachep_get_unsafe(tsd); + + assert(cache_bin_still_zero_initialized(&tcache->bins[0])); + size_t alignment = tcache_bin_alloc_alignment; + size_t size = sz_sa2u(tcache_bin_alloc_size, alignment); + + void *mem = ipallocztm(tsd_tsdn(tsd), size, alignment, true, NULL, + true, arena_get(TSDN_NULL, 0, true)); + if (mem == NULL) { + return true; + } + + tcache_init(tsd, tcache_slow, tcache, mem); + /* + * Initialization is a bit tricky here. After malloc init is done, all + * threads can rely on arena_choose and associate tcache accordingly. + * However, the thread that does actual malloc bootstrapping relies on + * functional tsd, and it can only rely on a0. In that case, we + * associate its tcache to a0 temporarily, and later on + * arena_choose_hard() will re-associate properly. + */ + tcache_slow->arena = NULL; + arena_t *arena; + if (!malloc_initialized()) { + /* If in initialization, assign to a0. */ + arena = arena_get(tsd_tsdn(tsd), 0, false); + tcache_arena_associate(tsd_tsdn(tsd), tcache_slow, tcache, + arena); + } else { + arena = arena_choose(tsd, NULL); + /* This may happen if thread.tcache.enabled is used. */ + if (tcache_slow->arena == NULL) { + tcache_arena_associate(tsd_tsdn(tsd), tcache_slow, + tcache, arena); + } + } + assert(arena == tcache_slow->arena); + + return false; +} + +/* Created manual tcache for tcache.create mallctl. */ +tcache_t * +tcache_create_explicit(tsd_t *tsd) { + /* + * We place the cache bin stacks, then the tcache_t, then a pointer to + * the beginning of the whole allocation (for freeing). The makes sure + * the cache bins have the requested alignment. + */ + size_t size = tcache_bin_alloc_size + sizeof(tcache_t) + + sizeof(tcache_slow_t); + /* Naturally align the pointer stacks. */ + size = PTR_CEILING(size); + size = sz_sa2u(size, tcache_bin_alloc_alignment); + + void *mem = ipallocztm(tsd_tsdn(tsd), size, tcache_bin_alloc_alignment, + true, NULL, true, arena_get(TSDN_NULL, 0, true)); + if (mem == NULL) { + return NULL; + } + tcache_t *tcache = (void *)((uintptr_t)mem + tcache_bin_alloc_size); + tcache_slow_t *tcache_slow = + (void *)((uintptr_t)mem + tcache_bin_alloc_size + sizeof(tcache_t)); + tcache_init(tsd, tcache_slow, tcache, mem); + + tcache_arena_associate(tsd_tsdn(tsd), tcache_slow, tcache, + arena_ichoose(tsd, NULL)); + + return tcache; +} + +static void +tcache_flush_cache(tsd_t *tsd, tcache_t *tcache) { + tcache_slow_t *tcache_slow = tcache->tcache_slow; + assert(tcache_slow->arena != NULL); + + for (unsigned i = 0; i < nhbins; i++) { + cache_bin_t *cache_bin = &tcache->bins[i]; + if (i < SC_NBINS) { + tcache_bin_flush_small(tsd, tcache, cache_bin, i, 0); + } else { + tcache_bin_flush_large(tsd, tcache, cache_bin, i, 0); + } + if (config_stats) { + assert(cache_bin->tstats.nrequests == 0); + } + } +} + +void +tcache_flush(tsd_t *tsd) { + assert(tcache_available(tsd)); + tcache_flush_cache(tsd, tsd_tcachep_get(tsd)); +} + +static void +tcache_destroy(tsd_t *tsd, tcache_t *tcache, bool tsd_tcache) { + tcache_slow_t *tcache_slow = tcache->tcache_slow; + tcache_flush_cache(tsd, tcache); + arena_t *arena = tcache_slow->arena; + tcache_arena_dissociate(tsd_tsdn(tsd), tcache_slow, tcache); + + if (tsd_tcache) { + cache_bin_t *cache_bin = &tcache->bins[0]; + cache_bin_assert_empty(cache_bin, &tcache_bin_info[0]); + } + idalloctm(tsd_tsdn(tsd), tcache_slow->dyn_alloc, NULL, NULL, true, + true); + + /* + * The deallocation and tcache flush above may not trigger decay since + * we are on the tcache shutdown path (potentially with non-nominal + * tsd). Manually trigger decay to avoid pathological cases. Also + * include arena 0 because the tcache array is allocated from it. + */ + arena_decay(tsd_tsdn(tsd), arena_get(tsd_tsdn(tsd), 0, false), + false, false); + + if (arena_nthreads_get(arena, false) == 0 && + !background_thread_enabled()) { + /* Force purging when no threads assigned to the arena anymore. */ + arena_decay(tsd_tsdn(tsd), arena, + /* is_background_thread */ false, /* all */ true); + } else { + arena_decay(tsd_tsdn(tsd), arena, + /* is_background_thread */ false, /* all */ false); + } +} + +/* For auto tcache (embedded in TSD) only. */ +void +tcache_cleanup(tsd_t *tsd) { + tcache_t *tcache = tsd_tcachep_get(tsd); + if (!tcache_available(tsd)) { + assert(tsd_tcache_enabled_get(tsd) == false); + assert(cache_bin_still_zero_initialized(&tcache->bins[0])); + return; + } + assert(tsd_tcache_enabled_get(tsd)); + assert(!cache_bin_still_zero_initialized(&tcache->bins[0])); + + tcache_destroy(tsd, tcache, true); + if (config_debug) { + /* + * For debug testing only, we want to pretend we're still in the + * zero-initialized state. + */ + memset(tcache->bins, 0, sizeof(cache_bin_t) * nhbins); + } +} + +void +tcache_stats_merge(tsdn_t *tsdn, tcache_t *tcache, arena_t *arena) { + cassert(config_stats); + + /* Merge and reset tcache stats. */ + for (unsigned i = 0; i < nhbins; i++) { + cache_bin_t *cache_bin = &tcache->bins[i]; + if (i < SC_NBINS) { + bin_t *bin = arena_bin_choose(tsdn, arena, i, NULL); + malloc_mutex_lock(tsdn, &bin->lock); + bin->stats.nrequests += cache_bin->tstats.nrequests; + malloc_mutex_unlock(tsdn, &bin->lock); + } else { + arena_stats_large_flush_nrequests_add(tsdn, + &arena->stats, i, cache_bin->tstats.nrequests); + } + cache_bin->tstats.nrequests = 0; + } +} + +static bool +tcaches_create_prep(tsd_t *tsd, base_t *base) { + bool err; + + malloc_mutex_assert_owner(tsd_tsdn(tsd), &tcaches_mtx); + + if (tcaches == NULL) { + tcaches = base_alloc(tsd_tsdn(tsd), base, + sizeof(tcache_t *) * (MALLOCX_TCACHE_MAX+1), CACHELINE); + if (tcaches == NULL) { + err = true; + goto label_return; + } + } + + if (tcaches_avail == NULL && tcaches_past > MALLOCX_TCACHE_MAX) { + err = true; + goto label_return; + } + + err = false; +label_return: + return err; +} + +bool +tcaches_create(tsd_t *tsd, base_t *base, unsigned *r_ind) { + witness_assert_depth(tsdn_witness_tsdp_get(tsd_tsdn(tsd)), 0); + + bool err; + + malloc_mutex_lock(tsd_tsdn(tsd), &tcaches_mtx); + + if (tcaches_create_prep(tsd, base)) { + err = true; + goto label_return; + } + + tcache_t *tcache = tcache_create_explicit(tsd); + if (tcache == NULL) { + err = true; + goto label_return; + } + + tcaches_t *elm; + if (tcaches_avail != NULL) { + elm = tcaches_avail; + tcaches_avail = tcaches_avail->next; + elm->tcache = tcache; + *r_ind = (unsigned)(elm - tcaches); + } else { + elm = &tcaches[tcaches_past]; + elm->tcache = tcache; + *r_ind = tcaches_past; + tcaches_past++; + } + + err = false; +label_return: + malloc_mutex_unlock(tsd_tsdn(tsd), &tcaches_mtx); + witness_assert_depth(tsdn_witness_tsdp_get(tsd_tsdn(tsd)), 0); + return err; +} + +static tcache_t * +tcaches_elm_remove(tsd_t *tsd, tcaches_t *elm, bool allow_reinit) { + malloc_mutex_assert_owner(tsd_tsdn(tsd), &tcaches_mtx); + + if (elm->tcache == NULL) { + return NULL; + } + tcache_t *tcache = elm->tcache; + if (allow_reinit) { + elm->tcache = TCACHES_ELM_NEED_REINIT; + } else { + elm->tcache = NULL; + } + + if (tcache == TCACHES_ELM_NEED_REINIT) { + return NULL; + } + return tcache; +} + +void +tcaches_flush(tsd_t *tsd, unsigned ind) { + malloc_mutex_lock(tsd_tsdn(tsd), &tcaches_mtx); + tcache_t *tcache = tcaches_elm_remove(tsd, &tcaches[ind], true); + malloc_mutex_unlock(tsd_tsdn(tsd), &tcaches_mtx); + if (tcache != NULL) { + /* Destroy the tcache; recreate in tcaches_get() if needed. */ + tcache_destroy(tsd, tcache, false); + } +} + +void +tcaches_destroy(tsd_t *tsd, unsigned ind) { + malloc_mutex_lock(tsd_tsdn(tsd), &tcaches_mtx); + tcaches_t *elm = &tcaches[ind]; + tcache_t *tcache = tcaches_elm_remove(tsd, elm, false); + elm->next = tcaches_avail; + tcaches_avail = elm; + malloc_mutex_unlock(tsd_tsdn(tsd), &tcaches_mtx); + if (tcache != NULL) { + tcache_destroy(tsd, tcache, false); + } +} + +static unsigned +tcache_ncached_max_compute(szind_t szind) { + if (szind >= SC_NBINS) { + assert(szind < nhbins); + return opt_tcache_nslots_large; + } + unsigned slab_nregs = bin_infos[szind].nregs; + + /* We may modify these values; start with the opt versions. */ + unsigned nslots_small_min = opt_tcache_nslots_small_min; + unsigned nslots_small_max = opt_tcache_nslots_small_max; + + /* + * Clamp values to meet our constraints -- even, nonzero, min < max, and + * suitable for a cache bin size. + */ + if (opt_tcache_nslots_small_max > CACHE_BIN_NCACHED_MAX) { + nslots_small_max = CACHE_BIN_NCACHED_MAX; + } + if (nslots_small_min % 2 != 0) { + nslots_small_min++; + } + if (nslots_small_max % 2 != 0) { + nslots_small_max--; + } + if (nslots_small_min < 2) { + nslots_small_min = 2; + } + if (nslots_small_max < 2) { + nslots_small_max = 2; + } + if (nslots_small_min > nslots_small_max) { + nslots_small_min = nslots_small_max; + } + + unsigned candidate; + if (opt_lg_tcache_nslots_mul < 0) { + candidate = slab_nregs >> (-opt_lg_tcache_nslots_mul); + } else { + candidate = slab_nregs << opt_lg_tcache_nslots_mul; + } + if (candidate % 2 != 0) { + /* + * We need the candidate size to be even -- we assume that we + * can divide by two and get a positive number (e.g. when + * flushing). + */ + ++candidate; + } + if (candidate <= nslots_small_min) { + return nslots_small_min; + } else if (candidate <= nslots_small_max) { + return candidate; + } else { + return nslots_small_max; + } +} + +bool +tcache_boot(tsdn_t *tsdn, base_t *base) { + tcache_maxclass = sz_s2u(opt_tcache_max); + assert(tcache_maxclass <= TCACHE_MAXCLASS_LIMIT); + nhbins = sz_size2index(tcache_maxclass) + 1; + + if (malloc_mutex_init(&tcaches_mtx, "tcaches", WITNESS_RANK_TCACHES, + malloc_mutex_rank_exclusive)) { + return true; + } + + /* Initialize tcache_bin_info. See comments in tcache_init(). */ + unsigned n_reserved_bins = nhbins < SC_NBINS ? SC_NBINS : nhbins; + size_t size = n_reserved_bins * sizeof(cache_bin_info_t); + tcache_bin_info = (cache_bin_info_t *)base_alloc(tsdn, base, size, + CACHELINE); + if (tcache_bin_info == NULL) { + return true; + } + + for (szind_t i = 0; i < nhbins; i++) { + unsigned ncached_max = tcache_ncached_max_compute(i); + cache_bin_info_init(&tcache_bin_info[i], ncached_max); + } + for (szind_t i = nhbins; i < SC_NBINS; i++) { + /* Disabled small bins. */ + cache_bin_info_init(&tcache_bin_info[i], 0); + assert(tcache_small_bin_disabled(i, NULL)); + } + + cache_bin_info_compute_alloc(tcache_bin_info, nhbins, + &tcache_bin_alloc_size, &tcache_bin_alloc_alignment); + + return false; +} + +void +tcache_prefork(tsdn_t *tsdn) { + malloc_mutex_prefork(tsdn, &tcaches_mtx); +} + +void +tcache_postfork_parent(tsdn_t *tsdn) { + malloc_mutex_postfork_parent(tsdn, &tcaches_mtx); +} + +void +tcache_postfork_child(tsdn_t *tsdn) { + malloc_mutex_postfork_child(tsdn, &tcaches_mtx); +} + +void tcache_assert_initialized(tcache_t *tcache) { + assert(!cache_bin_still_zero_initialized(&tcache->bins[0])); +} diff --git a/BeefRT/JEMalloc/src/test_hooks.c b/BeefRT/JEMalloc/src/test_hooks.c new file mode 100644 index 00000000..ace00d9c --- /dev/null +++ b/BeefRT/JEMalloc/src/test_hooks.c @@ -0,0 +1,12 @@ +#include "jemalloc/internal/jemalloc_preamble.h" + +/* + * The hooks are a little bit screwy -- they're not genuinely exported in the + * sense that we want them available to end-users, but we do want them visible + * from outside the generated library, so that we can use them in test code. + */ +JEMALLOC_EXPORT +void (*test_hooks_arena_new_hook)() = NULL; + +JEMALLOC_EXPORT +void (*test_hooks_libc_hook)() = NULL; diff --git a/BeefRT/JEMalloc/src/thread_event.c b/BeefRT/JEMalloc/src/thread_event.c new file mode 100644 index 00000000..37eb5827 --- /dev/null +++ b/BeefRT/JEMalloc/src/thread_event.c @@ -0,0 +1,343 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/thread_event.h" + +/* + * Signatures for event specific functions. These functions should be defined + * by the modules owning each event. The signatures here verify that the + * definitions follow the right format. + * + * The first two are functions computing new / postponed event wait time. New + * event wait time is the time till the next event if an event is currently + * being triggered; postponed event wait time is the time till the next event + * if an event should be triggered but needs to be postponed, e.g. when the TSD + * is not nominal or during reentrancy. + * + * The third is the event handler function, which is called whenever an event + * is triggered. The parameter is the elapsed time since the last time an + * event of the same type was triggered. + */ +#define E(event, condition_unused, is_alloc_event_unused) \ +uint64_t event##_new_event_wait(tsd_t *tsd); \ +uint64_t event##_postponed_event_wait(tsd_t *tsd); \ +void event##_event_handler(tsd_t *tsd, uint64_t elapsed); + +ITERATE_OVER_ALL_EVENTS +#undef E + +/* Signatures for internal functions fetching elapsed time. */ +#define E(event, condition_unused, is_alloc_event_unused) \ +static uint64_t event##_fetch_elapsed(tsd_t *tsd); + +ITERATE_OVER_ALL_EVENTS +#undef E + +static uint64_t +tcache_gc_fetch_elapsed(tsd_t *tsd) { + return TE_INVALID_ELAPSED; +} + +static uint64_t +tcache_gc_dalloc_fetch_elapsed(tsd_t *tsd) { + return TE_INVALID_ELAPSED; +} + +static uint64_t +prof_sample_fetch_elapsed(tsd_t *tsd) { + uint64_t last_event = thread_allocated_last_event_get(tsd); + uint64_t last_sample_event = prof_sample_last_event_get(tsd); + prof_sample_last_event_set(tsd, last_event); + return last_event - last_sample_event; +} + +static uint64_t +stats_interval_fetch_elapsed(tsd_t *tsd) { + uint64_t last_event = thread_allocated_last_event_get(tsd); + uint64_t last_stats_event = stats_interval_last_event_get(tsd); + stats_interval_last_event_set(tsd, last_event); + return last_event - last_stats_event; +} + +static uint64_t +peak_alloc_fetch_elapsed(tsd_t *tsd) { + return TE_INVALID_ELAPSED; +} + +static uint64_t +peak_dalloc_fetch_elapsed(tsd_t *tsd) { + return TE_INVALID_ELAPSED; +} + +/* Per event facilities done. */ + +static bool +te_ctx_has_active_events(te_ctx_t *ctx) { + assert(config_debug); +#define E(event, condition, alloc_event) \ + if (condition && alloc_event == ctx->is_alloc) { \ + return true; \ + } + ITERATE_OVER_ALL_EVENTS +#undef E + return false; +} + +static uint64_t +te_next_event_compute(tsd_t *tsd, bool is_alloc) { + uint64_t wait = TE_MAX_START_WAIT; +#define E(event, condition, alloc_event) \ + if (is_alloc == alloc_event && condition) { \ + uint64_t event_wait = \ + event##_event_wait_get(tsd); \ + assert(event_wait <= TE_MAX_START_WAIT); \ + if (event_wait > 0U && event_wait < wait) { \ + wait = event_wait; \ + } \ + } + + ITERATE_OVER_ALL_EVENTS +#undef E + assert(wait <= TE_MAX_START_WAIT); + return wait; +} + +static void +te_assert_invariants_impl(tsd_t *tsd, te_ctx_t *ctx) { + uint64_t current_bytes = te_ctx_current_bytes_get(ctx); + uint64_t last_event = te_ctx_last_event_get(ctx); + uint64_t next_event = te_ctx_next_event_get(ctx); + uint64_t next_event_fast = te_ctx_next_event_fast_get(ctx); + + assert(last_event != next_event); + if (next_event > TE_NEXT_EVENT_FAST_MAX || !tsd_fast(tsd)) { + assert(next_event_fast == 0U); + } else { + assert(next_event_fast == next_event); + } + + /* The subtraction is intentionally susceptible to underflow. */ + uint64_t interval = next_event - last_event; + + /* The subtraction is intentionally susceptible to underflow. */ + assert(current_bytes - last_event < interval); + uint64_t min_wait = te_next_event_compute(tsd, te_ctx_is_alloc(ctx)); + /* + * next_event should have been pushed up only except when no event is + * on and the TSD is just initialized. The last_event == 0U guard + * below is stronger than needed, but having an exactly accurate guard + * is more complicated to implement. + */ + assert((!te_ctx_has_active_events(ctx) && last_event == 0U) || + interval == min_wait || + (interval < min_wait && interval == TE_MAX_INTERVAL)); +} + +void +te_assert_invariants_debug(tsd_t *tsd) { + te_ctx_t ctx; + te_ctx_get(tsd, &ctx, true); + te_assert_invariants_impl(tsd, &ctx); + + te_ctx_get(tsd, &ctx, false); + te_assert_invariants_impl(tsd, &ctx); +} + +/* + * Synchronization around the fast threshold in tsd -- + * There are two threads to consider in the synchronization here: + * - The owner of the tsd being updated by a slow path change + * - The remote thread, doing that slow path change. + * + * As a design constraint, we want to ensure that a slow-path transition cannot + * be ignored for arbitrarily long, and that if the remote thread causes a + * slow-path transition and then communicates with the owner thread that it has + * occurred, then the owner will go down the slow path on the next allocator + * operation (so that we don't want to just wait until the owner hits its slow + * path reset condition on its own). + * + * Here's our strategy to do that: + * + * The remote thread will update the slow-path stores to TSD variables, issue a + * SEQ_CST fence, and then update the TSD next_event_fast counter. The owner + * thread will update next_event_fast, issue an SEQ_CST fence, and then check + * its TSD to see if it's on the slow path. + + * This is fairly straightforward when 64-bit atomics are supported. Assume that + * the remote fence is sandwiched between two owner fences in the reset pathway. + * The case where there is no preceding or trailing owner fence (i.e. because + * the owner thread is near the beginning or end of its life) can be analyzed + * similarly. The owner store to next_event_fast preceding the earlier owner + * fence will be earlier in coherence order than the remote store to it, so that + * the owner thread will go down the slow path once the store becomes visible to + * it, which is no later than the time of the second fence. + + * The case where we don't support 64-bit atomics is trickier, since word + * tearing is possible. We'll repeat the same analysis, and look at the two + * owner fences sandwiching the remote fence. The next_event_fast stores done + * alongside the earlier owner fence cannot overwrite any of the remote stores + * (since they precede the earlier owner fence in sb, which precedes the remote + * fence in sc, which precedes the remote stores in sb). After the second owner + * fence there will be a re-check of the slow-path variables anyways, so the + * "owner will notice that it's on the slow path eventually" guarantee is + * satisfied. To make sure that the out-of-band-messaging constraint is as well, + * note that either the message passing is sequenced before the second owner + * fence (in which case the remote stores happen before the second set of owner + * stores, so malloc sees a value of zero for next_event_fast and goes down the + * slow path), or it is not (in which case the owner sees the tsd slow-path + * writes on its previous update). This leaves open the possibility that the + * remote thread will (at some arbitrary point in the future) zero out one half + * of the owner thread's next_event_fast, but that's always safe (it just sends + * it down the slow path earlier). + */ +static void +te_ctx_next_event_fast_update(te_ctx_t *ctx) { + uint64_t next_event = te_ctx_next_event_get(ctx); + uint64_t next_event_fast = (next_event <= TE_NEXT_EVENT_FAST_MAX) ? + next_event : 0U; + te_ctx_next_event_fast_set(ctx, next_event_fast); +} + +void +te_recompute_fast_threshold(tsd_t *tsd) { + if (tsd_state_get(tsd) != tsd_state_nominal) { + /* Check first because this is also called on purgatory. */ + te_next_event_fast_set_non_nominal(tsd); + return; + } + + te_ctx_t ctx; + te_ctx_get(tsd, &ctx, true); + te_ctx_next_event_fast_update(&ctx); + te_ctx_get(tsd, &ctx, false); + te_ctx_next_event_fast_update(&ctx); + + atomic_fence(ATOMIC_SEQ_CST); + if (tsd_state_get(tsd) != tsd_state_nominal) { + te_next_event_fast_set_non_nominal(tsd); + } +} + +static void +te_adjust_thresholds_helper(tsd_t *tsd, te_ctx_t *ctx, + uint64_t wait) { + /* + * The next threshold based on future events can only be adjusted after + * progressing the last_event counter (which is set to current). + */ + assert(te_ctx_current_bytes_get(ctx) == te_ctx_last_event_get(ctx)); + assert(wait <= TE_MAX_START_WAIT); + + uint64_t next_event = te_ctx_last_event_get(ctx) + (wait <= + TE_MAX_INTERVAL ? wait : TE_MAX_INTERVAL); + te_ctx_next_event_set(tsd, ctx, next_event); +} + +static uint64_t +te_clip_event_wait(uint64_t event_wait) { + assert(event_wait > 0U); + if (TE_MIN_START_WAIT > 1U && + unlikely(event_wait < TE_MIN_START_WAIT)) { + event_wait = TE_MIN_START_WAIT; + } + if (TE_MAX_START_WAIT < UINT64_MAX && + unlikely(event_wait > TE_MAX_START_WAIT)) { + event_wait = TE_MAX_START_WAIT; + } + return event_wait; +} + +void +te_event_trigger(tsd_t *tsd, te_ctx_t *ctx) { + /* usize has already been added to thread_allocated. */ + uint64_t bytes_after = te_ctx_current_bytes_get(ctx); + /* The subtraction is intentionally susceptible to underflow. */ + uint64_t accumbytes = bytes_after - te_ctx_last_event_get(ctx); + + te_ctx_last_event_set(ctx, bytes_after); + + bool allow_event_trigger = tsd_nominal(tsd) && + tsd_reentrancy_level_get(tsd) == 0; + bool is_alloc = ctx->is_alloc; + uint64_t wait = TE_MAX_START_WAIT; + +#define E(event, condition, alloc_event) \ + bool is_##event##_triggered = false; \ + if (is_alloc == alloc_event && condition) { \ + uint64_t event_wait = event##_event_wait_get(tsd); \ + assert(event_wait <= TE_MAX_START_WAIT); \ + if (event_wait > accumbytes) { \ + event_wait -= accumbytes; \ + } else if (!allow_event_trigger) { \ + event_wait = event##_postponed_event_wait(tsd); \ + } else { \ + is_##event##_triggered = true; \ + event_wait = event##_new_event_wait(tsd); \ + } \ + event_wait = te_clip_event_wait(event_wait); \ + event##_event_wait_set(tsd, event_wait); \ + if (event_wait < wait) { \ + wait = event_wait; \ + } \ + } + + ITERATE_OVER_ALL_EVENTS +#undef E + + assert(wait <= TE_MAX_START_WAIT); + te_adjust_thresholds_helper(tsd, ctx, wait); + te_assert_invariants(tsd); + +#define E(event, condition, alloc_event) \ + if (is_alloc == alloc_event && condition && \ + is_##event##_triggered) { \ + assert(allow_event_trigger); \ + uint64_t elapsed = event##_fetch_elapsed(tsd); \ + event##_event_handler(tsd, elapsed); \ + } + + ITERATE_OVER_ALL_EVENTS +#undef E + + te_assert_invariants(tsd); +} + +static void +te_init(tsd_t *tsd, bool is_alloc) { + te_ctx_t ctx; + te_ctx_get(tsd, &ctx, is_alloc); + /* + * Reset the last event to current, which starts the events from a clean + * state. This is necessary when re-init the tsd event counters. + * + * The event counters maintain a relationship with the current bytes: + * last_event <= current < next_event. When a reinit happens (e.g. + * reincarnated tsd), the last event needs progressing because all + * events start fresh from the current bytes. + */ + te_ctx_last_event_set(&ctx, te_ctx_current_bytes_get(&ctx)); + + uint64_t wait = TE_MAX_START_WAIT; +#define E(event, condition, alloc_event) \ + if (is_alloc == alloc_event && condition) { \ + uint64_t event_wait = event##_new_event_wait(tsd); \ + event_wait = te_clip_event_wait(event_wait); \ + event##_event_wait_set(tsd, event_wait); \ + if (event_wait < wait) { \ + wait = event_wait; \ + } \ + } + + ITERATE_OVER_ALL_EVENTS +#undef E + te_adjust_thresholds_helper(tsd, &ctx, wait); +} + +void +tsd_te_init(tsd_t *tsd) { + /* Make sure no overflow for the bytes accumulated on event_trigger. */ + assert(TE_MAX_INTERVAL <= UINT64_MAX - SC_LARGE_MAXCLASS + 1); + te_init(tsd, true); + te_init(tsd, false); + te_assert_invariants(tsd); +} diff --git a/BeefRT/JEMalloc/src/ticker.c b/BeefRT/JEMalloc/src/ticker.c new file mode 100644 index 00000000..790b5c20 --- /dev/null +++ b/BeefRT/JEMalloc/src/ticker.c @@ -0,0 +1,32 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +/* + * To avoid using floating point math down core paths (still necessary because + * versions of the glibc dynamic loader that did not preserve xmm registers are + * still somewhat common, requiring us to be compilable with -mno-sse), and also + * to avoid generally expensive library calls, we use a precomputed table of + * values. We want to sample U uniformly on [0, 1], and then compute + * ceil(log(u)/log(1-1/nticks)). We're mostly interested in the case where + * nticks is reasonably big, so 1/log(1-1/nticks) is well-approximated by + * -nticks. + * + * To compute log(u), we sample an integer in [1, 64] and divide, then just look + * up results in a table. As a space-compression mechanism, we store these as + * uint8_t by dividing the range (255) by the highest-magnitude value the log + * can take on, and using that as a multiplier. We then have to divide by that + * multiplier at the end of the computation. + * + * The values here are computed in src/ticker.py + */ + +const uint8_t ticker_geom_table[1 << TICKER_GEOM_NBITS] = { + 254, 211, 187, 169, 156, 144, 135, 127, + 120, 113, 107, 102, 97, 93, 89, 85, + 81, 77, 74, 71, 68, 65, 62, 60, + 57, 55, 53, 50, 48, 46, 44, 42, + 40, 39, 37, 35, 33, 32, 30, 29, + 27, 26, 24, 23, 21, 20, 19, 18, + 16, 15, 14, 13, 12, 10, 9, 8, + 7, 6, 5, 4, 3, 2, 1, 0 +}; diff --git a/BeefRT/JEMalloc/src/ticker.py b/BeefRT/JEMalloc/src/ticker.py new file mode 100644 index 00000000..3807740c --- /dev/null +++ b/BeefRT/JEMalloc/src/ticker.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 + +import math + +# Must match TICKER_GEOM_NBITS +lg_table_size = 6 +table_size = 2**lg_table_size +byte_max = 255 +mul = math.floor(-byte_max/math.log(1 / table_size)) +values = [round(-mul * math.log(i / table_size)) + for i in range(1, table_size+1)] +print("mul =", mul) +print("values:") +for i in range(table_size // 8): + print(", ".join((str(x) for x in values[i*8 : i*8 + 8]))) diff --git a/BeefRT/JEMalloc/src/tsd.c b/BeefRT/JEMalloc/src/tsd.c new file mode 100644 index 00000000..e8e4f3a3 --- /dev/null +++ b/BeefRT/JEMalloc/src/tsd.c @@ -0,0 +1,549 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/san.h" +#include "jemalloc/internal/mutex.h" +#include "jemalloc/internal/rtree.h" + +/******************************************************************************/ +/* Data. */ + +/* TSD_INITIALIZER triggers "-Wmissing-field-initializer" */ +JEMALLOC_DIAGNOSTIC_PUSH +JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS + +#ifdef JEMALLOC_MALLOC_THREAD_CLEANUP +JEMALLOC_TSD_TYPE_ATTR(tsd_t) tsd_tls = TSD_INITIALIZER; +JEMALLOC_TSD_TYPE_ATTR(bool) JEMALLOC_TLS_MODEL tsd_initialized = false; +bool tsd_booted = false; +#elif (defined(JEMALLOC_TLS)) +JEMALLOC_TSD_TYPE_ATTR(tsd_t) tsd_tls = TSD_INITIALIZER; +pthread_key_t tsd_tsd; +bool tsd_booted = false; +#elif (defined(_WIN32)) +DWORD tsd_tsd; +tsd_wrapper_t tsd_boot_wrapper = {false, TSD_INITIALIZER}; +bool tsd_booted = false; +#else + +/* + * This contains a mutex, but it's pretty convenient to allow the mutex code to + * have a dependency on tsd. So we define the struct here, and only refer to it + * by pointer in the header. + */ +struct tsd_init_head_s { + ql_head(tsd_init_block_t) blocks; + malloc_mutex_t lock; +}; + +pthread_key_t tsd_tsd; +tsd_init_head_t tsd_init_head = { + ql_head_initializer(blocks), + MALLOC_MUTEX_INITIALIZER +}; + +tsd_wrapper_t tsd_boot_wrapper = { + false, + TSD_INITIALIZER +}; +bool tsd_booted = false; +#endif + +JEMALLOC_DIAGNOSTIC_POP + +/******************************************************************************/ + +/* A list of all the tsds in the nominal state. */ +typedef ql_head(tsd_t) tsd_list_t; +static tsd_list_t tsd_nominal_tsds = ql_head_initializer(tsd_nominal_tsds); +static malloc_mutex_t tsd_nominal_tsds_lock; + +/* How many slow-path-enabling features are turned on. */ +static atomic_u32_t tsd_global_slow_count = ATOMIC_INIT(0); + +static bool +tsd_in_nominal_list(tsd_t *tsd) { + tsd_t *tsd_list; + bool found = false; + /* + * We don't know that tsd is nominal; it might not be safe to get data + * out of it here. + */ + malloc_mutex_lock(TSDN_NULL, &tsd_nominal_tsds_lock); + ql_foreach(tsd_list, &tsd_nominal_tsds, TSD_MANGLE(tsd_link)) { + if (tsd == tsd_list) { + found = true; + break; + } + } + malloc_mutex_unlock(TSDN_NULL, &tsd_nominal_tsds_lock); + return found; +} + +static void +tsd_add_nominal(tsd_t *tsd) { + assert(!tsd_in_nominal_list(tsd)); + assert(tsd_state_get(tsd) <= tsd_state_nominal_max); + ql_elm_new(tsd, TSD_MANGLE(tsd_link)); + malloc_mutex_lock(tsd_tsdn(tsd), &tsd_nominal_tsds_lock); + ql_tail_insert(&tsd_nominal_tsds, tsd, TSD_MANGLE(tsd_link)); + malloc_mutex_unlock(tsd_tsdn(tsd), &tsd_nominal_tsds_lock); +} + +static void +tsd_remove_nominal(tsd_t *tsd) { + assert(tsd_in_nominal_list(tsd)); + assert(tsd_state_get(tsd) <= tsd_state_nominal_max); + malloc_mutex_lock(tsd_tsdn(tsd), &tsd_nominal_tsds_lock); + ql_remove(&tsd_nominal_tsds, tsd, TSD_MANGLE(tsd_link)); + malloc_mutex_unlock(tsd_tsdn(tsd), &tsd_nominal_tsds_lock); +} + +static void +tsd_force_recompute(tsdn_t *tsdn) { + /* + * The stores to tsd->state here need to synchronize with the exchange + * in tsd_slow_update. + */ + atomic_fence(ATOMIC_RELEASE); + malloc_mutex_lock(tsdn, &tsd_nominal_tsds_lock); + tsd_t *remote_tsd; + ql_foreach(remote_tsd, &tsd_nominal_tsds, TSD_MANGLE(tsd_link)) { + assert(tsd_atomic_load(&remote_tsd->state, ATOMIC_RELAXED) + <= tsd_state_nominal_max); + tsd_atomic_store(&remote_tsd->state, + tsd_state_nominal_recompute, ATOMIC_RELAXED); + /* See comments in te_recompute_fast_threshold(). */ + atomic_fence(ATOMIC_SEQ_CST); + te_next_event_fast_set_non_nominal(remote_tsd); + } + malloc_mutex_unlock(tsdn, &tsd_nominal_tsds_lock); +} + +void +tsd_global_slow_inc(tsdn_t *tsdn) { + atomic_fetch_add_u32(&tsd_global_slow_count, 1, ATOMIC_RELAXED); + /* + * We unconditionally force a recompute, even if the global slow count + * was already positive. If we didn't, then it would be possible for us + * to return to the user, have the user synchronize externally with some + * other thread, and then have that other thread not have picked up the + * update yet (since the original incrementing thread might still be + * making its way through the tsd list). + */ + tsd_force_recompute(tsdn); +} + +void tsd_global_slow_dec(tsdn_t *tsdn) { + atomic_fetch_sub_u32(&tsd_global_slow_count, 1, ATOMIC_RELAXED); + /* See the note in ..._inc(). */ + tsd_force_recompute(tsdn); +} + +static bool +tsd_local_slow(tsd_t *tsd) { + return !tsd_tcache_enabled_get(tsd) + || tsd_reentrancy_level_get(tsd) > 0; +} + +bool +tsd_global_slow() { + return atomic_load_u32(&tsd_global_slow_count, ATOMIC_RELAXED) > 0; +} + +/******************************************************************************/ + +static uint8_t +tsd_state_compute(tsd_t *tsd) { + if (!tsd_nominal(tsd)) { + return tsd_state_get(tsd); + } + /* We're in *a* nominal state; but which one? */ + if (malloc_slow || tsd_local_slow(tsd) || tsd_global_slow()) { + return tsd_state_nominal_slow; + } else { + return tsd_state_nominal; + } +} + +void +tsd_slow_update(tsd_t *tsd) { + uint8_t old_state; + do { + uint8_t new_state = tsd_state_compute(tsd); + old_state = tsd_atomic_exchange(&tsd->state, new_state, + ATOMIC_ACQUIRE); + } while (old_state == tsd_state_nominal_recompute); + + te_recompute_fast_threshold(tsd); +} + +void +tsd_state_set(tsd_t *tsd, uint8_t new_state) { + /* Only the tsd module can change the state *to* recompute. */ + assert(new_state != tsd_state_nominal_recompute); + uint8_t old_state = tsd_atomic_load(&tsd->state, ATOMIC_RELAXED); + if (old_state > tsd_state_nominal_max) { + /* + * Not currently in the nominal list, but it might need to be + * inserted there. + */ + assert(!tsd_in_nominal_list(tsd)); + tsd_atomic_store(&tsd->state, new_state, ATOMIC_RELAXED); + if (new_state <= tsd_state_nominal_max) { + tsd_add_nominal(tsd); + } + } else { + /* + * We're currently nominal. If the new state is non-nominal, + * great; we take ourselves off the list and just enter the new + * state. + */ + assert(tsd_in_nominal_list(tsd)); + if (new_state > tsd_state_nominal_max) { + tsd_remove_nominal(tsd); + tsd_atomic_store(&tsd->state, new_state, + ATOMIC_RELAXED); + } else { + /* + * This is the tricky case. We're transitioning from + * one nominal state to another. The caller can't know + * about any races that are occurring at the same time, + * so we always have to recompute no matter what. + */ + tsd_slow_update(tsd); + } + } + te_recompute_fast_threshold(tsd); +} + +static void +tsd_prng_state_init(tsd_t *tsd) { + /* + * A nondeterministic seed based on the address of tsd reduces + * the likelihood of lockstep non-uniform cache index + * utilization among identical concurrent processes, but at the + * cost of test repeatability. For debug builds, instead use a + * deterministic seed. + */ + *tsd_prng_statep_get(tsd) = config_debug ? 0 : + (uint64_t)(uintptr_t)tsd; +} + +static bool +tsd_data_init(tsd_t *tsd) { + /* + * We initialize the rtree context first (before the tcache), since the + * tcache initialization depends on it. + */ + rtree_ctx_data_init(tsd_rtree_ctxp_get_unsafe(tsd)); + tsd_prng_state_init(tsd); + tsd_te_init(tsd); /* event_init may use the prng state above. */ + tsd_san_init(tsd); + return tsd_tcache_enabled_data_init(tsd); +} + +static void +assert_tsd_data_cleanup_done(tsd_t *tsd) { + assert(!tsd_nominal(tsd)); + assert(!tsd_in_nominal_list(tsd)); + assert(*tsd_arenap_get_unsafe(tsd) == NULL); + assert(*tsd_iarenap_get_unsafe(tsd) == NULL); + assert(*tsd_tcache_enabledp_get_unsafe(tsd) == false); + assert(*tsd_prof_tdatap_get_unsafe(tsd) == NULL); +} + +static bool +tsd_data_init_nocleanup(tsd_t *tsd) { + assert(tsd_state_get(tsd) == tsd_state_reincarnated || + tsd_state_get(tsd) == tsd_state_minimal_initialized); + /* + * During reincarnation, there is no guarantee that the cleanup function + * will be called (deallocation may happen after all tsd destructors). + * We set up tsd in a way that no cleanup is needed. + */ + rtree_ctx_data_init(tsd_rtree_ctxp_get_unsafe(tsd)); + *tsd_tcache_enabledp_get_unsafe(tsd) = false; + *tsd_reentrancy_levelp_get(tsd) = 1; + tsd_prng_state_init(tsd); + tsd_te_init(tsd); /* event_init may use the prng state above. */ + tsd_san_init(tsd); + assert_tsd_data_cleanup_done(tsd); + + return false; +} + +tsd_t * +tsd_fetch_slow(tsd_t *tsd, bool minimal) { + assert(!tsd_fast(tsd)); + + if (tsd_state_get(tsd) == tsd_state_nominal_slow) { + /* + * On slow path but no work needed. Note that we can't + * necessarily *assert* that we're slow, because we might be + * slow because of an asynchronous modification to global state, + * which might be asynchronously modified *back*. + */ + } else if (tsd_state_get(tsd) == tsd_state_nominal_recompute) { + tsd_slow_update(tsd); + } else if (tsd_state_get(tsd) == tsd_state_uninitialized) { + if (!minimal) { + if (tsd_booted) { + tsd_state_set(tsd, tsd_state_nominal); + tsd_slow_update(tsd); + /* Trigger cleanup handler registration. */ + tsd_set(tsd); + tsd_data_init(tsd); + } + } else { + tsd_state_set(tsd, tsd_state_minimal_initialized); + tsd_set(tsd); + tsd_data_init_nocleanup(tsd); + } + } else if (tsd_state_get(tsd) == tsd_state_minimal_initialized) { + if (!minimal) { + /* Switch to fully initialized. */ + tsd_state_set(tsd, tsd_state_nominal); + assert(*tsd_reentrancy_levelp_get(tsd) >= 1); + (*tsd_reentrancy_levelp_get(tsd))--; + tsd_slow_update(tsd); + tsd_data_init(tsd); + } else { + assert_tsd_data_cleanup_done(tsd); + } + } else if (tsd_state_get(tsd) == tsd_state_purgatory) { + tsd_state_set(tsd, tsd_state_reincarnated); + tsd_set(tsd); + tsd_data_init_nocleanup(tsd); + } else { + assert(tsd_state_get(tsd) == tsd_state_reincarnated); + } + + return tsd; +} + +void * +malloc_tsd_malloc(size_t size) { + return a0malloc(CACHELINE_CEILING(size)); +} + +void +malloc_tsd_dalloc(void *wrapper) { + a0dalloc(wrapper); +} + +#if defined(JEMALLOC_MALLOC_THREAD_CLEANUP) || defined(_WIN32) +static unsigned ncleanups; +static malloc_tsd_cleanup_t cleanups[MALLOC_TSD_CLEANUPS_MAX]; + +#ifndef _WIN32 +JEMALLOC_EXPORT +#endif +void +_malloc_thread_cleanup(void) { + bool pending[MALLOC_TSD_CLEANUPS_MAX], again; + unsigned i; + + for (i = 0; i < ncleanups; i++) { + pending[i] = true; + } + + do { + again = false; + for (i = 0; i < ncleanups; i++) { + if (pending[i]) { + pending[i] = cleanups[i](); + if (pending[i]) { + again = true; + } + } + } + } while (again); +} + +#ifndef _WIN32 +JEMALLOC_EXPORT +#endif +void +_malloc_tsd_cleanup_register(bool (*f)(void)) { + assert(ncleanups < MALLOC_TSD_CLEANUPS_MAX); + cleanups[ncleanups] = f; + ncleanups++; +} + +#endif + +static void +tsd_do_data_cleanup(tsd_t *tsd) { + prof_tdata_cleanup(tsd); + iarena_cleanup(tsd); + arena_cleanup(tsd); + tcache_cleanup(tsd); + witnesses_cleanup(tsd_witness_tsdp_get_unsafe(tsd)); + *tsd_reentrancy_levelp_get(tsd) = 1; +} + +void +tsd_cleanup(void *arg) { + tsd_t *tsd = (tsd_t *)arg; + + switch (tsd_state_get(tsd)) { + case tsd_state_uninitialized: + /* Do nothing. */ + break; + case tsd_state_minimal_initialized: + /* This implies the thread only did free() in its life time. */ + /* Fall through. */ + case tsd_state_reincarnated: + /* + * Reincarnated means another destructor deallocated memory + * after the destructor was called. Cleanup isn't required but + * is still called for testing and completeness. + */ + assert_tsd_data_cleanup_done(tsd); + JEMALLOC_FALLTHROUGH; + case tsd_state_nominal: + case tsd_state_nominal_slow: + tsd_do_data_cleanup(tsd); + tsd_state_set(tsd, tsd_state_purgatory); + tsd_set(tsd); + break; + case tsd_state_purgatory: + /* + * The previous time this destructor was called, we set the + * state to tsd_state_purgatory so that other destructors + * wouldn't cause re-creation of the tsd. This time, do + * nothing, and do not request another callback. + */ + break; + default: + not_reached(); + } +#ifdef JEMALLOC_JET + test_callback_t test_callback = *tsd_test_callbackp_get_unsafe(tsd); + int *data = tsd_test_datap_get_unsafe(tsd); + if (test_callback != NULL) { + test_callback(data); + } +#endif +} + +tsd_t * +malloc_tsd_boot0(void) { + tsd_t *tsd; + +#if defined(JEMALLOC_MALLOC_THREAD_CLEANUP) || defined(_WIN32) + ncleanups = 0; +#endif + if (malloc_mutex_init(&tsd_nominal_tsds_lock, "tsd_nominal_tsds_lock", + WITNESS_RANK_OMIT, malloc_mutex_rank_exclusive)) { + return NULL; + } + if (tsd_boot0()) { + return NULL; + } + tsd = tsd_fetch(); + return tsd; +} + +void +malloc_tsd_boot1(void) { + tsd_boot1(); + tsd_t *tsd = tsd_fetch(); + /* malloc_slow has been set properly. Update tsd_slow. */ + tsd_slow_update(tsd); +} + +#ifdef _WIN32 +static BOOL WINAPI +_tls_callback(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpvReserved) { + switch (fdwReason) { +#ifdef JEMALLOC_LAZY_LOCK + case DLL_THREAD_ATTACH: + isthreaded = true; + break; +#endif + case DLL_THREAD_DETACH: + _malloc_thread_cleanup(); + break; + default: + break; + } + return true; +} + +/* + * We need to be able to say "read" here (in the "pragma section"), but have + * hooked "read". We won't read for the rest of the file, so we can get away + * with unhooking. + */ +#ifdef read +# undef read +#endif + +#ifdef _MSC_VER +# ifdef _M_IX86 +# pragma comment(linker, "/INCLUDE:__tls_used") +# pragma comment(linker, "/INCLUDE:_tls_callback") +# else +# pragma comment(linker, "/INCLUDE:_tls_used") +# pragma comment(linker, "/INCLUDE:" STRINGIFY(tls_callback) ) +# endif +# pragma section(".CRT$XLY",long,read) +#endif +JEMALLOC_SECTION(".CRT$XLY") JEMALLOC_ATTR(used) +BOOL (WINAPI *const tls_callback)(HINSTANCE hinstDLL, + DWORD fdwReason, LPVOID lpvReserved) = _tls_callback; +#endif + +#if (!defined(JEMALLOC_MALLOC_THREAD_CLEANUP) && !defined(JEMALLOC_TLS) && \ + !defined(_WIN32)) +void * +tsd_init_check_recursion(tsd_init_head_t *head, tsd_init_block_t *block) { + pthread_t self = pthread_self(); + tsd_init_block_t *iter; + + /* Check whether this thread has already inserted into the list. */ + malloc_mutex_lock(TSDN_NULL, &head->lock); + ql_foreach(iter, &head->blocks, link) { + if (iter->thread == self) { + malloc_mutex_unlock(TSDN_NULL, &head->lock); + return iter->data; + } + } + /* Insert block into list. */ + ql_elm_new(block, link); + block->thread = self; + ql_tail_insert(&head->blocks, block, link); + malloc_mutex_unlock(TSDN_NULL, &head->lock); + return NULL; +} + +void +tsd_init_finish(tsd_init_head_t *head, tsd_init_block_t *block) { + malloc_mutex_lock(TSDN_NULL, &head->lock); + ql_remove(&head->blocks, block, link); + malloc_mutex_unlock(TSDN_NULL, &head->lock); +} +#endif + +void +tsd_prefork(tsd_t *tsd) { + malloc_mutex_prefork(tsd_tsdn(tsd), &tsd_nominal_tsds_lock); +} + +void +tsd_postfork_parent(tsd_t *tsd) { + malloc_mutex_postfork_parent(tsd_tsdn(tsd), &tsd_nominal_tsds_lock); +} + +void +tsd_postfork_child(tsd_t *tsd) { + malloc_mutex_postfork_child(tsd_tsdn(tsd), &tsd_nominal_tsds_lock); + ql_new(&tsd_nominal_tsds); + + if (tsd_state_get(tsd) <= tsd_state_nominal_max) { + tsd_add_nominal(tsd); + } +} diff --git a/BeefRT/JEMalloc/src/witness.c b/BeefRT/JEMalloc/src/witness.c new file mode 100644 index 00000000..4474af04 --- /dev/null +++ b/BeefRT/JEMalloc/src/witness.c @@ -0,0 +1,122 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" +#include "jemalloc/internal/malloc_io.h" + +void +witness_init(witness_t *witness, const char *name, witness_rank_t rank, + witness_comp_t *comp, void *opaque) { + witness->name = name; + witness->rank = rank; + witness->comp = comp; + witness->opaque = opaque; +} + +static void +witness_print_witness(witness_t *w, unsigned n) { + assert(n > 0); + if (n == 1) { + malloc_printf(" %s(%u)", w->name, w->rank); + } else { + malloc_printf(" %s(%u)X%u", w->name, w->rank, n); + } +} + +static void +witness_print_witnesses(const witness_list_t *witnesses) { + witness_t *w, *last = NULL; + unsigned n = 0; + ql_foreach(w, witnesses, link) { + if (last != NULL && w->rank > last->rank) { + assert(w->name != last->name); + witness_print_witness(last, n); + n = 0; + } else if (last != NULL) { + assert(w->rank == last->rank); + assert(w->name == last->name); + } + last = w; + ++n; + } + if (last != NULL) { + witness_print_witness(last, n); + } +} + +static void +witness_lock_error_impl(const witness_list_t *witnesses, + const witness_t *witness) { + malloc_printf(": Lock rank order reversal:"); + witness_print_witnesses(witnesses); + malloc_printf(" %s(%u)\n", witness->name, witness->rank); + abort(); +} +witness_lock_error_t *JET_MUTABLE witness_lock_error = witness_lock_error_impl; + +static void +witness_owner_error_impl(const witness_t *witness) { + malloc_printf(": Should own %s(%u)\n", witness->name, + witness->rank); + abort(); +} +witness_owner_error_t *JET_MUTABLE witness_owner_error = + witness_owner_error_impl; + +static void +witness_not_owner_error_impl(const witness_t *witness) { + malloc_printf(": Should not own %s(%u)\n", witness->name, + witness->rank); + abort(); +} +witness_not_owner_error_t *JET_MUTABLE witness_not_owner_error = + witness_not_owner_error_impl; + +static void +witness_depth_error_impl(const witness_list_t *witnesses, + witness_rank_t rank_inclusive, unsigned depth) { + malloc_printf(": Should own %u lock%s of rank >= %u:", depth, + (depth != 1) ? "s" : "", rank_inclusive); + witness_print_witnesses(witnesses); + malloc_printf("\n"); + abort(); +} +witness_depth_error_t *JET_MUTABLE witness_depth_error = + witness_depth_error_impl; + +void +witnesses_cleanup(witness_tsd_t *witness_tsd) { + witness_assert_lockless(witness_tsd_tsdn(witness_tsd)); + + /* Do nothing. */ +} + +void +witness_prefork(witness_tsd_t *witness_tsd) { + if (!config_debug) { + return; + } + witness_tsd->forking = true; +} + +void +witness_postfork_parent(witness_tsd_t *witness_tsd) { + if (!config_debug) { + return; + } + witness_tsd->forking = false; +} + +void +witness_postfork_child(witness_tsd_t *witness_tsd) { + if (!config_debug) { + return; + } +#ifndef JEMALLOC_MUTEX_INIT_CB + witness_list_t *witnesses; + + witnesses = &witness_tsd->witnesses; + ql_new(witnesses); +#endif + witness_tsd->forking = false; +} diff --git a/BeefRT/JEMalloc/src/zone.c b/BeefRT/JEMalloc/src/zone.c new file mode 100644 index 00000000..23dfdd04 --- /dev/null +++ b/BeefRT/JEMalloc/src/zone.c @@ -0,0 +1,469 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/assert.h" + +#ifndef JEMALLOC_ZONE +# error "This source file is for zones on Darwin (OS X)." +#endif + +/* Definitions of the following structs in malloc/malloc.h might be too old + * for the built binary to run on newer versions of OSX. So use the newest + * possible version of those structs. + */ +typedef struct _malloc_zone_t { + void *reserved1; + void *reserved2; + size_t (*size)(struct _malloc_zone_t *, const void *); + void *(*malloc)(struct _malloc_zone_t *, size_t); + void *(*calloc)(struct _malloc_zone_t *, size_t, size_t); + void *(*valloc)(struct _malloc_zone_t *, size_t); + void (*free)(struct _malloc_zone_t *, void *); + void *(*realloc)(struct _malloc_zone_t *, void *, size_t); + void (*destroy)(struct _malloc_zone_t *); + const char *zone_name; + unsigned (*batch_malloc)(struct _malloc_zone_t *, size_t, void **, unsigned); + void (*batch_free)(struct _malloc_zone_t *, void **, unsigned); + struct malloc_introspection_t *introspect; + unsigned version; + void *(*memalign)(struct _malloc_zone_t *, size_t, size_t); + void (*free_definite_size)(struct _malloc_zone_t *, void *, size_t); + size_t (*pressure_relief)(struct _malloc_zone_t *, size_t); +} malloc_zone_t; + +typedef struct { + vm_address_t address; + vm_size_t size; +} vm_range_t; + +typedef struct malloc_statistics_t { + unsigned blocks_in_use; + size_t size_in_use; + size_t max_size_in_use; + size_t size_allocated; +} malloc_statistics_t; + +typedef kern_return_t memory_reader_t(task_t, vm_address_t, vm_size_t, void **); + +typedef void vm_range_recorder_t(task_t, void *, unsigned type, vm_range_t *, unsigned); + +typedef struct malloc_introspection_t { + kern_return_t (*enumerator)(task_t, void *, unsigned, vm_address_t, memory_reader_t, vm_range_recorder_t); + size_t (*good_size)(malloc_zone_t *, size_t); + boolean_t (*check)(malloc_zone_t *); + void (*print)(malloc_zone_t *, boolean_t); + void (*log)(malloc_zone_t *, void *); + void (*force_lock)(malloc_zone_t *); + void (*force_unlock)(malloc_zone_t *); + void (*statistics)(malloc_zone_t *, malloc_statistics_t *); + boolean_t (*zone_locked)(malloc_zone_t *); + boolean_t (*enable_discharge_checking)(malloc_zone_t *); + boolean_t (*disable_discharge_checking)(malloc_zone_t *); + void (*discharge)(malloc_zone_t *, void *); +#ifdef __BLOCKS__ + void (*enumerate_discharged_pointers)(malloc_zone_t *, void (^)(void *, void *)); +#else + void *enumerate_unavailable_without_blocks; +#endif + void (*reinit_lock)(malloc_zone_t *); +} malloc_introspection_t; + +extern kern_return_t malloc_get_all_zones(task_t, memory_reader_t, vm_address_t **, unsigned *); + +extern malloc_zone_t *malloc_default_zone(void); + +extern void malloc_zone_register(malloc_zone_t *zone); + +extern void malloc_zone_unregister(malloc_zone_t *zone); + +/* + * The malloc_default_purgeable_zone() function is only available on >= 10.6. + * We need to check whether it is present at runtime, thus the weak_import. + */ +extern malloc_zone_t *malloc_default_purgeable_zone(void) +JEMALLOC_ATTR(weak_import); + +/******************************************************************************/ +/* Data. */ + +static malloc_zone_t *default_zone, *purgeable_zone; +static malloc_zone_t jemalloc_zone; +static struct malloc_introspection_t jemalloc_zone_introspect; +static pid_t zone_force_lock_pid = -1; + +/******************************************************************************/ +/* Function prototypes for non-inline static functions. */ + +static size_t zone_size(malloc_zone_t *zone, const void *ptr); +static void *zone_malloc(malloc_zone_t *zone, size_t size); +static void *zone_calloc(malloc_zone_t *zone, size_t num, size_t size); +static void *zone_valloc(malloc_zone_t *zone, size_t size); +static void zone_free(malloc_zone_t *zone, void *ptr); +static void *zone_realloc(malloc_zone_t *zone, void *ptr, size_t size); +static void *zone_memalign(malloc_zone_t *zone, size_t alignment, + size_t size); +static void zone_free_definite_size(malloc_zone_t *zone, void *ptr, + size_t size); +static void zone_destroy(malloc_zone_t *zone); +static unsigned zone_batch_malloc(struct _malloc_zone_t *zone, size_t size, + void **results, unsigned num_requested); +static void zone_batch_free(struct _malloc_zone_t *zone, + void **to_be_freed, unsigned num_to_be_freed); +static size_t zone_pressure_relief(struct _malloc_zone_t *zone, size_t goal); +static size_t zone_good_size(malloc_zone_t *zone, size_t size); +static kern_return_t zone_enumerator(task_t task, void *data, unsigned type_mask, + vm_address_t zone_address, memory_reader_t reader, + vm_range_recorder_t recorder); +static boolean_t zone_check(malloc_zone_t *zone); +static void zone_print(malloc_zone_t *zone, boolean_t verbose); +static void zone_log(malloc_zone_t *zone, void *address); +static void zone_force_lock(malloc_zone_t *zone); +static void zone_force_unlock(malloc_zone_t *zone); +static void zone_statistics(malloc_zone_t *zone, + malloc_statistics_t *stats); +static boolean_t zone_locked(malloc_zone_t *zone); +static void zone_reinit_lock(malloc_zone_t *zone); + +/******************************************************************************/ +/* + * Functions. + */ + +static size_t +zone_size(malloc_zone_t *zone, const void *ptr) { + /* + * There appear to be places within Darwin (such as setenv(3)) that + * cause calls to this function with pointers that *no* zone owns. If + * we knew that all pointers were owned by *some* zone, we could split + * our zone into two parts, and use one as the default allocator and + * the other as the default deallocator/reallocator. Since that will + * not work in practice, we must check all pointers to assure that they + * reside within a mapped extent before determining size. + */ + return ivsalloc(tsdn_fetch(), ptr); +} + +static void * +zone_malloc(malloc_zone_t *zone, size_t size) { + return je_malloc(size); +} + +static void * +zone_calloc(malloc_zone_t *zone, size_t num, size_t size) { + return je_calloc(num, size); +} + +static void * +zone_valloc(malloc_zone_t *zone, size_t size) { + void *ret = NULL; /* Assignment avoids useless compiler warning. */ + + je_posix_memalign(&ret, PAGE, size); + + return ret; +} + +static void +zone_free(malloc_zone_t *zone, void *ptr) { + if (ivsalloc(tsdn_fetch(), ptr) != 0) { + je_free(ptr); + return; + } + + free(ptr); +} + +static void * +zone_realloc(malloc_zone_t *zone, void *ptr, size_t size) { + if (ivsalloc(tsdn_fetch(), ptr) != 0) { + return je_realloc(ptr, size); + } + + return realloc(ptr, size); +} + +static void * +zone_memalign(malloc_zone_t *zone, size_t alignment, size_t size) { + void *ret = NULL; /* Assignment avoids useless compiler warning. */ + + je_posix_memalign(&ret, alignment, size); + + return ret; +} + +static void +zone_free_definite_size(malloc_zone_t *zone, void *ptr, size_t size) { + size_t alloc_size; + + alloc_size = ivsalloc(tsdn_fetch(), ptr); + if (alloc_size != 0) { + assert(alloc_size == size); + je_free(ptr); + return; + } + + free(ptr); +} + +static void +zone_destroy(malloc_zone_t *zone) { + /* This function should never be called. */ + not_reached(); +} + +static unsigned +zone_batch_malloc(struct _malloc_zone_t *zone, size_t size, void **results, + unsigned num_requested) { + unsigned i; + + for (i = 0; i < num_requested; i++) { + results[i] = je_malloc(size); + if (!results[i]) + break; + } + + return i; +} + +static void +zone_batch_free(struct _malloc_zone_t *zone, void **to_be_freed, + unsigned num_to_be_freed) { + unsigned i; + + for (i = 0; i < num_to_be_freed; i++) { + zone_free(zone, to_be_freed[i]); + to_be_freed[i] = NULL; + } +} + +static size_t +zone_pressure_relief(struct _malloc_zone_t *zone, size_t goal) { + return 0; +} + +static size_t +zone_good_size(malloc_zone_t *zone, size_t size) { + if (size == 0) { + size = 1; + } + return sz_s2u(size); +} + +static kern_return_t +zone_enumerator(task_t task, void *data, unsigned type_mask, + vm_address_t zone_address, memory_reader_t reader, + vm_range_recorder_t recorder) { + return KERN_SUCCESS; +} + +static boolean_t +zone_check(malloc_zone_t *zone) { + return true; +} + +static void +zone_print(malloc_zone_t *zone, boolean_t verbose) { +} + +static void +zone_log(malloc_zone_t *zone, void *address) { +} + +static void +zone_force_lock(malloc_zone_t *zone) { + if (isthreaded) { + /* + * See the note in zone_force_unlock, below, to see why we need + * this. + */ + assert(zone_force_lock_pid == -1); + zone_force_lock_pid = getpid(); + jemalloc_prefork(); + } +} + +static void +zone_force_unlock(malloc_zone_t *zone) { + /* + * zone_force_lock and zone_force_unlock are the entry points to the + * forking machinery on OS X. The tricky thing is, the child is not + * allowed to unlock mutexes locked in the parent, even if owned by the + * forking thread (and the mutex type we use in OS X will fail an assert + * if we try). In the child, we can get away with reinitializing all + * the mutexes, which has the effect of unlocking them. In the parent, + * doing this would mean we wouldn't wake any waiters blocked on the + * mutexes we unlock. So, we record the pid of the current thread in + * zone_force_lock, and use that to detect if we're in the parent or + * child here, to decide which unlock logic we need. + */ + if (isthreaded) { + assert(zone_force_lock_pid != -1); + if (getpid() == zone_force_lock_pid) { + jemalloc_postfork_parent(); + } else { + jemalloc_postfork_child(); + } + zone_force_lock_pid = -1; + } +} + +static void +zone_statistics(malloc_zone_t *zone, malloc_statistics_t *stats) { + /* We make no effort to actually fill the values */ + stats->blocks_in_use = 0; + stats->size_in_use = 0; + stats->max_size_in_use = 0; + stats->size_allocated = 0; +} + +static boolean_t +zone_locked(malloc_zone_t *zone) { + /* Pretend no lock is being held */ + return false; +} + +static void +zone_reinit_lock(malloc_zone_t *zone) { + /* As of OSX 10.12, this function is only used when force_unlock would + * be used if the zone version were < 9. So just use force_unlock. */ + zone_force_unlock(zone); +} + +static void +zone_init(void) { + jemalloc_zone.size = zone_size; + jemalloc_zone.malloc = zone_malloc; + jemalloc_zone.calloc = zone_calloc; + jemalloc_zone.valloc = zone_valloc; + jemalloc_zone.free = zone_free; + jemalloc_zone.realloc = zone_realloc; + jemalloc_zone.destroy = zone_destroy; + jemalloc_zone.zone_name = "jemalloc_zone"; + jemalloc_zone.batch_malloc = zone_batch_malloc; + jemalloc_zone.batch_free = zone_batch_free; + jemalloc_zone.introspect = &jemalloc_zone_introspect; + jemalloc_zone.version = 9; + jemalloc_zone.memalign = zone_memalign; + jemalloc_zone.free_definite_size = zone_free_definite_size; + jemalloc_zone.pressure_relief = zone_pressure_relief; + + jemalloc_zone_introspect.enumerator = zone_enumerator; + jemalloc_zone_introspect.good_size = zone_good_size; + jemalloc_zone_introspect.check = zone_check; + jemalloc_zone_introspect.print = zone_print; + jemalloc_zone_introspect.log = zone_log; + jemalloc_zone_introspect.force_lock = zone_force_lock; + jemalloc_zone_introspect.force_unlock = zone_force_unlock; + jemalloc_zone_introspect.statistics = zone_statistics; + jemalloc_zone_introspect.zone_locked = zone_locked; + jemalloc_zone_introspect.enable_discharge_checking = NULL; + jemalloc_zone_introspect.disable_discharge_checking = NULL; + jemalloc_zone_introspect.discharge = NULL; +#ifdef __BLOCKS__ + jemalloc_zone_introspect.enumerate_discharged_pointers = NULL; +#else + jemalloc_zone_introspect.enumerate_unavailable_without_blocks = NULL; +#endif + jemalloc_zone_introspect.reinit_lock = zone_reinit_lock; +} + +static malloc_zone_t * +zone_default_get(void) { + malloc_zone_t **zones = NULL; + unsigned int num_zones = 0; + + /* + * On OSX 10.12, malloc_default_zone returns a special zone that is not + * present in the list of registered zones. That zone uses a "lite zone" + * if one is present (apparently enabled when malloc stack logging is + * enabled), or the first registered zone otherwise. In practice this + * means unless malloc stack logging is enabled, the first registered + * zone is the default. So get the list of zones to get the first one, + * instead of relying on malloc_default_zone. + */ + if (KERN_SUCCESS != malloc_get_all_zones(0, NULL, + (vm_address_t**)&zones, &num_zones)) { + /* + * Reset the value in case the failure happened after it was + * set. + */ + num_zones = 0; + } + + if (num_zones) { + return zones[0]; + } + + return malloc_default_zone(); +} + +/* As written, this function can only promote jemalloc_zone. */ +static void +zone_promote(void) { + malloc_zone_t *zone; + + do { + /* + * Unregister and reregister the default zone. On OSX >= 10.6, + * unregistering takes the last registered zone and places it + * at the location of the specified zone. Unregistering the + * default zone thus makes the last registered one the default. + * On OSX < 10.6, unregistering shifts all registered zones. + * The first registered zone then becomes the default. + */ + malloc_zone_unregister(default_zone); + malloc_zone_register(default_zone); + + /* + * On OSX 10.6, having the default purgeable zone appear before + * the default zone makes some things crash because it thinks it + * owns the default zone allocated pointers. We thus + * unregister/re-register it in order to ensure it's always + * after the default zone. On OSX < 10.6, there is no purgeable + * zone, so this does nothing. On OSX >= 10.6, unregistering + * replaces the purgeable zone with the last registered zone + * above, i.e. the default zone. Registering it again then puts + * it at the end, obviously after the default zone. + */ + if (purgeable_zone != NULL) { + malloc_zone_unregister(purgeable_zone); + malloc_zone_register(purgeable_zone); + } + + zone = zone_default_get(); + } while (zone != &jemalloc_zone); +} + +JEMALLOC_ATTR(constructor) +void +zone_register(void) { + /* + * If something else replaced the system default zone allocator, don't + * register jemalloc's. + */ + default_zone = zone_default_get(); + if (!default_zone->zone_name || strcmp(default_zone->zone_name, + "DefaultMallocZone") != 0) { + return; + } + + /* + * The default purgeable zone is created lazily by OSX's libc. It uses + * the default zone when it is created for "small" allocations + * (< 15 KiB), but assumes the default zone is a scalable_zone. This + * obviously fails when the default zone is the jemalloc zone, so + * malloc_default_purgeable_zone() is called beforehand so that the + * default purgeable zone is created when the default zone is still + * a scalable_zone. As purgeable zones only exist on >= 10.6, we need + * to check for the existence of malloc_default_purgeable_zone() at + * run time. + */ + purgeable_zone = (malloc_default_purgeable_zone == NULL) ? NULL : + malloc_default_purgeable_zone(); + + /* Register the custom zone. At this point it won't be the default. */ + zone_init(); + malloc_zone_register(&jemalloc_zone); + + /* Promote the custom zone to be default. */ + zone_promote(); +} diff --git a/BeefRT/TCMalloc/TCMalloc.cpp b/BeefRT/TCMalloc/TCMalloc.cpp new file mode 100644 index 00000000..83a1ecbc --- /dev/null +++ b/BeefRT/TCMalloc/TCMalloc.cpp @@ -0,0 +1,27 @@ +#define TCMALLOC_NO_MALLOCGUARD +#define TCMALLOC_NAMESPACE tcmalloc +#include "gperftools/src/tcmalloc.cc" + +#include "gperftools/src/central_freelist.cc" +#include "gperftools/src/internal_logging.cc" +//#include "malloc_extension.cc" +#include "gperftools/src/page_heap.cc" +#include "gperftools/src/sampler.cc" +#include "gperftools/src/span.cc" +#include "gperftools/src/stack_trace_table.cc" +#include "gperftools/src/static_vars.cc" +#include "gperftools/src/tc_common.cc" +#include "gperftools/src/thread_cache.cc" + +namespace tcmalloc +{ + extern "C" int RunningOnValgrind(void) + { + return 0; + } +} + +void TCMalloc_RecordAlloc(void* ptr, int size) +{ + +} \ No newline at end of file diff --git a/BeefRT/TCMalloc/TCMalloc.vcxproj b/BeefRT/TCMalloc/TCMalloc.vcxproj new file mode 100644 index 00000000..9f889ff5 --- /dev/null +++ b/BeefRT/TCMalloc/TCMalloc.vcxproj @@ -0,0 +1,760 @@ + + + + + Debug Static CStatic + Win32 + + + Debug Static CStatic + x64 + + + Debug Static + Win32 + + + Debug Static + x64 + + + Debug + Win32 + + + Debug + x64 + + + Release Static CStatic + Win32 + + + Release Static CStatic + x64 + + + Release Static + Win32 + + + Release Static + x64 + + + Release + Win32 + + + Release + x64 + + + + {C3F649DF-28FA-4ED3-B4A7-FB0CF5D63136} + Win32Proj + BeefMem + 10.0.17763.0 + + + + DynamicLibrary + true + v142 + Unicode + + + DynamicLibrary + true + v142 + Unicode + + + DynamicLibrary + true + v142 + Unicode + + + DynamicLibrary + true + v142 + Unicode + + + StaticLibrary + true + v142 + Unicode + + + StaticLibrary + true + v142 + Unicode + + + DynamicLibrary + false + v142 + true + Unicode + + + StaticLibrary + false + v142 + true + Unicode + + + StaticLibrary + false + v142 + true + Unicode + + + DynamicLibrary + false + v142 + true + Unicode + + + StaticLibrary + false + v142 + true + Unicode + + + StaticLibrary + false + v142 + true + Unicode + + + + + true + true + true + true + true + true + true + true + true + true + true + true + + + + + true + true + true + true + true + true + true + true + true + true + true + true + + + true + true + true + true + true + true + true + true + true + true + true + true + + + true + true + true + true + true + true + true + true + true + true + true + true + + + true + true + true + true + true + true + true + true + true + true + true + true + + + + + true + true + true + true + true + true + true + true + true + true + true + true + + + true + true + true + true + true + true + true + true + true + true + true + true + + + true + true + true + true + true + true + true + true + true + true + true + true + + + + + + + true + true + true + true + true + true + true + true + true + true + true + true + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true + true + true + true + true + true + true + true + true + true + true + true + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + TCMalloc64_d + $(SolutionDir)\ide\dist\ + + + TCMalloc64_ssd + $(SolutionDir)\ide\dist\ + + + TCMalloc64_sd + $(SolutionDir)\ide\dist\ + + + TCMalloc64 + $(SolutionDir)\ide\dist\ + false + + + TCMalloc64_ss + $(SolutionDir)\ide\dist\ + false + + + TCMalloc64_s + $(SolutionDir)\ide\dist\ + false + + + $(SolutionDir)\ide\dist\ + TCMalloc32_d + + + $(SolutionDir)\ide\dist\ + TCMalloc32_ssd + + + $(SolutionDir)\ide\dist\ + TCMalloc32_sd + + + $(SolutionDir)\ide\dist\ + TCMalloc32 + + + $(SolutionDir)\ide\dist\ + TCMalloc32_ss + + + $(SolutionDir)\ide\dist\ + TCMalloc32_s + + + + + + Level3 + Disabled + BFRTDBG;BP_DISABLED;BF_NO_BFAPP;BFRT_DYNAMIC;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions) + .;../;../..;../../BeefySysLib/third_party;../../BeefySysLib;../../BeefySysLib/platform/win;../gperftools/src;../gperftools/src/windows + MultiThreadedDebug + ProgramDatabase + + + Windows + DebugFull + 0x00DB0000 + false + + + + + + + + + Level3 + Disabled + BFRTDBG;BP_DISABLED;BF_NO_BFAPP;BFRT_DYNAMIC;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions) + .;../;../..;../../BeefySysLib/third_party;../../BeefySysLib;../../BeefySysLib/platform/win;../gperftools/src;../gperftools/src/windows + MultiThreadedDebug + ProgramDatabase + + + Windows + DebugFull + 0x00DB0000 + false + + + + + + + + + Level3 + Disabled + BFRTDBG;BP_DISABLED;BF_NO_BFAPP;BFRT_DYNAMIC;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions) + .;../;../..;../../BeefySysLib/third_party;../../BeefySysLib;../../BeefySysLib/platform/win;../gperftools/src;../gperftools/src/windows + MultiThreadedDebug + ProgramDatabase + + + Windows + DebugFull + 0x00DB0000 + false + + + + + + + + + Level3 + Disabled + BFRTDBG;BP_DISABLED;BF_NO_BFAPP;TCMALLOC_EXPORT;BFRT_DYNAMIC;BF_STOMP_EXPORT;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions) + .;../;../..;../../BeefySysLib/third_party;../../BeefySysLib;../../BeefySysLib/platform/win;../gperftools/src;../gperftools/src/windows + + + false + MultiThreadedDebug + ProgramDatabase + + + Windows + DebugFull + $(SolutionDir)\ide\dist\$(TargetName).dll + $(SolutionDir)\ide\dist\$(TargetName).lib + false + true + + + + + + + + + Level3 + Disabled + BFRTDBG;BP_DISABLED;BF_NO_BFAPP;BF_STOMP_EXPORT;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions) + .;../;../..;../../BeefySysLib/third_party;../../BeefySysLib;../../BeefySysLib/platform/win;../gperftools/src;../gperftools/src/windows + + + false + MultiThreadedDebug + ProgramDatabase + false + + + Windows + DebugFull + $(SolutionDir)\ide\dist\$(TargetName).dll + $(SolutionDir)\ide\dist\$(TargetName).lib + false + true + + + + + + + + + Level3 + Disabled + BFRTDBG;BP_DISABLED;BF_NO_BFAPP;BF_STOMP_EXPORT;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions) + .;../;../..;../../BeefySysLib/third_party;../../BeefySysLib;../../BeefySysLib/platform/win;../gperftools/src;../gperftools/src/windows + + + false + MultiThreadedDebugDLL + ProgramDatabase + + + Windows + DebugFull + $(SolutionDir)\ide\dist\$(TargetName).dll + $(SolutionDir)\ide\dist\$(TargetName).lib + false + true + + + + + + + Level3 + + + MaxSpeed + true + true + BFRTDBG;BP_DISABLED;BF_NO_BFAPP;BFRT_DYNAMIC;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + .;../;../..;../../BeefySysLib/third_party;../../BeefySysLib;../../BeefySysLib/platform/win;../gperftools/src;../gperftools/src/windows + MultiThreaded + false + + + Windows + DebugFull + true + true + + + + + Level3 + + + MaxSpeed + true + true + BFRTDBG;BP_DISABLED;BF_NO_BFAPP;BFRT_DYNAMIC;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + .;../;../..;../../BeefySysLib/third_party;../../BeefySysLib;../../BeefySysLib/platform/win;../gperftools/src;../gperftools/src/windows + MultiThreaded + false + + + Windows + DebugFull + true + true + + + + + Level3 + + + MaxSpeed + true + true + BFRTDBG;BP_DISABLED;BF_NO_BFAPP;BFRT_DYNAMIC;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + .;../;../..;../../BeefySysLib/third_party;../../BeefySysLib;../../BeefySysLib/platform/win;../gperftools/src;../gperftools/src/windows + MultiThreadedDLL + false + + + Windows + DebugFull + true + true + + + + + Level3 + + + MaxSpeed + true + true + BFRTDBG;BP_DISABLED;BF_NO_BFAPP;TCMALLOC_EXPORT;BFRT_DYNAMIC;;BF_STOMP_EXPORT;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + .;../;../..;../../BeefySysLib/third_party;../../BeefySysLib;../../BeefySysLib/platform/win;../gperftools/src;../gperftools/src/windows + false + MultiThreaded + + + Windows + DebugFull + true + true + $(SolutionDir)\ide\dist\$(TargetName).dll + $(SolutionDir)\ide\dist\$(TargetName).lib + + + + + Level3 + + + MaxSpeed + true + true + BFRTDBG;BP_DISABLED;BF_NO_BFAPP;BF_STOMP_EXPORT;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + .;../;../..;../../BeefySysLib/third_party;../../BeefySysLib;../../BeefySysLib/platform/win;../gperftools/src;../gperftools/src/windows + false + MultiThreaded + + + Windows + DebugFull + true + true + $(SolutionDir)\ide\dist\$(TargetName).dll + $(SolutionDir)\ide\dist\$(TargetName).lib + + + + + Level3 + + + MaxSpeed + true + true + BFRTDBG;BP_DISABLED;BF_NO_BFAPP;BF_STOMP_EXPORT;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) + .;../;../..;../../BeefySysLib/third_party;../../BeefySysLib;../../BeefySysLib/platform/win;../gperftools/src;../gperftools/src/windows + false + MultiThreadedDLL + + + Windows + DebugFull + true + true + $(SolutionDir)\ide\dist\$(TargetName).dll + $(SolutionDir)\ide\dist\$(TargetName).lib + + + + + + \ No newline at end of file diff --git a/BeefRT/TCMalloc/TCMalloc.vcxproj.filters b/BeefRT/TCMalloc/TCMalloc.vcxproj.filters new file mode 100644 index 00000000..e17abeac --- /dev/null +++ b/BeefRT/TCMalloc/TCMalloc.vcxproj.filters @@ -0,0 +1,247 @@ + + + + + {1f66966e-836d-49d0-a8c1-b8f6bcdc9b36} + + + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + gperftools + + + \ No newline at end of file diff --git a/BeefRT/gperftools/src/tcmalloc.cc b/BeefRT/gperftools/src/tcmalloc.cc index 4fa839bb..ef52054b 100644 --- a/BeefRT/gperftools/src/tcmalloc.cc +++ b/BeefRT/gperftools/src/tcmalloc.cc @@ -1585,7 +1585,7 @@ TCMALLOC_EXTERN PERFTOOLS_DLL_DECL int tc_set_new_mode(int flag) __THROW { //int main( int argc, const char* argv[]); -TCMALLOC_EXTERN PERFTOOLS_DLL_DECL void* tc_malloc(size_t size) __THROW { +PERFTOOLS_TCMALLOC_EXPORT void* tc_malloc(size_t size) __THROW { // main(0, 0); void* result = do_malloc_or_cpp_alloc(size); @@ -1593,7 +1593,7 @@ TCMALLOC_EXTERN PERFTOOLS_DLL_DECL void* tc_malloc(size_t size) __THROW { return result; } -TCMALLOC_EXTERN PERFTOOLS_DLL_DECL void tc_free(void* ptr) __THROW { +PERFTOOLS_TCMALLOC_EXPORT void tc_free(void* ptr) __THROW { MallocHook::InvokeDeleteHook(ptr); do_free(ptr); } diff --git a/BeefRT/gperftools/src/thread_cache.cc b/BeefRT/gperftools/src/thread_cache.cc index 96e8f28e..222a5aeb 100644 --- a/BeefRT/gperftools/src/thread_cache.cc +++ b/BeefRT/gperftools/src/thread_cache.cc @@ -43,7 +43,6 @@ using std::min; using std::max; - namespace TCMALLOC_NAMESPACE { DEFINE_int64(tcmalloc_max_total_thread_cache_bytes, diff --git a/BeefRT/gperftools/src/windows/config.h b/BeefRT/gperftools/src/windows/config.h index b9c0dc5e..36b913bb 100644 --- a/BeefRT/gperftools/src/windows/config.h +++ b/BeefRT/gperftools/src/windows/config.h @@ -259,7 +259,15 @@ "config.h" before anything else. */ #ifndef PERFTOOLS_DLL_DECL # define PERFTOOLS_IS_A_DLL 0 /* not set if you're statically linking */ -# define PERFTOOLS_DLL_DECL + +# define PERFTOOLS_DLL_DECL + +#ifdef TCMALLOC_EXPORT +#define PERFTOOLS_TCMALLOC_EXPORT __declspec(dllexport) +#else +#define PERFTOOLS_TCMALLOC_EXPORT static +#endif + # define PERFTOOLS_DLL_DECL_FOR_UNITTESTS //# define PERFTOOLS_DLL_DECL __declspec(dllexport) //# define PERFTOOLS_DLL_DECL_FOR_UNITTESTS __declspec(dllimport) diff --git a/BeefRT/gperftools/src/windows/gperftools/tcmalloc.h b/BeefRT/gperftools/src/windows/gperftools/tcmalloc.h index dcc2c671..88bb4064 100644 --- a/BeefRT/gperftools/src/windows/gperftools/tcmalloc.h +++ b/BeefRT/gperftools/src/windows/gperftools/tcmalloc.h @@ -77,8 +77,8 @@ extern "C" { PERFTOOLS_DLL_DECL const char* tc_version(int* major, int* minor, const char** patch) __THROW; - PERFTOOLS_DLL_DECL void* tc_malloc(size_t size) __THROW; - PERFTOOLS_DLL_DECL void tc_free(void* ptr) __THROW; + PERFTOOLS_TCMALLOC_EXPORT void* tc_malloc(size_t size) __THROW; + PERFTOOLS_TCMALLOC_EXPORT void tc_free(void* ptr) __THROW; PERFTOOLS_DLL_DECL void* tc_realloc(void* ptr, size_t size) __THROW; PERFTOOLS_DLL_DECL void* tc_calloc(size_t nmemb, size_t size) __THROW; PERFTOOLS_DLL_DECL void tc_cfree(void* ptr) __THROW;