Add: eBPF profiling
Overview
This pull request introduces an eBPF benchmark to the codebase, enabling advanced performance measurement capabilities. The eBPF benchmark collects the following metrics:
- Allocated and freed Memory, grouped by a stack
- Top 10 instances of significant one-time allocations
- Kernel cache usage
- System Calls with measured time
Setup
eBPF benchmark requires bcc toolkits to be installed. Please follow the instructions here.
Implementation details
The eBPF benchmark is implemented as a separate process that runs in parallel to the workload. The eBPF benchmark is implemented in Python and uses the bcc toolkit to collect the metrics. The eBPF benchmark is started by the run.py script and is stopped when the workload finishes, due to some limitations lazy flag was added to the benchmark, to enable communication via SIGUSR1 and SIGUSR2 to the start and the end of the workload.
To reduce overhead when collecting time related metrics, --with-ebpf-memory is instroducesed, which (if not given) disables the collection of memory related metrics. The memory related metrics are collected by the eBPF benchmark using the kprobes mechanism, which is known to have a significant overhead. Without --with-ebpf-memory flag, time overhead is around 1%.
Some Results
System Calls
The following benchmark were run on an XPS 15 7590, Intel® Core™ i7-9750H CPU @ 2.60GHz × 12 on a 100MB configuration.
Init
Read
Batch Read
Range Select
Scan
Read Update 50 50
Read Upsert 95 5
Batch Upsert
Remove
Execution
When running following command
sudo python3 ./run.py --with-ebpf --with-ebpf-memory --workload-names RangeSelect --db-names leveldb
The eBPF benchmark will create a file bench/ebpf/snapshots/rocksdb_100MB/RangeSelect-1690716880466.json whose content would look like something like this
{
"time": 1690716880466,
"stats": {
"memory": 32321992,
"combined_allocs": [
{
"alloc_size": 19489096,
"free_size": 31744,
"number_of_allocs": 5162,
"number_of_frees": 16,
"trace": [
"readahead+0xb",
"rocksdb::BlockPrefetcher::PrefetchIfNeeded(rocksdb::BlockBasedTable::Rep const*, rocksdb::BlockHandle const&, unsigned long, bool, bool, rocksdb::Env::IOPriority)+0x400",
"rocksdb::BlockBasedTableIterator::InitDataBlock()+0xd3",
"rocksdb::BlockBasedTableIterator::FindBlockForward()+0xc0",
"rocksdb::BlockBasedTableIterator::Next()+0x3e",
"rocksdb::BlockBasedTableIterator::NextAndGetResult(rocksdb::IterateResult*)+0x2d",
"rocksdb::(anonymous namespace)::LevelIterator::NextAndGetResult(rocksdb::IterateResult*)+0xd4",
"rocksdb::MergingIterator::Next()+0x41",
"rocksdb::MergingIterator::NextAndGetResult(rocksdb::IterateResult*)+0x17",
"rocksdb::DBIter::Next()+0x4c6",
"ucsb::facebook::rocksdb_t::range_select(unsigned long, unsigned long, std::span<std::byte, 18446744073709551615ul>) const+0x14e"
]
},
{
"alloc_size": 1021480,
"free_size": 192,
"number_of_allocs": 289,
"number_of_frees": 1,
"trace": [
"readahead+0xb",
"rocksdb::BlockBasedTable::PrefetchTail(rocksdb::ReadOptions const&, rocksdb::RandomAccessFileReader*, unsigned long, bool, rocksdb::TailPrefetchStats*, bool, bool, std::unique_ptr<rocksdb::FilePrefetchBuffer, std::default_delete<rocksdb::FilePrefetchBuffer> >*)+0x450",
"rocksdb::BlockBasedTable::Open(rocksdb::ReadOptions const&, rocksdb::ImmutableOptions const&, rocksdb::EnvOptions const&, rocksdb::BlockBasedTableOptions const&, rocksdb::InternalKeyComparator const&, std::unique_ptr<rocksdb::RandomAccessFileReader, std::default_delete<rocksdb::RandomAccessFileReader> >&&, unsigned long, std::unique_ptr<rocksdb::TableReader, std::default_delete<rocksdb::TableReader> >*, std::shared_ptr<rocksdb::CacheReservationManager>, std::shared_ptr<rocksdb::SliceTransform const> const&, bool, bool, int, bool, unsigned long, bool, rocksdb::TailPrefetchStats*, rocksdb::BlockCacheTracer*, unsigned long, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, unsigned long, std::array<unsigned long, 2ul>)+0x1ff",
"rocksdb::BlockBasedTableFactory::NewTableReader(rocksdb::ReadOptions const&, rocksdb::TableReaderOptions const&, std::unique_ptr<rocksdb::RandomAccessFileReader, std::default_delete<rocksdb::RandomAccessFileReader> >&&, unsigned long, std::unique_ptr<rocksdb::TableReader, std::default_delete<rocksdb::TableReader> >*, bool) const+0xf7",
"rocksdb::TableCache::GetTableReader(rocksdb::ReadOptions const&, rocksdb::FileOptions const&, rocksdb::InternalKeyComparator const&, rocksdb::FileMetaData const&, bool, bool, rocksdb::HistogramImpl*, std::unique_ptr<rocksdb::TableReader, std::default_delete<rocksdb::TableReader> >*, std::shared_ptr<rocksdb::SliceTransform const> const&, bool, int, bool, unsigned long, rocksdb::Temperature)+0xc8c",
"rocksdb::TableCache::FindTable(rocksdb::ReadOptions const&, rocksdb::FileOptions const&, rocksdb::InternalKeyComparator const&, rocksdb::FileMetaData const&, rocksdb::Cache::Handle**, std::shared_ptr<rocksdb::SliceTransform const> const&, bool, bool, rocksdb::HistogramImpl*, bool, int, bool, unsigned long, rocksdb::Temperature)+0x398",
"rocksdb::VersionBuilder::Rep::LoadTableHandlers(rocksdb::InternalStats*, int, bool, bool, std::shared_ptr<rocksdb::SliceTransform const> const&, unsigned long)::{lambda()#1}::operator()() const+0x143",
"[unknown]"
]
},
],
"stack_traces": 2119
},
"outstanding": [
{
"stack": [
"0x00007f6d55325c1b readahead+0xb [libc.so.6]",
"0x00005565e36f5030 rocksdb::BlockPrefetcher::PrefetchIfNeeded(rocksdb::BlockBasedTable::Rep const*, rocksdb::BlockHandle const&, unsigned long, bool, bool, rocksdb::Env::IOPriority)+0x400 [ucsb_bench]",
"0x00005565e36f1713 rocksdb::BlockBasedTableIterator::InitDataBlock()+0xd3 [ucsb_bench]",
"0x00005565e36f1d80 rocksdb::BlockBasedTableIterator::FindBlockForward()+0xc0 [ucsb_bench]",
"0x00005565e36f1f8e rocksdb::BlockBasedTableIterator::Next()+0x3e [ucsb_bench]",
"0x00005565e36f1fdd rocksdb::BlockBasedTableIterator::NextAndGetResult(rocksdb::IterateResult*)+0x2d [ucsb_bench]",
"0x00005565e342d534 rocksdb::(anonymous namespace)::LevelIterator::NextAndGetResult(rocksdb::IterateResult*)+0xd4 [ucsb_bench]",
"0x00005565e357b921 rocksdb::MergingIterator::Next()+0x41 [ucsb_bench]",
"0x00005565e3578787 rocksdb::MergingIterator::NextAndGetResult(rocksdb::IterateResult*)+0x17 [ucsb_bench]",
"0x00005565e339c956 rocksdb::DBIter::Next()+0x4c6 [ucsb_bench]",
"0x00005565e325213e ucsb::facebook::rocksdb_t::range_select(unsigned long, unsigned long, std::span<std::byte, 18446744073709551615ul>) const+0x14e [ucsb_bench]"
],
"size": 18317896,
"count": 4531
},
{
"stack": [
"0x00007f6d552a4930 [unknown] [libc.so.6]"
],
"size": 4898816,
"count": 1196
},
...
],
"kernel_caches": [
{
"name": "vmap_area",
"alloc_count": 12,
"free_count": 1,
"alloc_size": 768,
"free_size": 64
},
{
"name": "proc_inode_cache",
"alloc_count": 7,
"free_count": 0,
"alloc_size": 4984,
"free_size": 0
},
...
],
"syscalls": [
{
"name": "futex",
"count": 22,
"total_ns": 552268976
},
{
"name": "pread64",
"count": 2728,
"total_ns": 43157739
},
{
"name": "fsync",
"count": 3,
"total_ns": 5430908
},
{
"name": "newfstatat",
"count": 68,
"total_ns": 4228567
},
{
"name": "fdatasync",
"count": 2,
"total_ns": 3091786
},
{
"name": "openat",
"count": 79,
"total_ns": 1561006
},
{
"name": "read",
"count": 65,
"total_ns": 1311081
},
{
"name": "brk",
"count": 63,
"total_ns": 474281
},
...
]
}
Thanks for the updates. Do you also have the tables requested in the second part of the comment?
Can you please add an output example in the PR showcasing the logs when using your eBPF version vs the original? Does it already include the break-down for separate workloads - showcasing time spent on the system-call side vs actual user-space code?