perf-book
perf-book copied to clipboard
Chapter 8 edits
Not too many changes, but I have some notes:
- 8-0 algorithmic: maybe also note that by using linear instead of binary search, you needn't keep the data sorted?
- 8-1: AFAIK, "von Neumann" just means the code and data paths are integrated, while "Harvard" keeps them distinct
- didn't know that was called "Eytzinger layout", interesting. added to the Eponyms list!
- 8-2: stack allocation isn't necessarily immediate, if for instance it grows into a new page
- 8.4.3: another option is to manually request (from the process) the pages at runtime, though this requires root privs. i've done this in code for high-speed networking. here's some example code:
#include <fcntl.h>
#include <iomanip>
#include <iostream>
#include <stdexcept>
#include "numset.h"
#include "Sysfs.h"
#include "fdesc.h"
#include "zones.h"
static inline const char* plurstr(unsigned c){
return c == 1 ? "" : "s";
}
// get the current number of free hugepages of some size on some node. hfd
// ought be an open pathfd into a node/hugepages-fookB directory.
static bool
readFreeHugePages(int hfd, unsigned long& freehp){
if(!Sysfs::readSysfsUInt(hfd, "free_hugepages", freehp)){
return false;
}
return true;
}
static bool
readCountHugePages(int hfd, unsigned long& counthp){
return Sysfs::readSysfsUInt(hfd, "nr_hugepages", counthp);
}
static bool
allocateHugePages(uint32_t psize, int hfd, unsigned long count){
psize /= (1024ul * 1024);
unsigned long cur;
if(!readCountHugePages(hfd, cur)){
return false;
}
count += cur; // FIXME check for overflow
std::cout << " requesting " << count << " " << psize << "MiB pages (have " << cur << ")" << std::endl;
if(!Sysfs::writeSysfsUInt(hfd, "nr_hugepages", count)){
return false;
}
unsigned long fhp;
if(!readCountHugePages(hfd, fhp)){
return false;
}
if(fhp < count){
std::cerr << " requested " << count << " " << psize << "MiB pages, only got "
<< fhp << ", have " << cur << std::endl;
if(fhp > cur){ // undo a partial allocation
std::cout << " releasing " << fhp - cur << " huge pages" << std::endl;
Sysfs::writeSysfsUInt(hfd, "nr_hugepages", cur);
}
return false;
}
return true;
}
// determine the huge page size we'll use to fulfill this request. one day we
// might do something smarter where we look at current usage and even
// fragmentation. for now, we use static size thresholds [sad horn noise].
//
// this assumes x86 huge page sizes, and that both 2M and 1GB are supported.
// we have the info available to do otherwise, detected from sysfs FIXME.
static inline unsigned long
pageSize(size_t req){
// there are 512 2MB regions in a 1GB region. L2 TLBs run a few kiloentries.
// given the difficulty of assembling a contiguous 1GB region after the
// address space gets fragmented to hell, plus the lack of widespread 1GB
// page support, we require 2GB to use 1GB pages.
if(req < 2 * X86_GIGAPAGE_SIZE){
return X86_HUGEPAGE_SIZE;
}
return X86_GIGAPAGE_SIZE;
}
bool NUMAZone::reserveFastMemory(uint64_t bytes, uint32_t& pgsize, unsigned& pcount){
pgsize = pageSize(bytes);
pcount = bytes / pgsize + !!(bytes % pgsize);
if(!reserveHugePages(pgsize, pcount)){
if(pgsize == X86_GIGAPAGE_SIZE){
pgsize = X86_HUGEPAGE_SIZE;
pcount = bytes / pgsize + !!(bytes % pgsize);
if(reserveHugePages(pgsize, pcount)){
return true;
}
}
pcount = 0;
return false;
}
return true;
}
// some number are free, but they might be from one of our outstanding
// reservations (remember, they're not atomic). so we track our outstanding
// allocations, and rely on the caller to report back post-mapping.
bool NUMAZone::reserveHugePages(uint32_t size, unsigned count){
for(auto& hpinfo : HPages){
if(hpinfo.PageSize == size){
std::lock_guard lg{mtx};
unsigned long freehp;
if(!readFreeHugePages(hpinfo.fd.FD(), freehp)){
return false;
}
// freehp might be less than OutstandingPages due to a worker having
// just successfully mapped them, but not yet called reportMapped(),
// so don't consider this an error, but don't count them towards our
// total, either. of course, they might have actually been freed by
// someone else in that same interim, in which case we'll be demanding
// more than we strictly need, but that's not any kind of big deal.
if(freehp < hpinfo.OutstandingPages){
freehp = 0;
}else{
freehp -= hpinfo.OutstandingPages;
}
if(freehp >= count){
std::cout << " wanted " << count << " " << (size / 1024 / 1024)
<< "MiB page" << plurstr(count) << ", had " << freehp << std::endl;
hpinfo.OutstandingPages += count;
return true;
}
count -= freehp;
if(!allocateHugePages(hpinfo.PageSize, hpinfo.fd.FD(), count)){
return false;
}
hpinfo.OutstandingPages = count + freehp;
return true;
}
}
std::cerr << "requested unsupported page size " << size << std::endl;
return false; // no such page size, sorry
}
uint64_t NUMAZone::percentAlloc(uint32_t denom) const {
if(!denom){
return 0;
}
uint64_t alloc = Size / denom / 2;
uint32_t hsize = Basepagesize;
for(const auto& h : HPages){
if(h.PageSize > alloc){
break;
}
hsize = h.PageSize;
}
uint64_t ialloc = alloc;
ialloc -= ialloc % hsize;
return ialloc;
}
void NUMAZone::reportMapped(uint32_t pgsize, unsigned pcount){
for(auto& hpinfo : HPages){
if(hpinfo.PageSize == pgsize){
std::lock_guard lg{mtx};
if(hpinfo.OutstandingPages < pcount){
std::cerr << "reported " << pcount << " page" << plurstr(pcount)
<< " mapped, had only " << hpinfo.OutstandingPages << " outstanding" << std::endl;
throw std::invalid_argument("invalid outstanding page count");
}
hpinfo.OutstandingPages -= pcount;
return;
}
}
std::cerr << "your huge page size sucks " << pgsize << std::endl;
throw std::invalid_argument("invalid hugepage size");
}
// extract the current number of hugepages of this size on this node, and also
// the number of free hugepages. this is of course only a snapshot.
bool NUMAZone::getZoneHugePageClassInfo(int hfd, unsigned long& freep, unsigned long& mappedp){
if(!readCountHugePages(hfd, mappedp)){
return false;
}
if(!readFreeHugePages(hfd, freep)){
return false;
}
return true;
}
bool NUMAZone::getZoneHugePageInfo(int zfd){
Fdesc hpfd{openat(zfd, "hugepages", O_RDONLY | O_CLOEXEC | O_DIRECTORY | O_PATH)};
if(hpfd.FD() < 0){
return false;
}
// FIXME we ought just browse the directory, and find all supported
// arbitrary sizes, but this works for now. subdirectories are of the
// form hugepages-{foo}kB, so define in terms of 1024B.
const std::vector<uint32_t> candidateSizes = {
X86_HUGEPAGE_SIZE / 1024,
X86_GIGAPAGE_SIZE / 1024,
};
#define HPPREFIX "hugepages-"
#define HPSUFFIX "kB"
char hpname[20 + __builtin_strlen(HPPREFIX) + __builtin_strlen(HPSUFFIX) + 1]; // 20: largest 64-bit uint
strcpy(hpname, HPPREFIX);
const size_t maxwrite = sizeof(hpname) - __builtin_strlen(HPPREFIX);
for(auto sz : candidateSizes){
auto s = snprintf(hpname + __builtin_strlen(HPPREFIX), maxwrite, "%u" HPSUFFIX, sz);
if(s < 0 || static_cast<size_t>(s) >= maxwrite){
std::cerr << "couldn't look for huge pages of size " << sz << "kB" << std::endl;
return false;
}
Fdesc hpsfd{openat(hpfd.FD(), hpname, O_RDONLY | O_CLOEXEC | O_DIRECTORY | O_PATH)};
if(hpsfd.FD() < 0){
if(errno != ENOENT){
std::cerr << "error opening " << hpname << ": " << strerror(errno) << std::endl;
return false;
}
continue;
}
if(sz * 1024 < Basepagesize){
std::cerr << "huge page size " << sz * 1024 << " >= base page size " << Basepagesize << std::endl;
return false;
}
unsigned long freep, mappedp;
if(!getZoneHugePageClassInfo(hpsfd.FD(), freep, mappedp)){
return false;
}
std::cout << " " << sz / 1024 << "MiB hugepage" << plurstr(mappedp) << ": " <<
mappedp << " allocated, " << freep << " free" << std::endl;
HPages.emplace_back(0, sz * 1024, std::move(hpsfd));
}
return true;
}
bool NUMAZone::getZoneInfo(int zfd, uint64_t& size){
auto buf = Sysfs::readSysfs(zfd, "meminfo");
if(!buf){
return false;
}
const char tag[] = "MemTotal:";
const char* s = buf.get();
const char* ts = tag;
bool foundtag = false;
bool foundvalue = false;
size = 0;
// MemTotal ought always be the first line. it has no repeated internal
// characters, so we can always safely match to the beginning of the tag.
// once foundtag goes high, start accumulating size.
while(*s && *s != '\n'){
if(foundtag && !foundvalue){
if(isdigit(*s)){
foundvalue = true;
}else if(!isspace(*s)){
std::cerr << "found mystery character " << *s << " for " << tag << " value" << std::endl;
return false;
}
}
if(foundvalue){
if(isdigit(*s)){
size *= 10;
size += *s - '0';
}else{
if(isspace(*s)){ // " kB' follows value; we're done!
size *= 1024;
break;
}else{
std::cerr << "found mystery character " << *s << " for " << tag << " value" << std::endl;
size = 0;
return false;
}
}
}else{
if(*s == *ts){
if(!*++ts){
foundtag = true;
}
}else{
ts = tag;
}
}
++s;
}
if(size){
return true;
}
std::cerr << "didn't find expected " << tag << " in meminfo" << std::endl;
return false;
}
NUMAZone::NUMAZone(int zfd, int zid, uint32_t basepagesize) :
ZoneId(zid),
Basepagesize(basepagesize) {
// 64 bit value max is 20 char
#define PREFIX "node"
char nbuf[__builtin_strlen(PREFIX) + 20 + 1];
auto s = snprintf(nbuf, sizeof(nbuf), PREFIX "%d", zid);
if(s < 0 || static_cast<size_t>(s) >= sizeof(nbuf)){
std::cerr << "crazy zone id " << zid << std::endl;
throw std::invalid_argument("invalid id for numa zone");
}
Fdesc nfd{openat(zfd, nbuf, O_RDONLY | O_CLOEXEC | O_DIRECTORY | O_PATH)};
if(nfd.FD() < 0){
std::cerr << "error opening zone " << zid << " directory (" << strerror(errno) << ")" << std::endl;
throw std::runtime_error("couldn't open numa zone in sysfs");
}
auto cpulist = Sysfs::readSysfs(nfd.FD(), "cpulist");
if(!cpulist){
std::cerr << "error reading zone " << zid << " cpu list (" << strerror(errno) << ")" << std::endl;
throw std::runtime_error("couldn't read numa zone's cpulist");
}
if(!lexNumberCollection(cpulist.get(), Cores)){
throw std::runtime_error("invalid numa zone's cpulist");
}
if(!getZoneInfo(nfd.FD(), Size)){
throw std::runtime_error("invalid numa zone info");
}
std::cout << "numa memory zone " << zid << " ("
<< static_cast<float>(Size) / (1024lu * 1024lu * 1024lu) << " GiB): "
<< cpulist.get() << std::endl;
if(!getZoneHugePageInfo(nfd.FD())){
throw std::runtime_error("invalid numa zone huge page info");
}
}
void NUMATopology::discoverZones(uint32_t basepagesize){
Fdesc sysfd{open("/sys/devices/system/node", O_RDONLY | O_CLOEXEC | O_DIRECTORY | O_PATH)};
if(sysfd.FD() < 0){
std::cerr << "couldn't open sysfs system/node (" << strerror(errno) << ")" << std::endl;
throw std::runtime_error("couldn't open sysfs system/node");
}
auto memzones = Sysfs::readSysfs(sysfd.FD(), "has_memory");
if(!memzones){
throw std::runtime_error("couldn't read system/node/has_memory");
}
std::unordered_set<int> zset;
if(!lexNumberCollection(memzones.get(), zset)){
throw std::runtime_error("couldn't parse system/node/has_memory");
}
for(int z : zset){
Zones.try_emplace(z, sysfd.FD(), z, basepagesize);
}
}