perf-book icon indicating copy to clipboard operation
perf-book copied to clipboard

Chapter 8 edits

Open dankamongmen opened this issue 5 months ago • 0 comments

Not too many changes, but I have some notes:

  • 8-0 algorithmic: maybe also note that by using linear instead of binary search, you needn't keep the data sorted?
  • 8-1: AFAIK, "von Neumann" just means the code and data paths are integrated, while "Harvard" keeps them distinct
  • didn't know that was called "Eytzinger layout", interesting. added to the Eponyms list!
  • 8-2: stack allocation isn't necessarily immediate, if for instance it grows into a new page
  • 8.4.3: another option is to manually request (from the process) the pages at runtime, though this requires root privs. i've done this in code for high-speed networking. here's some example code:
#include <fcntl.h>
#include <iomanip>
#include <iostream>
#include <stdexcept>
#include "numset.h"
#include "Sysfs.h"
#include "fdesc.h"
#include "zones.h"

static inline const char* plurstr(unsigned c){
  return c == 1 ? "" : "s";
}

// get the current number of free hugepages of some size on some node. hfd
// ought be an open pathfd into a node/hugepages-fookB directory.
static bool
readFreeHugePages(int hfd, unsigned long& freehp){
  if(!Sysfs::readSysfsUInt(hfd, "free_hugepages", freehp)){
    return false;
  }
  return true;
}

static bool
readCountHugePages(int hfd, unsigned long& counthp){
  return Sysfs::readSysfsUInt(hfd, "nr_hugepages", counthp);
}

static bool
allocateHugePages(uint32_t psize, int hfd, unsigned long count){
  psize /= (1024ul * 1024);
  unsigned long cur;
  if(!readCountHugePages(hfd, cur)){
    return false;
  }
  count += cur; // FIXME check for overflow
  std::cout << " requesting " << count << " " << psize << "MiB pages (have " << cur << ")" << std::endl;
  if(!Sysfs::writeSysfsUInt(hfd, "nr_hugepages", count)){
    return false;
  }
  unsigned long fhp;
  if(!readCountHugePages(hfd, fhp)){
    return false;
  }
  if(fhp < count){
    std::cerr << " requested " << count << " " << psize << "MiB pages, only got "
              << fhp << ", have " << cur << std::endl;
    if(fhp > cur){ // undo a partial allocation
      std::cout << " releasing " << fhp - cur << " huge pages" << std::endl;
      Sysfs::writeSysfsUInt(hfd, "nr_hugepages", cur);
    }
    return false;
  }
  return true;
}

// determine the huge page size we'll use to fulfill this request. one day we
// might do something smarter where we look at current usage and even
// fragmentation. for now, we use static size thresholds [sad horn noise].
//
// this assumes x86 huge page sizes, and that both 2M and 1GB are supported.
// we have the info available to do otherwise, detected from sysfs FIXME.
static inline unsigned long
pageSize(size_t req){
  // there are 512 2MB regions in a 1GB region. L2 TLBs run a few kiloentries.
  // given the difficulty of assembling a contiguous 1GB region after the
  // address space gets fragmented to hell, plus the lack of widespread 1GB
  // page support, we require 2GB to use 1GB pages.
  if(req < 2 * X86_GIGAPAGE_SIZE){
    return X86_HUGEPAGE_SIZE;
  }
  return X86_GIGAPAGE_SIZE;
}

bool NUMAZone::reserveFastMemory(uint64_t bytes, uint32_t& pgsize, unsigned& pcount){
  pgsize = pageSize(bytes);
  pcount = bytes / pgsize + !!(bytes % pgsize);
  if(!reserveHugePages(pgsize, pcount)){
    if(pgsize == X86_GIGAPAGE_SIZE){
      pgsize = X86_HUGEPAGE_SIZE;
      pcount = bytes / pgsize + !!(bytes % pgsize);
      if(reserveHugePages(pgsize, pcount)){
        return true;
      }
    }
    pcount = 0;
    return false;
  }
  return true;
}

// some number are free, but they might be from one of our outstanding
// reservations (remember, they're not atomic). so we track our outstanding
// allocations, and rely on the caller to report back post-mapping.
bool NUMAZone::reserveHugePages(uint32_t size, unsigned count){
  for(auto& hpinfo : HPages){
    if(hpinfo.PageSize == size){
      std::lock_guard lg{mtx};
      unsigned long freehp;
      if(!readFreeHugePages(hpinfo.fd.FD(), freehp)){
        return false;
      }
      // freehp might be less than OutstandingPages due to a worker having
      // just successfully mapped them, but not yet called reportMapped(),
      // so don't consider this an error, but don't count them towards our
      // total, either. of course, they might have actually been freed by
      // someone else in that same interim, in which case we'll be demanding
      // more than we strictly need, but that's not any kind of big deal.
      if(freehp < hpinfo.OutstandingPages){
        freehp = 0;
      }else{
        freehp -= hpinfo.OutstandingPages;
      }
      if(freehp >= count){
        std::cout << " wanted " << count << " " << (size / 1024 / 1024)
                  << "MiB page" << plurstr(count) << ", had " << freehp << std::endl;
        hpinfo.OutstandingPages += count;
        return true;
      }
      count -= freehp;
      if(!allocateHugePages(hpinfo.PageSize, hpinfo.fd.FD(), count)){
        return false;
      }
      hpinfo.OutstandingPages = count + freehp;
      return true;
    }
  }
  std::cerr << "requested unsupported page size " << size << std::endl;
  return false; // no such page size, sorry
}

uint64_t NUMAZone::percentAlloc(uint32_t denom) const {
  if(!denom){
    return 0;
  }
  uint64_t alloc = Size / denom / 2;
  uint32_t hsize = Basepagesize;
  for(const auto& h : HPages){
    if(h.PageSize > alloc){
      break;
    }
    hsize = h.PageSize;
  }
  uint64_t ialloc = alloc;
  ialloc -= ialloc % hsize;
  return ialloc;
}

void NUMAZone::reportMapped(uint32_t pgsize, unsigned pcount){
  for(auto& hpinfo : HPages){
    if(hpinfo.PageSize == pgsize){
      std::lock_guard lg{mtx};
      if(hpinfo.OutstandingPages < pcount){
        std::cerr << "reported " << pcount << " page" << plurstr(pcount)
                  << " mapped, had only " << hpinfo.OutstandingPages << " outstanding" << std::endl;
        throw std::invalid_argument("invalid outstanding page count");
      }
      hpinfo.OutstandingPages -= pcount;
      return;
    }
  }
  std::cerr << "your huge page size sucks " << pgsize << std::endl;
  throw std::invalid_argument("invalid hugepage size");
}

// extract the current number of hugepages of this size on this node, and also
// the number of free hugepages. this is of course only a snapshot.
bool NUMAZone::getZoneHugePageClassInfo(int hfd, unsigned long& freep, unsigned long& mappedp){
  if(!readCountHugePages(hfd, mappedp)){
    return false;
  }
  if(!readFreeHugePages(hfd, freep)){
    return false;
  }
  return true;
}

bool NUMAZone::getZoneHugePageInfo(int zfd){
  Fdesc hpfd{openat(zfd, "hugepages", O_RDONLY | O_CLOEXEC | O_DIRECTORY | O_PATH)};
  if(hpfd.FD() < 0){
    return false;
  }
  // FIXME we ought just browse the directory, and find all supported
  // arbitrary sizes, but this works for now. subdirectories are of the
  // form hugepages-{foo}kB, so define in terms of 1024B.
  const std::vector<uint32_t> candidateSizes = {
    X86_HUGEPAGE_SIZE / 1024,
    X86_GIGAPAGE_SIZE / 1024,
  };
#define HPPREFIX "hugepages-"
#define HPSUFFIX "kB"
  char hpname[20 + __builtin_strlen(HPPREFIX) + __builtin_strlen(HPSUFFIX) + 1]; // 20: largest 64-bit uint
  strcpy(hpname, HPPREFIX);
  const size_t maxwrite = sizeof(hpname) - __builtin_strlen(HPPREFIX);
  for(auto sz : candidateSizes){
    auto s = snprintf(hpname + __builtin_strlen(HPPREFIX), maxwrite, "%u" HPSUFFIX, sz);
    if(s < 0 || static_cast<size_t>(s) >= maxwrite){
      std::cerr << "couldn't look for huge pages of size " << sz << "kB" << std::endl;
      return false;
    }
    Fdesc hpsfd{openat(hpfd.FD(), hpname, O_RDONLY | O_CLOEXEC | O_DIRECTORY | O_PATH)};
    if(hpsfd.FD() < 0){
      if(errno != ENOENT){
        std::cerr << "error opening " << hpname << ": " << strerror(errno) << std::endl;
        return false;
      }
      continue;
    }
    if(sz * 1024 < Basepagesize){
      std::cerr << "huge page size " << sz * 1024 << " >= base page size " << Basepagesize << std::endl;
      return false;
    }
    unsigned long freep, mappedp;
    if(!getZoneHugePageClassInfo(hpsfd.FD(), freep, mappedp)){
      return false;
    }
    std::cout << " " << sz / 1024 << "MiB hugepage" << plurstr(mappedp) << ": " <<
              mappedp << " allocated, " << freep << " free" << std::endl;
    HPages.emplace_back(0, sz * 1024, std::move(hpsfd));
  }
  return true;
}

bool NUMAZone::getZoneInfo(int zfd, uint64_t& size){
  auto buf = Sysfs::readSysfs(zfd, "meminfo");
  if(!buf){
    return false;
  }
  const char tag[] = "MemTotal:";
  const char* s = buf.get();
  const char* ts = tag;
  bool foundtag = false;
  bool foundvalue = false;
  size = 0;
  // MemTotal ought always be the first line. it has no repeated internal
  // characters, so we can always safely match to the beginning of the tag.
  // once foundtag goes high, start accumulating size.
  while(*s && *s != '\n'){
    if(foundtag && !foundvalue){
      if(isdigit(*s)){
        foundvalue = true;
      }else if(!isspace(*s)){
        std::cerr << "found mystery character " << *s << " for " << tag << " value" << std::endl;
        return false;
      }
    }
    if(foundvalue){
      if(isdigit(*s)){
        size *= 10;
        size += *s - '0';
      }else{
        if(isspace(*s)){ // " kB' follows value; we're done!
          size *= 1024;
          break;
        }else{
          std::cerr << "found mystery character " << *s << " for " << tag << " value" << std::endl;
          size = 0;
          return false;
        }
      }
    }else{
      if(*s == *ts){
        if(!*++ts){
          foundtag = true;
        }
      }else{
        ts = tag;
      }
    }
    ++s;
  }
  if(size){
    return true;
  }
  std::cerr << "didn't find expected " << tag << " in meminfo" << std::endl;
  return false;
}

NUMAZone::NUMAZone(int zfd, int zid, uint32_t basepagesize) :
 ZoneId(zid),
 Basepagesize(basepagesize) {
  // 64 bit value max is 20 char
  #define PREFIX "node"
  char nbuf[__builtin_strlen(PREFIX) + 20 + 1];
  auto s = snprintf(nbuf, sizeof(nbuf), PREFIX "%d", zid);
  if(s < 0 || static_cast<size_t>(s) >= sizeof(nbuf)){
    std::cerr << "crazy zone id " << zid << std::endl;
    throw std::invalid_argument("invalid id for numa zone");
  }
  Fdesc nfd{openat(zfd, nbuf, O_RDONLY | O_CLOEXEC | O_DIRECTORY | O_PATH)};
  if(nfd.FD() < 0){
    std::cerr << "error opening zone " << zid << " directory (" << strerror(errno) << ")" << std::endl;
    throw std::runtime_error("couldn't open numa zone in sysfs");
  }
  auto cpulist = Sysfs::readSysfs(nfd.FD(), "cpulist");
  if(!cpulist){
    std::cerr << "error reading zone " << zid << " cpu list (" << strerror(errno) << ")" << std::endl;
    throw std::runtime_error("couldn't read numa zone's cpulist");
  }
  if(!lexNumberCollection(cpulist.get(), Cores)){
    throw std::runtime_error("invalid numa zone's cpulist");
  }
  if(!getZoneInfo(nfd.FD(), Size)){
    throw std::runtime_error("invalid numa zone info");
  }
  std::cout << "numa memory zone " << zid << " ("
            << static_cast<float>(Size) / (1024lu * 1024lu * 1024lu) << " GiB): "
            << cpulist.get() << std::endl;
  if(!getZoneHugePageInfo(nfd.FD())){
    throw std::runtime_error("invalid numa zone huge page info");
  }
}

void NUMATopology::discoverZones(uint32_t basepagesize){
  Fdesc sysfd{open("/sys/devices/system/node", O_RDONLY | O_CLOEXEC | O_DIRECTORY | O_PATH)};
  if(sysfd.FD() < 0){
    std::cerr << "couldn't open sysfs system/node (" << strerror(errno) << ")" << std::endl;
    throw std::runtime_error("couldn't open sysfs system/node");
  }
  auto memzones = Sysfs::readSysfs(sysfd.FD(), "has_memory");
  if(!memzones){
    throw std::runtime_error("couldn't read system/node/has_memory");
  }
  std::unordered_set<int> zset;
  if(!lexNumberCollection(memzones.get(), zset)){
    throw std::runtime_error("couldn't parse system/node/has_memory");
  }
  for(int z : zset){
    Zones.try_emplace(z, sysfd.FD(), z, basepagesize);
  }
}

dankamongmen avatar Sep 13 '24 16:09 dankamongmen