amdsmi
amdsmi copied to clipboard
[Issue]: ROCm 7.0.1 (MI300A): amdsmi_topo_get_p2p_status use-after-free and amdsmi_get_gpu_memory_partition_config uninitialized read inside libamd_smi.so
Problem Description
Running the attached test (amdsmi_partitions.cpp) under Valgrind on an MI300A system shows memory issues inside libamd_smi.so:
- Uninitialized read: Repeated warnings when calling amdsmi_get_gpu_memory_partition_config, where the library calls strlen() on an uninitialized pointer:
Conditional jump or move depends on uninitialised value(s)
at strlen
by amdsmi_get_gpu_memory_partition_config (libamd_smi.so.26.0.70001)
- Use-after-free: Calling amdsmi_topo_get_p2p_status yields multiple “Invalid read … inside a block … free’d” with both the operator delete and subsequent reads occurring within libamd_smi.so frames, e.g.:
Invalid read of size 8
at rsmi_topo_get_p2p_status (libamd_smi.so.26.0.70001)
by amdsmi_topo_get_p2p_status (libamd_smi.so.26.0.70001)
Address ... is ... inside a block of size ... free'd
at operator delete
by ... (libamd_smi.so.26.0.70001)
Block was alloc'd at
at operator new
by ... (libamd_smi.so.26.0.70001)
Operating System
OS: NAME="Red Hat Enterprise Linux" VERSION="8.10 (Ootpa)"
CPU
AMD Instinct MI300A Accelerator
GPU
AMD Instinct MI300A Accelerator
ROCm Version
ROCm 7.0.1
ROCm Component
amdsmi
Steps to Reproduce
Build the test and run under valgrind
// file: amdsmi_partitions.cpp
#include <algorithm>
#include <cstdio>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <sstream>
#include <string>
#include <unordered_set>
#include <vector>
#if defined(__GLIBC__)
# include <malloc.h>
#endif
#include <amd_smi/amdsmi.h>
#ifndef AMDSMI_PROCESSOR_TYPE_AMD_GPU
# ifdef AMDSMI_PROCESSOR_TYPE_GPU
# define AMDSMI_PROCESSOR_TYPE_AMD_GPU AMDSMI_PROCESSOR_TYPE_GPU
# elif defined(AMD_GPU)
# define AMDSMI_PROCESSOR_TYPE_AMD_GPU AMD_GPU
# endif
#endif
namespace {
const long kLeakThresholdBytes = 64L * 1024L; // tolerate small allocator fluctuations
const int kIterations = 32;
enum class StatusEval { Success, Skip, Failure };
static const char *status_to_cstr(amdsmi_status_t s) {
const char *str = nullptr;
if (amdsmi_status_code_to_string(s, &str) == AMDSMI_STATUS_SUCCESS && str)
return str;
return "unknown";
}
static StatusEval log_status(const std::string &label, amdsmi_status_t status) {
std::printf(" %s -> %d (%s)\n", label.c_str(), (int)status, status_to_cstr(status));
switch (status) {
case AMDSMI_STATUS_SUCCESS:
return StatusEval::Success;
case AMDSMI_STATUS_NOT_SUPPORTED:
case AMDSMI_STATUS_NOT_YET_IMPLEMENTED:
std::printf(" [SKIP] %s unsupported on this system.\n", label.c_str());
return StatusEval::Skip;
case AMDSMI_STATUS_NO_PERM:
std::printf(" [SKIP] %s requires elevated privileges.\n", label.c_str());
return StatusEval::Skip;
case AMDSMI_STATUS_NO_DATA:
case AMDSMI_STATUS_BUSY:
case AMDSMI_STATUS_NO_DRV:
std::printf(" [SKIP] %s blocked by runtime environment.\n", label.c_str());
return StatusEval::Skip;
default:
std::printf(" [FAIL] %s call failed.\n", label.c_str());
return StatusEval::Failure;
}
}
static size_t current_allocated_bytes() {
#if defined(__GLIBC__)
# if defined(__GLIBC_PREREQ) && __GLIBC_PREREQ(2, 33)
struct mallinfo2 info = mallinfo2();
return static_cast<size_t>(info.uordblks);
# else
struct mallinfo info = mallinfo();
return static_cast<size_t>(info.uordblks);
# endif
#else
std::ifstream status("/proc/self/status");
std::string line;
while (std::getline(status, line)) {
if (line.rfind("VmRSS:", 0) == 0) {
std::istringstream iss(line.substr(6));
size_t value = 0;
std::string unit;
iss >> value >> unit;
if (unit == "kB") value *= 1024;
return value;
}
}
return 0;
#endif
}
static void release_allocator_cache() {
#if defined(__GLIBC__)
malloc_trim(0);
#endif
}
// Tracks heap usage over a scoped region to flag large positive deltas.
struct MemoryGuard {
std::string label;
size_t before;
explicit MemoryGuard(std::string l)
: label(std::move(l)), before(current_allocated_bytes()) {}
long report() const {
release_allocator_cache();
size_t after = current_allocated_bytes();
long delta = static_cast<long>(after) - static_cast<long>(before);
std::printf(" %s memory delta: %ld bytes\n", label.c_str(), delta);
return delta;
}
};
struct HandleHash {
size_t operator()(amdsmi_processor_handle h) const noexcept {
return reinterpret_cast<size_t>(h);
}
};
struct HandleEq {
bool operator()(amdsmi_processor_handle a, amdsmi_processor_handle b) const noexcept {
return a == b;
}
};
} // namespace
int main() {
bool overall_ok = true;
amdsmi_status_t status = amdsmi_init(AMDSMI_INIT_AMD_GPUS);
std::printf("amdsmi_init -> %d (%s)\n", (int)status, status_to_cstr(status));
if (status != AMDSMI_STATUS_SUCCESS)
return 1;
uint32_t socket_count = 0;
status = amdsmi_get_socket_handles(&socket_count, nullptr);
std::printf("amdsmi_get_socket_handles(count) -> %d (%s)\n", (int)status,
status_to_cstr(status));
if (status != AMDSMI_STATUS_SUCCESS || socket_count == 0) {
std::printf("No sockets discovered; nothing to test.\n");
amdsmi_shut_down();
return status == AMDSMI_STATUS_SUCCESS ? 0 : 1;
}
std::vector<amdsmi_socket_handle> sockets(socket_count);
status = amdsmi_get_socket_handles(&socket_count, sockets.data());
std::printf("amdsmi_get_socket_handles(handles) -> %d (%s)\n", (int)status,
status_to_cstr(status));
if (status != AMDSMI_STATUS_SUCCESS) {
amdsmi_shut_down();
return 1;
}
std::unordered_set<amdsmi_processor_handle, HandleHash, HandleEq> seen;
std::vector<amdsmi_processor_handle> gpu_handles;
for (uint32_t si = 0; si < socket_count; ++si) {
uint32_t device_count = 0;
status = amdsmi_get_processor_handles(sockets[si], &device_count, nullptr);
if (status != AMDSMI_STATUS_SUCCESS || device_count == 0)
continue;
std::vector<amdsmi_processor_handle> devices(device_count);
status = amdsmi_get_processor_handles(sockets[si], &device_count, devices.data());
std::printf("amdsmi_get_processor_handles(socket=%u) -> %d (%s)\n", si, (int)status,
status_to_cstr(status));
if (status != AMDSMI_STATUS_SUCCESS)
continue;
for (uint32_t di = 0; di < device_count; ++di) {
#ifdef AMDSMI_PROCESSOR_TYPE_AMD_GPU
processor_type_t type;
if (amdsmi_get_processor_type(devices[di], &type) == AMDSMI_STATUS_SUCCESS &&
type != AMDSMI_PROCESSOR_TYPE_AMD_GPU) {
continue;
}
#endif
if (!seen.insert(devices[di]).second)
continue;
gpu_handles.push_back(devices[di]);
std::printf("\n=== GPU handle %zu ===\n", gpu_handles.size() - 1);
// Memory partition configuration test
{
amdsmi_memory_partition_config_t cfg{};
StatusEval eval = log_status("amdsmi_get_gpu_memory_partition_config",
amdsmi_get_gpu_memory_partition_config(devices[di], &cfg));
if (eval == StatusEval::Success) {
std::printf(" mp_mode=%u, numa_ranges=%u, caps=0x%08x\n",
static_cast<unsigned>(cfg.mp_mode), cfg.num_numa_ranges,
cfg.partition_caps.nps_cap_mask);
MemoryGuard guard("amdsmi_get_gpu_memory_partition_config (repeat)");
for (int iter = 0; iter < kIterations; ++iter) {
amdsmi_memory_partition_config_t tmp{};
amdsmi_status_t st =
amdsmi_get_gpu_memory_partition_config(devices[di], &tmp);
if (st != AMDSMI_STATUS_SUCCESS) {
std::printf(" iteration %d failed -> %d (%s)\n", iter, (int)st,
status_to_cstr(st));
overall_ok = false;
break;
}
}
long delta = guard.report();
if (delta > kLeakThresholdBytes) {
std::printf(
" [FAIL] Potential leak: delta exceeds %ld bytes threshold.\n",
kLeakThresholdBytes);
overall_ok = false;
}
} else if (eval == StatusEval::Failure) {
overall_ok = false;
}
}
}
}
if (gpu_handles.size() > 1) {
std::printf("\n=== P2P topology checks ===\n");
for (size_t src = 0; src + 1 < gpu_handles.size(); ++src) {
for (size_t dst = src + 1; dst < gpu_handles.size(); ++dst) {
std::string label = "amdsmi_topo_get_p2p_status(" + std::to_string(src) +
"->" + std::to_string(dst) + ")";
amdsmi_link_type_t type = AMDSMI_LINK_TYPE_UNKNOWN;
amdsmi_p2p_capability_t cap{};
StatusEval eval = log_status(label, amdsmi_topo_get_p2p_status(
gpu_handles[src], gpu_handles[dst],
&type, &cap));
if (eval == StatusEval::Success) {
std::printf(" initial link type=%d, coherent=%u, atomics32=%u, atomics64=%u, dma=%u, bidi=%u\n",
(int)type, cap.is_iolink_coherent, cap.is_iolink_atomics_32bit,
cap.is_iolink_atomics_64bit, cap.is_iolink_dma,
cap.is_iolink_bi_directional);
std::string repeat_label = label + " (repeat)";
MemoryGuard guard(repeat_label);
for (int iter = 0; iter < kIterations; ++iter) {
amdsmi_link_type_t loop_type = AMDSMI_LINK_TYPE_UNKNOWN;
amdsmi_p2p_capability_t loop_cap{};
amdsmi_status_t st = amdsmi_topo_get_p2p_status(
gpu_handles[src], gpu_handles[dst], &loop_type, &loop_cap);
if (st != AMDSMI_STATUS_SUCCESS) {
std::printf(" iteration %d failed -> %d (%s)\n", iter, (int)st,
status_to_cstr(st));
overall_ok = false;
break;
}
}
long delta = guard.report();
if (delta > kLeakThresholdBytes) {
std::printf(
" [FAIL] Potential leak: delta exceeds %ld bytes threshold.\n",
kLeakThresholdBytes);
overall_ok = false;
}
} else if (eval == StatusEval::Failure) {
overall_ok = false;
}
}
}
} else {
std::printf("\nOnly one GPU handle found; skipping P2P checks.\n");
}
status = amdsmi_shut_down();
std::printf("amdsmi_shut_down -> %d (%s)\n", (int)status, status_to_cstr(status));
return (overall_ok && status == AMDSMI_STATUS_SUCCESS) ? 0 : 1;
}
(Optional for Linux users) Output of /opt/rocm/bin/rocminfo --support
/opt/rocm/bin/rocminfo --support
ROCk module version 6.14.14 is loaded
=====================
HSA System Attributes
=====================
Runtime Version: 1.18
Runtime Ext Version: 1.11
System Timestamp Freq.: 1000.000000MHz
Sig. Max Wait Duration: 18446744073709551615 (0xFFFFFFFFFFFFFFFF) (timestamp count)
Machine Model: LARGE
System Endianness: LITTLE
Mwaitx: DISABLED
XNACK enabled: NO
DMAbuf Support: YES
VMM Support: YES
==========
HSA Agents
==========
*******
Agent 1
*******
Name: AMD Instinct MI300A Accelerator
Uuid: CPU-XX
Marketing Name: AMD Instinct MI300A Accelerator
Vendor Name: CPU
Feature: None specified
Profile: FULL_PROFILE
Float Round Mode: NEAR
Max Queue Number: 0(0x0)
Queue Min Size: 0(0x0)
Queue Max Size: 0(0x0)
Queue Type: MULTI
Node: 0
Device Type: CPU
Cache Info:
L1: 32768(0x8000) KB
Chip ID: 0(0x0)
ASIC Revision: 0(0x0)
Cacheline Size: 64(0x40)
Max Clock Freq. (MHz): 3700
BDFID: 0
Internal Node ID: 0
Compute Unit: 48
SIMDs per CU: 0
Shader Engines: 0
Shader Arrs. per Eng.: 0
WatchPts on Addr. Ranges:1
Memory Properties:
Features: None
Pool Info:
Pool 1
Segment: GLOBAL; FLAGS: FINE GRAINED
Size: 130750664(0x7cb18c8) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:4KB
Alloc Alignment: 4KB
Accessible by all: TRUE
Pool 2
Segment: GLOBAL; FLAGS: EXTENDED FINE GRAINED
Size: 130750664(0x7cb18c8) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:4KB
Alloc Alignment: 4KB
Accessible by all: TRUE
Pool 3
Segment: GLOBAL; FLAGS: KERNARG, FINE GRAINED
Size: 130750664(0x7cb18c8) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:4KB
Alloc Alignment: 4KB
Accessible by all: TRUE
Pool 4
Segment: GLOBAL; FLAGS: COARSE GRAINED
Size: 130750664(0x7cb18c8) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:4KB
Alloc Alignment: 4KB
Accessible by all: TRUE
ISA Info:
*******
Agent 2
*******
Name: AMD Instinct MI300A Accelerator
Uuid: CPU-XX
Marketing Name: AMD Instinct MI300A Accelerator
Vendor Name: CPU
Feature: None specified
Profile: FULL_PROFILE
Float Round Mode: NEAR
Max Queue Number: 0(0x0)
Queue Min Size: 0(0x0)
Queue Max Size: 0(0x0)
Queue Type: MULTI
Node: 1
Device Type: CPU
Cache Info:
L1: 32768(0x8000) KB
Chip ID: 0(0x0)
ASIC Revision: 0(0x0)
Cacheline Size: 64(0x40)
Max Clock Freq. (MHz): 3700
BDFID: 0
Internal Node ID: 1
Compute Unit: 48
SIMDs per CU: 0
Shader Engines: 0
Shader Arrs. per Eng.: 0
WatchPts on Addr. Ranges:1
Memory Properties:
Features: None
Pool Info:
Pool 1
Segment: GLOBAL; FLAGS: FINE GRAINED
Size: 131809200(0x7db3fb0) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:4KB
Alloc Alignment: 4KB
Accessible by all: TRUE
Pool 2
Segment: GLOBAL; FLAGS: EXTENDED FINE GRAINED
Size: 131809200(0x7db3fb0) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:4KB
Alloc Alignment: 4KB
Accessible by all: TRUE
Pool 3
Segment: GLOBAL; FLAGS: KERNARG, FINE GRAINED
Size: 131809200(0x7db3fb0) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:4KB
Alloc Alignment: 4KB
Accessible by all: TRUE
Pool 4
Segment: GLOBAL; FLAGS: COARSE GRAINED
Size: 131809200(0x7db3fb0) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:4KB
Alloc Alignment: 4KB
Accessible by all: TRUE
ISA Info:
*******
Agent 3
*******
Name: AMD Instinct MI300A Accelerator
Uuid: CPU-XX
Marketing Name: AMD Instinct MI300A Accelerator
Vendor Name: CPU
Feature: None specified
Profile: FULL_PROFILE
Float Round Mode: NEAR
Max Queue Number: 0(0x0)
Queue Min Size: 0(0x0)
Queue Max Size: 0(0x0)
Queue Type: MULTI
Node: 2
Device Type: CPU
Cache Info:
L1: 32768(0x8000) KB
Chip ID: 0(0x0)
ASIC Revision: 0(0x0)
Cacheline Size: 64(0x40)
Max Clock Freq. (MHz): 3700
BDFID: 0
Internal Node ID: 2
Compute Unit: 48
SIMDs per CU: 0
Shader Engines: 0
Shader Arrs. per Eng.: 0
WatchPts on Addr. Ranges:1
Memory Properties:
Features: None
Pool Info:
Pool 1
Segment: GLOBAL; FLAGS: FINE GRAINED
Size: 131809208(0x7db3fb8) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:4KB
Alloc Alignment: 4KB
Accessible by all: TRUE
Pool 2
Segment: GLOBAL; FLAGS: EXTENDED FINE GRAINED
Size: 131809208(0x7db3fb8) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:4KB
Alloc Alignment: 4KB
Accessible by all: TRUE
Pool 3
Segment: GLOBAL; FLAGS: KERNARG, FINE GRAINED
Size: 131809208(0x7db3fb8) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:4KB
Alloc Alignment: 4KB
Accessible by all: TRUE
Pool 4
Segment: GLOBAL; FLAGS: COARSE GRAINED
Size: 131809208(0x7db3fb8) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:4KB
Alloc Alignment: 4KB
Accessible by all: TRUE
ISA Info:
*******
Agent 4
*******
Name: AMD Instinct MI300A Accelerator
Uuid: CPU-XX
Marketing Name: AMD Instinct MI300A Accelerator
Vendor Name: CPU
Feature: None specified
Profile: FULL_PROFILE
Float Round Mode: NEAR
Max Queue Number: 0(0x0)
Queue Min Size: 0(0x0)
Queue Max Size: 0(0x0)
Queue Type: MULTI
Node: 3
Device Type: CPU
Cache Info:
L1: 32768(0x8000) KB
Chip ID: 0(0x0)
ASIC Revision: 0(0x0)
Cacheline Size: 64(0x40)
Max Clock Freq. (MHz): 3700
BDFID: 0
Internal Node ID: 3
Compute Unit: 48
SIMDs per CU: 0
Shader Engines: 0
Shader Arrs. per Eng.: 0
WatchPts on Addr. Ranges:1
Memory Properties:
Features: None
Pool Info:
Pool 1
Segment: GLOBAL; FLAGS: FINE GRAINED
Size: 131752964(0x7da6404) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:4KB
Alloc Alignment: 4KB
Accessible by all: TRUE
Pool 2
Segment: GLOBAL; FLAGS: EXTENDED FINE GRAINED
Size: 131752964(0x7da6404) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:4KB
Alloc Alignment: 4KB
Accessible by all: TRUE
Pool 3
Segment: GLOBAL; FLAGS: KERNARG, FINE GRAINED
Size: 131752964(0x7da6404) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:4KB
Alloc Alignment: 4KB
Accessible by all: TRUE
Pool 4
Segment: GLOBAL; FLAGS: COARSE GRAINED
Size: 131752964(0x7da6404) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:4KB
Alloc Alignment: 4KB
Accessible by all: TRUE
ISA Info:
*******
Agent 5
*******
Name: gfx942
Uuid: GPU-f68e1753dc12b715
Marketing Name: AMD Instinct MI300A
Vendor Name: AMD
Feature: KERNEL_DISPATCH
Profile: BASE_PROFILE
Float Round Mode: NEAR
Max Queue Number: 128(0x80)
Queue Min Size: 64(0x40)
Queue Max Size: 131072(0x20000)
Queue Type: MULTI
Node: 4
Device Type: GPU
Cache Info:
L1: 32(0x20) KB
L2: 4096(0x1000) KB
L3: 262144(0x40000) KB
Chip ID: 29856(0x74a0)
ASIC Revision: 1(0x1)
Cacheline Size: 128(0x80)
Max Clock Freq. (MHz): 2100
BDFID: 512
Internal Node ID: 4
Compute Unit: 228
SIMDs per CU: 4
Shader Engines: 24
Shader Arrs. per Eng.: 1
WatchPts on Addr. Ranges:4
Coherent Host Access: TRUE
Memory Properties: APU
Features: KERNEL_DISPATCH
Fast F16 Operation: TRUE
Wavefront Size: 64(0x40)
Workgroup Max Size: 1024(0x400)
Workgroup Max Size per Dimension:
x 1024(0x400)
y 1024(0x400)
z 1024(0x400)
Max Waves Per CU: 32(0x20)
Max Work-item Per CU: 2048(0x800)
Grid Max Size: 4294967295(0xffffffff)
Grid Max Size per Dimension:
x 2147483647(0x7fffffff)
y 65535(0xffff)
z 65535(0xffff)
Max fbarriers/Workgrp: 32
Packet Processor uCode:: 177
SDMA engine uCode:: 24
IOMMU Support:: None
Pool Info:
Pool 1
Segment: GLOBAL; FLAGS: COARSE GRAINED
Size: 98647880(0x5e13f48) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:2048KB
Alloc Alignment: 4KB
Accessible by all: FALSE
Pool 2
Segment: GLOBAL; FLAGS: EXTENDED FINE GRAINED
Size: 98647880(0x5e13f48) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:2048KB
Alloc Alignment: 4KB
Accessible by all: FALSE
Pool 3
Segment: GLOBAL; FLAGS: FINE GRAINED
Size: 98647880(0x5e13f48) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:2048KB
Alloc Alignment: 4KB
Accessible by all: FALSE
Pool 4
Segment: GROUP
Size: 64(0x40) KB
Allocatable: FALSE
Alloc Granule: 0KB
Alloc Recommended Granule:0KB
Alloc Alignment: 0KB
Accessible by all: FALSE
ISA Info:
ISA 1
Name: amdgcn-amd-amdhsa--gfx942:sramecc+:xnack-
Machine Models: HSA_MACHINE_MODEL_LARGE
Profiles: HSA_PROFILE_BASE
Default Rounding Mode: NEAR
Default Rounding Mode: NEAR
Fast f16: TRUE
Workgroup Max Size: 1024(0x400)
Workgroup Max Size per Dimension:
x 1024(0x400)
y 1024(0x400)
z 1024(0x400)
Grid Max Size: 4294967295(0xffffffff)
Grid Max Size per Dimension:
x 2147483647(0x7fffffff)
y 65535(0xffff)
z 65535(0xffff)
FBarrier Max Size: 32
ISA 2
Name: amdgcn-amd-amdhsa--gfx9-4-generic:sramecc+:xnack-
Machine Models: HSA_MACHINE_MODEL_LARGE
Profiles: HSA_PROFILE_BASE
Default Rounding Mode: NEAR
Default Rounding Mode: NEAR
Fast f16: TRUE
Workgroup Max Size: 1024(0x400)
Workgroup Max Size per Dimension:
x 1024(0x400)
y 1024(0x400)
z 1024(0x400)
Grid Max Size: 4294967295(0xffffffff)
Grid Max Size per Dimension:
x 2147483647(0x7fffffff)
y 65535(0xffff)
z 65535(0xffff)
FBarrier Max Size: 32
*******
Agent 6
*******
Name: gfx942
Uuid: GPU-5bd3148dc76b831a
Marketing Name: AMD Instinct MI300A
Vendor Name: AMD
Feature: KERNEL_DISPATCH
Profile: BASE_PROFILE
Float Round Mode: NEAR
Max Queue Number: 128(0x80)
Queue Min Size: 64(0x40)
Queue Max Size: 131072(0x20000)
Queue Type: MULTI
Node: 5
Device Type: GPU
Cache Info:
L1: 32(0x20) KB
L2: 4096(0x1000) KB
L3: 262144(0x40000) KB
Chip ID: 29856(0x74a0)
ASIC Revision: 1(0x1)
Cacheline Size: 128(0x80)
Max Clock Freq. (MHz): 2100
BDFID: 16640
Internal Node ID: 5
Compute Unit: 228
SIMDs per CU: 4
Shader Engines: 24
Shader Arrs. per Eng.: 1
WatchPts on Addr. Ranges:4
Coherent Host Access: TRUE
Memory Properties: APU
Features: KERNEL_DISPATCH
Fast F16 Operation: TRUE
Wavefront Size: 64(0x40)
Workgroup Max Size: 1024(0x400)
Workgroup Max Size per Dimension:
x 1024(0x400)
y 1024(0x400)
z 1024(0x400)
Max Waves Per CU: 32(0x20)
Max Work-item Per CU: 2048(0x800)
Grid Max Size: 4294967295(0xffffffff)
Grid Max Size per Dimension:
x 2147483647(0x7fffffff)
y 65535(0xffff)
z 65535(0xffff)
Max fbarriers/Workgrp: 32
Packet Processor uCode:: 177
SDMA engine uCode:: 24
IOMMU Support:: None
Pool Info:
Pool 1
Segment: GLOBAL; FLAGS: COARSE GRAINED
Size: 98647880(0x5e13f48) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:2048KB
Alloc Alignment: 4KB
Accessible by all: FALSE
Pool 2
Segment: GLOBAL; FLAGS: EXTENDED FINE GRAINED
Size: 98647880(0x5e13f48) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:2048KB
Alloc Alignment: 4KB
Accessible by all: FALSE
Pool 3
Segment: GLOBAL; FLAGS: FINE GRAINED
Size: 98647880(0x5e13f48) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:2048KB
Alloc Alignment: 4KB
Accessible by all: FALSE
Pool 4
Segment: GROUP
Size: 64(0x40) KB
Allocatable: FALSE
Alloc Granule: 0KB
Alloc Recommended Granule:0KB
Alloc Alignment: 0KB
Accessible by all: FALSE
ISA Info:
ISA 1
Name: amdgcn-amd-amdhsa--gfx942:sramecc+:xnack-
Machine Models: HSA_MACHINE_MODEL_LARGE
Profiles: HSA_PROFILE_BASE
Default Rounding Mode: NEAR
Default Rounding Mode: NEAR
Fast f16: TRUE
Workgroup Max Size: 1024(0x400)
Workgroup Max Size per Dimension:
x 1024(0x400)
y 1024(0x400)
z 1024(0x400)
Grid Max Size: 4294967295(0xffffffff)
Grid Max Size per Dimension:
x 2147483647(0x7fffffff)
y 65535(0xffff)
z 65535(0xffff)
FBarrier Max Size: 32
ISA 2
Name: amdgcn-amd-amdhsa--gfx9-4-generic:sramecc+:xnack-
Machine Models: HSA_MACHINE_MODEL_LARGE
Profiles: HSA_PROFILE_BASE
Default Rounding Mode: NEAR
Default Rounding Mode: NEAR
Fast f16: TRUE
Workgroup Max Size: 1024(0x400)
Workgroup Max Size per Dimension:
x 1024(0x400)
y 1024(0x400)
z 1024(0x400)
Grid Max Size: 4294967295(0xffffffff)
Grid Max Size per Dimension:
x 2147483647(0x7fffffff)
y 65535(0xffff)
z 65535(0xffff)
FBarrier Max Size: 32
*******
Agent 7
*******
Name: gfx942
Uuid: GPU-a57d04507c371187
Marketing Name: AMD Instinct MI300A
Vendor Name: AMD
Feature: KERNEL_DISPATCH
Profile: BASE_PROFILE
Float Round Mode: NEAR
Max Queue Number: 128(0x80)
Queue Min Size: 64(0x40)
Queue Max Size: 131072(0x20000)
Queue Type: MULTI
Node: 6
Device Type: GPU
Cache Info:
L1: 32(0x20) KB
L2: 4096(0x1000) KB
L3: 262144(0x40000) KB
Chip ID: 29856(0x74a0)
ASIC Revision: 1(0x1)
Cacheline Size: 128(0x80)
Max Clock Freq. (MHz): 2100
BDFID: 33024
Internal Node ID: 6
Compute Unit: 228
SIMDs per CU: 4
Shader Engines: 24
Shader Arrs. per Eng.: 1
WatchPts on Addr. Ranges:4
Coherent Host Access: TRUE
Memory Properties: APU
Features: KERNEL_DISPATCH
Fast F16 Operation: TRUE
Wavefront Size: 64(0x40)
Workgroup Max Size: 1024(0x400)
Workgroup Max Size per Dimension:
x 1024(0x400)
y 1024(0x400)
z 1024(0x400)
Max Waves Per CU: 32(0x20)
Max Work-item Per CU: 2048(0x800)
Grid Max Size: 4294967295(0xffffffff)
Grid Max Size per Dimension:
x 2147483647(0x7fffffff)
y 65535(0xffff)
z 65535(0xffff)
Max fbarriers/Workgrp: 32
Packet Processor uCode:: 177
SDMA engine uCode:: 24
IOMMU Support:: None
Pool Info:
Pool 1
Segment: GLOBAL; FLAGS: COARSE GRAINED
Size: 98647880(0x5e13f48) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:2048KB
Alloc Alignment: 4KB
Accessible by all: FALSE
Pool 2
Segment: GLOBAL; FLAGS: EXTENDED FINE GRAINED
Size: 98647880(0x5e13f48) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:2048KB
Alloc Alignment: 4KB
Accessible by all: FALSE
Pool 3
Segment: GLOBAL; FLAGS: FINE GRAINED
Size: 98647880(0x5e13f48) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:2048KB
Alloc Alignment: 4KB
Accessible by all: FALSE
Pool 4
Segment: GROUP
Size: 64(0x40) KB
Allocatable: FALSE
Alloc Granule: 0KB
Alloc Recommended Granule:0KB
Alloc Alignment: 0KB
Accessible by all: FALSE
ISA Info:
ISA 1
Name: amdgcn-amd-amdhsa--gfx942:sramecc+:xnack-
Machine Models: HSA_MACHINE_MODEL_LARGE
Profiles: HSA_PROFILE_BASE
Default Rounding Mode: NEAR
Default Rounding Mode: NEAR
Fast f16: TRUE
Workgroup Max Size: 1024(0x400)
Workgroup Max Size per Dimension:
x 1024(0x400)
y 1024(0x400)
z 1024(0x400)
Grid Max Size: 4294967295(0xffffffff)
Grid Max Size per Dimension:
x 2147483647(0x7fffffff)
y 65535(0xffff)
z 65535(0xffff)
FBarrier Max Size: 32
ISA 2
Name: amdgcn-amd-amdhsa--gfx9-4-generic:sramecc+:xnack-
Machine Models: HSA_MACHINE_MODEL_LARGE
Profiles: HSA_PROFILE_BASE
Default Rounding Mode: NEAR
Default Rounding Mode: NEAR
Fast f16: TRUE
Workgroup Max Size: 1024(0x400)
Workgroup Max Size per Dimension:
x 1024(0x400)
y 1024(0x400)
z 1024(0x400)
Grid Max Size: 4294967295(0xffffffff)
Grid Max Size per Dimension:
x 2147483647(0x7fffffff)
y 65535(0xffff)
z 65535(0xffff)
FBarrier Max Size: 32
*******
Agent 8
*******
Name: gfx942
Uuid: GPU-cdcfb00895cff553
Marketing Name: AMD Instinct MI300A
Vendor Name: AMD
Feature: KERNEL_DISPATCH
Profile: BASE_PROFILE
Float Round Mode: NEAR
Max Queue Number: 128(0x80)
Queue Min Size: 64(0x40)
Queue Max Size: 131072(0x20000)
Queue Type: MULTI
Node: 7
Device Type: GPU
Cache Info:
L1: 32(0x20) KB
L2: 4096(0x1000) KB
L3: 262144(0x40000) KB
Chip ID: 29856(0x74a0)
ASIC Revision: 1(0x1)
Cacheline Size: 128(0x80)
Max Clock Freq. (MHz): 2100
BDFID: 49408
Internal Node ID: 7
Compute Unit: 228
SIMDs per CU: 4
Shader Engines: 24
Shader Arrs. per Eng.: 1
WatchPts on Addr. Ranges:4
Coherent Host Access: TRUE
Memory Properties: APU
Features: KERNEL_DISPATCH
Fast F16 Operation: TRUE
Wavefront Size: 64(0x40)
Workgroup Max Size: 1024(0x400)
Workgroup Max Size per Dimension:
x 1024(0x400)
y 1024(0x400)
z 1024(0x400)
Max Waves Per CU: 32(0x20)
Max Work-item Per CU: 2048(0x800)
Grid Max Size: 4294967295(0xffffffff)
Grid Max Size per Dimension:
x 2147483647(0x7fffffff)
y 65535(0xffff)
z 65535(0xffff)
Max fbarriers/Workgrp: 32
Packet Processor uCode:: 177
SDMA engine uCode:: 24
IOMMU Support:: None
Pool Info:
Pool 1
Segment: GLOBAL; FLAGS: COARSE GRAINED
Size: 98647880(0x5e13f48) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:2048KB
Alloc Alignment: 4KB
Accessible by all: FALSE
Pool 2
Segment: GLOBAL; FLAGS: EXTENDED FINE GRAINED
Size: 98647880(0x5e13f48) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:2048KB
Alloc Alignment: 4KB
Accessible by all: FALSE
Pool 3
Segment: GLOBAL; FLAGS: FINE GRAINED
Size: 98647880(0x5e13f48) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Recommended Granule:2048KB
Alloc Alignment: 4KB
Accessible by all: FALSE
Pool 4
Segment: GROUP
Size: 64(0x40) KB
Allocatable: FALSE
Alloc Granule: 0KB
Alloc Recommended Granule:0KB
Alloc Alignment: 0KB
Accessible by all: FALSE
ISA Info:
ISA 1
Name: amdgcn-amd-amdhsa--gfx942:sramecc+:xnack-
Machine Models: HSA_MACHINE_MODEL_LARGE
Profiles: HSA_PROFILE_BASE
Default Rounding Mode: NEAR
Default Rounding Mode: NEAR
Fast f16: TRUE
Workgroup Max Size: 1024(0x400)
Workgroup Max Size per Dimension:
x 1024(0x400)
y 1024(0x400)
z 1024(0x400)
Grid Max Size: 4294967295(0xffffffff)
Grid Max Size per Dimension:
x 2147483647(0x7fffffff)
y 65535(0xffff)
z 65535(0xffff)
FBarrier Max Size: 32
ISA 2
Name: amdgcn-amd-amdhsa--gfx9-4-generic:sramecc+:xnack-
Machine Models: HSA_MACHINE_MODEL_LARGE
Profiles: HSA_PROFILE_BASE
Default Rounding Mode: NEAR
Default Rounding Mode: NEAR
Fast f16: TRUE
Workgroup Max Size: 1024(0x400)
Workgroup Max Size per Dimension:
x 1024(0x400)
y 1024(0x400)
z 1024(0x400)
Grid Max Size: 4294967295(0xffffffff)
Grid Max Size per Dimension:
x 2147483647(0x7fffffff)
y 65535(0xffff)
z 65535(0xffff)
FBarrier Max Size: 32
*** Done ***
Additional Information
No response
Hi @djwoun. Internal ticket has been created to investigate this issue. Thanks!