ROCK-Kernel-Driver
ROCK-Kernel-Driver copied to clipboard
NULL pointer dereference in kfd_dbgmgr_address_watch
This is similar and probably related to #70. There are probably much more such bugs. It is not very good that system with rocm can be destabilised from unprivileged user without sudo.
#include <hc.hpp>
#include <hsa.h>
#include <hsakmt.h>
int main()
{
hc::accelerator_view view = hc::accelerator().get_default_view();
hsa_agent_t agent = *static_cast<hsa_agent_t*>(view.get_hsa_agent());
unsigned int node;
hsa_agent_get_info(agent, HSA_AGENT_INFO_NODE, &node);
HSA_DBG_WATCH_MODE modes[1] = {HSA_DBG_WATCH_ALL};
void *addrs[1] = {0};
HSAuint64 mask[1] = {0};
hsaKmtDbgAddressWatch(node, 1, modes, addrs, mask, NULL);
return 0;
}
hcc -hc -lhsa-runtime64 -lhsakmt main.cpp
./a.out
[ 28.478193] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
[ 28.478255] IP: kfd_dbgmgr_address_watch+0x12/0x60 [amdgpu]
[ 28.478257] PGD 835f5f067 P4D 835f5f067 PUD 7ee4f5067 PMD 0
[ 28.478263] Oops: 0000 [#1] SMP NOPTI
[ 28.478265] Modules linked in: msr nls_utf8 cifs ccm fscache cmac bnep binfmt_misc nls_iso8859_1 arc4 snd_hda_codec_realtek snd_hda_codec_generic edac_mce_amd snd_hda_codec_hdmi iwlmvm kvm_amd snd_seq_midi snd_hda_intel snd_seq_midi_event kvm snd_hda_codec mac80211 irqbypass snd_hda_core snd_rawmidi snd_hwdep snd_pcm snd_seq btusb iwlwifi btrtl btbcm btintel snd_seq_device snd_timer snd bluetooth wmi_bmof cfg80211 ecdh_generic ccp soundcore k10temp shpchp mac_hid sch_fq_codel ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi nct6775 hwmon_vid parport_pc ppdev lp parport ip_tables x_tables autofs4 btrfs zstd_compress raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c raid1 multipath linear raid0 amdgpu(OE) amdchash(OE)
[ 28.478313] amdttm(OE) amd_sched(OE) crct10dif_pclmul mxm_wmi crc32_pclmul ghash_clmulni_intel pcbc amdkcl(OE) aesni_intel amd_iommu_v2 aes_x86_64 crypto_simd glue_helper drm_kms_helper cryptd igb syscopyarea sysfillrect dca sysimgblt i2c_algo_bit fb_sys_fops ptp drm nvme atlantic i2c_piix4 pps_core ahci nvme_core libahci gpio_amdpt wmi gpio_generic
[ 28.478333] CPU: 17 PID: 4186 Comm: a.out Tainted: G OE 4.15.0-45-generic #48-Ubuntu
[ 28.478335] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./X399 Professional Gaming, BIOS P3.30 08/14/2018
[ 28.478388] RIP: 0010:kfd_dbgmgr_address_watch+0x12/0x60 [amdgpu]
[ 28.478389] RSP: 0018:ffffc0bcc79c7d18 EFLAGS: 00010246
[ 28.478392] RAX: ffff9fe7ec9a6800 RBX: ffffc0bcc79c7d28 RCX: ffff9fe84f57e310
[ 28.478393] RDX: 000000000000800b RSI: ffffc0bcc79c7d28 RDI: 0000000000000000
[ 28.478395] RBP: ffffc0bcc79c7d18 R08: ffffc0bcc79c8000 R09: 0000000000000020
[ 28.478396] R10: 0000000000000020 R11: 00000000000005c0 R12: ffff9fe83e81e400
[ 28.478398] R13: ffff9fe7ec9a6800 R14: ffff9fe84f57e300 R15: ffffc0bcc79c7db0
[ 28.478400] FS: 00007f6cee04ac00(0000) GS:ffff9fe85cc40000(0000) knlGS:0000000000000000
[ 28.478402] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 28.478403] CR2: 0000000000000000 CR3: 00000007e9658000 CR4: 00000000003406e0
[ 28.478405] Call Trace:
[ 28.478456] kfd_ioctl_dbg_address_watch+0x125/0x160 [amdgpu]
[ 28.478505] kfd_ioctl+0x271/0x450 [amdgpu]
[ 28.478553] ? kfd_ioctl_dbg_wave_control+0x1a0/0x1a0 [amdgpu]
[ 28.478558] ? __handle_mm_fault+0x478/0x5c0
[ 28.478562] do_vfs_ioctl+0xa8/0x630
[ 28.478565] ? handle_mm_fault+0xb1/0x1f0
[ 28.478568] ? __do_page_fault+0x270/0x4d0
[ 28.478571] SyS_ioctl+0x79/0x90
[ 28.478574] do_syscall_64+0x73/0x130
[ 28.478578] entry_SYSCALL_64_after_hwframe+0x3d/0xa2
[ 28.478580] RIP: 0033:0x7f6cec4c85d7
[ 28.478582] RSP: 002b:00007ffef082cad8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[ 28.478584] RAX: ffffffffffffffda RBX: 0000000000000008 RCX: 00007f6cec4c85d7
[ 28.478585] RDX: 0000000000d99a50 RSI: 0000000040104b0f RDI: 0000000000000003
[ 28.478587] RBP: 0000000000d99a50 R08: 00007ffef082cba8 R09: 0000000000000000
[ 28.478588] R10: 0000000000d99a78 R11: 0000000000000246 R12: 0000000040104b0f
[ 28.478589] R13: 0000000000000003 R14: 0000000000d99a50 R15: 0000000000000028
[ 28.478591] Code: fa 48 c7 c0 ea ff ff ff 5d c3 0f 1f 44 00 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 55 48 8b 06 48 89 e5 8b 90 90 00 00 00 <39> 17 75 11 48 8b 7f 10 48 8b 47 30 e8 3d 4e 69 fb 48 98 5d c3
[ 28.478671] RIP: kfd_dbgmgr_address_watch+0x12/0x60 [amdgpu] RSP: ffffc0bcc79c7d18
[ 28.478673] CR2: 0000000000000000
[ 28.478675] ---[ end trace a03e90876b0f96e4 ]---
@misos1 Apologies for the lack of response. Can you please check if your issue still exist with the latest ROCm 6.2? If not, please close the ticket. Thanks!
Should be resolved now