storage
storage copied to clipboard
Single node with resnet-50 and 32 accelerators but only single log
I'm trying to run a single node benchmark with resnet-50 and 32 accelerators on v1.0 tag.
ubuntu@ip-xxx-xxx-xxx-xxx:/mnt/training_volume/benchmark/storage$ ./benchmark.sh run --hosts xxx.xxx.xxx.xxx --workload resnet50 --accelerator-type h100 --num-accelerators 32 --results-dir run2 --param dataset.num_files_train=2395 --param dataset.data_folder=resnet50_data
The test runs successfully, however, the result directory only has the logs of a single process.
[INFO] Averaged metric over all epochs
[METRIC] ==========================================================
[METRIC] Number of Simulated Accelerators: 1
[METRIC] Training Accelerator Utilization [AU] (%): 90.2184 (1.4735)
[METRIC] Training Throughput (samples/second): 1610.7861 (26.2801)
[METRIC] Training I/O Throughput (MB/second): 176.1368 (2.8737)
[METRIC] train_au_meet_expectation: success
[METRIC] ==========================================================
[/mnt/training_volume/benchmark/storage/dlio_benchmark/dlio_benchmark/utils/statscounter.py:185]
[INFO] 2024-08-18T13:20:46.858001 outputs saved in RANKID_output.json [/mnt/training_volume/benchmark/storage/dlio_benchmark/dlio_benchmark/utils/statscounter.py:378]
The processes are certainly running in parallel as you can see in the ps output:
ubuntu@ip-xxx-xxx-xxx-xxx:/mnt/training_volume/benchmark/storage/resnet50_report/run1$ ps aux | grep python
root 927 0.0 0.0 32456 15616 ? Ss Aug16 0:00 /usr/bin/python3 /usr/bin/networkd-dispatcher --run-startup-triggers
root 953 0.0 0.0 109988 15872 ? Ssl Aug16 0:00 /usr/bin/python3 /usr/share/unattended-upgrades/unattended-upgrade-shutdown --wait-for-signal
ubuntu 347568 0.6 0.0 6128 3328 pts/1 S+ 10:45 0:24 mpirun -hosts xxx.xxx.xxx.xxx -np 32 python3 dlio_benchmark/dlio_benchmark/main.py --config-path=/mnt/training_volume/benchmark/storage/storage-conf workload=resnet50_h100 ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=2395 ++workload.dataset.data_folder=resnet50_data ++workload.workflow.profiling=False ++workload.profiling.profiler=none ++hydra.output_subdir=configs ++hydra.run.dir=run2
ubuntu 347570 41.0 2.5 12543372 1619792 ? Ssl 10:45 25:21 python3 dlio_benchmark/dlio_benchmark/main.py --config-path=/mnt/training_volume/benchmark/storage/storage-conf workload=resnet50_h100 ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=2395 ++workload.dataset.data_folder=resnet50_data ++workload.workflow.profiling=False ++workload.profiling.profiler=none ++hydra.output_subdir=configs ++hydra.run.dir=run2
ubuntu 347571 40.8 2.4 12545752 1592628 ? Ssl 10:45 25:13 python3 dlio_benchmark/dlio_benchmark/main.py --config-path=/mnt/training_volume/benchmark/storage/storage-conf workload=resnet50_h100 ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=2395 ++workload.dataset.data_folder=resnet50_data ++workload.workflow.profiling=False ++workload.profiling.profiler=none ++hydra.output_subdir=configs ++hydra.run.dir=run2
ubuntu 347572 41.1 2.4 12544520 1581320 ? Ssl 10:45 25:25 python3 dlio_benchmark/dlio_benchmark/main.py --config-path=/mnt/training_volume/benchmark/storage/storage-conf workload=resnet50_h100 ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=2395 ++workload.dataset.data_folder=resnet50_data ++workload.workflow.profiling=False ++workload.profiling.profiler=none ++hydra.output_subdir=configs ++hydra.run.dir=run2
ubuntu 347573 40.8 2.4 12542596 1589704 ? Ssl 10:45 25:15 python3 dlio_benchmark/dlio_benchmark/main.py --config-path=/mnt/training_volume/benchmark/storage/storage-conf workload=resnet50_h100 ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=2395 ++workload.dataset.data_folder=resnet50_data ++workload.workflow.profiling=False ++workload.profiling.profiler=none ++hydra.output_subdir=configs ++hydra.run.dir=run2
ubuntu 347574 40.6 2.4 12541904 1558112 ? Ssl 10:45 25:08 python3 dlio_benchmark/dlio_benchmark/main.py --config-path=/mnt/training_volume/benchmark/storage/storage-conf workload=resnet50_h100 ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=2395 ++workload.dataset.data_folder=resnet50_data ++workload.workflow.profiling=False ++workload.profiling.profiler=none ++hydra.output_subdir=configs ++hydra.run.dir=run2
ubuntu 347575 40.9 2.4 12543368 1574980 ? Ssl 10:45 25:18 python3 dlio_benchmark/dlio_benchmark/main.py --config-path=/mnt/training_volume/benchmark/storage/storage-conf workload=resnet50_h100 ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=2395 ++workload.dataset.data_folder=resnet50_data ++workload.workflow.profiling=False ++workload.profiling.profiler=none ++hydra.output_subdir=configs ++hydra.run.dir=run2
ubuntu 347576 41.1 2.4 12544460 1588604 ? Ssl 10:45 25:25 python3 dlio_benchmark/dlio_benchmark/main.py --config-path=/mnt/training_volume/benchmark/storage/storage-conf workload=resnet50_h100 ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=2395 ++workload.dataset.data_folder=resnet50_data ++workload.workflow.profiling=False ++workload.profiling.profiler=none ++hydra.output_subdir=configs ++hydra.run.dir=run2
ubuntu 347577 41.0 2.3 12542036 1551128 ? Ssl 10:45 25:21 python3 dlio_benchmark/dlio_benchmark/main.py --config-path=/mnt/training_volume/benchmark/storage/storage-conf workload=resnet50_h100 ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=2395 ++workload.dataset.data_folder=resnet50_data ++workload.workflow.profiling=False ++workload.profiling.profiler=none ++hydra.output_subdir=configs ++hydra.run.dir=run2
ubuntu 347578 40.7 2.4 12544520 1566408 ? Ssl 10:45 25:10 python3 dlio_benchmark/dlio_benchmark/main.py --config-path=/mnt/training_volume/benchmark/storage/storage-conf workload=resnet50_h100 ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=2395 ++workload.dataset.data_folder=resnet50_data ++workload.workflow.profiling=False ++workload.profiling.profiler=none ++hydra.output_subdir=configs ++hydra.run.dir=run2
ubuntu 347579 40.7 2.4 12543360 1587716 ? Ssl 10:45 25:12 python3 dlio_benchmark/dlio_benchmark/main.py --config-path=/mnt/training_volume/benchmark/storage/storage-conf workload=resnet50_h100 ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=2395 ++workload.dataset.data_folder=resnet50_data ++workload.workflow.profiling=False ++workload.profiling.profiler=none ++hydra.output_subdir=configs ++hydra.run.dir=run2
ubuntu 347580 41.2 2.4 12545680 1595396 ? Ssl 10:45 25:30 python3 dlio_benchmark/dlio_benchmark/main.py --config-path=/mnt/training_volume/benchmark/storage/storage-conf workload=resnet50_h100 ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=2395 ++workload.dataset.data_folder=resnet50_data ++workload.workflow.profiling=False ++workload.profiling.profiler=none ++hydra.output_subdir=configs ++hydra.run.dir=run2
ubuntu 347581 40.4 2.4 12543496 1607728 ? Ssl 10:45 25:00 python3 dlio_benchmark/dlio_benchmark/main.py --config-path=/mnt/training_volume/benchmark/storage/storage-conf workload=resnet50_h100 ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=2395 ++workload.dataset.data_folder=resnet50_data ++workload.workflow.profiling=False ++workload.profiling.profiler=none ++hydra.output_subdir=configs ++hydra.run.dir=run2
ubuntu 347582 40.7 2.4 12544588 1566136 ? Ssl 10:45 25:11 python3 dlio_benchmark/dlio_benchmark/main.py --config-path=/mnt/training_volume/benchmark/storage/storage-conf workload=resnet50_h100 ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=2395 ++workload.dataset.data_folder=resnet50_data ++workload.workflow.profiling=False ++workload.profiling.profiler=none ++hydra.output_subdir=configs ++hydra.run.dir=run2
ubuntu 347583 40.7 2.4 12543288 1589156 ? Ssl 10:45 25:11 python3 dlio_benchmark/dlio_benchmark/main.py --config-path=/mnt/training_volume/benchmark/storage/storage-conf workload=resnet50_h100 ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=2395 ++workload.dataset.data_folder=resnet50_data ++workload.workflow.profiling=False ++workload.profiling.profiler=none ++hydra.output_subdir=configs ++hydra.run.dir=run2
ubuntu 347584 40.9 2.4 12544392 1571904 ? Ssl 10:45 25:18 python3 dlio_benchmark/dlio_benchmark/main.py --config-path=/mnt/training_volume/benchmark/storage/storage-conf workload=resnet50_h100 ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=2395 ++workload.dataset.data_folder=resnet50_data ++workload.workflow.profiling=False ++workload.profiling.profiler=none ++hydra.output_subdir=configs ++hydra.run.dir=run2
ubuntu 347585 40.8 2.4 12541848 1574680 ? Ssl 10:45 25:13 python3 dlio_benchmark/dlio_benchmark/main.py --config-path=/mnt/training_volume/benchmark/storage/storage-conf workload=resnet50_h100 ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=2395 ++workload.dataset.data_folder=resnet50_data ++workload.workflow.profiling=False ++workload.profiling.profiler=none ++hydra.output_subdir=configs ++hydra.run.dir=run2
ubuntu 347586 40.6 2.4 12544524 1582300 ? Ssl 10:45 25:09 python3 dlio_benchmark/dlio_benchmark/main.py --config-path=/mnt/training_volume/benchmark/storage/storage-conf workload=resnet50_h100 ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=2395 ++workload.dataset.data_folder=resnet50_data ++workload.workflow.profiling=False ++workload.profiling.profiler=none ++hydra.output_subdir=configs ++hydra.run.dir=run2
ubuntu 347587 40.7 2.4 12544400 1581052 ? Ssl 10:45 25:12 python3 dlio_benchmark/dlio_benchmark/main.py --config-path=/mnt/training_volume/benchmark/storage/storage-conf workload=resnet50_h100 ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=2395 ++workload.dataset.data_folder=resnet50_data ++workload.workflow.profiling=False ++workload.profiling.profiler=none ++hydra.output_subdir=configs ++hydra.run.dir=run2
ubuntu 347589 41.1 2.4 12542872 1619236 ? Ssl 10:45 25:27 python3 dlio_benchmark/dlio_benchmark/main.py --config-path=/mnt/training_volume/benchmark/storage/storage-conf workload=resnet50_h100 ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=2395 ++workload.dataset.data_folder=resnet50_data ++workload.workflow.profiling=False ++workload.profiling.profiler=none ++hydra.output_subdir=configs ++hydra.run.dir=run2
ubuntu 347590 40.8 2.3 12544520 1552464 ? Ssl 10:45 25:15 python3 dlio_benchmark/dlio_benchmark/main.py --config-path=/mnt/training_volume/benchmark/storage/storage-conf workload=resnet50_h100 ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=2395 ++workload.dataset.data_folder=resnet50_data ++workload.workflow.profiling=False ++workload.profiling.profiler=none ++hydra.output_subdir=configs ++hydra.run.dir=run2
ubuntu 347591 41.2 2.4 12542368 1575644 ? Ssl 10:45 25:31 python3 dlio_benchmark/dlio_benchmark/main.py --config-path=/mnt/training_volume/benchmark/storage/storage-conf workload=resnet50_h100 ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=2395 ++workload.dataset.data_folder=resnet50_data ++workload.workflow.profiling=False ++workload.profiling.profiler=none ++hydra.output_subdir=configs ++hydra.run.dir=run2
ubuntu 347592 41.1 2.4 12541648 1572900 ? Ssl 10:45 25:24 python3 dlio_benchmark/dlio_benchmark/main.py --config-path=/mnt/training_volume/benchmark/storage/storage-conf workload=resnet50_h100 ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=2395 ++workload.dataset.data_folder=resnet50_data ++workload.workflow.profiling=False ++workload.profiling.profiler=none ++hydra.output_subdir=configs ++hydra.run.dir=run2
ubuntu 347593 40.8 2.4 12543128 1586076 ? Ssl 10:45 25:14 python3 dlio_benchmark/dlio_benchmark/main.py --config-path=/mnt/training_volume/benchmark/storage/storage-conf workload=resnet50_h100 ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=2395 ++workload.dataset.data_folder=resnet50_data ++workload.workflow.profiling=False ++workload.profiling.profiler=none ++hydra.output_subdir=configs ++hydra.run.dir=run2
ubuntu 347594 40.8 2.4 12541832 1600536 ? Ssl 10:45 25:14 python3 dlio_benchmark/dlio_benchmark/main.py --config-path=/mnt/training_volume/benchmark/storage/storage-conf workload=resnet50_h100 ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=2395 ++workload.dataset.data_folder=resnet50_data ++workload.workflow.profiling=False ++workload.profiling.profiler=none ++hydra.output_subdir=configs ++hydra.run.dir=run2
ubuntu 347595 40.6 2.4 12543284 1617284 ? Ssl 10:45 25:05 python3 dlio_benchmark/dlio_benchmark/main.py --config-path=/mnt/training_volume/benchmark/storage/storage-conf workload=resnet50_h100 ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=2395 ++workload.dataset.data_folder=resnet50_data ++workload.workflow.profiling=False ++workload.profiling.profiler=none ++hydra.output_subdir=configs ++hydra.run.dir=run2
ubuntu 347596 40.5 2.4 12541836 1585508 ? Ssl 10:45 25:03 python3 dlio_benchmark/dlio_benchmark/main.py --config-path=/mnt/training_volume/benchmark/storage/storage-conf workload=resnet50_h100 ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=2395 ++workload.dataset.data_folder=resnet50_data ++workload.workflow.profiling=False ++workload.profiling.profiler=none ++hydra.output_subdir=configs ++hydra.run.dir=run2
ubuntu 347597 40.7 2.5 12541836 1632196 ? Ssl 10:45 25:12 python3 dlio_benchmark/dlio_benchmark/main.py --config-path=/mnt/training_volume/benchmark/storage/storage-conf workload=resnet50_h100 ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=2395 ++workload.dataset.data_folder=resnet50_data ++workload.workflow.profiling=False ++workload.profiling.profiler=none ++hydra.output_subdir=configs ++hydra.run.dir=run2
ubuntu 347598 41.0 2.4 12541844 1603544 ? Ssl 10:45 25:23 python3 dlio_benchmark/dlio_benchmark/main.py --config-path=/mnt/training_volume/benchmark/storage/storage-conf workload=resnet50_h100 ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=2395 ++workload.dataset.data_folder=resnet50_data ++workload.workflow.profiling=False ++workload.profiling.profiler=none ++hydra.output_subdir=configs ++hydra.run.dir=run2
ubuntu 347599 41.1 2.4 12543504 1596068 ? Ssl 10:45 25:25 python3 dlio_benchmark/dlio_benchmark/main.py --config-path=/mnt/training_volume/benchmark/storage/storage-conf workload=resnet50_h100 ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=2395 ++workload.dataset.data_folder=resnet50_data ++workload.workflow.profiling=False ++workload.profiling.profiler=none ++hydra.output_subdir=configs ++hydra.run.dir=run2
ubuntu 347600 40.8 2.4 12543372 1592648 ? Ssl 10:45 25:15 python3 dlio_benchmark/dlio_benchmark/main.py --config-path=/mnt/training_volume/benchmark/storage/storage-conf workload=resnet50_h100 ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=2395 ++workload.dataset.data_folder=resnet50_data ++workload.workflow.profiling=False ++workload.profiling.profiler=none ++hydra.output_subdir=configs ++hydra.run.dir=run2
ubuntu 347601 41.0 2.4 12543124 1584156 ? Ssl 10:45 25:23 python3 dlio_benchmark/dlio_benchmark/main.py --config-path=/mnt/training_volume/benchmark/storage/storage-conf workload=resnet50_h100 ++workload.workflow.generate_data=False ++workload.workflow.train=True ++workload.dataset.num_files_train=2395 ++workload.dataset.data_folder=resnet50_data ++workload.workflow.profiling=False ++workload.profiling.profiler=none ++hydra.output_subdir=configs ++hydra.run.dir=run2
ubuntu 356935 0.0 0.0 7076 1536 pts/0 S+ 11:47 0:00 grep --color=auto python
ubuntu@ip-xxx-xxx-xxx-xxx:/mnt/training_volume/benchmark/storage/resnet50_report/run1$
Here's the directory content:
ubuntu@ip-xxx-xxx-xxx-xxx:/mnt/training_volume/benchmark/storage/run2$ ls -la
total 17860
drwxrwxr-x 3 ubuntu ubuntu 149 Aug 18 13:19 .
drwxrwxr-x 14 ubuntu ubuntu 4096 Aug 18 10:45 ..
-rw-rw-r-- 1 ubuntu ubuntu 3682441 Aug 18 13:20 0_output.json
drwxrwxr-x 2 ubuntu ubuntu 81 Aug 18 12:22 configs
-rw-rw-r-- 1 ubuntu ubuntu 14581832 Aug 18 13:20 dlio.log
-rw-rw-r-- 1 ubuntu ubuntu 0 Aug 18 10:45 dlp.log
-rw-rw-r-- 1 ubuntu ubuntu 1527 Aug 18 13:20 per_epoch_stats.json
-rw-rw-r-- 1 ubuntu ubuntu 4848 Aug 18 13:20 summary.json
Content of summary:
{
"start": "2024-08-18T10:45:21.410265",
"num_accelerators": 1,
"num_hosts": 1,
"hostname": "ip-xxx-xxx-xxx-xxx",
"metric": {
"train_au_percentage": [
92.54704772034549,
91.2520240442806,
89.71968127822399,
88.69772347160792,
88.8753794687775
],
"train_au_mean_percentage": 90.2183711966471,
"train_au_meet_expectation": "success",
"train_au_stdev_percentage": 1.4734897679301848,
"train_throughput_samples_per_second": [
1652.3173163411957,
1629.2251374756245,
1601.8853174657795,
1583.6518456226847,
1586.8507999556778
],
"train_throughput_mean_samples_per_second": 1610.7860833721925,
"train_throughput_stdev_samples_per_second": 26.280145784798727,
"train_io_mean_MB_per_second": 176.1368227715315,
"train_io_stdev_MB_per_second": 2.873690943999507
},
"num_files_train": 2395,
"num_files_eval": 0,
"num_samples_per_file": 1251,
"host_cpu_count": [
32
],
"host_processor_name": "x86_64",
"potential_caching": [
0
],
"host_cpuinfo": {
"vendor_id": "GenuineIntel",
"cpu family": "6",
"model": "106",
"model name": "Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz",
"stepping": "6",
"microcode": "0xd0003e7",
"cpu MHz": "3500.266",
"cache size": "55296 KB",
"physical id": "0",
"siblings": "32",
"core id": "15",
"cpu cores": "16",
"apicid": "31",
"initial apicid": "31",
"fpu": "yes",
"fpu_exception": "yes",
"cpuid level": "27",
"wp": "yes",
"flags": "fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch ssbd ibrs ibpb stibp ibrs_enhanced fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves wbnoinvd ida arat avx512vbmi pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg tme avx512_vpopcntdq rdpid md_clear flush_l1d arch_capabilities",
"bugs": "spectre_v1 spectre_v2 spec_store_bypass swapgs mmio_stale_data eibrs_pbrsb gds bhi",
"bogomips": "5799.92",
"clflush size": "64",
"cache_alignment": "64",
"address sizes": "46 bits physical, 48 bits virtual",
"power management": ""
},
"host_meminfo": {
"MemTotal": "64770764 kB",
"MemFree": "27909852 kB",
"MemAvailable": "56402940 kB",
"Buffers": "1560 kB",
"Cached": "28091988 kB",
"SwapCached": "0 kB",
"Active": "11343028 kB",
"Inactive": "23513612 kB",
"Active(anon)": "6945236 kB",
"Inactive(anon)": "39508 kB",
"Active(file)": "4397792 kB",
"Inactive(file)": "23474104 kB",
"Unevictable": "37136 kB",
"Mlocked": "27412 kB",
"SwapTotal": "0 kB",
"SwapFree": "0 kB",
"Zswap": "0 kB",
"Zswapped": "0 kB",
"Dirty": "40 kB",
"Writeback": "0 kB",
"AnonPages": "6801412 kB",
"Mapped": "447240 kB",
"Shmem": "205032 kB",
"KReclaimable": "1339468 kB",
"Slab": "1634372 kB",
"SReclaimable": "1339468 kB",
"SUnreclaim": "294904 kB",
"KernelStack": "25104 kB",
"PageTables": "58976 kB",
"SecPageTables": "0 kB",
"NFS_Unstable": "0 kB",
"Bounce": "0 kB",
"WritebackTmp": "0 kB",
"CommitLimit": "32385380 kB",
"Committed_AS": "49067816 kB",
"VmallocTotal": "34359738367 kB",
"VmallocUsed": "44700 kB",
"VmallocChunk": "0 kB",
"Percpu": "24704 kB",
"HardwareCorrupted": "0 kB",
"AnonHugePages": "0 kB",
"ShmemHugePages": "0 kB",
"ShmemPmdMapped": "0 kB",
"FileHugePages": "0 kB",
"FilePmdMapped": "0 kB",
"Unaccepted": "0 kB",
"HugePages_Total": "0",
"HugePages_Free": "0",
"HugePages_Rsvd": "0",
"HugePages_Surp": "0",
"Hugepagesize": "2048 kB",
"Hugetlb": "0 kB",
"DirectMap4k": "401840 kB",
"DirectMap2M": "8998912 kB",
"DirectMap1G": "56623104 kB"
},
"host_memory_GB": [
61.77021408081055
],
"data_size_per_host_GB": 319.94487664676274,
"epochs": 5,
"end": "2024-08-18T13:20:46.763598"
}
Maybe try to run "mpirun -hosts xxx.xxx.xxx.xxx -np 32
This is moot (no longer relevant) so is being closed.