dash
dash copied to clipboard
HDF5 Errors
While looking at the debug output on circle ci I noticed a problem with the gnu_openmpi3 environment. HDF5 shows some errors, however, the unit test of development passes. Is it relevant?
[ RUN ] HDF5MatrixTest.UnderfilledPatMultiple
HDF5-DIAG: Error detected in HDF5 (1.10.2) MPI-process 0:
#000: H5Dio.c line 315 in H5Dwrite(): can't prepare for writing data
major: Dataset
minor: Write failed
#001: H5Dio.c line 388 in H5D__pre_write(): can't write data
major: Dataset
minor: Write failed
#002: H5Dio.c line 829 in H5D__write(): can't write data
major: Dataset
minor: Write failed
#003: H5Dmpio.c line 760 in H5D__contig_collective_write(): couldn't finish shared collective MPI-IO
major: Low-level I/O
minor: Write failed
#004: H5Dmpio.c line 2193 in H5D__inter_collective_io(): couldn't finish collective MPI-IO
major: Low-level I/O
minor: Can't get value
#005: H5Dmpio.c line 2237 in H5D__final_collective_io(): optimized write failed
major: Dataset
minor: Write failed
#006: H5Dmpio.c line 421 in H5D__mpio_select_write(): can't finish collective parallel write
major: Low-level I/O
minor: Write failed
#007: H5Fio.c line 195 in H5F_block_write(): write through page buffer failed
major: Low-level I/O
minor: Write failed
#008: H5PB.c line 1041 in H5PB_write(): write through metadata accumulator failed
major: Page Buffering
minor: Write failed
#009: H5Faccum.c line 834 in H5F__accum_write(): file write failed
major: Low-level I/O
minor: Write failed
#010: H5FDint.c line 308 in H5FD_write(): driver write request failed
major: Virtual File Layer
minor: Write failed
#011: H5FDmpio.c line 1874 in H5FD_mpio_write(): file write failed
major: Low-level I/O
minor: Write failed
HDF5-DIAG: Error detected in HDF5 (1.10.2) MPI-process 0:
#000: H5Dio.c line 315 in H5Dwrite(): can't prepare for writing data
major: Dataset
minor: Write failed
#001: H5Dio.c line 388 in H5D__pre_write(): can't write data
major: Dataset
minor: Write failed
#002: H5Dio.c line 829 in H5D__write(): can't write data
major: Dataset
minor: Write failed
#003: H5Dmpio.c line 760 in H5D__contig_collective_write(): couldn't finish shared collective MPI-IO
major: Low-level I/O
minor: Write failed
#004: H5Dmpio.c line 2193 in H5D__inter_collective_io(): couldn't finish collective MPI-IO
major: Low-level I/O
minor: Can't get value
#005: H5Dmpio.c line 2237 in H5D__final_collective_io(): optimized write failed
major: Dataset
minor: Write failed
#006: H5Dmpio.c line 421 in H5D__mpio_select_write(): can't finish collective parallel write
major: Low-level I/O
minor: Write failed
#007: H5Fio.c line 195 in H5F_block_write(): write through page buffer failed
major: Low-level I/O
minor: Write failed
#008: H5PB.c line 1041 in H5PB_write(): write through metadata accumulator failed
major: Page Buffering
minor: Write failed
#009: H5Faccum.c line 834 in H5F__accum_write(): file write failed
major: Low-level I/O
minor: Write failed
#010: H5FDint.c line 308 in H5FD_write(): driver write request failed
major: Virtual File Layer
minor: Write failed
#011: H5FDmpio.c line 1874 in H5FD_mpio_write(): file write failed
major: Low-level I/O
minor: Write failed
HDF5-DIAG: Error detected in HDF5 (1.10.2) MPI-process 0:
#000: H5Dio.c line 315 in H5Dwrite(): can't prepare for writing data
major: Dataset
minor: Write failed
#001: H5Dio.c line 388 in H5D__pre_write(): can't write data
major: Dataset
minor: Write failed
#002: H5Dio.c line 829 in H5D__write(): can't write data
major: Dataset
minor: Write failed
#003: H5Dmpio.c line 760 in H5D__contig_collective_write(): couldn't finish shared collective MPI-IO
major: Low-level I/O
minor: Write failed
#004: H5Dmpio.c line 2193 in H5D__inter_collective_io(): couldn't finish collective MPI-IO
major: Low-level I/O
minor: Can't get value
#005: H5Dmpio.c line 2237 in H5D__final_collective_io(): optimized write failed
major: Dataset
minor: Write failed
#006: H5Dmpio.c line 421 in H5D__mpio_select_write(): can't finish collective parallel write
major: Low-level I/O
minor: Write failed
#007: H5Fio.c line 195 in H5F_block_write(): write through page buffer failed
major: Low-level I/O
minor: Write failed
#008: H5PB.c line 1041 in H5PB_write(): write through metadata accumulator failed
major: Page Buffering
minor: Write failed
#009: H5Faccum.c line 834 in H5F__accum_write(): file write failed
major: Low-level I/O
minor: Write failed
#010: H5FDint.c line 308 in H5FD_write(): driver write request failed
major: Virtual File Layer
minor: Write failed
#011: H5FDmpio.c line 1874 in H5FD_mpio_write(): file write failed
major: Low-level I/O
minor: Write failed
HDF5-DIAG: Error detected in HDF5 (1.10.2) MPI-process 0:
#000: H5Dio.c line 315 in H5Dwrite(): can't prepare for writing data
major: Dataset
minor: Write failed
#001: H5Dio.c line 388 in H5D__pre_write(): can't write data
major: Dataset
minor: Write failed
#002: H5Dio.c line 829 in H5D__write(): can't write data
major: Dataset
minor: Write failed
#003: H5Dmpio.c line 760 in H5D__contig_collective_write(): couldn't finish shared collective MPI-IO
major: Low-level I/O
minor: Write failed
#004: H5Dmpio.c line 2193 in H5D__inter_collective_io(): couldn't finish collective MPI-IO
major: Low-level I/O
minor: Can't get value
#005: H5Dmpio.c line 2237 in H5D__final_collective_io(): optimized write failed
major: Dataset
minor: Write failed
#006: H5Dmpio.c line 421 in H5D__mpio_select_write(): can't finish collective parallel write
major: Low-level I/O
minor: Write failed
#007: H5Fio.c line 195 in H5F_block_write(): write through page buffer failed
major: Low-level I/O
minor: Write failed
#008: H5PB.c line 1041 in H5PB_write(): write through metadata accumulator failed
major: Page Buffering
minor: Write failed
#009: H5Faccum.c line 834 in H5F__accum_write(): file write failed
major: Low-level I/O
minor: Write failed
#010: H5FDint.c line 308 in H5FD_write(): driver write request failed
major: Virtual File Layer
minor: Write failed
#011: H5FDmpio.c line 1874 in H5FD_mpio_write(): file write failed
major: Low-level I/O
minor: Write failed
[ OK ] HDF5MatrixTest.UnderfilledPatMultiple
I'm going to have a look at this during the weekend. However, as the unit test succeeds, the data should have been written correctly to disk.
Yes, maybe you're right. However, only the UnderfilledPatMultiple test shows these errors,. This seems to be a bit strange.
I just tried to update the openmpi3 container to the latest version (3.1.2), but there are still the MPI_THREAD_MULTIPLE bugs reported by @devreal . If building with this flag, no jobs can be run at all:
root@b48730d2fd78:/opt/dash# mpirun -n 4 --allow-run-as-root build/dash/dash-test-mpi
[----------] 62 tests will be run.
[----------] run 2 out of 2 tests from TestPrinterTest
[ RUN ] TestPrinterTest.FailOnOneUnit
[= -1 LOG =] TestBase.h : 283 | ===> Running test case TestPrinterTest.FailOnOneUnit ...
[= -1 LOG =] TestBase.h : 283 | ===> Running test case TestPrinterTest.FailOnOneUnit ...
[= -1 LOG =] TestBase.h : 283 | ===> Running test case TestPrinterTest.FailOnOneUnit ...
[= -1 LOG =] TestBase.h : 283 | ===> Running test case TestPrinterTest.FailOnOneUnit ...
--------------------------------------------------------------------------
The OSC pt2pt component does not support MPI_THREAD_MULTIPLE in this release.
Workarounds are to run on a single node, or to use a system with an RDMA
capable network such as Infiniband.
--------------------------------------------------------------------------
[b48730d2fd78:01186] *** An error occurred in MPI_Win_create
[b48730d2fd78:01186] *** reported by process [893779969,2]
[b48730d2fd78:01186] *** on communicator MPI COMMUNICATOR 3 DUP FROM 0
[b48730d2fd78:01186] *** MPI_ERR_WIN: invalid window
[b48730d2fd78:01186] *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort,
[b48730d2fd78:01186] *** and potentially your MPI job)
[b48730d2fd78:01179] 3 more processes have sent help message help-osc-pt2pt.txt / mpi-thread-multiple-not-supported
[b48730d2fd78:01179] Set MCA parameter "orte_base_help_aggregate" to 0 to see all help / error messages
[b48730d2fd78:01179] 3 more processes have sent help message help-mpi-errors.txt / mpi_errors_are_fatal
root@b48730d2fd78:/opt/dash#
Regarding the HDF5 bug: This is also a bug in the MPI-IO implementation and not on the HDF5 or DASH side. However what is actually written to the file seems to be correct (at least in the unit test).
IMO we should close the ticket, or keep this as a known limitation until MPI3 becomes a bit more stable.
Let's track it as a known issue.
@fmoessbauer Can you reproduce the MPI-IO problem with a small example or describe what the code is doing so we can file a bug report?
@devreal: If you run the test with the env-flag DASH_HDF5_PRESERVE_FILE=1 set, the generated file is preserved. What is stored in the file is a 2-d cantor pairing of the cartesian coordinates. This makes it possible to almost instantly see IO errors, but actually there are no.
The DASH_PATTERN attribute contains a encoded version of a dash::BlockPattern, by specifying the extents per dimension:
<size 1> <size 2> <team 1> <team 2> <dist 1> <dist 2>
For a visualization of the mapping algorithm, see here: http://doc.dash-project.org/internal/Publications/DASH-IO
I'll try to write a short reproducer, but that is anything but trivial. We can chat/skype on that if you are interested in more detail.
root@aeaffa162df0:/opt/dash/build# /opt/phdf5/bin/h5dump test_matrix.hdf5
HDF5 "test_matrix.hdf5" {
GROUP "/" {
DATASET "data" {
DATATYPE H5T_STD_I32LE
DATASPACE SIMPLE { ( 33, 11 ) / ( 33, 11 ) }
DATA {
(0,0): 0, 2, 5, 9, 14, 20, 27, 35, 44, 54, 65,
(1,0): 1, 4, 8, 13, 19, 26, 34, 43, 53, 64, 76,
(2,0): 3, 7, 12, 18, 25, 33, 42, 52, 63, 75, 88,
(3,0): 6, 11, 17, 24, 32, 41, 51, 62, 74, 87, 101,
(4,0): 10, 16, 23, 31, 40, 50, 61, 73, 86, 100, 115,
(5,0): 15, 22, 30, 39, 49, 60, 72, 85, 99, 114, 130,
(6,0): 21, 29, 38, 48, 59, 71, 84, 98, 113, 129, 146,
(7,0): 28, 37, 47, 58, 70, 83, 97, 112, 128, 145, 163,
(8,0): 36, 46, 57, 69, 82, 96, 111, 127, 144, 162, 181,
(9,0): 45, 56, 68, 81, 95, 110, 126, 143, 161, 180, 200,
(10,0): 55, 67, 80, 94, 109, 125, 142, 160, 179, 199, 220,
(11,0): 66, 79, 93, 108, 124, 141, 159, 178, 198, 219, 241,
(12,0): 78, 92, 107, 123, 140, 158, 177, 197, 218, 240, 263,
(13,0): 91, 106, 122, 139, 157, 176, 196, 217, 239, 262, 286,
(14,0): 105, 121, 138, 156, 175, 195, 216, 238, 261, 285, 310,
(15,0): 120, 137, 155, 174, 194, 215, 237, 260, 284, 309, 335,
(16,0): 136, 154, 173, 193, 214, 236, 259, 283, 308, 334, 361,
(17,0): 153, 172, 192, 213, 235, 258, 282, 307, 333, 360, 388,
(18,0): 171, 191, 212, 234, 257, 281, 306, 332, 359, 387, 416,
(19,0): 190, 211, 233, 256, 280, 305, 331, 358, 386, 415, 445,
(20,0): 210, 232, 255, 279, 304, 330, 357, 385, 414, 444, 475,
(21,0): 231, 254, 278, 303, 329, 356, 384, 413, 443, 474, 506,
(22,0): 253, 277, 302, 328, 355, 383, 412, 442, 473, 505, 538,
(23,0): 276, 301, 327, 354, 382, 411, 441, 472, 504, 537, 571,
(24,0): 300, 326, 353, 381, 410, 440, 471, 503, 536, 570, 605,
(25,0): 325, 352, 380, 409, 439, 470, 502, 535, 569, 604, 640,
(26,0): 351, 379, 408, 438, 469, 501, 534, 568, 603, 639, 676,
(27,0): 378, 407, 437, 468, 500, 533, 567, 602, 638, 675, 713,
(28,0): 406, 436, 467, 499, 532, 566, 601, 637, 674, 712, 751,
(29,0): 435, 466, 498, 531, 565, 600, 636, 673, 711, 750, 790,
(30,0): 465, 497, 530, 564, 599, 635, 672, 710, 749, 789, 830,
(31,0): 496, 529, 563, 598, 634, 671, 709, 748, 788, 829, 871,
(32,0): 528, 562, 597, 633, 670, 708, 747, 787, 828, 870, 913
}
ATTRIBUTE "DASH_PATTERN" {
DATATYPE H5T_STD_I64LE
DATASPACE SIMPLE { ( 8 ) / ( 8 ) }
DATA {
(0): 33, 11, 2, 2, 3, 3, 12, 4
}
}
}
}
}