netcdf-c
netcdf-c copied to clipboard
4.8.1 build returns NC_EHDFERR from nc_enddef() in some simple tests
This issue has been confirmed on my laptop (Ubuntu 18, GCC 7.4.0) and ANL GCE workstations (Ubuntu 18, GCC 11.1.0).
This issue is reproduced with NetCDF 4.8.1 using latest HDF5 1.12 or 1.13. It is not reproducible with older NetCDF versions like 4.7.4 and 4.8.0, though.
On my laptop, it sometimes shows the following assertion failure:
H5MM.c:594: H5MM_memcpy: Assertion
(char *)dest >= (const char *)src + n || (const char *)src >= (char *)dest + n' failed.`
It is likely a bug introduced by NetCDF 4.8.1's code changes to 4.8.0.
Due to this issue, some E3SM cases fail to run with netcdf4p type (NetCDF 4.8.1) on ANL GCE machines. The following is a simple NETCDF4 test program to reproduce this issue.
#include <stdio.h>
#include <mpi.h>
#include <netcdf.h>
#include <netcdf_par.h>
#define ERR { if (ret != NC_NOERR) printf("rank = %d, error code %d returned at line = %d\n", my_rank, ret, __LINE__); }
int main(int argc, char* argv[])
{
int my_rank;
int ntasks;
int dimid;
int ncid;
int varid;
char varname[32];
int i;
int ret;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &ntasks);
ret = nc_create_par("test.nc", NC_CLOBBER | NC_MPIIO | NC_NETCDF4, MPI_COMM_WORLD, MPI_INFO_NULL, &ncid); ERR
ret = nc_def_dim(ncid, "time", NC_UNLIMITED, &dimid); ERR
ret = nc_def_var(ncid, "time", NC_INT, 1, &dimid, &varid); ERR
ret = nc_put_att_text(ncid, varid, "Att1", 4, "att1"); ERR
ret = nc_put_att_text(ncid, varid, "Att2", 4, "att2"); ERR
ret = nc_put_att_text(ncid, varid, "Att3", 4, "att3"); ERR
ret = nc_put_att_text(ncid, varid, "Att4", 4, "att4"); ERR
for (i = 0; i < 300; i++) {
snprintf(varname, 32, "dummy_var_%d", i);
ret = nc_def_var(ncid, varname, NC_INT, 1, &dimid, &varid); ERR
}
ret = nc_enddef(ncid); ERR
if (ret == NC_EHDFERR) {
printf("rank = %d, nc_enddef() returned error code NC_EHDFERR\n", my_rank);
MPI_Finalize();
return -1;
}
ret = nc_close(ncid); ERR
MPI_Finalize();
return 0;
}
Did you build netcdf-c with --enable-parallel-tests? And did those tests pass?
Did you build netcdf-c with --enable-parallel-tests? And did those tests pass?
Yes we did and they all passed. You can try to reproduce this issue with the test program included in this PR, thanks.
I am taking a look at this issue today.
I have added the test code above to tst_parallel6.c and indeed have reproduced the problem.
I also added a sequential version of this test to tst_vars3.c and it passes without problem. So this only fails for parallel I/O.
Here's some address santizer output:
ed@mikado:~/netcdf-c/nc_test4$ make tst_parallel6 && mpiexec -n 4 ./tst_parallel6
make: 'tst_parallel6' is up to date.
*** Testing parallel I/O.
*** testing a reported enddef problem...=================================================================
=================================================================
==459041==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x62100018fd0c at pc 0x7fde4359b490 bp 0x7ffd26a3c0a0 sp 0x7ffd26a3b848
==459042==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x6210001bf50c at pc 0x7ff03943b490 bp 0x7fffad5dc0a0 sp 0x7fffad5db848
READ of size 4032 at 0x62100018fd0c thread T0
READ of size 4032 at 0x6210001bf50c thread T0
#0 0x7fde4359b48f in __interceptor_memcpy ../../../../src/libsanitizer/sanitizer_common/sanitizer_common_interceptors.inc:790
#1 0x7fde42ed2323 in H5O__attr_shared_decode (/usr/local/hdf5-1.12.1_mpich/lib/libhdf5.so.200+0x1a4323)
#2 0x7fde42ef92ab in H5O_msg_decode (/usr/local/hdf5-1.12.1_mpich/lib/libhdf5.so.200+0x1cb2ab)
#3 0x7fde42d82283 in H5A__dense_fh_name_cmp (/usr/local/hdf5-1.12.1_mpich/lib/libhdf5.so.200+0x54283)
#4 0x7fde42e953cb in H5HF__huge_op_real (/usr/local/hdf5-1.12.1_mpich/lib/libhdf5.so.200+0x1673cb)
#5 0x7fde42e96b9c in H5HF__huge_op (/usr/local/hdf5-1.12.1_mpich/lib/libhdf5.so.200+0x168b9c)
#6 0x7fde42e88447 in H5HF_op (/usr/local/hdf5-1.12.1_mpich/lib/libhdf5.so.200+0x15a447)
#7 0x7fde42d827b1 in H5A__dense_btree2_name_compare (/usr/local/hdf5-1.12.1_mpich/lib/libhdf5.so.200+0x547b1)
#8 0x7fde42da0311 in H5B2__locate_record (/usr/local/hdf5-1.12.1_mpich/lib/libhdf5.so.200+0x72311)
#9 0x7fde42d99c09 in H5B2_find (/usr/local/hdf5-1.12.1_mpich/lib/libhdf5.so.200+0x6bc09)
#10 0x7fde42d83f2f in H5A__dense_open (/usr/local/hdf5-1.12.1_mpich/lib/libhdf5.so.200+0x55f2f)
#11 0x7fde42ed3ee8 in H5O__attr_open_by_name (/usr/local/hdf5-1.12.1_mpich/lib/libhdf5.so.200+0x1a5ee8)
#12 0x7fde42d89b35 in H5A__open (/usr/local/hdf5-1.12.1_mpich/lib/libhdf5.so.200+0x5bb35)
#13 0x7fde43047892 in H5VL__native_attr_open (/usr/local/hdf5-1.12.1_mpich/lib/libhdf5.so.200+0x319892)
#14 0x7fde43030093 in H5VL_attr_open (/usr/local/hdf5-1.12.1_mpich/lib/libhdf5.so.200+0x302093)
#15 0x7fde42d7d898 in H5Aopen (/usr/local/hdf5-1.12.1_mpich/lib/libhdf5.so.200+0x4f898)
#16 0x7fde431879c0 in H5DSattach_scale (/usr/local/hdf5-1.12.1_mpich/lib/libhdf5_hl.so.200+0x89c0)
#17 0x7fde432ee237 in attach_dimscales /home/ed/netcdf-c/libhdf5/nc4hdf.c:1420
#18 0x7fde432f1016 in nc4_rec_write_metadata /home/ed/netcdf-c/libhdf5/nc4hdf.c:1962
#19 0x7fde432f6189 in sync_netcdf4_file /home/ed/netcdf-c/libhdf5/hdf5file.c:167
#20 0x7fde432f89e8 in nc4_enddef_netcdf4_file /home/ed/netcdf-c/libhdf5/hdf5file.c:743
#21 0x7fde432f775d in NC4_enddef /home/ed/netcdf-c/libhdf5/hdf5file.c:512
#22 0x7fde432f7422 in NC4__enddef /home/ed/netcdf-c/libhdf5/hdf5file.c:473
#23 0x7fde431f9be5 in nc_enddef /home/ed/netcdf-c/libdispatch/dfile.c:1029
#24 0x5645600661d4 in main /home/ed/netcdf-c/nc_test4/tst_parallel6.c:89
#25 0x7fde425fb0b2 in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x240b2)
#26 0x56456006538d in _start (/home/ed/netcdf-c/nc_test4/.libs/tst_parallel6+0x238d)
0x62100018fd0c is located 0 bytes to the right of 4108-byte region [0x62100018ed00,0x62100018fd0c)
allocated by thread T0 here:
#0 0x7fde4360d808 in __interceptor_malloc ../../../../src/libsanitizer/asan/asan_malloc_linux.cc:144
#1 0x7fde42e95371 in H5HF__huge_op_real (/usr/local/hdf5-1.12.1_mpich/lib/libhdf5.so.200+0x167371)
SUMMARY: AddressSanitizer: heap-buffer-overflow ../../../../src/libsanitizer/sanitizer_common/sanitizer_common_interceptors.inc:790 in __interceptor_memcpy
Shadow bytes around the buggy address:
0x0c4280029f50: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
0x0c4280029f60: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
0x0c4280029f70: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
0x0c4280029f80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
0x0c4280029f90: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
=>0x0c4280029fa0: 00[04]fa fa fa fa fa fa fa fa fa fa fa fa fa fa
0x0c4280029fb0: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
0x0c4280029fc0: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
0x0c4280029fd0: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
0x0c4280029fe0: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
0x0c4280029ff0: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
Shadow byte legend (one shadow byte represents 8 application bytes):
Addressable: 00
Partially addressable: 01 02 03 04 05 06 07
Heap left redzone: fa
Freed heap region: fd
Stack left redzone: f1
Stack mid redzone: f2
Stack right redzone: f3
Stack after return: f5
Stack use after scope: f8
Global redzone: f9
Global init order: f6
Poisoned by user: f7
Container overflow: fc
Array cookie: ac
Intra object redzone: bb
ASan internal: fe
Left alloca redzone: ca
Right alloca redzone: cb
Shadow gap: cc
==459041==ABORTING
ed@mikado:~/netcdf-c/nc_test4$
And if I run this program with the new mode flag: NC_NODIMSCALE_ATTACH, the error does not occur.
@dqwu you may wish to turn off dimscales in files with many variables. See the discussion here: #2128.
Using this mode flag also is a workaround for your original problem, since the trouble seems to be in the dimscale code...
I think the next step is going to be a pure-HDF5 test to see if this is a HDF5 problem or a netcdf-c problem.
@edwardhartnett I have a pure-HDF5 test to reproduce this issue.
#include <mpi.h>
#include <hdf5.h>
#include <hdf5_hl.h>
#define MAX_VARS 500
#define MAX_ATTS 6
int main(int argc, char* argv[])
{
hid_t fapl_id;
hid_t file_id;
hid_t dcpl_id;
hid_t dataset_id_Time;
hid_t dataset_id_vars[MAX_VARS];
hid_t attr_id;
hid_t space_id;
hsize_t dims[1], mdims[1];
hsize_t att_size;
char var_name[16];
char att_name[16];
int i;
MPI_Init(&argc, &argv);
fapl_id = H5Pcreate(H5P_FILE_ACCESS);
H5Pset_fapl_mpio(fapl_id, MPI_COMM_WORLD, MPI_INFO_NULL);
file_id = H5Fcreate("test.h5", H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id);
H5Pclose(fapl_id);
dcpl_id = H5Pcreate(H5P_DATASET_CREATE);
H5Pset_attr_creation_order(dcpl_id, H5P_CRT_ORDER_TRACKED | H5P_CRT_ORDER_INDEXED); /* No HDF5 errors if this line is commented out */
dims[0] = mdims[0] = 1;
space_id = H5Screate_simple(1, dims, mdims);
dataset_id_Time = H5Dcreate2(file_id, "Time", H5T_NATIVE_INT, space_id, H5P_DEFAULT, dcpl_id, H5P_DEFAULT);
H5Sclose(space_id);
H5Pclose(dcpl_id);
for (i = 0; i < MAX_ATTS; i++) {
sprintf(att_name, "att_%d", i);
att_size = 1;
space_id = H5Screate_simple(1, &att_size, &att_size);
attr_id = H5Acreate2(dataset_id_Time, att_name, H5T_NATIVE_INT, space_id, H5P_DEFAULT, H5P_DEFAULT);
H5Awrite(attr_id, H5T_NATIVE_INT, &i);
H5Sclose(space_id);
H5Aclose(attr_id);
}
for (i = 0; i < MAX_VARS; i++) {
sprintf(var_name, "var_%d", i);
dcpl_id = H5Pcreate(H5P_DATASET_CREATE);
dims[0] = mdims[0] = 1;
space_id = H5Screate_simple(1, dims, mdims);
dataset_id_vars[i] = H5Dcreate2(file_id, var_name, H5T_NATIVE_INT, space_id, H5P_DEFAULT, dcpl_id, H5P_DEFAULT);
H5Sclose(space_id);
H5Pclose(dcpl_id);
}
H5DSset_scale(dataset_id_Time, "Time");
for (i = 0; i < MAX_VARS; i++) {
/* No HDF5 errors if we add MPI_Barrier(MPI_COMM_WORLD) before each H5DSattach_scale call */
H5DSattach_scale(dataset_id_vars[i], dataset_id_Time, 0);
}
H5Dclose(dataset_id_Time);
for (i = 0; i < MAX_VARS; i++)
H5Dclose(dataset_id_vars[i]);
H5Fclose(file_id);
MPI_Finalize();
return 0;
}
If you have a pure HDF5 test that demonstrates the problem please send it to the HDF5 support team and hopefully they will fix it...
If you have a pure HDF5 test that demonstrates the problem please send it to the HDF5 support team and hopefully they will fix it...
I have already sent it to @brtnfld , and he has confirmed that it is related to https://github.com/Unidata/netcdf-c/issues/1822
A similar issue is https://github.com/Unidata/netcdf-c/issues/2251
Thanks! Given that this is an hdf5 issue and not a netCDF one, I'll close this out. Thanks!