pandora icon indicating copy to clipboard operation
pandora copied to clipboard

Segmentation fault with hdf5 serialization

Open mrceresa opened this issue 8 years ago • 5 comments

Dear all, I have a strange error on F24. The serialization code fails with SIGSEG:

./epidemy 
simulation: 0 of: 1 initialized
[ws121924:07778] *** Process received signal ***
[ws121924:07778] Signal: Segmentation fault (11)
[ws121924:07778] Signal code: Address not mapped (1)
[ws121924:07778] Failing at address: 0x7
[ws121924:07778] [ 0] /lib64/libc.so.6(+0x347e0)[0x7f9d6825f7e0]
[ws121924:07778] [ 1] /lib64/libc.so.6(strlen+0x26)[0x7f9d682b3336]
[ws121924:07778] [ 2] /usr/local/HDF_Group/HDF5/1.8.17/lib/libhdf5.so.10.2.0(H5T__conv_vlen+0x43e)[0x7f9d6901227e]
[ws121924:07778] [ 3] /usr/local/HDF_Group/HDF5/1.8.17/lib/libhdf5.so.10.2.0(H5T_convert+0x6f)[0x7f9d69007cbf]
[ws121924:07778] [ 4] /usr/local/HDF_Group/HDF5/1.8.17/lib/libhdf5.so.10.2.0(H5D__scatgath_write+0x1e7)[0x7f9d68f20db7]
[ws121924:07778] [ 5] /usr/local/HDF_Group/HDF5/1.8.17/lib/libhdf5.so.10.2.0(+0x73a54)[0x7f9d68f08a54]
[ws121924:07778] [ 6] /usr/local/HDF_Group/HDF5/1.8.17/lib/libhdf5.so.10.2.0(+0x84a3d)[0x7f9d68f19a3d]
[ws121924:07778] [ 7] /usr/local/HDF_Group/HDF5/1.8.17/lib/libhdf5.so.10.2.0(H5Dwrite+0x104)[0x7f9d68f19fc4]
[ws121924:07778] [ 8] /home/mario/git/pandora/build-release/libpandora.so(_ZN6Engine10Serializer25executeAgentSerializationERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEi+0x12c8)[0x7f9d6b559ab8]
[ws121924:07778] [ 9] /home/mario/git/pandora/build-release/libpandora.so(_ZN6Engine10Serializer14serializeAgentEPNS_5AgentERKii+0x19e)[0x7f9d6b55b49e]
[ws121924:07778] [10] /home/mario/git/pandora/build-release/libpandora.so(_ZN6Engine10Serializer15serializeAgentsERKiSt20_List_const_iteratorISt10shared_ptrINS_5AgentEEES7_+0x4b)[0x7f9d6b55b58b]
[ws121924:07778] [11] /home/mario/git/pandora/build-release/libpandora.so(_ZN6Engine14SpacePartition15serializeAgentsERKi+0x19)[0x7f9d6b565819]
[ws121924:07778] [12] /home/mario/git/pandora/build-release/libpandora.so(_ZN6Engine5World4stepEv+0x3dd)[0x7f9d6b57558d]
[ws121924:07778] [13] /home/mario/git/pandora/build-release/libpandora.so(_ZN6Engine5World3runEv+0x3d8)[0x7f9d6b576068]
[ws121924:07778] [14] ./epidemy(main+0x9b)[0x40c37b]
[ws121924:07778] [15] /lib64/libc.so.6(__libc_start_main+0xf1)[0x7f9d6824b731]
[ws121924:07778] [16] ./epidemy(_start+0x29)[0x40c599]
[ws121924:07778] *** End of error message ***
Segmentation fault (core dumped)

I've tried to debug it for a while but I cannot understand why it's happening. I link against a parallel version of hdf5 v1.8.17 built separately

Thanks for any help you could give me.

Best,

Mario

mrceresa avatar Feb 09 '17 12:02 mrceresa

The data it is trying to write are (from Serializer::executeAgentSerialization): H5Dwrite(datasetId, idType, memorySpace, fileSpace, H5P_DEFAULT, &(data->at(0)));

Type: Human/step0/id File: 16777217 Size: 20000
fileSpace: 67108871 Offset: 0 Stride: 1 Count: 1 Block: 20000
datasetId: 83886080 idType: 50331843 memorySpace: 67108872 fileSpace: 67108871 Data: Human_0

mrceresa avatar Feb 09 '17 14:02 mrceresa

After some more debugging I found that the problem happens if the array has more than one element (usually has one element per agent). If I just leave only one string it works fine.

mrceresa avatar Feb 09 '17 18:02 mrceresa

This is confirmed because if I serialize each agent id separately it (seems to) work (no segfault):

		hsize_t	block;
		block = 1;
		hsize_t simpleDimension = 1;
		hsize_t newSize;
		newSize = currentIndex+1;
		
		itI->second = currentIndex+data->size();
		
		std::ostringstream oss;
		oss << type << "/step" << step << "/" << itM->first;

		hid_t datasetId = H5Dopen(_agentsFileId, oss.str().c_str(), H5P_DEFAULT);
		
		for (auto d : *data){
			H5Dset_extent( datasetId, &newSize);
			hid_t fileSpace = H5Dget_space(datasetId);
			H5Sselect_hyperslab(fileSpace, H5S_SELECT_SET, offset, stride, count, &block);
			log_INFO(logName.str(), "block " << block);
			hid_t idType = H5Tcopy(H5T_C_S1);
			H5Tset_size (idType, H5T_VARIABLE);
			hid_t memorySpace = H5Screate_simple(1, &simpleDimension, 0);
			log_INFO(logName.str(), "Dimension " << simpleDimension);
			log_INFO(logName.str(), "d " << d);
			H5Dwrite(datasetId, idType, memorySpace, fileSpace, H5P_DEFAULT, &d);
			data->clear();

			H5Sclose(memorySpace);
			H5Sclose(fileSpace);
		}

mrceresa avatar Feb 09 '17 18:02 mrceresa

Everything points to a problem into the way the original code access the dataset, because I can write the data with this (inspired from SO):

		const size_t n = data->size();
		hsize_t simpleDimension = n;
		//log_INFO(logName.str(), "Serializing: " << simpleDimension);
		char* dataA[n];
		for (size_t i = 0; i < n; i++) {
			dataA[i] = data->at(i).c_str();
		}
		data->clear();
		
				
		hid_t hdf5file= H5Fcreate("test.hdf5", H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
		hid_t group = H5Gcreate2(hdf5file, "/MyGroup", H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
		hsize_t dim1=data->size();
		hid_t dataspace = H5Screate_simple(1, &dim1, NULL);
		hid_t datatype = H5Tcopy(H5T_C_S1);
		int ret = H5Tset_size (datatype, H5T_VARIABLE);
		hid_t dataset = H5Dcreate2(group, "Samples", datatype, dataspace, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
		H5Dwrite(dataset, datatype, dataspace, H5S_ALL, H5P_DEFAULT, dataA);
		
		H5Dclose(dataset);
		H5Tclose(datatype);
		H5Sclose(dataspace);
		H5Gclose(group);
		H5Fclose(hdf5file);

mrceresa avatar Feb 09 '17 19:02 mrceresa

Okay, now it works if I convert the string to serialize to a continous array first:

@@ -503,18 +503,30 @@ void Serializer::executeAgentSerialization( const std::string & type, int step)
 	for(StringMap::iterator itM=attributesS->begin(); itM!=attributesS->end(); itM++)
 	{
 		std::vector<std::string> * data = itM->second;
-		hsize_t	block[1];
-		block[0] = data->size();
 		
-		hsize_t simpleDimension = data->size();
-		// TODO es repeteix per cada atribut
-		hsize_t newSize[1];
-		newSize[0] = currentIndex+data->size();
 		itI->second = currentIndex+data->size();
-
+		
+		std::stringstream logName;
+		logName << "Serializer_" << _scheduler.getId();
+				
+		const size_t n = data->size();
+		hsize_t simpleDimension = n;
+		//log_INFO(logName.str(), "Serializing: " << simpleDimension);
+		char* dataA[n];
+		for (size_t i = 0; i < n; i++) {
+			dataA[i] = data->at(i).c_str();
+		}
+				
+		hsize_t	block[1];
+		block[0] = data->size();
+		
+		// TODO es repeteix per cada atribut
+		hsize_t newSize[1];
+		newSize[0] = currentIndex+data->size();
+		
 		std::ostringstream oss;
 		oss << type << "/step" << step << "/" << itM->first;
-
+		
 		hid_t datasetId = H5Dopen(_agentsFileId, oss.str().c_str(), H5P_DEFAULT);
 		H5Dset_extent( datasetId, newSize);
 		hid_t fileSpace = H5Dget_space(datasetId);
@@ -522,12 +534,13 @@ void Serializer::executeAgentSerialization( const std::string & type, int step)
 		hid_t idType = H5Tcopy(H5T_C_S1);
 		H5Tset_size (idType, H5T_VARIABLE);
   		hid_t memorySpace = H5Screate_simple(1, &simpleDimension, 0);
-		H5Dwrite(datasetId, idType, memorySpace, fileSpace, H5P_DEFAULT, &(data->at(0)));
+		H5Dwrite(datasetId, idType, memorySpace, fileSpace, H5P_DEFAULT, dataA);
 		data->clear();

 		H5Sclose(memorySpace);
 		H5Sclose(fileSpace);
 		H5Dclose(datasetId);
 	}
 }

mrceresa avatar Feb 09 '17 19:02 mrceresa