Transformers-Tutorials
Transformers-Tutorials copied to clipboard
Error while training the model for document image classification using LayoutLMV2 model.
Hai @NielsRogge, Thank you for sharing the code. I followed your code but I am getting error while training the LayoutLMV2 model for document image classification. Can you please help me on this.
from datasets import Dataset
# read dataframe as HuggingFace Datasets object
dataset = Dataset.from_pandas(data_set)
#train_ds = dataset.class_encode_column("Label")
#train_ds
dataset
this is the output
Dataset({
features: ['Image_File_Path', 'Label'],
num_rows: 12
})
from datasets import Features, Sequence, ClassLabel, Value, Array2D, Array3D
# we need to define custom features
features = Features({
'image': Array3D(dtype="int64", shape=(3, 224, 224)),
'input_ids': Sequence(feature=Value(dtype='int64')),
'attention_mask': Sequence(Value(dtype='int64')),
'token_type_ids': Sequence(Value(dtype='int64')),
'bbox': Array2D(dtype="int64", shape=(512, 4)),
'labels': ClassLabel(num_classes=len(labels), names=labels),
})
def preprocess_data(examples):
# take a batch of images
images = [Image.open(path).convert("RGB") for path in examples['Image_File_Path']]
encoded_inputs = processor(images, padding="max_length", truncation=True)
# add labels
encoded_inputs["labels"] = [label2id[label] for label in examples["Label"]]
return encoded_inputs
encoded_dataset = dataset.map(preprocess_data, remove_columns=dataset.column_names, features=features, batched=True, batch_size=2)
this is the error
Parameter 'function'=<function preprocess_data at 0x000002E786CF6440> of the transform datasets.arrow_dataset.Dataset._map_single couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.
0%
0/6 [00:10<?, ?ba/s]
Output exceeds the [size limit](command:workbench.action.openSettings?[). Open the full output data [in a text editor](command:workbench.action.openLargeOutput?6dd491dd-c13e-4f37-bedd-534200bc5476)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
File c:\Users\name\.conda\envs\detectron_env\lib\site-packages\datasets\arrow_dataset.py:2781, in Dataset._map_single(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, new_fingerprint, rank, offset, disable_tqdm, desc, cache_only)
2780 else:
-> 2781 writer.write_batch(batch)
2782 if update_data and writer is not None:
File c:\Users\name\.conda\envs\detectron_env\lib\site-packages\datasets\arrow_writer.py:507, in ArrowWriter.write_batch(self, batch_examples, writer_batch_size)
506 pa_table = pa.Table.from_arrays(arrays, schema=schema)
--> 507 self.write_table(pa_table, writer_batch_size)
File c:\Users\name\.conda\envs\detectron_env\lib\site-packages\datasets\arrow_writer.py:518, in ArrowWriter.write_table(self, pa_table, writer_batch_size)
517 if self.pa_writer is None:
--> 518 self._build_writer(inferred_schema=pa_table.schema)
519 pa_table = table_cast(pa_table, self._schema)
File c:\Users\name\.conda\envs\detectron_env\lib\site-packages\datasets\arrow_writer.py:369, in ArrowWriter._build_writer(self, inferred_schema)
368 if self.with_metadata:
--> 369 schema = schema.with_metadata(self._build_metadata(DatasetInfo(features=self._features), self.fingerprint))
370 self._schema = schema
File c:\Users\name\.conda\envs\detectron_env\lib\site-packages\datasets\arrow_writer.py:392, in ArrowWriter._build_metadata(info, fingerprint)
391 metadata["fingerprint"] = fingerprint
--> 392 return {"huggingface": json.dumps(metadata)}
File c:\Users\name\.conda\envs\detectron_env\lib\json\__init__.py:231, in dumps(obj, skipkeys, ensure_ascii, check_circular, allow_nan, cls, indent, separators, default, sort_keys, **kw)
227 if (not skipkeys and ensure_ascii and
228 check_circular and allow_nan and
229 cls is None and indent is None and separators is None and
230 default is None and not sort_keys and not kw):
--> 231 return _default_encoder.encode(obj)
232 if cls is None:
File c:\Users\name\.conda\envs\detectron_env\lib\json\encoder.py:199, in JSONEncoder.encode(self, o)
196 # This doesn't pass the iterator directly to ''.join() because the
...
178 """
--> 179 raise TypeError(f'Object of type {o.__class__.__name__} '
180 f'is not JSON serializable')
TypeError: Object of type ndarray is not JSON serializable