datasets icon indicating copy to clipboard operation
datasets copied to clipboard

TypeError: Couldn't cast array of type string to null in long json

Open nokados opened this issue 1 year ago • 6 comments

Describe the bug

In general, changing the type from string to null is allowed within a dataset — there are even examples of this in the documentation.

However, if the dataset is large and unevenly distributed, this allowance stops working. The schema gets locked in after reading a chunk.

Consequently, if all values in the first chunk of a field are, for example, null, the field will be locked as type null, and if a string appears in that field in the second chunk, it will trigger this error:

Traceback
   TypeError                                 Traceback (most recent call last)

[/usr/local/lib/python3.10/dist-packages/datasets/builder.py](https://localhost:8080/#) in _prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
   1868                     try:
-> 1869                         writer.write_table(table)
   1870                     except CastError as cast_error:

14 frames

[/usr/local/lib/python3.10/dist-packages/datasets/arrow_writer.py](https://localhost:8080/#) in write_table(self, pa_table, writer_batch_size)
    579         pa_table = pa_table.combine_chunks()
--> 580         pa_table = table_cast(pa_table, self._schema)
    581         if self.embed_local_files:

[/usr/local/lib/python3.10/dist-packages/datasets/table.py](https://localhost:8080/#) in table_cast(table, schema)
   2291     if table.schema != schema:
-> 2292         return cast_table_to_schema(table, schema)
   2293     elif table.schema.metadata != schema.metadata:

[/usr/local/lib/python3.10/dist-packages/datasets/table.py](https://localhost:8080/#) in cast_table_to_schema(table, schema)
   2244         )
-> 2245     arrays = [
   2246         cast_array_to_feature(

[/usr/local/lib/python3.10/dist-packages/datasets/table.py](https://localhost:8080/#) in <listcomp>(.0)
   2245     arrays = [
-> 2246         cast_array_to_feature(
   2247             table[name] if name in table_column_names else pa.array([None] * len(table), type=schema.field(name).type),

[/usr/local/lib/python3.10/dist-packages/datasets/table.py](https://localhost:8080/#) in wrapper(array, *args, **kwargs)
   1794         if isinstance(array, pa.ChunkedArray):
-> 1795             return pa.chunked_array([func(chunk, *args, **kwargs) for chunk in array.chunks])
   1796         else:

[/usr/local/lib/python3.10/dist-packages/datasets/table.py](https://localhost:8080/#) in <listcomp>(.0)
   1794         if isinstance(array, pa.ChunkedArray):
-> 1795             return pa.chunked_array([func(chunk, *args, **kwargs) for chunk in array.chunks])
   1796         else:

[/usr/local/lib/python3.10/dist-packages/datasets/table.py](https://localhost:8080/#) in cast_array_to_feature(array, feature, allow_primitive_to_str, allow_decimal_to_str)
   2101     elif not isinstance(feature, (Sequence, dict, list, tuple)):
-> 2102         return array_cast(
   2103             array,

[/usr/local/lib/python3.10/dist-packages/datasets/table.py](https://localhost:8080/#) in wrapper(array, *args, **kwargs)
   1796         else:
-> 1797             return func(array, *args, **kwargs)
   1798 

[/usr/local/lib/python3.10/dist-packages/datasets/table.py](https://localhost:8080/#) in array_cast(array, pa_type, allow_primitive_to_str, allow_decimal_to_str)
   1947         if pa.types.is_null(pa_type) and not pa.types.is_null(array.type):
-> 1948             raise TypeError(f"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)}")
   1949         return array.cast(pa_type)

TypeError: Couldn't cast array of type string to null


The above exception was the direct cause of the following exception:

DatasetGenerationError                    Traceback (most recent call last)

[<ipython-input-353-e02f83980611>](https://localhost:8080/#) in <cell line: 1>()
----> 1 dd = load_dataset("json", data_files=["TEST.json"])

[/usr/local/lib/python3.10/dist-packages/datasets/load.py](https://localhost:8080/#) in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, keep_in_memory, save_infos, revision, token, streaming, num_proc, storage_options, trust_remote_code, **config_kwargs)
   2094 
   2095     # Download and prepare data
-> 2096     builder_instance.download_and_prepare(
   2097         download_config=download_config,
   2098         download_mode=download_mode,

[/usr/local/lib/python3.10/dist-packages/datasets/builder.py](https://localhost:8080/#) in download_and_prepare(self, output_dir, download_config, download_mode, verification_mode, dl_manager, base_path, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)
    922                     if num_proc is not None:
    923                         prepare_split_kwargs["num_proc"] = num_proc
--> 924                     self._download_and_prepare(
    925                         dl_manager=dl_manager,
    926                         verification_mode=verification_mode,

[/usr/local/lib/python3.10/dist-packages/datasets/builder.py](https://localhost:8080/#) in _download_and_prepare(self, dl_manager, verification_mode, **prepare_split_kwargs)
    997             try:
    998                 # Prepare split will record examples associated to the split
--> 999                 self._prepare_split(split_generator, **prepare_split_kwargs)
   1000             except OSError as e:
   1001                 raise OSError(

[/usr/local/lib/python3.10/dist-packages/datasets/builder.py](https://localhost:8080/#) in _prepare_split(self, split_generator, file_format, num_proc, max_shard_size)
   1738             job_id = 0
   1739             with pbar:
-> 1740                 for job_id, done, content in self._prepare_split_single(
   1741                     gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
   1742                 ):

[/usr/local/lib/python3.10/dist-packages/datasets/builder.py](https://localhost:8080/#) in _prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
   1894             if isinstance(e, DatasetGenerationError):
   1895                 raise
-> 1896             raise DatasetGenerationError("An error occurred while generating the dataset") from e
   1897 
   1898         yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)

DatasetGenerationError: An error occurred while generating the dataset

Steps to reproduce the bug

import json
from datasets import load_dataset

with open("TEST.json", "w") as f:
    row = {"ballast": "qwerty" * 1000, "b": None}
    row_str = json.dumps(row) + "\n"
    line_size = len(row_str)
    chunk_size = 10 << 20
    lines_in_chunk = chunk_size // line_size + 1
    print(f"Writing {lines_in_chunk} lines")
    for i in range(lines_in_chunk):
        f.write(row_str)
    null_row = {"ballast": "Gotcha", "b": "Not Null"}
    f.write(json.dumps(null_row) + "\n")

load_dataset("json", data_files=["TEST.json"])

Expected behavior

Concatenation of the chunks without errors

Environment info

  • datasets version: 3.0.1
  • Platform: Linux-6.1.85+-x86_64-with-glibc2.35
  • Python version: 3.10.12
  • huggingface_hub version: 0.24.7
  • PyArrow version: 16.1.0
  • Pandas version: 2.2.2
  • fsspec version: 2024.6.1

nokados avatar Oct 12 '24 08:10 nokados

I am encountering this same issue. It seems that the library manages to recognise an optional column (but not exclusively null) if there is at least one non-null instance within the same file. For example, given a test_0.jsonl file:

{"a": "a1", "b": "b1", "c": null, "d": null}
{"a": "a2", "b": null, "c": "c2", "d": null}

the data is correctly loaded, recognising that columns b & c are optional, while d is null.

{'a': ['a1', 'a2'], 'b': ['b1', None], 'c': [None, 'c2'], 'd': [None, None]}

But if the config has another file, say test_1.jsonl where d now has some non-null values:

{"a": null, "b": "b3", "c": "c3", "d": "d3"}
{"a": "a4", "b": "b4", "c": null, "d": null}

then, an error is raised:

TypeError                                 Traceback (most recent call last)

[/usr/local/lib/python3.10/dist-packages/datasets/builder.py](https://localhost:8080/#) in _prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
   1869                     try:
-> 1870                         writer.write_table(table)
   1871                     except CastError as cast_error:

14 frames

TypeError: Couldn't cast array of type string to null


The above exception was the direct cause of the following exception:

DatasetGenerationError                    Traceback (most recent call last)

[/usr/local/lib/python3.10/dist-packages/datasets/builder.py](https://localhost:8080/#) in _prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
   1895             if isinstance(e, DatasetGenerationError):
   1896                 raise
-> 1897             raise DatasetGenerationError("An error occurred while generating the dataset") from e
   1898 
   1899         yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)

DatasetGenerationError: An error occurred while generating the dataset

I have created a sample repository if that helps. Interestingly, the dataset viewer correctly shows the data across files, although it still indicates the above error.

KurtMica avatar Dec 12 '24 10:12 KurtMica

Managed to find a workaround, by specifying the features explicitly, which is also possible to do directly using the YAML file configuration.

KurtMica avatar Jan 05 '25 15:01 KurtMica

I hit the same issue for datasets 3.2.0. Given the two jsonl files with the same content but different ordering, load_dataset worked for one but did not work for the other.

from datasets import load_dataset

issues_dataset = load_dataset(
    "json", data_files="NeMo-issues-fixed.jsonl", split="train"
)
issues_dataset

For NeMo-issues.jsonl, I got an exception:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
File [~/anaconda3/envs/llm/lib/python3.12/site-packages/datasets/builder.py:1870](http://localhost:8888/home/renwei/anaconda3/envs/llm/lib/python3.12/site-packages/datasets/builder.py#line=1869), in ArrowBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
   1869 try:
-> 1870     writer.write_table(table)
   1871 except CastError as cast_error:

File [~/anaconda3/envs/llm/lib/python3.12/site-packages/datasets/arrow_writer.py:622](http://localhost:8888/home/renwei/anaconda3/envs/llm/lib/python3.12/site-packages/datasets/arrow_writer.py#line=621), in ArrowWriter.write_table(self, pa_table, writer_batch_size)
    621 pa_table = pa_table.combine_chunks()
--> 622 pa_table = table_cast(pa_table, self._schema)
    623 if self.embed_local_files:

File [~/anaconda3/envs/llm/lib/python3.12/site-packages/datasets/table.py:2292](http://localhost:8888/home/renwei/anaconda3/envs/llm/lib/python3.12/site-packages/datasets/table.py#line=2291), in table_cast(table, schema)
   2291 if table.schema != schema:
-> 2292     return cast_table_to_schema(table, schema)
   2293 elif table.schema.metadata != schema.metadata:

File [~/anaconda3/envs/llm/lib/python3.12/site-packages/datasets/table.py:2246](http://localhost:8888/home/renwei/anaconda3/envs/llm/lib/python3.12/site-packages/datasets/table.py#line=2245), in cast_table_to_schema(table, schema)
   2240     raise CastError(
   2241         f"Couldn't cast\n{_short_str(table.schema)}\nto\n{_short_str(features)}\nbecause column names don't match",
   2242         table_column_names=table.column_names,
   2243         requested_column_names=list(features),
   2244     )
   2245 arrays = [
-> 2246     cast_array_to_feature(
   2247         table[name] if name in table_column_names else pa.array([None] * len(table), type=schema.field(name).type),
   2248         feature,
   2249     )
   2250     for name, feature in features.items()
   2251 ]
   2252 return pa.Table.from_arrays(arrays, schema=schema)

File [~/anaconda3/envs/llm/lib/python3.12/site-packages/datasets/table.py:1795](http://localhost:8888/home/renwei/anaconda3/envs/llm/lib/python3.12/site-packages/datasets/table.py#line=1794), in _wrap_for_chunked_arrays.<locals>.wrapper(array, *args, **kwargs)
   1794 if isinstance(array, pa.ChunkedArray):
-> 1795     return pa.chunked_array([func(chunk, *args, **kwargs) for chunk in array.chunks])
   1796 else:

File [~/anaconda3/envs/llm/lib/python3.12/site-packages/datasets/table.py:2102](http://localhost:8888/home/renwei/anaconda3/envs/llm/lib/python3.12/site-packages/datasets/table.py#line=2101), in cast_array_to_feature(array, feature, allow_primitive_to_str, allow_decimal_to_str)
   2101 elif not isinstance(feature, (Sequence, dict, list, tuple)):
-> 2102     return array_cast(
   2103         array,
   2104         feature(),
   2105         allow_primitive_to_str=allow_primitive_to_str,
   2106         allow_decimal_to_str=allow_decimal_to_str,
   2107     )
   2108 raise TypeError(f"Couldn't cast array of type\n{_short_str(array.type)}\nto\n{_short_str(feature)}")

File [~/anaconda3/envs/llm/lib/python3.12/site-packages/datasets/table.py:1797](http://localhost:8888/home/renwei/anaconda3/envs/llm/lib/python3.12/site-packages/datasets/table.py#line=1796), in _wrap_for_chunked_arrays.<locals>.wrapper(array, *args, **kwargs)
   1796 else:
-> 1797     return func(array, *args, **kwargs)

File [~/anaconda3/envs/llm/lib/python3.12/site-packages/datasets/table.py:1948](http://localhost:8888/home/renwei/anaconda3/envs/llm/lib/python3.12/site-packages/datasets/table.py#line=1947), in array_cast(array, pa_type, allow_primitive_to_str, allow_decimal_to_str)
   1947 if pa.types.is_null(pa_type) and not pa.types.is_null(array.type):
-> 1948     raise TypeError(f"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)}")
   1949 return array.cast(pa_type)

TypeError: Couldn't cast array of type string to null

The above exception was the direct cause of the following exception:

DatasetGenerationError                    Traceback (most recent call last)
Cell In[73], line 3
      1 from datasets import load_dataset
----> 3 issues_dataset = load_dataset(
      4     "json", data_files="NeMo-issues.jsonl", split="train"
      5 )
      6 issues_dataset

File [~/anaconda3/envs/llm/lib/python3.12/site-packages/datasets/load.py:2151](http://localhost:8888/home/renwei/anaconda3/envs/llm/lib/python3.12/site-packages/datasets/load.py#line=2150), in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, keep_in_memory, save_infos, revision, token, streaming, num_proc, storage_options, trust_remote_code, **config_kwargs)
   2148     return builder_instance.as_streaming_dataset(split=split)
   2150 # Download and prepare data
-> 2151 builder_instance.download_and_prepare(
   2152     download_config=download_config,
   2153     download_mode=download_mode,
   2154     verification_mode=verification_mode,
   2155     num_proc=num_proc,
   2156     storage_options=storage_options,
   2157 )
   2159 # Build dataset for splits
   2160 keep_in_memory = (
   2161     keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
   2162 )

File [~/anaconda3/envs/llm/lib/python3.12/site-packages/datasets/builder.py:924](http://localhost:8888/home/renwei/anaconda3/envs/llm/lib/python3.12/site-packages/datasets/builder.py#line=923), in DatasetBuilder.download_and_prepare(self, output_dir, download_config, download_mode, verification_mode, dl_manager, base_path, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)
    922 if num_proc is not None:
    923     prepare_split_kwargs["num_proc"] = num_proc
--> 924 self._download_and_prepare(
    925     dl_manager=dl_manager,
    926     verification_mode=verification_mode,
    927     **prepare_split_kwargs,
    928     **download_and_prepare_kwargs,
    929 )
    930 # Sync info
    931 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())

File [~/anaconda3/envs/llm/lib/python3.12/site-packages/datasets/builder.py:1000](http://localhost:8888/home/renwei/anaconda3/envs/llm/lib/python3.12/site-packages/datasets/builder.py#line=999), in DatasetBuilder._download_and_prepare(self, dl_manager, verification_mode, **prepare_split_kwargs)
    996 split_dict.add(split_generator.split_info)
    998 try:
    999     # Prepare split will record examples associated to the split
-> 1000     self._prepare_split(split_generator, **prepare_split_kwargs)
   1001 except OSError as e:
   1002     raise OSError(
   1003         "Cannot find data file. "
   1004         + (self.manual_download_instructions or "")
   1005         + "\nOriginal erro[r:\n](file:///R:/n)"
   1006         + str(e)
   1007     ) from None

File [~/anaconda3/envs/llm/lib/python3.12/site-packages/datasets/builder.py:1741](http://localhost:8888/home/renwei/anaconda3/envs/llm/lib/python3.12/site-packages/datasets/builder.py#line=1740), in ArrowBasedBuilder._prepare_split(self, split_generator, file_format, num_proc, max_shard_size)
   1739 job_id = 0
   1740 with pbar:
-> 1741     for job_id, done, content in self._prepare_split_single(
   1742         gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
   1743     ):
   1744         if done:
   1745             result = content

File [~/anaconda3/envs/llm/lib/python3.12/site-packages/datasets/builder.py:1897](http://localhost:8888/home/renwei/anaconda3/envs/llm/lib/python3.12/site-packages/datasets/builder.py#line=1896), in ArrowBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
   1895     if isinstance(e, DatasetGenerationError):
   1896         raise
-> 1897     raise DatasetGenerationError("An error occurred while generating the dataset") from e
   1899 yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)

DatasetGenerationError: An error occurred while generating the dataset

For NeMo-issues-fixed.json which consists of the last 1000 lines and then the first 9000 lines of NeMo-issues.jsonl, I could load the data:

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'sub_issues_summary', 'active_lock_reason', 'draft', 'pull_request', 'body', 'closed_by', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason'],
    num_rows: 10000
})

renweizhukov avatar Jan 21 '25 02:01 renweizhukov

having the same issue as well!

DronHazra avatar Feb 23 '25 13:02 DronHazra

Is this fixed in the latest version?

Met4physics avatar Jul 12 '25 17:07 Met4physics

@DronHazra @renweizhukov Is this fixed in the latest version?

Cooperx521 avatar Jul 21 '25 03:07 Cooperx521