The config isn't consistent between chunks
I was processing large files and received the following error. It failed at around ~80% of the data after about ~1h 20min. The full error is really long, but this is the beginning of it. I'm essentially storing 5 columns where the type of each column is a numpy array. Arrays are of variable length.
🐛 Bug
File "/root/.nextflow-bin/litdata_dataset.py", line 91, in <module>
main(
File "/root/.nextflow-bin/litdata_dataset.py", line 37, in main
ld.optimize(
File "/usr/local/lib/python3.12/site-packages/litdata/processing/functions.py", line 445, in optimize
data_processor.run(
File "/usr/local/lib/python3.12/site-packages/litdata/processing/data_processor.py", line 1134, in run
result = data_recipe._done(len(user_items), self.delete_cached_files, self.output_dir)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/site-packages/litdata/processing/data_processor.py", line 802, in _done
merge_cache._merge_no_wait(node_rank if num_nodes > 1 else None, getattr(self, "existing_index", None))
File "/usr/local/lib/python3.12/site-packages/litdata/streaming/cache.py", line 156, in _merge_no_wait
self._writer._merge_no_wait(node_rank=node_rank, existing_index=existing_index)
File "/usr/local/lib/python3.12/site-packages/litdata/streaming/writer.py", line 470, in _merge_no_wait
raise Exception(
Exception: The config isn't consistent between chunks. This shouldn't have happened.Found {'chunk_bytes': 64000000, 'chunk_size': None, 'compression': 'zstd', 'data_format': ['int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'int'], 'data_spec': '[1, {"type": "builtins.dict", "context": "[\\"input_ids\\", \\"chromosome_idx\\", \\"pos_in_chr_ones\\",...
To Reproduce
Unfortunately I'm not sure how to show how to reproduce without sharing ~100gb dataset.
def get_data_from_file_row_group(group):
"""
Concurrency safe batch samples from parquet row groups.
"""
file_path, row_group = group
with pq.ParquetFile(file_path) as pf:
yield from pf.read_row_groups([row_group]).to_pylist()
def get_data_from_file_row_group(group):
"""
Concurrency safe batch samples from parquet row groups.
"""
file_path, row_group = group
with pq.ParquetFile(file_path) as pf:
yield from pf.read_row_groups([row_group]).to_pylist()
file_paths = glob.glob(f"{input_dir}/*.parquet")
groups = generate_file_row_groups(file_paths)
ld.optimize(
fn=get_data_from_file_row_group,
inputs=groups,
chunk_bytes="64MB",
num_workers=num_workers,
output_dir=f"./output/{dir_prefix}",
compression="zstd",
)
Additional context
Environment detail
- PyTorch Version: 2.4.1
- OS (e.g., Linux): Debian 11
- Lit data version: 0.2.26
- Python version: 3.10
Hi @AugustDev sorry that it failed at ~80%.
Btw, were you using use_checkpoint = True? It can help you in case of any failure.
And, The config isn't consistent between chunks; it should have printed config and data[config] that mismatched. If logs are still available, can you check what's the cause of the mismatch?
Yes, LitData encodes each leaf of the pytree as a single object and therefore, it doesn't know this is a single sample.
You can convert it to numpy or torch tensor directly to inform LitData this is a single item and not a list of items.
Hi @AugustDev, I wanted to follow up and see if the solution recommended by @tchaton was helpful for you.
Hi @bhimrazy @tchaton thank you for the reply. Would you say that as long I am saving data as torch.tensor or numpy array there should be no problems loading the data?
Hi @bhimrazy @tchaton thank you for the reply. Would you say that as long I am saving data as torch.tensor or numpy array there should be no problems loading the data?
Yes, @AugustDev. Let us know how it goes.
Also, if you could recommend any similar publicly available data for testing on my end, that would be helpful. Thank you! 😊
Hey @gluonfield Litdata supports parquet files now. So u don t need to optimise it anymore
This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. Thank you for your contributions.
I stumble upon the same issue, but my outcomes are a bit different. Was processing a relatively small dataset of 100k images, divided the work across multiple worker. Each worker seemed to work well. They all created a *-index.json files, as well as chunks, however one worker ended up writing the following index file
{"chunks": [], "config": null}
While others have this:
{"chunks": [{"chunk_bytes": 199788789, "chunk_size": 568, "dim": null, "filename": "chunk-2-0.bin"}, {"chunk_bytes": 199806838, "chunk_size": 570, "dim": null, "filename": "chunk-2-1.bin"}, {"chunk_bytes": 199972921, "chunk_size": 570, "dim": null, "filename": "chunk-2-2.bin"}, {"chunk_bytes": 199874518, "chunk_size": 571, "dim": null, "filename": "chunk-2-3.bin"}, {"chunk_bytes": 199768541, "chunk_size": 569, "dim": null, "filename": "chunk-2-4.bin"}, {"chunk_bytes": 199998419, "chunk_size": 572, "dim": null, "filename": "chunk-2-5.bin"}, {"chunk_bytes": 199856281, "chunk_size": 570, "dim": null, "filename": "chunk-2-6.bin"}, {"chunk_bytes": 199743966, "chunk_size": 569, "dim": null, "filename": "chunk-2-7.bin"}, {"chunk_bytes": 199910861, "chunk_size": 570, "dim": null, "filename": "chunk-2-8.bin"}, {"chunk_bytes": 199853015, "chunk_size": 569, "dim": null, "filename": "chunk-2-9.bin"}, {"chunk_bytes": 199873843, "chunk_size": 570, "dim": null, "filename": "chunk-2-10.bin"}, {"chunk_bytes": 199662578, "chunk_size": 570, "dim": null, "filename": "chunk-2-11.bin"}, {"chunk_bytes": 199861044, "chunk_size": 570, "dim": null, "filename": "chunk-2-12.bin"}, {"chunk_bytes": 199808455, "chunk_size": 569, "dim": null, "filename": "chunk-2-13.bin"}, {"chunk_bytes": 199775745, "chunk_size": 570, "dim": null, "filename": "chunk-2-14.bin"}, {"chunk_bytes": 199747189, "chunk_size": 568, "dim": null, "filename": "chunk-2-15.bin"}, {"chunk_bytes": 199922076, "chunk_size": 570, "dim": null, "filename": "chunk-2-16.bin"}, {"chunk_bytes": 199988340, "chunk_size": 570, "dim": null, "filename": "chunk-2-17.bin"}, {"chunk_bytes": 199862319, "chunk_size": 569, "dim": null, "filename": "chunk-2-18.bin"}, {"chunk_bytes": 199817702, "chunk_size": 569, "dim": null, "filename": "chunk-2-19.bin"}, {"chunk_bytes": 199767705, "chunk_size": 569, "dim": null, "filename": "chunk-2-20.bin"}, {"chunk_bytes": 199777672, "chunk_size": 570, "dim": null, "filename": "chunk-2-21.bin"}, {"chunk_bytes": 199722535, "chunk_size": 567, "dim": null, "filename": "chunk-2-22.bin"}, {"chunk_bytes": 199690816, "chunk_size": 570, "dim": null, "filename": "chunk-2-23.bin"}, {"chunk_bytes": 199863298, "chunk_size": 568, "dim": null, "filename": "chunk-2-24.bin"}, {"chunk_bytes": 199944409, "chunk_size": 570, "dim": null, "filename": "chunk-2-25.bin"}, {"chunk_bytes": 199931892, "chunk_size": 570, "dim": null, "filename": "chunk-2-26.bin"}, {"chunk_bytes": 199801350, "chunk_size": 569, "dim": null, "filename": "chunk-2-27.bin"}, {"chunk_bytes": 199856427, "chunk_size": 570, "dim": null, "filename": "chunk-2-28.bin"}, {"chunk_bytes": 199676517, "chunk_size": 570, "dim": null, "filename": "chunk-2-29.bin"}, {"chunk_bytes": 199978570, "chunk_size": 569, "dim": null, "filename": "chunk-2-30.bin"}, {"chunk_bytes": 52061402, "chunk_size": 149, "dim": null, "filename": "chunk-2-31.bin"}], "config": {"chunk_bytes": 200000000, "chunk_size": null, "compression": null, "data_format": ["tensor", "str"], "data_spec": "[1, {\"type\": \"builtins.dict\", \"context\": \"[\\\"features\\\", \\\"meta\\\"]\", \"children_spec\": [{\"type\": null, \"context\": null, \"children_spec\": []}, {\"type\": \"builtins.list\", \"context\": \"null\", \"children_spec\": [{\"type\": null, \"context\": null, \"children_spec\": []}]}]}]", "encryption": null, "item_loader": "PyTreeLoader"}}
As a result, I ended up have the following error in my logs:
Traceback (most recent call last):
File "./extract_features_wit_base.py", line 59, in <module>
main()
File "./extract_features_wit_base.py", line 55, in main
create_litdata_dataset("wikimedia/wit_base", dataloader, args)
File "./utils.py", line 165, in create_litdata_dataset
ld.optimize(
File "/user/anon/envs/prh/lib/python3.10/site-packages/litdata/processing/functions.py", line 599, in optimize
data_processor.run(recipe)
File "/user/anon/envs/prh/lib/python3.10/site-packages/litdata/processing/data_processor.py", line 1380, in run
result = data_recipe._done(size, self.delete_cached_files, self.output_dir)
File "/user/anon/envs/prh/lib/python3.10/site-packages/litdata/processing/data_processor.py", line 983, in _done
self._upload_index(output_dir, cache_dir, num_nodes, node_rank)
File "/user/anon/envs/prh/lib/python3.10/site-packages/litdata/processing/data_processor.py", line 1054, in _upload_index
merge_cache._merge_no_wait()
File "/user/anon/envs/prh/lib/python3.10/site-packages/litdata/streaming/cache.py", line 167, in _merge_no_wait
self._writer._merge_no_wait(node_rank=node_rank, existing_index=existing_index)
File "/user/anon/envs/prh/lib/python3.10/site-packages/litdata/streaming/writer.py", line 511, in _merge_no_wait
raise Exception(
Exception: The config isn't consistent between chunks. This shouldn't have happened.Found {'chunk_bytes': 200000000, 'chunk_size': None, 'compression': None, 'data_format': ['tensor', 'str'], 'data_spec': '[1, {"type": "builtins.dict", "context": "[\\"features\\", \\"meta\\"]", "children_spec": [{"type": null, "context": null, "children_spec": []}, {"type": "builtins.list", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}]}]', 'encryption': None, 'item_loader': 'PyTreeLoader'}; None.
/user/anon/envs/prh/lib/python3.10/multiprocessing/resource_tracker.py:224: UserWarning: resource_tracker: There appear to be 1 leaked semaphore objects to clean up at shutdown
warnings.warn('resource_tracker: There appear to be %d '
I attached log files of each worker, so you can see that they worked fine.
Hi @Bizilizi,
Could you check your output directory and see if there are binary files matching the pattern chunk-{worker_index}-{counter}.bin, where {worker_index} corresponds to the file whose {worker_index}-index.json is empty?
Also, could you share details on your optimize_fn, specifically, the kind of data it returns?
Thanks!
Yep, all the chunks are there, indexes as well:
total 24G
3.3K 0-index.json
3.3K 1-index.json
3.3K 2-index.json
30 3-index.json
191M chunk-0-0.bin
191M chunk-0-10.bin
191M chunk-0-11.bin
191M chunk-0-12.bin
191M chunk-0-13.bin
191M chunk-0-14.bin
191M chunk-0-15.bin
191M chunk-0-16.bin
191M chunk-0-17.bin
191M chunk-0-18.bin
191M chunk-0-19.bin
191M chunk-0-1.bin
191M chunk-0-20.bin
191M chunk-0-21.bin
191M chunk-0-22.bin
191M chunk-0-23.bin
191M chunk-0-24.bin
191M chunk-0-25.bin
191M chunk-0-26.bin
191M chunk-0-27.bin
191M chunk-0-28.bin
191M chunk-0-29.bin
191M chunk-0-2.bin
191M chunk-0-30.bin
23M chunk-0-31.bin
191M chunk-0-3.bin
191M chunk-0-4.bin
191M chunk-0-5.bin
191M chunk-0-6.bin
191M chunk-0-7.bin
191M chunk-0-8.bin
191M chunk-0-9.bin
191M chunk-1-0.bin
191M chunk-1-10.bin
191M chunk-1-11.bin
191M chunk-1-12.bin
191M chunk-1-13.bin
191M chunk-1-14.bin
191M chunk-1-15.bin
191M chunk-1-16.bin
191M chunk-1-17.bin
191M chunk-1-18.bin
191M chunk-1-19.bin
191M chunk-1-1.bin
191M chunk-1-20.bin
191M chunk-1-21.bin
191M chunk-1-22.bin
191M chunk-1-23.bin
191M chunk-1-24.bin
191M chunk-1-25.bin
191M chunk-1-26.bin
191M chunk-1-27.bin
191M chunk-1-28.bin
191M chunk-1-29.bin
191M chunk-1-2.bin
191M chunk-1-30.bin
51M chunk-1-31.bin
191M chunk-1-3.bin
191M chunk-1-4.bin
191M chunk-1-5.bin
191M chunk-1-6.bin
191M chunk-1-7.bin
191M chunk-1-8.bin
191M chunk-1-9.bin
191M chunk-2-0.bin
191M chunk-2-10.bin
191M chunk-2-11.bin
191M chunk-2-12.bin
191M chunk-2-13.bin
191M chunk-2-14.bin
191M chunk-2-15.bin
191M chunk-2-16.bin
191M chunk-2-17.bin
191M chunk-2-18.bin
191M chunk-2-19.bin
191M chunk-2-1.bin
191M chunk-2-20.bin
191M chunk-2-21.bin
191M chunk-2-22.bin
191M chunk-2-23.bin
191M chunk-2-24.bin
191M chunk-2-25.bin
191M chunk-2-26.bin
191M chunk-2-27.bin
191M chunk-2-28.bin
191M chunk-2-29.bin
191M chunk-2-2.bin
191M chunk-2-30.bin
50M chunk-2-31.bin
191M chunk-2-3.bin
191M chunk-2-4.bin
191M chunk-2-5.bin
191M chunk-2-6.bin
191M chunk-2-7.bin
191M chunk-2-8.bin
191M chunk-2-9.bin
191M chunk-3-0.bin
191M chunk-3-10.bin
191M chunk-3-11.bin
191M chunk-3-12.bin
191M chunk-3-13.bin
191M chunk-3-14.bin
191M chunk-3-15.bin
191M chunk-3-16.bin
191M chunk-3-17.bin
191M chunk-3-18.bin
191M chunk-3-19.bin
191M chunk-3-1.bin
191M chunk-3-20.bin
191M chunk-3-21.bin
191M chunk-3-22.bin
191M chunk-3-23.bin
191M chunk-3-24.bin
191M chunk-3-25.bin
191M chunk-3-26.bin
191M chunk-3-27.bin
191M chunk-3-28.bin
191M chunk-3-29.bin
191M chunk-3-2.bin
191M chunk-3-30.bin
191M chunk-3-3.bin
191M chunk-3-4.bin
191M chunk-3-5.bin
191M chunk-3-6.bin
191M chunk-3-7.bin
191M chunk-3-8.bin
191M chunk-3-9.bin
Empty index is number 3, e.g.:
cat results/3-index.json
{"chunks": [], "config": null}%
I run it on slurm with following sh file:
#!/bin/bash
#SBATCH --job-name=litdata_wit_base
#SBATCH --ntasks=4
#SBATCH --gpus-per-task=1
#SBATCH --cpus-per-task=8
#SBATCH --mem=96GB
#SBATCH --time=1-00:00:0
#SBATCH --partition=...
#SBATCH --nodelist=...
#SBATCH --output=logs/%j/%t.log
#SBATCH --error=logs/%j/%t.log
CMD="
# Set litdata environment variables
export DATA_OPTIMIZER_NUM_NODES=$SLURM_NTASKS
export DATA_OPTIMIZER_NODE_RANK=\$SLURM_PROCID
# Same Python script runs on ALL nodes
python extract_features_wit_base.py --modelset custom \
--cache_dir /.cache \
--chunk_bytes 200mb \
--dataset datasets/wit/data/100k
--subset 100k
"
srun bash -c "$CMD"
The outputs of optimize_fn is a dict with torch tensors and mets strings:
def extract_llm_features(sample, llm_model_name, args):
"""
Extract features from a language model for a given sample.
Args:
sample: Text sample
llm_model_name: The language model name.
args: argparse.Namespace containing arguments, including pooling method.
"""
...
return {
"features": output_feats,
"meta": sample["meta"] if "meta" in sample else None,
}
Hi @Bizilizi, thanks for sharing the detailed context 🙌
It looks like None might be causing the issue. Could you try replacing it with an empty string "" instead?
return {
"features": output_feats,
"meta": sample["meta"] if "meta" in sample else "",
}