wenet
wenet copied to clipboard
[dataset] supoort shard by many jsonl files
- [ ] 需要验证下
# usage1:
json_files = ["1.jsonl", "2.jsonl", "3.jsonl"]
dataset = WenetRawDatasetSource(json_files, partiaion=True, shard_by_files=True)
# usage2:
json_files = "all.jsonl"
dataset = WenetRawDatasetSource(json_files, partiaion=True, shard_by_files=True)