spaCy
spaCy copied to clipboard
"Value Error: bytes object is too large" when using to_disk on large model.
Hi,
I'm attempting to initialize the gpt2-xl huggingface model in SpaCy using the following code provided in examples/init_model.py:
#!/usr/bin/env python
import plac
from wasabi import Printer
from spacy_transformers import TransformersLanguage, TransformersWordPiecer
from spacy_transformers import TransformersTok2Vec
@plac.annotations(
path=("Output path", "positional", None, str),
name=("Name of pre-trained model", "option", "n", str),
lang=("Language code to use", "option", "l", str),
)
def main(path, name="gpt2-xl", lang="en"):
msg = Printer()
msg.info(f"Creating model for '{name}' ({lang})")
with msg.loading(f"Setting up the pipeline..."):
nlp = TransformersLanguage(trf_name=name, meta={"lang": lang})
nlp.add_pipe(nlp.create_pipe("sentencizer"))
nlp.add_pipe(TransformersWordPiecer.from_pretrained(nlp.vocab, name))
nlp.add_pipe(TransformersTok2Vec.from_pretrained(nlp.vocab, name))
msg.good("Initialized the model pipeline")
nlp.to_disk(path)
msg.good(f"Saved '{name}' ({lang})")
msg.text(f"Pipeline: {nlp.pipe_names}")
msg.text(f"Location: {path}")
with msg.loading("Verifying model loads..."):
nlp.from_disk(path)
msg.good("Model loads!")
if __name__ == "__main__":
plac.call(main)
After downloading the model, the function nlp.to_disk(path) raises an exception:
$ python3 test.py models/gpt2-xl/
ℹ Creating model for 'gpt2-xl' (en)
✔ Initialized the model pipeline
Traceback (most recent call last):
File "test.py", line 49, in <module>
plac.call(main)
File "/home/jaronmm2/.local/lib/python3.6/site-packages/plac_core.py", line 367, in call
cmd, result = parser.consume(arglist)
File "/home/jaronmm2/.local/lib/python3.6/site-packages/plac_core.py", line 232, in consume
return cmd, self.func(*(args + varargs + extraopts), **kwargs)
File "test.py", line 39, in main
init_gpt2(path, name, lang)
File "test.py", line 23, in init_gpt2
nlp.to_disk(path)
File "/home/jaronmm2/.local/lib/python3.6/site-packages/spacy/language.py", line 924, in to_disk
util.to_disk(path, serializers, exclude)
File "/home/jaronmm2/.local/lib/python3.6/site-packages/spacy/util.py", line 677, in to_disk
writer(path / key)
File "/home/jaronmm2/.local/lib/python3.6/site-packages/spacy/language.py", line 922, in <lambda>
serializers[name] = lambda p, proc=proc: proc.to_disk(p, exclude=["vocab"])
File "pipes.pyx", line 208, in spacy.pipeline.pipes.Pipe.to_disk
File "/home/jaronmm2/.local/lib/python3.6/site-packages/spacy/util.py", line 677, in to_disk
writer(path / key)
File "pipes.pyx", line 206, in spacy.pipeline.pipes.Pipe.to_disk.lambda6
File "/home/jaronmm2/.local/lib/python3.6/site-packages/thinc/neural/_classes/model.py", line 405, in to_disk
file_.write(self.to_bytes())
File "/home/jaronmm2/.local/lib/python3.6/site-packages/thinc/neural/_classes/model.py", line 372, in to_bytes
return srsly.msgpack_dumps({b"weights": weights})
File "/home/jaronmm2/.local/lib/python3.6/site-packages/srsly/_msgpack_api.py", line 16, in msgpack_dumps
return msgpack.dumps(data, use_bin_type=True)
File "/home/jaronmm2/.local/lib/python3.6/site-packages/srsly/msgpack/__init__.py", line 40, in packb
return Packer(**kwargs).pack(o)
File "_packer.pyx", line 285, in srsly.msgpack._packer.Packer.pack
File "_packer.pyx", line 291, in srsly.msgpack._packer.Packer.pack
File "_packer.pyx", line 288, in srsly.msgpack._packer.Packer.pack
File "_packer.pyx", line 235, in srsly.msgpack._packer.Packer._pack
File "_packer.pyx", line 264, in srsly.msgpack._packer.Packer._pack
File "_packer.pyx", line 206, in srsly.msgpack._packer.Packer._pack
ValueError: bytes object is too large
Thanks for the report! This seems to be a hard-coded limit in msgpack:
https://github.com/explosion/srsly/blob/03e8861eb08b3c33cc86e7c2e049e5b126538dff/srsly/msgpack/_packer.pyx#L44
We'll look into it, but since I'm not sure why msgpack has this limit, I'm not sure whether there's going to be an easy solution. Your only option for now may be to initialize the model as you're doing here each time rather than saving/loading from disk as a spacy model.