glove-python
glove-python copied to clipboard
Error loading nlp.stanford.edu vectors
I'm getting the following error when trying to load http://nlp.stanford.edu/data/glove.840B.300d.zip
In [1]: import glove
In [2]: %time glv = glove.Glove.load_stanford("glove.840B.300d.txt")
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-2-5e84d129b242> in <module>()
----> 1 get_ipython().magic(u'time glv = glove.Glove.load_stanford("glove.840B.300d.txt")')
virtualEnv/local/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in magic(self, arg_s)
2161 magic_name, _, magic_arg_s = arg_s.partition(' ')
2162 magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2163 return self.run_line_magic(magic_name, magic_arg_s)
2164
2165 #-------------------------------------------------------------------------
virtualEnv/local/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in run_line_magic(self, magic_name, line)
2082 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
2083 with self.builtin_trap:
-> 2084 result = fn(*args,**kwargs)
2085 return result
2086
<decorator-gen-60> in time(self, line, cell, local_ns)
virtualEnv/Executionr/local/lib/python2.7/site-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
191 # but it's overkill for just that one bit of state.
192 def magic_deco(arg):
--> 193 call = lambda f, *a, **k: f(*a, **k)
194
195 if callable(arg):
virtualEnv/local/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
1175 else:
1176 st = clock2()
-> 1177 exec(code, glob, local_ns)
1178 end = clock2()
1179 out = None
<timed exec> in <module>()
virtualEnv/local/lib/python2.7/site-packages/glove/glove.pyc in load_stanford(cls, filename)
265 instance.word_vectors = (np.array(vectors)
266 .reshape(no_vectors,
--> 267 no_components))
268 instance.word_biases = np.zeros(no_vectors)
269 instance.add_dictionary(dct)
ValueError: total size of new array must be unchanged
Any suggestions on how to load the vectors?
did anyone solve this?
@thomasj02 I solved it. This is not a clean solution, but it worked on python 3. It will use a lot of ram, so I will advice not running anything RAM heavy on the side.
` @classmethod def load_stanford(cls, filename): """ Load model from the output files generated by the C code from http://nlp.stanford.edu/projects/glove/.
The entries of the word dictionary will be of type
unicode in Python 2 and str in Python 3.
"""
dct = {}
#vectors = array.array('d')
vectors = []
# Read in the data.
temp_array = []
vector_size = 0
with io.open(filename, 'r', encoding='utf-8') as savefile:
for i, line in enumerate(savefile):
tokens = line.split(' ')
word = tokens[0]
entries = tokens[1:]
vector_size = len(entries)
dct[word] = i
#vectors.extend(float(x) for x in entries)
vectors.append([float(x) for x in entries])
#temp_array.append([float(x) for x in entries])
print("temp_array", len(temp_array))
# Infer word vectors dimensions.
print("dct keys",len(dct.keys()))
no_components = len(vectors)
# Set up the model instance.
instance = Glove()
instance.no_components = no_components
word_vec = np.memmap("word_vec", dtype=np.float32, mode="w+", shape=(len(vectors),vector_size))
word_vec[:] = vectors[:]
instance.word_vectors = word_vec
#instance.word_vectors[:] = np.array(vectors).reshape(no_vectors,no_components)
print("word_vec_new", instance.word_vectors.shape)
instance.word_biases = np.memmap("word_biases", dtype=np.float32, mode="w+", shape=len(vectors))
print("word_biases", instance.word_biases.shape)
instance.add_dictionary(dct)
return instance`
It looks like there are some unknowns in the original corpus, which means the total size of vectors
is different from num_words * dimensions
and reshape won't work.
Adding a little catch for <unk>
in the case of twitter corpus helped for me.
This is the glove.py
file for the Glove
class.
https://github.com/maciejkula/glove-python/blob/master/glove/glove.py#L235
@classmethod
def load_stanford(cls, filename):
"""
Load model from the output files generated by
the C code from http://nlp.stanford.edu/projects/glove/.
The entries of the word dictionary will be of type
unicode in Python 2 and str in Python 3.
"""
dct = {}
vectors = array.array('d')
# Read in the data.
with io.open(filename, 'r', encoding='utf-8') as savefile:
for i, line in enumerate(savefile):
tokens = line.split(' ')
word = tokens[0]
entries = tokens[1:]
################# This part
if word == '<unk>':
continue
#################
dct[word] = i
vectors.extend([float(x) for x in entries])
# Infer word vectors dimensions.
no_components = len(entries)
no_vectors = len(dct)
# Set up the model instance.
instance = Glove()
instance.no_components = no_components
instance.word_vectors = (np.array(vectors)
.reshape(no_vectors,
no_components))
instance.word_biases = np.zeros(no_vectors)
instance.add_dictionary(dct)
return instance
Actually, on second thought it should probably be less hardcoded and be more like
if word in dct.keys():