umap PicklingError numpy 1.20

I am having the following issue using numpy 1.20 and umpa-learn.

See the following traceback.

PicklingError                             Traceback (most recent call last)
<command-1906172682191955> in <module>
     19 
     20 UMAP = reducer.fit_transform(article_cluster_data[[
---> 21     s for s in eans_embeddings.columns if "embedding" in s
     22 ]])
     23 

/databricks/python/lib/python3.7/site-packages/umap/umap_.py in fit_transform(self, X, y)
   2633             Local radii of data points in the embedding (log-transformed).
   2634         """
-> 2635         self.fit(X, y)
   2636         if self.transform_mode == "embedding":
   2637             if self.output_dens:

/databricks/python/lib/python3.7/site-packages/umap/umap_.py in fit(self, X, y)
   2571 
   2572         numba.set_num_threads(self._original_n_threads)
-> 2573         self._input_hash = joblib.hash(self._raw_data)
   2574 
   2575         return self

/databricks/python/lib/python3.7/site-packages/joblib/hashing.py in hash(obj, hash_name, coerce_mmap)
    265     else:
    266         hasher = Hasher(hash_name=hash_name)
--> 267     return hasher.hash(obj)

/databricks/python/lib/python3.7/site-packages/joblib/hashing.py in hash(self, obj, return_digest)
     66     def hash(self, obj, return_digest=True):
     67         try:
---> 68             self.dump(obj)
     69         except pickle.PicklingError as e:
     70             e.args += ('PicklingError while hashing %r: %r' % (obj, e),)

/databricks/python/lib/python3.7/pickle.py in dump(self, obj)
    435         if self.proto >= 4:
    436             self.framer.start_framing()
--> 437         self.save(obj)
    438         self.write(STOP)
    439         self.framer.end_framing()

/databricks/python/lib/python3.7/site-packages/joblib/hashing.py in save(self, obj)
    240             klass = obj.__class__
    241             obj = (klass, ('HASHED', obj.descr))
--> 242         Hasher.save(self, obj)
    243 
    244 

/databricks/python/lib/python3.7/site-packages/joblib/hashing.py in save(self, obj)
     92                 cls = obj.__self__.__class__
     93                 obj = _MyHash(func_name, inst, cls)
---> 94         Pickler.save(self, obj)
     95 
     96     def memoize(self, obj):

/databricks/python/lib/python3.7/pickle.py in save(self, obj, save_persistent_id)
    502         f = self.dispatch.get(t)
    503         if f is not None:
--> 504             f(self, obj) # Call unbound method with explicit self
    505             return
    506 

/databricks/python/lib/python3.7/pickle.py in save_tuple(self, obj)
    772         if n <= 3 and self.proto >= 2:
    773             for element in obj:
--> 774                 save(element)
    775             # Subtle.  Same as in the big comment below.
    776             if id(obj) in memo:

/databricks/python/lib/python3.7/site-packages/joblib/hashing.py in save(self, obj)
    240             klass = obj.__class__
    241             obj = (klass, ('HASHED', obj.descr))
--> 242         Hasher.save(self, obj)
    243 
    244 

/databricks/python/lib/python3.7/site-packages/joblib/hashing.py in save(self, obj)
     92                 cls = obj.__self__.__class__
     93                 obj = _MyHash(func_name, inst, cls)
---> 94         Pickler.save(self, obj)
     95 
     96     def memoize(self, obj):

/databricks/python/lib/python3.7/pickle.py in save(self, obj, save_persistent_id)
    502         f = self.dispatch.get(t)
    503         if f is not None:
--> 504             f(self, obj) # Call unbound method with explicit self
    505             return
    506 

/databricks/python/lib/python3.7/pickle.py in save_tuple(self, obj)
    787         write(MARK)
    788         for element in obj:
--> 789             save(element)
    790 
    791         if id(obj) in memo:

/databricks/python/lib/python3.7/site-packages/joblib/hashing.py in save(self, obj)
    240             klass = obj.__class__
    241             obj = (klass, ('HASHED', obj.descr))
--> 242         Hasher.save(self, obj)
    243 
    244 

/databricks/python/lib/python3.7/site-packages/joblib/hashing.py in save(self, obj)
     92                 cls = obj.__self__.__class__
     93                 obj = _MyHash(func_name, inst, cls)
---> 94         Pickler.save(self, obj)
     95 
     96     def memoize(self, obj):

/databricks/python/lib/python3.7/pickle.py in save(self, obj, save_persistent_id)
    502         f = self.dispatch.get(t)
    503         if f is not None:
--> 504             f(self, obj) # Call unbound method with explicit self
    505             return
    506 

/databricks/python/lib/python3.7/pickle.py in save_tuple(self, obj)
    772         if n <= 3 and self.proto >= 2:
    773             for element in obj:
--> 774                 save(element)
    775             # Subtle.  Same as in the big comment below.
    776             if id(obj) in memo:

/databricks/python/lib/python3.7/site-packages/joblib/hashing.py in save(self, obj)
    240             klass = obj.__class__
    241             obj = (klass, ('HASHED', obj.descr))
--> 242         Hasher.save(self, obj)
    243 
    244 

/databricks/python/lib/python3.7/site-packages/joblib/hashing.py in save(self, obj)
     92                 cls = obj.__self__.__class__
     93                 obj = _MyHash(func_name, inst, cls)
---> 94         Pickler.save(self, obj)
     95 
     96     def memoize(self, obj):

/databricks/python/lib/python3.7/pickle.py in save(self, obj, save_persistent_id)
    516                 issc = False
    517             if issc:
--> 518                 self.save_global(obj)
    519                 return
    520 

/databricks/python/lib/python3.7/site-packages/joblib/hashing.py in save_global(self, obj, name, pack)
    115             Pickler.save_global(self, obj, **kwargs)
    116         except pickle.PicklingError:
--> 117             Pickler.save_global(self, obj, **kwargs)
    118             module = getattr(obj, "__module__", None)
    119             if module == '__main__':

/databricks/python/lib/python3.7/pickle.py in save_global(self, obj, name)
    958             raise PicklingError(
    959                 "Can't pickle %r: it's not found as %s.%s" %
--> 960                 (obj, module_name, name)) from None
    961         else:
    962             if obj2 is not obj:

PicklingError: ("Can't pickle <class 'numpy.dtype[float32]'>: it's not found as numpy.dtype[float32]", 'PicklingError while hashing array([[-0.3997416 , -0.19219466, -0.83981943, ..., -0.9273374 ,\n         1.4046632 ,  0.30895016],\n       [-0.04274601, -0.12016755, -0.53093857, ..., -0.9320015 ,\n         0.8004919 ,  0.14586882],\n       [ 0.10363793,  0.21220148, -0.5180615 , ..., -1.103286  ,\n         1.030384  ,  0.33772892],\n       ...,\n       [ 0.45876223,  0.13564155, -0.37127146, ..., -0.24023826,\n         0.6981608 ,  0.5868731 ],\n       [-0.12448474, -0.12088505, -0.5615971 , ..., -0.42116365,\n         1.4583211 ,  0.395956  ],\n       [-0.10243232, -0.24882779,  0.15550528, ..., -0.7924694 ,\n         1.1544111 ,  0.19003616]], dtype=float32): PicklingError("Can\'t pickle <class \'numpy.dtype[float32]\'>: it\'s not found as numpy.dtype[float32]")')

Feb 01 '21 11:02 Augusttell

@Augusttell how did you fix it?

Feb 01 '21 16:02 geokaragiannis

That is a very odd error. I'm not really sure where to begin with that. I'll see if I can manage a reproducer.

Feb 01 '21 16:02 lmcinnes

There is a similar issue floating around with HDBSCAN not working due to what seems to be a compilation issue with Numpy. You can find references to these issues here and here.

Feb 02 '21 06:02 MaartenGr

@lmcinnes @Augusttell I am not sure this should be closed as this problem, at least for me and I believe others as well, still persists.

Feb 02 '21 13:02 MaartenGr

I didn't close it, but I will re-open it if it is ongoing.

Feb 02 '21 15:02 lmcinnes

So I can't currently reproduce this error, which makes it especially hard to track down. Obviously something has changed with numpy, but given that I am only using the public API it seems very odd that any problems should arise. I'll leave this open but I can't really promise any hope of a fix until I can manage to reproduce the error myself.

Feb 02 '21 16:02 lmcinnes

I resolved it using the following requirements list:

umap-learn==0.5.0 numpy==1.20.0 scipy==1.5.4 scikit-learn==0.24.1 numba==0.52 pynndescent==0.5.1 tbb==2021.1.1

For me it seem to be some kind of package conflict with the new numpy release.

Feb 02 '21 18:02 Augusttell

I suspect there is something going on with dependency resolution / solving with PyPI (or conda, depending on which people are using) that means that depending on exactly what other packages are installed a conflict ends up occurring. That likely means it is going to be something subtle in other packages installed, and what their dependencies are, that will result in something going astray. It is going to make it hard to track down. Realistically nothing has changed in the umap-learn released, but numpy 1.20 seems to have had some flow on effects. Starting from a clean virtual environment may be one option to get something working.

Feb 02 '21 19:02 lmcinnes

I tried the library versions from @Augusttell but the error still exists. I'm using python 3.7.9. I also tried 3.7.6 but the error still exists so I think python version does not matter

Feb 02 '21 21:02 yamengzhang

@yamengzhang In the worst case if you just want to get working you can simply comment out the relevant lines in you umap installation -- the joblib hashing isn't required; it just speeds things up in certain very specific cases. If you remove the following lines:

https://github.com/lmcinnes/umap/blob/9113f4a3f1fa091e6874134bf26ac98e48c5c7ed/umap/umap_.py#L2572

and

https://github.com/lmcinnes/umap/blob/9113f4a3f1fa091e6874134bf26ac98e48c5c7ed/umap/umap_.py#L2670-L2681

it may actually work (although the numpy issues may rear their head elsewhere -- I'm not sure).

Feb 02 '21 22:02 lmcinnes

Glad it worked in the end. I may update the requirements to pick out the right joblib version so we can hopefully avoid this in future.

Feb 02 '21 22:02 lmcinnes

@lmcinnes Sorry for the previous confusion. The job failed at last. I think the version of joblib is indeed crucial. However the problem becomes more complicated because I also need hdbscan.

In the failed job the settings are: joblib==1.0.0 umap-learn==0.5.0 numpy==1.20.0 hdbscan==0.8.26 scipy==1.5.4 scikit-learn==0.24.1 numba==0.52

Using the versions above I ran into issue. However after downgrading joblib to 0.17.0 as the solution in the issue suggested I ran into this pickle error again.

Feb 03 '21 00:02 yamengzhang

I think this is a matter of version conflicts. I think the easiest thing to do right now is to push out a new version of hdbscan that fixes this (it was fixed in master, but I haven't pushed a new release out). Alternatively if you install hdbscan directly from the master branch on github that should also work.

Feb 03 '21 17:02 lmcinnes

ran into the same problem with numpy 1.20, downgraded to numpy==1.19 and umap worked.

Feb 03 '21 21:02 alexgonzl

numpy 1.20 seems to be causing a variety of issues right now. I don't think it is the numpy package itself, but the way pip is handling things with dependencies packages that depend on numpy and may, or may not, be built against specific numpy versions. It is, to be honest, quite complicated and beyond my knowledge. I am hoping this will eventually sort itself out over the next few weeks as the upstream issues with numpy 1.20 get sorted out.

Feb 03 '21 22:02 lmcinnes

Same pickling issue. I commented out the lines in ~/anaconda3/envs/ai/lib/python3.8/site-packages/umap/umap_.py which circumvented the exception.

 $ ipython
Python 3.8.5 (default, Sep  4 2020, 02:22:02) 

% conda list umap
# packages in environment at /Users/davidlaxer/anaconda3/envs/ai:
#
# Name                    Version                   Build  Channel
umap-learn                0.5.1                    pypi_0    pypi
(ai) davidlaxer@x86_64-apple-darwin13 src % conda list joblib
# packages in environment at /Users/davidlaxer/anaconda3/envs/ai:
#
# Name                    Version                   Build  Channel
joblib                    1.0.1                    pypi_0    pypi
(ai) davidlaxer@x86_64-apple-darwin13 src % conda list numpy
# packages in environment at /Users/davidlaxer/anaconda3/envs/ai:
#
# Name                    Version                   Build  Channel
numpy                     1.21.2                   pypi_0    pypi

Sep 30 '21 20:09 dbl001

umap umap copied to clipboard

PicklingError numpy 1.20

umap
umap copied to clipboard