pangeo-forge-recipes icon indicating copy to clipboard operation
pangeo-forge-recipes copied to clipboard

workflow with SSL authentication is broken

Open larsbuntemeyer opened this issue 1 year ago • 2 comments

I just stumbled across an issue when working on a recipe for https://github.com/pangeo-forge/staged-recipes/issues/96. My recipe requires authentication for ESGF to read NetCDF files and I do so using an SSLContext object. My workflow is broken by #359. I can repdroduce this issue using the slightly adapted tutorial code, e.g., I'll add fsspec_open_kwargs={"ssl": ssl.create_default_context()} to the FilePattern object:

import pandas as pd
import ssl
import xarray as xr

from pangeo_forge_recipes.recipes import setup_logging, XarrayZarrRecipe
from pangeo_forge_recipes.patterns import ConcatDim, FilePattern

dates = pd.date_range('1981-09-01', '2022-02-01', freq='D')

URL_FORMAT = (
    "https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/"
    "v2.1/access/avhrr/{time:%Y%m}/oisst-avhrr-v02r01.{time:%Y%m%d}.nc"
)

def make_url(time):
    return URL_FORMAT.format(time=time)

time_concat_dim = ConcatDim("time", dates, nitems_per_file=1)
pattern = FilePattern(make_url, time_concat_dim, fsspec_open_kwargs={"ssl": ssl.create_default_context()})

and running the second part of the tutorial...

# Create recipe object
recipe = XarrayZarrRecipe(pattern, inputs_per_chunk=2)

# Set up logging
setup_logging()

# Prune the recipe
recipe_pruned = recipe.copy_pruned()

# Run the recipe
run_function = recipe_pruned.to_function()
run_function()

# Check the output
oisst_zarr = xr.open_zarr(recipe.target_mapper, consolidated=True)
oisst_zarr

... is broken because the SSLContext object can not be deecopied during hashing in the asdict call in https://github.com/pangeo-forge/pangeo-forge-recipes/blob/16eb6bfd23a26997523b86c0fe5428103ec95b8a/pangeo_forge_recipes/serialization.py#L70

I see that storage_config is ignored during hashing anyway... https://github.com/pangeo-forge/pangeo-forge-recipes/blob/16eb6bfd23a26997523b86c0fe5428103ec95b8a/pangeo_forge_recipes/recipes/base.py#L18

I guess, ignoring that during the asdict call might solve the problem already since it is removed afterwards anyway. This might be related to https://github.com/python/cpython/issues/88071.

larsbuntemeyer avatar Oct 04 '22 14:10 larsbuntemeyer

Here is the full traceback...

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In [2], line 12
     10 # Run the recipe
     11 run_function = recipe_pruned.to_function()
---> 12 run_function()
     14 # Check the output
     15 oisst_zarr = xr.open_zarr(recipe.target_mapper, consolidated=True)

File ~/python/packages/pangeo-forge-recipes/pangeo_forge_recipes/executors/python.py:46, in FunctionPipelineExecutor.compile.<locals>.function()
     44         stage.function(m, config=pipeline.config)
     45 else:
---> 46     stage.function(config=pipeline.config)

File ~/python/packages/pangeo-forge-recipes/pangeo_forge_recipes/recipes/xarray_zarr.py:581, in prepare_target(config)
    578     config.storage_config.metadata[_GLOBAL_METADATA_KEY] = recipe_meta
    580 zgroup = zarr.open_group(config.target_mapper)
--> 581 for k, v in config.get_execution_context().items():
    582     zgroup.attrs[f"pangeo-forge:{k}"] = v

File ~/python/packages/pangeo-forge-recipes/pangeo_forge_recipes/recipes/base.py:59, in BaseRecipe.get_execution_context(self)
     55 def get_execution_context(self):
     56     return dict(
     57         # See https://stackoverflow.com/a/2073599 re: version
     58         version=pkg_resources.require("pangeo-forge-recipes")[0].version,
---> 59         recipe_hash=self.sha256().hex(),
     60         inputs_hash=self.file_pattern.sha256().hex(),
     61     )

File ~/python/packages/pangeo-forge-recipes/pangeo_forge_recipes/recipes/base.py:53, in BaseRecipe.sha256(self)
     52 def sha256(self):
---> 53     return dataclass_sha256(self, ignore_keys=self._hash_exclude_)

File ~/python/packages/pangeo-forge-recipes/pangeo_forge_recipes/serialization.py:70, in dataclass_sha256(dclass, ignore_keys)
     59 def dataclass_sha256(dclass: type, ignore_keys: List[str]) -> bytes:
     60     """Generate a deterministic sha256 hash from a Python ``dataclass``. Fields for which the value
     61     is either ``None`` or an empty collection are excluded from the hash calculation automatically.
     62     To manually exclude other fields from the calculation, pass their names via ``igonore_keys``.
   (...)
     67     :param ignore_keys: A list of field names to exclude from the hash calculation.
     68     """
---> 70     d = asdict(dclass, dict_factory=dict_drop_empty)
     71     for k in ignore_keys:
     72         del d[k]

File ~/miniconda3/envs/pangeo-forge-esgf/lib/python3.10/dataclasses.py:1239, in asdict(obj, dict_factory)
   1237 if not _is_dataclass_instance(obj):
   1238     raise TypeError("asdict() should be called on dataclass instances")
-> 1239 return _asdict_inner(obj, dict_factory)

File ~/miniconda3/envs/pangeo-forge-esgf/lib/python3.10/dataclasses.py:1246, in _asdict_inner(obj, dict_factory)
   1244 result = []
   1245 for f in fields(obj):
-> 1246     value = _asdict_inner(getattr(obj, f.name), dict_factory)
   1247     result.append((f.name, value))
   1248 return dict_factory(result)

File ~/miniconda3/envs/pangeo-forge-esgf/lib/python3.10/dataclasses.py:1280, in _asdict_inner(obj, dict_factory)
   1276     return type(obj)((_asdict_inner(k, dict_factory),
   1277                       _asdict_inner(v, dict_factory))
   1278                      for k, v in obj.items())
   1279 else:
-> 1280     return copy.deepcopy(obj)

File ~/miniconda3/envs/pangeo-forge-esgf/lib/python3.10/copy.py:172, in deepcopy(x, memo, _nil)
    170                 y = x
    171             else:
--> 172                 y = _reconstruct(x, memo, *rv)
    174 # If is its own copy, don't memoize.
    175 if y is not x:

File ~/miniconda3/envs/pangeo-forge-esgf/lib/python3.10/copy.py:271, in _reconstruct(x, memo, func, args, state, listiter, dictiter, deepcopy)
    269 if state is not None:
    270     if deep:
--> 271         state = deepcopy(state, memo)
    272     if hasattr(y, '__setstate__'):
    273         y.__setstate__(state)

File ~/miniconda3/envs/pangeo-forge-esgf/lib/python3.10/copy.py:146, in deepcopy(x, memo, _nil)
    144 copier = _deepcopy_dispatch.get(cls)
    145 if copier is not None:
--> 146     y = copier(x, memo)
    147 else:
    148     if issubclass(cls, type):

File ~/miniconda3/envs/pangeo-forge-esgf/lib/python3.10/copy.py:231, in _deepcopy_dict(x, memo, deepcopy)
    229 memo[id(x)] = y
    230 for key, value in x.items():
--> 231     y[deepcopy(key, memo)] = deepcopy(value, memo)
    232 return y

File ~/miniconda3/envs/pangeo-forge-esgf/lib/python3.10/copy.py:146, in deepcopy(x, memo, _nil)
    144 copier = _deepcopy_dispatch.get(cls)
    145 if copier is not None:
--> 146     y = copier(x, memo)
    147 else:
    148     if issubclass(cls, type):

File ~/miniconda3/envs/pangeo-forge-esgf/lib/python3.10/copy.py:231, in _deepcopy_dict(x, memo, deepcopy)
    229 memo[id(x)] = y
    230 for key, value in x.items():
--> 231     y[deepcopy(key, memo)] = deepcopy(value, memo)
    232 return y

File ~/miniconda3/envs/pangeo-forge-esgf/lib/python3.10/copy.py:161, in deepcopy(x, memo, _nil)
    159 reductor = getattr(x, "__reduce_ex__", None)
    160 if reductor is not None:
--> 161     rv = reductor(4)
    162 else:
    163     reductor = getattr(x, "__reduce__", None)

TypeError: cannot pickle 'SSLContext' object

larsbuntemeyer avatar Oct 04 '22 14:10 larsbuntemeyer

Also related to https://github.com/python/cpython/issues/77204

larsbuntemeyer avatar Oct 04 '22 14:10 larsbuntemeyer