dask-cuda
dask-cuda copied to clipboard
Running large inputs results in "Worker process still alive after 3.1999992370605472 seconds, killing"
The following program:
import cupy as cp
import numpy as np
import dask.array as da
import rmm
import sys
import time
from helpers import *
from dask_cuda import LocalCUDACluster
from dask.distributed import Client, wait
from cupyx.profiler import benchmark
def generate_array(size, work_units):
rs = da.random.RandomState(RandomState=cp.random.RandomState)
rs = rs.randint(low = 0, high = 100_000, size = (size, work_units), chunks = 'auto')
return rs
def multiplication(arr):
array_mult = da.multiply(arr, 42)
return array_mult.persist()
def bench(arr):
r = multiplication(arr)
wait(r)
return r
if __name__ == '__main__':
cluster = LocalCUDACluster('0', n_workers=1, rmm_managed_memory=True)
client = Client(cluster)
client.run(cp.cuda.set_allocator, rmm.rmm_cupy_allocator)
rmm.reinitialize(managed_memory=True)
cp.cuda.set_allocator(rmm.rmm_cupy_allocator)
size = int(sys.argv[1])
wk = int(sys.argv[2])
arr = generate_array(size, wk)
y = benchmark(bench, (arr, ), n_repeat= 5, n_warmup= 0)
print(parse_cupy(y))
Crashes when input sizes get too big, i.e. an inputsize of (2 ^ 22, 1000). The normal DASK solution is to pass processes=False
client, however this is not an option when using GPU.
Set up:
# packages in environment at /home/joos/miniconda3/envs/rapids-22.12:
#
# Name Version Build Channel
_libgcc_mutex 0.1 conda_forge conda-forge
_openmp_mutex 4.5 2_gnu conda-forge
aiohttp 3.8.3 py39hb9d737c_1 conda-forge
aiosignal 1.3.1 pyhd8ed1ab_0 conda-forge
anyio 3.6.2 pyhd8ed1ab_0 conda-forge
aom 3.5.0 h27087fc_0 conda-forge
appdirs 1.4.4 pyh9f0ad1d_0 conda-forge
argon2-cffi 21.3.0 pyhd8ed1ab_0 conda-forge
argon2-cffi-bindings 21.2.0 py39hb9d737c_3 conda-forge
arrow-cpp 9.0.0 py39hd3ccb9b_2_cpu conda-forge
asttokens 2.1.0 pyhd8ed1ab_0 conda-forge
async-timeout 4.0.2 pyhd8ed1ab_0 conda-forge
attrs 22.1.0 pyh71513ae_1 conda-forge
aws-c-cal 0.5.11 h95a6274_0 conda-forge
aws-c-common 0.6.2 h7f98852_0 conda-forge
aws-c-event-stream 0.2.7 h3541f99_13 conda-forge
aws-c-io 0.10.5 hfb6a706_0 conda-forge
aws-checksums 0.1.11 ha31a3da_7 conda-forge
aws-sdk-cpp 1.8.186 hecaee15_4 conda-forge
backcall 0.2.0 pyh9f0ad1d_0 conda-forge
backports 1.0 py_2 conda-forge
backports.functools_lru_cache 1.6.4 pyhd8ed1ab_0 conda-forge
beautifulsoup4 4.11.1 pyha770c72_0 conda-forge
bleach 5.0.1 pyhd8ed1ab_0 conda-forge
blosc 1.21.1 h83bc5f7_3 conda-forge
bokeh 2.4.3 pyhd8ed1ab_3 conda-forge
boost 1.78.0 py39h7c9e3ff_4 conda-forge
boost-cpp 1.78.0 h75c5d50_1 conda-forge
branca 0.6.0 pyhd8ed1ab_0 conda-forge
brotli 1.0.9 h166bdaf_8 conda-forge
brotli-bin 1.0.9 h166bdaf_8 conda-forge
brotlipy 0.7.0 py39hb9d737c_1005 conda-forge
brunsli 0.1 h9c3ff4c_0 conda-forge
bzip2 1.0.8 h7f98852_4 conda-forge
c-ares 1.18.1 h7f98852_0 conda-forge
c-blosc2 2.4.3 h7a311fb_0 conda-forge
ca-certificates 2022.9.24 ha878542_0 conda-forge
cachetools 5.2.0 pyhd8ed1ab_0 conda-forge
cairo 1.16.0 ha61ee94_1014 conda-forge
certifi 2022.9.24 pyhd8ed1ab_0 conda-forge
cffi 1.15.1 py39he91dace_2 conda-forge
cfitsio 4.1.0 hd9d235c_0 conda-forge
charls 2.3.4 h9c3ff4c_0 conda-forge
charset-normalizer 2.1.1 pyhd8ed1ab_0 conda-forge
click 8.1.3 unix_pyhd8ed1ab_2 conda-forge
click-plugins 1.1.1 py_0 conda-forge
cligj 0.7.2 pyhd8ed1ab_1 conda-forge
cloudpickle 2.2.0 pyhd8ed1ab_0 conda-forge
colorama 0.4.6 pyhd8ed1ab_0 conda-forge
colorcet 3.0.1 pyhd8ed1ab_0 conda-forge
contourpy 1.0.6 py39hf939315_0 conda-forge
cryptography 38.0.3 py39hd97740a_0 conda-forge
cubinlinker 0.2.0 py39h11215e4_1 rapidsai-nightly
cucim 22.12.00a221118 cuda_11_py39_g0d716fb_16 rapidsai-nightly
cuda-python 11.7.1 py39h1eff087_1 conda-forge
cudatoolkit 11.5.1 h59c8dcf_10 conda-forge
cudf 22.12.00a221118 cuda_11_py39_gcc4b4dd27c_281 rapidsai-nightly
cudf_kafka 22.12.00a221118 py39_gcc4b4dd27c_281 rapidsai-nightly
cugraph 22.12.00a221117 cuda11_py39_g3d95eee8_119 rapidsai-nightly
cuml 22.12.00a221117 cuda11_py39_gdd197b9db_44 rapidsai-nightly
cupy 11.3.0 py39hc3c280e_1 conda-forge
curl 7.86.0 h7bff187_1 conda-forge
cusignal 22.12.00a221118 py39_g5265ccb_7 rapidsai-nightly
cuspatial 22.12.00a221118 py39_g1e8267b_64 rapidsai-nightly
custreamz 22.12.00a221118 py39_gcc4b4dd27c_281 rapidsai-nightly
cuxfilter 22.12.00a221116 py39_g8726496_8 rapidsai-nightly
cycler 0.11.0 pyhd8ed1ab_0 conda-forge
cyrus-sasl 2.1.27 h230043b_5 conda-forge
cytoolz 0.12.0 py39hb9d737c_1 conda-forge
dask 2022.11.0 pyhd8ed1ab_0 conda-forge
dask-core 2022.11.0 pyhd8ed1ab_0 conda-forge
dask-cuda 22.12.00a221118 py39_gf11abe3_29 rapidsai-nightly
dask-cudf 22.12.00a221118 cuda_11_py39_gcc4b4dd27c_281 rapidsai-nightly
dask-glm 0.2.0 py_1 conda-forge
dask-ml 2022.5.27 pyhd8ed1ab_0 conda-forge
datashader 0.13.1a py_0 rapidsai-nightly
datashape 0.5.4 py_1 conda-forge
dav1d 1.0.0 h166bdaf_1 conda-forge
debugpy 1.6.3 py39h5a03fae_1 conda-forge
decorator 5.1.1 pyhd8ed1ab_0 conda-forge
defusedxml 0.7.1 pyhd8ed1ab_0 conda-forge
distributed 2022.11.0 pyhd8ed1ab_0 conda-forge
dlpack 0.5 h9c3ff4c_0 conda-forge
entrypoints 0.4 pyhd8ed1ab_0 conda-forge
executing 1.2.0 pyhd8ed1ab_0 conda-forge
expat 2.5.0 h27087fc_0 conda-forge
faiss-proc 1.0.0 cuda conda-forge
fastavro 1.7.0 py39hb9d737c_0 conda-forge
fastrlock 0.8 py39h5a03fae_3 conda-forge
fiona 1.8.22 py39h80939cc_2 conda-forge
flit-core 3.8.0 pyhd8ed1ab_0 conda-forge
folium 0.13.0 pyhd8ed1ab_0 conda-forge
font-ttf-dejavu-sans-mono 2.37 hab24e00_0 conda-forge
font-ttf-inconsolata 3.000 h77eed37_0 conda-forge
font-ttf-source-code-pro 2.038 h77eed37_0 conda-forge
font-ttf-ubuntu 0.83 hab24e00_0 conda-forge
fontconfig 2.14.1 hc2a2eb6_0 conda-forge
fonts-conda-ecosystem 1 0 conda-forge
fonts-conda-forge 1 0 conda-forge
fonttools 4.38.0 py39hb9d737c_1 conda-forge
freetype 2.12.1 hca18f0e_0 conda-forge
freexl 1.0.6 h166bdaf_1 conda-forge
frozenlist 1.3.3 py39hb9d737c_0 conda-forge
fsspec 2022.11.0 pyhd8ed1ab_0 conda-forge
gdal 3.5.3 py39h92c1d47_2 conda-forge
geopandas 0.12.1 pyhd8ed1ab_1 conda-forge
geopandas-base 0.12.1 pyha770c72_1 conda-forge
geos 3.11.1 h27087fc_0 conda-forge
geotiff 1.7.1 ha76d385_4 conda-forge
gettext 0.21.1 h27087fc_0 conda-forge
gflags 2.2.2 he1b5a44_1004 conda-forge
giflib 5.2.1 h36c2ea0_2 conda-forge
glog 0.6.0 h6f12383_0 conda-forge
grpc-cpp 1.47.1 hbad87ad_6 conda-forge
hdf4 4.2.15 h9772cbc_5 conda-forge
hdf5 1.12.2 nompi_h2386368_100 conda-forge
heapdict 1.0.1 py_0 conda-forge
holoviews 1.14.6 pyhd8ed1ab_0 conda-forge
icu 70.1 h27087fc_0 conda-forge
idna 3.4 pyhd8ed1ab_0 conda-forge
imagecodecs 2022.9.26 py39h702eeef_3 conda-forge
imageio 2.22.0 pyhfa7a67d_0 conda-forge
importlib-metadata 5.0.0 pyha770c72_1 conda-forge
importlib_resources 5.10.0 pyhd8ed1ab_0 conda-forge
ipykernel 6.17.1 pyh210e3f2_0 conda-forge
ipython 8.6.0 pyh41d4057_1 conda-forge
ipywidgets 8.0.2 pyhd8ed1ab_1 conda-forge
jbig 2.1 h7f98852_2003 conda-forge
jedi 0.18.1 pyhd8ed1ab_2 conda-forge
jinja2 3.1.2 pyhd8ed1ab_1 conda-forge
joblib 1.2.0 pyhd8ed1ab_0 conda-forge
jpeg 9e h166bdaf_2 conda-forge
json-c 0.16 hc379101_0 conda-forge
jsonschema 4.17.0 pyhd8ed1ab_0 conda-forge
jupyter-server-proxy 3.2.2 pyhd8ed1ab_0 conda-forge
jupyter_client 7.3.4 pyhd8ed1ab_0 conda-forge
jupyter_core 5.0.0 py39hf3d152e_0 conda-forge
jupyter_server 1.23.2 pyhd8ed1ab_0 conda-forge
jupyterlab_pygments 0.2.2 pyhd8ed1ab_0 conda-forge
jupyterlab_widgets 3.0.3 pyhd8ed1ab_0 conda-forge
jxrlib 1.1 h7f98852_2 conda-forge
kealib 1.4.15 ha7026e8_1 conda-forge
keyutils 1.6.1 h166bdaf_0 conda-forge
kiwisolver 1.4.4 py39hf939315_1 conda-forge
krb5 1.19.3 h3790be6_0 conda-forge
lcms2 2.14 h6ed2654_0 conda-forge
ld_impl_linux-64 2.39 hc81fddc_0 conda-forge
lerc 4.0.0 h27087fc_0 conda-forge
libabseil 20220623.0 cxx17_h48a1fff_5 conda-forge
libaec 1.0.6 h9c3ff4c_0 conda-forge
libavif 0.11.1 h5cdd6b5_0 conda-forge
libblas 3.9.0 16_linux64_openblas conda-forge
libbrotlicommon 1.0.9 h166bdaf_8 conda-forge
libbrotlidec 1.0.9 h166bdaf_8 conda-forge
libbrotlienc 1.0.9 h166bdaf_8 conda-forge
libcblas 3.9.0 16_linux64_openblas conda-forge
libcrc32c 1.1.2 h9c3ff4c_0 conda-forge
libcucim 22.12.00a221118 cuda11_g0d716fb_16 rapidsai-nightly
libcudf 22.12.00a221118 cuda11_gcc4b4dd27c_281 rapidsai-nightly
libcudf_kafka 22.12.00a221118 gcc4b4dd27c_281 rapidsai-nightly
libcugraph 22.12.00a221117 cuda11_ge2a34687_120 rapidsai-nightly
libcugraph_etl 22.12.00a221117 cuda11_ge2a34687_120 rapidsai-nightly
libcugraphops 22.12.00a221117 cuda11_gfbd98ce_21 rapidsai-nightly
libcuml 22.12.00a221117 cuda11_gdd197b9db_44 rapidsai-nightly
libcumlprims 22.12.00a221010 cuda11_geaadb5e_2 rapidsai-nightly
libcurl 7.86.0 h7bff187_1 conda-forge
libcusolver 11.4.1.48 0 nvidia
libcusparse 11.7.5.86 0 nvidia
libcuspatial 22.12.00a221118 cuda11_g1e8267b_64 rapidsai-nightly
libdap4 3.20.6 hd7c4107_2 conda-forge
libdeflate 1.14 h166bdaf_0 conda-forge
libedit 3.1.20191231 he28a2e2_2 conda-forge
libev 4.33 h516909a_1 conda-forge
libevent 2.1.10 h9b69904_4 conda-forge
libfaiss 1.7.0 cuda112h5bea7ad_8_cuda conda-forge
libffi 3.4.2 h7f98852_5 conda-forge
libgcc-ng 12.2.0 h65d4601_19 conda-forge
libgcrypt 1.10.1 h166bdaf_0 conda-forge
libgdal 3.5.3 h7bccd54_2 conda-forge
libgfortran-ng 12.2.0 h69a702a_19 conda-forge
libgfortran5 12.2.0 h337968e_19 conda-forge
libglib 2.74.1 h606061b_1 conda-forge
libgomp 12.2.0 h65d4601_19 conda-forge
libgoogle-cloud 2.1.0 h9ebe8e8_2 conda-forge
libgpg-error 1.45 hc0c96e0_0 conda-forge
libgsasl 1.10.0 h5b4c23d_0 conda-forge
libiconv 1.17 h166bdaf_0 conda-forge
libkml 1.3.0 h37653c0_1015 conda-forge
liblapack 3.9.0 16_linux64_openblas conda-forge
libllvm11 11.1.0 he0ac6c6_5 conda-forge
libnetcdf 4.8.1 nompi_h261ec11_106 conda-forge
libnghttp2 1.47.0 hdcd2b5c_1 conda-forge
libnsl 2.0.0 h7f98852_0 conda-forge
libntlm 1.4 h7f98852_1002 conda-forge
libopenblas 0.3.21 pthreads_h78a6416_3 conda-forge
libpng 1.6.38 h753d276_0 conda-forge
libpq 14.5 hd77ab85_1 conda-forge
libprotobuf 3.20.2 h6239696_0 conda-forge
libraft-distance 22.12.00a221118 cuda11_g5a00013_123 rapidsai-nightly
libraft-headers 22.12.00a221118 cuda11_g5a00013_123 rapidsai-nightly
libraft-nn 22.12.00a221118 cuda11_g5a00013_123 rapidsai-nightly
librdkafka 1.7.0 hc49e61c_1 conda-forge
librmm 22.12.00a221118 cuda11_gda7036aa_57 rapidsai-nightly
librttopo 1.1.0 ha49c73b_12 conda-forge
libsodium 1.0.18 h36c2ea0_1 conda-forge
libspatialindex 1.9.3 h9c3ff4c_4 conda-forge
libspatialite 5.0.1 h7c8129e_22 conda-forge
libsqlite 3.40.0 h753d276_0 conda-forge
libssh2 1.10.0 haa6b8db_3 conda-forge
libstdcxx-ng 12.2.0 h46fd767_19 conda-forge
libthrift 0.16.0 h491838f_2 conda-forge
libtiff 4.4.0 h55922b4_4 conda-forge
libutf8proc 2.8.0 h166bdaf_0 conda-forge
libuuid 2.32.1 h7f98852_1000 conda-forge
libuv 1.44.2 h166bdaf_0 conda-forge
libwebp 1.2.4 h522a892_0 conda-forge
libwebp-base 1.2.4 h166bdaf_0 conda-forge
libxcb 1.13 h7f98852_1004 conda-forge
libxgboost 1.6.2dev.rapidsai22.12 cuda_11_0 rapidsai-nightly
libxml2 2.10.3 h7463322_0 conda-forge
libzip 1.9.2 hc869a4a_1 conda-forge
libzlib 1.2.13 h166bdaf_4 conda-forge
libzopfli 1.0.3 h9c3ff4c_0 conda-forge
llvmlite 0.39.1 py39h7d9a04d_1 conda-forge
locket 1.0.0 pyhd8ed1ab_0 conda-forge
lz4 4.0.2 py39h029007f_0 conda-forge
lz4-c 1.9.3 h9c3ff4c_1 conda-forge
mapclassify 2.4.3 pyhd8ed1ab_0 conda-forge
markdown 3.4.1 pyhd8ed1ab_0 conda-forge
markupsafe 2.1.1 py39hb9d737c_2 conda-forge
matplotlib-base 3.6.2 py39hf9fd14e_0 conda-forge
matplotlib-inline 0.1.6 pyhd8ed1ab_0 conda-forge
mistune 2.0.4 pyhd8ed1ab_0 conda-forge
msgpack-python 1.0.4 py39hf939315_1 conda-forge
multidict 6.0.2 py39hb9d737c_2 conda-forge
multipledispatch 0.6.0 py_0 conda-forge
munch 2.5.0 py_0 conda-forge
munkres 1.1.4 pyh9f0ad1d_0 conda-forge
nbclient 0.7.0 pyhd8ed1ab_0 conda-forge
nbconvert-core 7.2.5 pyhd8ed1ab_0 conda-forge
nbformat 5.7.0 pyhd8ed1ab_0 conda-forge
nccl 2.14.3.1 h0800d71_0 conda-forge
ncurses 6.3 h27087fc_1 conda-forge
nest-asyncio 1.5.6 pyhd8ed1ab_0 conda-forge
networkx 2.6.3 pyhd8ed1ab_1 conda-forge
nodejs 18.12.1 h96d913c_0 conda-forge
nspr 4.32 h9c3ff4c_1 conda-forge
nss 3.78 h2350873_0 conda-forge
numba 0.56.3 py39h61ddf18_0 conda-forge
numpy 1.23.4 py39h3d75532_1 conda-forge
nvtx 0.2.3 py39hb9d737c_2 conda-forge
openjpeg 2.5.0 h7d73246_1 conda-forge
openssl 1.1.1s h166bdaf_0 conda-forge
orc 1.7.6 h6c59b99_0 conda-forge
packaging 21.3 pyhd8ed1ab_0 conda-forge
pandas 1.5.1 py39h4661b88_1 conda-forge
pandocfilters 1.5.0 pyhd8ed1ab_0 conda-forge
panel 0.12.7 pyhd8ed1ab_0 conda-forge
param 1.12.2 pyh6c4a22f_0 conda-forge
parquet-cpp 1.5.1 2 conda-forge
parso 0.8.3 pyhd8ed1ab_0 conda-forge
partd 1.3.0 pyhd8ed1ab_0 conda-forge
pcre 8.45 h9c3ff4c_0 conda-forge
pcre2 10.40 hc3806b6_0 conda-forge
pexpect 4.8.0 pyh1a96a4e_2 conda-forge
pickleshare 0.7.5 py_1003 conda-forge
pillow 9.2.0 py39hf3a2cdf_3 conda-forge
pip 22.3.1 pyhd8ed1ab_0 conda-forge
pixman 0.40.0 h36c2ea0_0 conda-forge
pkgutil-resolve-name 1.3.10 pyhd8ed1ab_0 conda-forge
platformdirs 2.5.2 pyhd8ed1ab_1 conda-forge
poppler 22.11.0 h92391eb_0 conda-forge
poppler-data 0.4.11 hd8ed1ab_0 conda-forge
postgresql 14.5 hdeef612_1 conda-forge
proj 9.1.0 h93bde94_0 conda-forge
prometheus_client 0.15.0 pyhd8ed1ab_0 conda-forge
prompt-toolkit 3.0.32 pyha770c72_0 conda-forge
protobuf 3.20.2 py39h5a03fae_0 conda-forge
psutil 5.9.4 py39hb9d737c_0 conda-forge
pthread-stubs 0.4 h36c2ea0_1001 conda-forge
ptxcompiler 0.7.0 py39h1eff087_2 conda-forge
ptyprocess 0.7.0 pyhd3deb0d_0 conda-forge
pure_eval 0.2.2 pyhd8ed1ab_0 conda-forge
py-xgboost 1.6.2dev.rapidsai22.12 cuda_11_py39_0 rapidsai-nightly
pyarrow 9.0.0 py39hc0775d8_2_cpu conda-forge
pycparser 2.21 pyhd8ed1ab_0 conda-forge
pyct 0.4.6 py_0 conda-forge
pyct-core 0.4.6 py_0 conda-forge
pydeck 0.5.0 pyh9f0ad1d_0 conda-forge
pyee 8.1.0 pyhd8ed1ab_0 conda-forge
pygments 2.13.0 pyhd8ed1ab_0 conda-forge
pylibcugraph 22.12.00a221117 cuda11_py39_g3d95eee8_119 rapidsai-nightly
pylibraft 22.12.00a221118 cuda11_py39_g5a00013_123 rapidsai-nightly
pynvml 11.4.1 pyhd8ed1ab_0 conda-forge
pyopenssl 22.1.0 pyhd8ed1ab_0 conda-forge
pyparsing 3.0.9 pyhd8ed1ab_0 conda-forge
pyppeteer 1.0.2 pyhd8ed1ab_0 conda-forge
pyproj 3.4.0 py39h14a8356_2 conda-forge
pyrsistent 0.19.2 py39hb9d737c_0 conda-forge
pysocks 1.7.1 pyha2e5f31_6 conda-forge
python 3.9.13 h9a8a25e_0_cpython conda-forge
python-confluent-kafka 1.7.0 py39h3811e60_2 conda-forge
python-dateutil 2.8.2 pyhd8ed1ab_0 conda-forge
python-fastjsonschema 2.16.2 pyhd8ed1ab_0 conda-forge
python_abi 3.9 2_cp39 conda-forge
pytz 2022.6 pyhd8ed1ab_0 conda-forge
pyviz_comms 2.2.1 pyhd8ed1ab_1 conda-forge
pywavelets 1.3.0 py39h2ae25f5_2 conda-forge
pyyaml 6.0 py39hb9d737c_5 conda-forge
pyzmq 24.0.1 py39headdf64_1 conda-forge
raft-dask 22.12.00a221118 cuda11_py39_g5a00013_123 rapidsai-nightly
rapids 22.12.00a221118 cuda11_py39_ge31b0c4_47 rapidsai-nightly
rapids-xgboost 22.12.00a221118 cuda11_py39_ge31b0c4_47 rapidsai-nightly
re2 2022.06.01 h27087fc_0 conda-forge
readline 8.1.2 h0f457ee_0 conda-forge
requests 2.28.1 pyhd8ed1ab_1 conda-forge
rmm 22.12.00a221118 cuda11_py39_gda7036aa_57 rapidsai-nightly
rtree 1.0.1 py39hb102c33_1 conda-forge
s2n 1.0.10 h9b69904_0 conda-forge
scikit-image 0.19.3 py39h4661b88_2 conda-forge
scikit-learn 1.1.3 py39hd5c8da3_1 conda-forge
scipy 1.9.3 py39hddc5342_2 conda-forge
send2trash 1.8.0 pyhd8ed1ab_0 conda-forge
setuptools 60.10.0 py39hf3d152e_0 conda-forge
shapely 1.8.5 py39h76a96b7_2 conda-forge
simpervisor 0.4 pyhd8ed1ab_0 conda-forge
six 1.16.0 pyh6c4a22f_0 conda-forge
snappy 1.1.9 hbd366e4_2 conda-forge
sniffio 1.3.0 pyhd8ed1ab_0 conda-forge
sortedcontainers 2.4.0 pyhd8ed1ab_0 conda-forge
soupsieve 2.3.2.post1 pyhd8ed1ab_0 conda-forge
spdlog 1.8.5 h4bd325d_1 conda-forge
sqlite 3.40.0 h4ff8645_0 conda-forge
stack_data 0.6.1 pyhd8ed1ab_0 conda-forge
streamz 0.6.4 pyh6c4a22f_0 conda-forge
tblib 1.7.0 pyhd8ed1ab_0 conda-forge
terminado 0.17.0 pyh41d4057_0 conda-forge
threadpoolctl 3.1.0 pyh8a188c0_0 conda-forge
tifffile 2022.10.10 pyhd8ed1ab_0 conda-forge
tiledb 2.11.3 h1e4a385_1 conda-forge
tinycss2 1.2.1 pyhd8ed1ab_0 conda-forge
tk 8.6.12 h27826a3_0 conda-forge
toolz 0.12.0 pyhd8ed1ab_0 conda-forge
tornado 6.1 py39hb9d737c_3 conda-forge
tqdm 4.64.1 pyhd8ed1ab_0 conda-forge
traitlets 5.5.0 pyhd8ed1ab_0 conda-forge
treelite 3.0.0 py39hc7ff369_1 conda-forge
treelite-runtime 3.0.0 pypi_0 pypi
typing-extensions 4.4.0 hd8ed1ab_0 conda-forge
typing_extensions 4.4.0 pyha770c72_0 conda-forge
tzcode 2022f h166bdaf_0 conda-forge
tzdata 2022f h191b570_0 conda-forge
ucx 1.13.1 h538f049_0 conda-forge
ucx-proc 1.0.0 gpu rapidsai-nightly
ucx-py 0.29.00a221117 py39_ge662e4d_20 rapidsai-nightly
unicodedata2 15.0.0 py39hb9d737c_0 conda-forge
urllib3 1.26.11 pyhd8ed1ab_0 conda-forge
wcwidth 0.2.5 pyh9f0ad1d_2 conda-forge
webencodings 0.5.1 py_1 conda-forge
websocket-client 1.4.2 pyhd8ed1ab_0 conda-forge
websockets 10.4 py39hb9d737c_1 conda-forge
wheel 0.38.4 pyhd8ed1ab_0 conda-forge
widgetsnbextension 4.0.3 pyhd8ed1ab_0 conda-forge
xarray 2022.11.0 pyhd8ed1ab_0 conda-forge
xerces-c 3.2.4 h55805fa_1 conda-forge
xgboost 1.6.2dev.rapidsai22.12 cuda_11_py39_0 rapidsai-nightly
xorg-kbproto 1.0.7 h7f98852_1002 conda-forge
xorg-libice 1.0.10 h7f98852_0 conda-forge
xorg-libsm 1.2.3 hd9c2040_1000 conda-forge
xorg-libx11 1.7.2 h7f98852_0 conda-forge
xorg-libxau 1.0.9 h7f98852_0 conda-forge
xorg-libxdmcp 1.1.3 h7f98852_0 conda-forge
xorg-libxext 1.3.4 h7f98852_1 conda-forge
xorg-libxrender 0.9.10 h7f98852_1003 conda-forge
xorg-renderproto 0.11.1 h7f98852_1002 conda-forge
xorg-xextproto 7.3.0 h7f98852_1002 conda-forge
xorg-xproto 7.0.31 h7f98852_1007 conda-forge
xyzservices 2022.9.0 pyhd8ed1ab_0 conda-forge
xz 5.2.6 h166bdaf_0 conda-forge
yaml 0.2.5 h7f98852_2 conda-forge
yarl 1.8.1 py39hb9d737c_0 conda-forge
zeromq 4.3.4 h9c3ff4c_1 conda-forge
zfp 1.0.0 h27087fc_3 conda-forge
zict 2.2.0 pyhd8ed1ab_0 conda-forge
zipp 3.10.0 pyhd8ed1ab_0 conda-forge
zlib 1.2.13 h166bdaf_4 conda-forge
zlib-ng 2.0.6 h166bdaf_0 conda-forge
zstd 1.5.2 h6239696_4 conda-forge
Rig:
description: Computer
width: 64 bits
capabilities: smp vsyscall32
*-core
description: Motherboard
physical id: 0
*-memory
description: System memory
physical id: 0
size: 15GiB
*-cpu
product: Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz
vendor: Intel Corp.
physical id: 1
bus info: cpu@0
size: 1483MHz
capacity: 4200MHz
width: 64 bits
capabilities: fpu fpu_exception wp vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp x86-64 constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb invpcid_single pti ssbd ibrs ibpb stibp tpr_shadow vnmi flexpriority ept vpid fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx rdseed adx smap clflushopt intel_pt xsaveopt xsavec xgetbv1 xsaves dtherm ida arat pln pts hwp hwp_notify hwp_act_window hwp_epp md_clear flush_l1d arch_capabilities cpufreq
*-pci
description: Host bridge
product: Xeon E3-1200 v5/E3-1500 v5/6th Gen Core Processor Host Bridge/DRAM Registers
vendor: Intel Corporation
physical id: 100
bus info: pci@0000:00:00.0
version: 07
width: 32 bits
clock: 33MHz
configuration: driver=skl_uncore
resources: irq:0
*-display
description: VGA compatible controller
product: HD Graphics 530
vendor: Intel Corporation
physical id: 2
bus info: pci@0000:00:02.0
version: 06
width: 64 bits
clock: 33MHz
capabilities: vga_controller bus_master cap_list rom
configuration: driver=i915 latency=0
resources: irq:141 memory:f1000000-f1ffffff memory:e0000000-efffffff ioport:f000(size=64) memory:c0000-dffff
*-usb
description: USB controller
product: 100 Series/C230 Series Chipset Family USB 3.0 xHCI Controller
vendor: Intel Corporation
physical id: 14
bus info: pci@0000:00:14.0
version: 31
width: 64 bits
clock: 33MHz
capabilities: xhci bus_master cap_list
configuration: driver=xhci_hcd latency=0
resources: irq:135 memory:f5120000-f512ffff
*-generic
description: Signal processing controller
product: 100 Series/C230 Series Chipset Family Thermal Subsystem
vendor: Intel Corporation
physical id: 14.2
bus info: pci@0000:00:14.2
version: 31
width: 64 bits
clock: 33MHz
capabilities: bus_master cap_list
configuration: driver=intel_pch_thermal latency=0
resources: irq:18 memory:f5138000-f5138fff
*-communication
description: Communication controller
product: 100 Series/C230 Series Chipset Family MEI Controller #1
vendor: Intel Corporation
physical id: 16
bus info: pci@0000:00:16.0
version: 31
width: 64 bits
clock: 33MHz
capabilities: bus_master cap_list
configuration: driver=mei_me latency=0
resources: iomemory:2f0-2ef irq:142 memory:2ffff15000-2ffff15fff
*-storage
description: SATA controller
product: Q170/Q150/B150/H170/H110/Z170/CM236 Chipset SATA Controller [AHCI Mode]
vendor: Intel Corporation
physical id: 17
bus info: pci@0000:00:17.0
version: 31
width: 32 bits
clock: 66MHz
capabilities: storage ahci_1.0 bus_master cap_list
configuration: driver=ahci latency=0
resources: irq:139 memory:f5134000-f5135fff memory:f5137000-f51370ff ioport:f090(size=8) ioport:f080(size=4) ioport:f060(size=32) memory:f5136000-f51367ff
*-pci:0
description: PCI bridge
product: 100 Series/C230 Series Chipset Family PCI Express Root Port #5
vendor: Intel Corporation
physical id: 1c
bus info: pci@0000:00:1c.0
version: f1
width: 32 bits
clock: 33MHz
capabilities: pci normal_decode bus_master cap_list
configuration: driver=pcieport
resources: irq:122
*-pci:1
description: PCI bridge
product: 100 Series/C230 Series Chipset Family PCI Express Root Port #6
vendor: Intel Corporation
physical id: 1c.5
bus info: pci@0000:00:1c.5
version: f1
width: 32 bits
clock: 33MHz
capabilities: pci normal_decode bus_master cap_list
configuration: driver=pcieport
resources: irq:123 ioport:e000(size=4096) memory:f6000000-f70fffff ioport:2fe0000000(size=303038464)
*-display
description: VGA compatible controller
product: NVIDIA Corporation
vendor: NVIDIA Corporation
physical id: 0
bus info: pci@0000:02:00.0
version: a1
width: 64 bits
clock: 33MHz
capabilities: vga_controller bus_master cap_list rom
configuration: driver=nvidia latency=0
resources: iomemory:2f0-2ef iomemory:2f0-2ef irq:144 memory:f6000000-f6ffffff memory:2fe0000000-2fefffffff memory:2ff0000000-2ff1ffffff ioport:e000(size=128) memory:f7000000-f707ffff
*-multimedia
description: Audio device
product: NVIDIA Corporation
vendor: NVIDIA Corporation
physical id: 0.1
bus info: pci@0000:02:00.1
version: a1
width: 32 bits
clock: 33MHz
capabilities: bus_master cap_list
configuration: driver=snd_hda_intel latency=0
resources: irq:18 memory:f7080000-f7083fff
*-usb
description: USB controller
product: NVIDIA Corporation
vendor: NVIDIA Corporation
physical id: 0.2
bus info: pci@0000:02:00.2
version: a1
width: 64 bits
clock: 33MHz
capabilities: xhci bus_master cap_list
configuration: driver=xhci_hcd latency=0
resources: iomemory:2f0-2ef iomemory:2f0-2ef irq:136 memory:2ff2000000-2ff203ffff memory:2ff2040000-2ff204ffff
*-serial UNCLAIMED
description: Serial bus controller
product: NVIDIA Corporation
vendor: NVIDIA Corporation
physical id: 0.3
bus info: pci@0000:02:00.3
version: a1
width: 32 bits
clock: 33MHz
capabilities: bus_master cap_list
configuration: latency=0
resources: memory:f7084000-f7084fff
*-pci:2
description: PCI bridge
product: 100 Series/C230 Series Chipset Family PCI Express Root Port #8
vendor: Intel Corporation
physical id: 1c.7
bus info: pci@0000:00:1c.7
version: f1
width: 32 bits
clock: 33MHz
capabilities: pci normal_decode bus_master cap_list
configuration: driver=pcieport
resources: irq:124 ioport:d000(size=4096) memory:f4000000-f50fffff ioport:2fc0000000(size=303038464)
*-display
description: VGA compatible controller
product: NVIDIA Corporation
vendor: NVIDIA Corporation
physical id: 0
bus info: pci@0000:03:00.0
version: a1
width: 64 bits
clock: 33MHz
capabilities: vga_controller bus_master cap_list rom
configuration: driver=nvidia latency=0
resources: iomemory:2f0-2ef iomemory:2f0-2ef irq:145 memory:f4000000-f4ffffff memory:2fc0000000-2fcfffffff memory:2fd0000000-2fd1ffffff ioport:d000(size=128) memory:f5000000-f507ffff
*-multimedia
description: Audio device
product: NVIDIA Corporation
vendor: NVIDIA Corporation
physical id: 0.1
bus info: pci@0000:03:00.1
version: a1
width: 32 bits
clock: 33MHz
capabilities: bus_master cap_list
configuration: driver=snd_hda_intel latency=0
resources: irq:16 memory:f5080000-f5083fff
*-usb
description: USB controller
product: NVIDIA Corporation
vendor: NVIDIA Corporation
physical id: 0.2
bus info: pci@0000:03:00.2
version: a1
width: 64 bits
clock: 33MHz
capabilities: xhci bus_master cap_list
configuration: driver=xhci_hcd latency=0
resources: iomemory:2f0-2ef iomemory:2f0-2ef irq:137 memory:2fd2000000-2fd203ffff memory:2fd2040000-2fd204ffff
*-serial UNCLAIMED
description: Serial bus controller
product: NVIDIA Corporation
vendor: NVIDIA Corporation
physical id: 0.3
bus info: pci@0000:03:00.3
version: a1
width: 32 bits
clock: 33MHz
capabilities: bus_master cap_list
configuration: latency=0
resources: memory:f5084000-f5084fff
*-pci:3
description: PCI bridge
product: 100 Series/C230 Series Chipset Family PCI Express Root Port #9
vendor: Intel Corporation
physical id: 1d
bus info: pci@0000:00:1d.0
version: f1
width: 32 bits
clock: 33MHz
capabilities: pci normal_decode bus_master cap_list
configuration: driver=pcieport
resources: irq:125
*-pci:4
description: PCI bridge
product: 100 Series/C230 Series Chipset Family PCI Express Root Port #10
vendor: Intel Corporation
physical id: 1d.1
bus info: pci@0000:00:1d.1
version: f1
width: 32 bits
clock: 33MHz
capabilities: pci normal_decode bus_master cap_list
configuration: driver=pcieport
resources: irq:126 ioport:c000(size=4096) memory:f2000000-f30fffff ioport:2fa0000000(size=303038464)
*-pci
description: PCI bridge
product: ASMedia Technology Inc.
vendor: ASMedia Technology Inc.
physical id: 0
bus info: pci@0000:05:00.0
version: 00
width: 32 bits
clock: 33MHz
capabilities: pci normal_decode bus_master cap_list
configuration: driver=pcieport
resources: irq:127 ioport:c000(size=4096) memory:f2000000-f30fffff ioport:2fa0000000(size=303038464)
*-pci:0
description: PCI bridge
product: ASMedia Technology Inc.
vendor: ASMedia Technology Inc.
physical id: 1
bus info: pci@0000:06:01.0
version: 00
width: 32 bits
clock: 33MHz
capabilities: pci normal_decode bus_master cap_list
configuration: driver=pcieport
resources: irq:128
*-pci:1
description: PCI bridge
product: ASMedia Technology Inc.
vendor: ASMedia Technology Inc.
physical id: 2
bus info: pci@0000:06:02.0
version: 00
width: 32 bits
clock: 33MHz
capabilities: pci normal_decode bus_master cap_list
configuration: driver=pcieport
resources: irq:129
*-pci:2
description: PCI bridge
product: ASMedia Technology Inc.
vendor: ASMedia Technology Inc.
physical id: 3
bus info: pci@0000:06:03.0
version: 00
width: 32 bits
clock: 33MHz
capabilities: pci normal_decode bus_master cap_list
configuration: driver=pcieport
resources: irq:130 ioport:c000(size=4096) memory:f2000000-f30fffff ioport:2fa0000000(size=303038464)
*-display
description: VGA compatible controller
product: NVIDIA Corporation
vendor: NVIDIA Corporation
physical id: 0
bus info: pci@0000:09:00.0
version: a1
width: 64 bits
clock: 33MHz
capabilities: vga_controller bus_master cap_list rom
configuration: driver=nvidia latency=0
resources: iomemory:2f0-2ef iomemory:2f0-2ef irq:146 memory:f2000000-f2ffffff memory:2fa0000000-2fafffffff memory:2fb0000000-2fb1ffffff ioport:c000(size=128) memory:f3000000-f307ffff
*-multimedia
description: Audio device
product: NVIDIA Corporation
vendor: NVIDIA Corporation
physical id: 0.1
bus info: pci@0000:09:00.1
version: a1
width: 32 bits
clock: 33MHz
capabilities: bus_master cap_list
configuration: driver=snd_hda_intel latency=0
resources: irq:17 memory:f3080000-f3083fff
*-usb
description: USB controller
product: NVIDIA Corporation
vendor: NVIDIA Corporation
physical id: 0.2
bus info: pci@0000:09:00.2
version: a1
width: 64 bits
clock: 33MHz
capabilities: xhci bus_master cap_list
configuration: driver=xhci_hcd latency=0
resources: iomemory:2f0-2ef iomemory:2f0-2ef irq:138 memory:2fb2000000-2fb203ffff memory:2fb2040000-2fb204ffff
*-serial UNCLAIMED
description: Serial bus controller
product: NVIDIA Corporation
vendor: NVIDIA Corporation
physical id: 0.3
bus info: pci@0000:09:00.3
version: a1
width: 32 bits
clock: 33MHz
capabilities: bus_master cap_list
configuration: latency=0
resources: memory:f3084000-f3084fff
*-pci:3
description: PCI bridge
product: ASMedia Technology Inc.
vendor: ASMedia Technology Inc.
physical id: 4
bus info: pci@0000:06:04.0
version: 00
width: 32 bits
clock: 33MHz
capabilities: pci normal_decode bus_master cap_list
configuration: driver=pcieport
resources: irq:131
*-pci:4
description: PCI bridge
product: ASMedia Technology Inc.
vendor: ASMedia Technology Inc.
physical id: 5
bus info: pci@0000:06:05.0
version: 00
width: 32 bits
clock: 33MHz
capabilities: pci normal_decode bus_master cap_list
configuration: driver=pcieport
resources: irq:132
*-pci:5
description: PCI bridge
product: ASMedia Technology Inc.
vendor: ASMedia Technology Inc.
physical id: 6
bus info: pci@0000:06:06.0
version: 00
width: 32 bits
clock: 33MHz
capabilities: pci normal_decode bus_master cap_list
configuration: driver=pcieport
resources: irq:133
*-pci:6
description: PCI bridge
product: ASMedia Technology Inc.
vendor: ASMedia Technology Inc.
physical id: 7
bus info: pci@0000:06:07.0
version: 00
width: 32 bits
clock: 33MHz
capabilities: pci normal_decode bus_master cap_list
configuration: driver=pcieport
resources: irq:134
*-isa
description: ISA bridge
product: H110 Chipset LPC/eSPI Controller
vendor: Intel Corporation
physical id: 1f
bus info: pci@0000:00:1f.0
version: 31
width: 32 bits
clock: 33MHz
capabilities: isa bus_master
configuration: latency=0
*-memory UNCLAIMED
description: Memory controller
product: 100 Series/C230 Series Chipset Family Power Management Controller
vendor: Intel Corporation
physical id: 1f.2
bus info: pci@0000:00:1f.2
version: 31
width: 32 bits
clock: 33MHz (30.3ns)
capabilities: bus_master
configuration: latency=0
resources: memory:f5130000-f5133fff
*-multimedia
description: Audio device
product: 100 Series/C230 Series Chipset Family HD Audio Controller
vendor: Intel Corporation
physical id: 1f.3
bus info: pci@0000:00:1f.3
version: 31
width: 64 bits
clock: 33MHz
capabilities: bus_master cap_list
configuration: driver=snd_hda_intel latency=32
resources: iomemory:2f0-2ef iomemory:2f0-2ef irq:143 memory:2ffff10000-2ffff13fff memory:2ffff00000-2ffff0ffff
*-serial UNCLAIMED
description: SMBus
product: 100 Series/C230 Series Chipset Family SMBus
vendor: Intel Corporation
physical id: 1f.4
bus info: pci@0000:00:1f.4
version: 31
width: 64 bits
clock: 33MHz
configuration: latency=0
resources: iomemory:2f0-2ef memory:2ffff14000-2ffff140ff ioport:f040(size=32)
*-network
description: Ethernet interface
product: Ethernet Connection (2) I219-V
vendor: Intel Corporation
physical id: 1f.6
bus info: pci@0000:00:1f.6
logical name: enp0s31f6
version: 31
serial: 70:85:c2:76:d1:a5
size: 1Gbit/s
capacity: 1Gbit/s
width: 32 bits
clock: 33MHz
capabilities: bus_master cap_list ethernet physical tp 10bt 10bt-fd 100bt 100bt-fd 1000bt-fd autonegotiation
configuration: autonegotiation=on broadcast=yes driver=e1000e driverversion=3.2.6-k duplex=full firmware=0.8-4 ip=10.1.1.121 latency=0 link=yes multicast=yes port=twisted pair speed=1Gbit/s
resources: irq:140 memory:f5100000-f511ffff
What causes this error?
Is this something you experience during the workflow or at the end when the cluster is shutting down?
The program I run is in a loop, where input increases from as such:
def run_multiplication():
for s in range(min, max):
size = 2 ** s
print_size = size * WORK_SIZE
try:
CMD_EXE = ["python3", "./multiplication.py", str(size), str(WORK_SIZE)]
delay = check_output(CMD_EXE).decode("ascii")
avg_watt = measure_watt(CMD_EXE)
total_joules = watts_to_joules(avg_watt, float(delay))
write_throughput_results("multiplicationGPU", size=print_size, wattage = total_joules, time=delay)
except CalledProcessError as e:
print(e)
write_error_output("multiplication", print_size, e)
The program never terminates, so the problem occurs during the workflow. As I write my output stats to a CSV, I can determine that exactly after executing the workflow for (2^20, 1000), the program just does not terminate, but just prints that the program dies, whilst still not terminating.
What I understand is the program containing run_multiplication
runs that code from the original description, which creates LocalCUDACluster
, etc., is that right? If that is the case, you're starting a new cluster for each iteration of run_multiplication
, which brings me again to the question of whether this happens during the execution of the actual Dask workflow as opposed to the cluster shutdown.
The way I understand it, you could completely disregard run_multiplication
and just run the workflow from the top with the parameters that fail and you would still experience the same issue. However, there are differences as to whether this is only happening at the end (when the work has completed and the Dask cluster is shutting down) or in the middle of the workflow, which is exactly what I'm trying to understand.
You are right. I have two separate programs, one calling the scripts where I implement the multiplication and so forth. So the loop I posted is a .py
script which runs the multiplication.py
script for all integers in range [5:22]. So yes, there is a top program running the cluster, which means a new cluster is started for every iteration in the main script.
The problem originates from the Cluster not shutting down, why I now realize my original post may be mistitled, I apologize for that. I can run the program multiplication.py
for arbitrary sizes, including 2 ^ 21, 1000 with no problems, why the problem must be the cluster shutdown. The reason as to my doubt is that I added client.shutdown()
to multiplication.py
in order to run it from the master script, this instantly resulted in the following error:
2022-11-23 12:32:12,869 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2022-11-23 12:32:12,869 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
0.2419048309326172
2022-11-23 12:32:13,744 - distributed.client - ERROR -
ConnectionRefusedError: [Errno 111] Connection refused
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/joachim/anaconda3/envs/rapids-22.12/lib/python3.9/site-packages/distributed/comm/core.py", line 291, in connect
comm = await asyncio.wait_for(
File "/home/joachim/anaconda3/envs/rapids-22.12/lib/python3.9/asyncio/tasks.py", line 479, in wait_for
return fut.result()
File "/home/joachim/anaconda3/envs/rapids-22.12/lib/python3.9/site-packages/distributed/comm/tcp.py", line 511, in connect
convert_stream_closed_error(self, e)
File "/home/joachim/anaconda3/envs/rapids-22.12/lib/python3.9/site-packages/distributed/comm/tcp.py", line 142, in convert_stream_closed_error
raise CommClosedError(f"in {obj}: {exc.__class__.__name__}: {exc}") from exc
distributed.comm.core.CommClosedError: in <distributed.comm.tcp.TCPConnector object at 0x7fa777db4b50>: ConnectionRefusedError: [Errno 111] Connection refused
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/joachim/anaconda3/envs/rapids-22.12/lib/python3.9/site-packages/distributed/utils.py", line 742, in wrapper
return await func(*args, **kwargs)
File "/home/joachim/anaconda3/envs/rapids-22.12/lib/python3.9/site-packages/distributed/client.py", line 1298, in _reconnect
await self._ensure_connected(timeout=timeout)
File "/home/joachim/anaconda3/envs/rapids-22.12/lib/python3.9/site-packages/distributed/client.py", line 1328, in _ensure_connected
comm = await connect(
File "/home/joachim/anaconda3/envs/rapids-22.12/lib/python3.9/site-packages/distributed/comm/core.py", line 315, in connect
await asyncio.sleep(backoff)
File "/home/joachim/anaconda3/envs/rapids-22.12/lib/python3.9/asyncio/tasks.py", line 652, in sleep
return await future
asyncio.exceptions.CancelledError
Traceback (most recent call last):
File "/home/joachim/anaconda3/envs/rapids-22.12/lib/python3.9/site-packages/distributed/comm/tcp.py", line 225, in read
frames_nbytes = await stream.read_bytes(fmt_size)
tornado.iostream.StreamClosedError: Stream is closed
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/joachim/anaconda3/envs/rapids-22.12/lib/python3.9/site-packages/distributed/client.py", line 1500, in _handle_report
msgs = await self.scheduler_comm.comm.read()
File "/home/joachim/anaconda3/envs/rapids-22.12/lib/python3.9/site-packages/distributed/comm/tcp.py", line 241, in read
convert_stream_closed_error(self, e)
File "/home/joachim/anaconda3/envs/rapids-22.12/lib/python3.9/site-packages/distributed/comm/tcp.py", line 144, in convert_stream_closed_error
raise CommClosedError(f"in {obj}: {exc}") from exc
distributed.comm.core.CommClosedError: in <TCP (closed) Client->Scheduler local=tcp://127.0.0.1:40494 remote=tcp://127.0.0.1:34111>: Stream is closed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/joachim/anaconda3/envs/rapids-22.12/lib/python3.9/site-packages/distributed/utils.py", line 742, in wrapper
return await func(*args, **kwargs)
File "/home/joachim/anaconda3/envs/rapids-22.12/lib/python3.9/site-packages/distributed/client.py", line 1508, in _handle_report
await self._reconnect()
File "/home/joachim/anaconda3/envs/rapids-22.12/lib/python3.9/site-packages/distributed/utils.py", line 742, in wrapper
return await func(*args, **kwargs)
File "/home/joachim/anaconda3/envs/rapids-22.12/lib/python3.9/site-packages/distributed/client.py", line 1298, in _reconnect
await self._ensure_connected(timeout=timeout)
File "/home/joachim/anaconda3/envs/rapids-22.12/lib/python3.9/site-packages/distributed/client.py", line 1328, in _ensure_connected
comm = await connect(
File "/home/joachim/anaconda3/envs/rapids-22.12/lib/python3.9/site-packages/distributed/comm/core.py", line 315, in connect
await asyncio.sleep(backoff)
File "/home/joachim/anaconda3/envs/rapids-22.12/lib/python3.9/asyncio/tasks.py", line 652, in sleep
return await future
asyncio.exceptions.CancelledError
Why I thought I would let DASK handle the cluster shut-downs and openings. However, I now realize that I will have to find a way to shutdown the cluster after executing the workload, which I will investigate.
I don't mean that you're necessarily doing anything wrong, just trying to understand where the issue occurs. Unfortunately ensuring everything closes correctly is more challenging than it may seem, so it's possible that for any reason your cluster is actually taking longer than expected to terminate and that may have the effect of spreading down the stack.
So in this case, are you sure that the work has actually completed? In other words, are you able to verify that somehow? What could still be happening is that the cluster is not shutting down, but some of the processes could have deadlocked or died, and thus making Dask unable to continue the job (for example, if the process stopped responding and can't be killed). So the next step here is to ensure that the cluster is shutting down but fails to do so, or if it's actually stuck for some other reason.