Wandb.jl
Wandb.jl copied to clipboard
Ctrl+C during Training tends to kill the Wandb Process
Which admittedly is not shocking, but with ~10 minutes startup time, I'd really like to avoid having to start a new session
Problem at: (unknown file) 0 (unknown function)
Traceback (most recent call last):
File "/home/awadell/project/.spack-env/view/lib/python3.9/site-packages/wandb/sdk/wandb_init.py", line 954, in init
run = wi.init()
File "/home/awadell/project/.spack-env/view/lib/python3.9/site-packages/wandb/sdk/wandb_init.py", line 489, in init
tel.feature.init_return_run = True
File "/home/awadell/project/.spack-env/view/lib/python3.9/site-packages/wandb/sdk/lib/telemetry.py", line 43, in __exit__
self._run._telemetry_callback(self._obj)
File "/home/awadell/project/.spack-env/view/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 470, in _telemetry_callback
self._telemetry_flush()
File "/home/awadell/project/.spack-env/view/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 481, in _telemetry_flush
self._backend.interface._publish_telemetry(self._telemetry_obj)
File "/home/awadell/project/.spack-env/view/lib/python3.9/site-packages/wandb/sdk/interface/interface_shared.py", line 73, in _publish_telemetry
self._publish(rec)
File "/home/awadell/project/.spack-env/view/lib/python3.9/site-packages/wandb/sdk/interface/interface_queue.py", line 49, in _publish
raise Exception("The wandb backend process has shutdown")
Exception: The wandb backend process has shutdown
ERROR: LoadError: PyError ($(Expr(:escape, :(ccall(#= /home/awadell/project/.julia/packages/PyCall/7a7w0/src/pyfncall.jl:43 =# @pysym(:PyObject_Call), PyPtr, (PyPtr, PyPtr, PyPtr), o, pyargsptr, kw))))) <class 'Exception'>
Exception('problem')
wandb: ERROR Abnormal program exit
File "/home/awadell/project/.spack-env/view/lib/python3.9/site-packages/wandb/sdk/wandb_init.py", line 992, in init
six.raise_from(Exception("problem"), error_seen)
File "<string>", line 3, in raise_from
Stacktrace:
[1] pyerr_check
@ ~/project/.julia/packages/PyCall/7a7w0/src/exception.jl:62 [inlined]
[2] pyerr_check
@ ~/project/.julia/packages/PyCall/7a7w0/src/exception.jl:66 [inlined]
[3] _handle_error(msg::String)
@ PyCall ~/project/.julia/packages/PyCall/7a7w0/src/exception.jl:83
[4] macro expansion
@ ~/project/.julia/packages/PyCall/7a7w0/src/exception.jl:97 [inlined]
[5] #107
@ ~/project/.julia/packages/PyCall/7a7w0/src/pyfncall.jl:43 [inlined]
[6] disable_sigint
@ ./c.jl:458 [inlined]
[7] __pycall!
@ ~/project/.julia/packages/PyCall/7a7w0/src/pyfncall.jl:42 [inlined]
[8] _pycall!(ret::PyCall.PyObject, o::PyCall.PyObject, args::Tuple{}, nargs::Int64, kw::PyCall.PyObject)
@ PyCall ~/project/.julia/packages/PyCall/7a7w0/src/pyfncall.jl:29
[9] _pycall!(ret::PyCall.PyObject, o::PyCall.PyObject, args::Tuple{}, kwargs::Base.Pairs{Symbol, Any, Tuple{Symbol, Symbol, Symbol}, NamedTuple{(:project, :name, :config), Tuple{String, Nothing, Dict{String, Any}}}})
@ PyCall ~/project/.julia/packages/PyCall/7a7w0/src/pyfncall.jl:11
[10] #_#114
@ ~/project/.julia/packages/PyCall/7a7w0/src/pyfncall.jl:86 [inlined]
[11] WandbLogger(; project::String, name::Nothing, min_level::Base.CoreLogging.LogLevel, step_increment::Int64, start_step::Int64, kwargs::Base.Pairs{Symbol, Dict{String, Any}, Tuple{Symbol}, NamedTuple{(:config,), Tuple{Dict{String, Any}}}})
@ Wandb ~/project/.julia/packages/Wandb/8Eio5/src/main.jl:19
Why would you have to restart the session? You could Wandb.close(lg)
and then restart the logger with the same parameters.
I had been including a script like this, so I didn't have access to the logger.
function train()
lg = WandbLogger(...)
# Training stuff
end
train()
But even after switching to this, once the wandb backend shutdown, all calls to Wandb error out:
function train(lg)
...
end
lg = WandbLogger(...)
train(lg)
Wandb.close(lg)
Specifically, after Ctrl+Cing in the training loop, calling Wandb.close(lg)
gives:
julia> Wandb.close(lg)
ERROR: PyError ($(Expr(:escape, :(ccall(#= /home/awadell/project/.julia/packages/PyCall/7a7w0/src/pyfncall.jl:43 =# @pysym(:PyObject_Call), PyPtr, (PyPtr, PyPtr, PyPtr), o, pyargsptr, kw))))) <class 'Exception'>
Exception('The wandb backend process has shutdown')
Exception in thread NetStatThr:
raise Exception("The wandb backend process has shutdown")
Traceback (most recent call last):
Exception: The wandb backend process has shutdown
File "/home/awadell/project/.spack-env/._view/nz3aetbopueaqzrd7hqzg2w3h5busd3r/lib/python3.9/threading.py", line 973, in _bootstrap_inner
File "/home/awadell/project/.spack-env/view/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 1538, in finish
tel.feature.finish = True
File "/home/awadell/project/.spack-env/view/lib/python3.9/site-packages/wandb/sdk/lib/telemetry.py", line 43, in __exit__
self._run._telemetry_callback(self._obj)
File "/home/awadell/project/.spack-env/view/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 470, in _telemetry_callback
self._telemetry_flush()
File "/home/awadell/project/.spack-env/view/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 481, in _telemetry_flush
self._backend.interface._publish_telemetry(self._telemetry_obj)
File "/home/awadell/project/.spack-env/view/lib/python3.9/site-packages/wandb/sdk/interface/interface_shared.py", line 73, in _publish_telemetry
self._publish(rec)
File "/home/awadell/project/.spack-env/view/lib/python3.9/site-packages/wandb/sdk/interface/interface_queue.py", line 49, in _publish
raise Exception("The wandb backend process has shutdown")
Stacktrace:
[1] pyerr_check
@ ~/project/.julia/packages/PyCall/7a7w0/src/exception.jl:62 [inlined]
[2] pyerr_check
@ ~/project/.julia/packages/PyCall/7a7w0/src/exception.jl:66 [inlined]
[3] _handle_error(msg::String)
@ PyCall ~/project/.julia/packages/PyCall/7a7w0/src/exception.jl:83
[4] macro expansion
@ ~/project/.julia/packages/PyCall/7a7w0/src/exception.jl:97 [inlined]
[5] #107
@ ~/project/.julia/packages/PyCall/7a7w0/src/pyfncall.jl:43 [inlined]
[6] disable_sigint
@ ./c.jl:458 [inlined]
[7] __pycall!
@ ~/project/.julia/packages/PyCall/7a7w0/src/pyfncall.jl:42 [inlined]
[8] _pycall!(ret::PyCall.PyObject, o::PyCall.PyObject, args::Tuple{}, nargs::Int64, kw::Ptr{Nothing})
@ PyCall ~/project/.julia/packages/PyCall/7a7w0/src/pyfncall.jl:29
[9] _pycall!
@ ~/project/.julia/packages/PyCall/7a7w0/src/pyfncall.jl:11 [inlined]
[10] #_#114
@ ~/project/.julia/packages/PyCall/7a7w0/src/pyfncall.jl:86 [inlined]
[11] (::PyCall.PyObject)()
@ PyCall ~/project/.julia/packages/PyCall/7a7w0/src/pyfncall.jl:86
[12] close(lg::WandbLogger; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ Wandb ~/project/.julia/packages/Wandb/8Eio5/src/main.jl:46
[13] close(lg::WandbLogger)
@ Wandb ~/project/.julia/packages/Wandb/8Eio5/src/main.jl:46
[14] top-level scope
@ REPL[3]:1
[15] top-level scope
@ ~/project/.julia/packages/CUDA/5jdFl/src/initialization.jl:52
So if I want to keep using Wandb to log things (I do, great package btw), I need to restart julia. Or at least that's the only strategy I've found that works