axon
axon copied to clipboard
Resume training with `Axon.Loop.from_state`
Hey,
resuming from an existing loop state does not seem to work properly right now.
For this, I am working in the horses_or_humans example.
I am returning the entire state in the train_model function:
defp train_model(model, data, optimizer, epochs) do
model
|> Axon.Loop.trainer(:binary_cross_entropy, optimizer)
|> Axon.Loop.metric(:accuracy)
|> Axon.Loop.handle(:iteration_completed, &log_metrics(&1, :train))
|> Map.put(:output_transform, fn state -> state end)
|> Axon.Loop.run(data, epochs: epochs, iterations: 100, compiler: EXLA)
end
So I can store it and try to resume from that state during another execution:
def run_continue() do
case File.read("./horses_or_humans_continue_model") do
{:ok, file} ->
{model, model_state} = :erlang.binary_to_term(file)
optimizer = Axon.Optimizers.adam(1.0e-4)
centralized_optimizer = Axon.Updates.compose(Axon.Updates.centralize(), optimizer)
data = data()
IO.write("\n\nTraining model with gradient centralization\n\n")
result =
model
|> Axon.Loop.trainer(:binary_cross_entropy, centralized_optimizer)
|> Axon.Loop.from_state(model_state)
|> Axon.Loop.metric(:accuracy)
|> Axon.Loop.handle(:iteration_completed, &log_metrics(&1, :train))
|> Map.put(:output_transform, fn state -> state end)
|> Axon.Loop.run(data, epochs: 5, iterations: 100, compiler: EXLA)
binary = :erlang.term_to_binary({model, result})
File.write!("./horses_or_humans_continue_model", binary)
_ ->
model = build_model({nil, 4, 300, 300})
optimizer = Axon.Optimizers.adam(1.0e-4)
centralized_optimizer = Axon.Updates.compose(Axon.Updates.centralize(), optimizer)
data = data()
IO.write("\n\nTraining model with gradient centralization\n\n")
result = train_model(model, data, centralized_optimizer, 5)
binary = :erlang.term_to_binary({model, result})
File.write!("./horses_or_humans_continue_model", binary)
end
end
But there seems to be an issue with anonymous functions:
phillipp@PHILLIPP-TWN:/mnt/c/Users/Phillipp/Code/axon$ ELIXIR_ERL_OPTIONS="+sssdio 128" XLA_TARGET=cuda110 elixir examples/vision/horses_or_humans.exs
Training model with gradient centralization
01:20:09.894 [info] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
01:20:09.900 [info] XLA service 0x7f2624008320 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
01:20:09.900 [info] StreamExecutor device (0): NVIDIA GeForce GTX 970, Compute Capability 5.2
01:20:09.900 [info] Using BFC allocator.
01:20:09.900 [info] XLA backend allocating 3191431496 bytes on device 0 for BFCAllocator.
01:20:11.045 [info] Start cannot spawn child process: No such file or directory
01:20:26.776 [info] Loaded cuDNN version 8005
Epoch: 0, Batch: 101, Loss: 1.14877 accuracy: 0.31095 loss: 1.15279
Epoch: 1, Batch: 101, Loss: 0.92957 accuracy: 0.32828 loss: 0.93075
Epoch: 2, Batch: 101, Loss: 0.85162 accuracy: 0.42915 loss: 0.85221
Epoch: 3, Batch: 101, Loss: 0.81645 accuracy: 0.44895 loss: 0.81675
Epoch: 4, Batch: 101, Loss: 0.79185 accuracy: 0.48639 loss: 0.79206
phillipp@PHILLIPP-TWN:/mnt/c/Users/Phillipp/Code/axon$ ELIXIR_ERL_OPTIONS="+sssdio 128" XLA_TARGET=cuda110 elixir examples/vision/horses_or_humans.exs
Training model with gradient centralization
01:25:31.809 [info] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
01:25:33.707 [info] XLA service 0x7f555c009b50 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
01:25:33.707 [info] StreamExecutor device (0): NVIDIA GeForce GTX 970, Compute Capability 5.2
01:25:33.707 [info] Using BFC allocator.
01:25:33.707 [info] XLA backend allocating 3191431496 bytes on device 0 for BFCAllocator.
01:25:33.986 [info] Start cannot spawn child process: No such file or directory
** (FunctionClauseError) no function clause matching in anonymous fn/2 in Axon.Loop.build_batch_fn/2
The following arguments were given to anonymous fn/2 in Axon.Loop.build_batch_fn/2:
# 1
{0, %{"accuracy" => #Nx.Tensor<
f32
Nx.Defn.Expr
parameter a:53 f32
>, "loss" => #Nx.Tensor<
f32
Nx.Defn.Expr
parameter a:54 f32
>}}
# 2
{"accuracy", {#Function<7.87141647/3 in Axon.Metrics.running_average/1>, :accuracy}}
(axon 0.1.0-dev) lib/axon/loop.ex:1146: anonymous fn/2 in Axon.Loop.build_batch_fn/2
(elixir 1.13.3) lib/enum.ex:3724: anonymous fn/4 in Enum.zip_with/3
(elixir 1.13.3) lib/enum.ex:3784: anonymous fn/3 in Enum.zip_reduce/4
(elixir 1.13.3) lib/stream.ex:1337: Stream.do_zip_next/6
(elixir 1.13.3) lib/stream.ex:1274: Stream.do_zip_enum/4
(elixir 1.13.3) lib/enum.ex:3785: Enum.zip_reduce/4
(elixir 1.13.3) lib/enum.ex:3724: Enum.zip_with/3
(axon 0.1.0-dev) lib/axon/loop.ex:1146: anonymous fn/4 in Axon.Loop.build_batch_fn/2
@PhillippOhlandt Can you please try saving your training progress with Axon.Loop.checkpoint and then resuming training from the checkpoint using Axon.Loop.from_state now? If it works, then we can close this :)