axon icon indicating copy to clipboard operation
axon copied to clipboard

Resume training with `Axon.Loop.from_state`

Open PhillippOhlandt opened this issue 3 years ago • 1 comments

Hey,

resuming from an existing loop state does not seem to work properly right now.

For this, I am working in the horses_or_humans example.

I am returning the entire state in the train_model function:

  defp train_model(model, data, optimizer, epochs) do
    model
    |> Axon.Loop.trainer(:binary_cross_entropy, optimizer)
    |> Axon.Loop.metric(:accuracy)
    |> Axon.Loop.handle(:iteration_completed, &log_metrics(&1, :train))
    |> Map.put(:output_transform, fn state -> state end)
    |> Axon.Loop.run(data, epochs: epochs, iterations: 100, compiler: EXLA)
  end

So I can store it and try to resume from that state during another execution:

def run_continue() do
    case File.read("./horses_or_humans_continue_model") do
      {:ok, file} ->
        {model, model_state} = :erlang.binary_to_term(file)

        optimizer = Axon.Optimizers.adam(1.0e-4)
        centralized_optimizer = Axon.Updates.compose(Axon.Updates.centralize(), optimizer)

        data = data()
        IO.write("\n\nTraining model with gradient centralization\n\n")
        result =
          model
          |> Axon.Loop.trainer(:binary_cross_entropy, centralized_optimizer)
          |> Axon.Loop.from_state(model_state)
          |> Axon.Loop.metric(:accuracy)
          |> Axon.Loop.handle(:iteration_completed, &log_metrics(&1, :train))
          |> Map.put(:output_transform, fn state -> state end)
          |> Axon.Loop.run(data, epochs: 5, iterations: 100, compiler: EXLA)

        binary = :erlang.term_to_binary({model, result})
        File.write!("./horses_or_humans_continue_model", binary)
      _ ->
        model = build_model({nil, 4, 300, 300})
        optimizer = Axon.Optimizers.adam(1.0e-4)
        centralized_optimizer = Axon.Updates.compose(Axon.Updates.centralize(), optimizer)
        data = data()
        IO.write("\n\nTraining model with gradient centralization\n\n")
        result = train_model(model, data, centralized_optimizer, 5)
        binary = :erlang.term_to_binary({model, result})
        File.write!("./horses_or_humans_continue_model", binary)
    end
  end

But there seems to be an issue with anonymous functions:

phillipp@PHILLIPP-TWN:/mnt/c/Users/Phillipp/Code/axon$ ELIXIR_ERL_OPTIONS="+sssdio 128" XLA_TARGET=cuda110 elixir examples/vision/horses_or_humans.exs

Training model with gradient centralization


01:20:09.894 [info]  could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.

01:20:09.900 [info]  XLA service 0x7f2624008320 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:

01:20:09.900 [info]    StreamExecutor device (0): NVIDIA GeForce GTX 970, Compute Capability 5.2

01:20:09.900 [info]  Using BFC allocator.

01:20:09.900 [info]  XLA backend allocating 3191431496 bytes on device 0 for BFCAllocator.

01:20:11.045 [info]  Start cannot spawn child process: No such file or directory

01:20:26.776 [info]  Loaded cuDNN version 8005
Epoch: 0, Batch: 101, Loss: 1.14877 accuracy: 0.31095 loss: 1.15279
Epoch: 1, Batch: 101, Loss: 0.92957 accuracy: 0.32828 loss: 0.93075
Epoch: 2, Batch: 101, Loss: 0.85162 accuracy: 0.42915 loss: 0.85221
Epoch: 3, Batch: 101, Loss: 0.81645 accuracy: 0.44895 loss: 0.81675
Epoch: 4, Batch: 101, Loss: 0.79185 accuracy: 0.48639 loss: 0.79206




phillipp@PHILLIPP-TWN:/mnt/c/Users/Phillipp/Code/axon$ ELIXIR_ERL_OPTIONS="+sssdio 128" XLA_TARGET=cuda110 elixir examples/vision/horses_or_humans.exs

Training model with gradient centralization


01:25:31.809 [info]  could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.

01:25:33.707 [info]  XLA service 0x7f555c009b50 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:

01:25:33.707 [info]    StreamExecutor device (0): NVIDIA GeForce GTX 970, Compute Capability 5.2

01:25:33.707 [info]  Using BFC allocator.

01:25:33.707 [info]  XLA backend allocating 3191431496 bytes on device 0 for BFCAllocator.

01:25:33.986 [info]  Start cannot spawn child process: No such file or directory
** (FunctionClauseError) no function clause matching in anonymous fn/2 in Axon.Loop.build_batch_fn/2

    The following arguments were given to anonymous fn/2 in Axon.Loop.build_batch_fn/2:

        # 1
        {0, %{"accuracy" => #Nx.Tensor<
             f32

             Nx.Defn.Expr
             parameter a:53   f32
           >, "loss" => #Nx.Tensor<
             f32

             Nx.Defn.Expr
             parameter a:54   f32
           >}}

        # 2
        {"accuracy", {#Function<7.87141647/3 in Axon.Metrics.running_average/1>, :accuracy}}

    (axon 0.1.0-dev) lib/axon/loop.ex:1146: anonymous fn/2 in Axon.Loop.build_batch_fn/2
    (elixir 1.13.3) lib/enum.ex:3724: anonymous fn/4 in Enum.zip_with/3
    (elixir 1.13.3) lib/enum.ex:3784: anonymous fn/3 in Enum.zip_reduce/4
    (elixir 1.13.3) lib/stream.ex:1337: Stream.do_zip_next/6
    (elixir 1.13.3) lib/stream.ex:1274: Stream.do_zip_enum/4
    (elixir 1.13.3) lib/enum.ex:3785: Enum.zip_reduce/4
    (elixir 1.13.3) lib/enum.ex:3724: Enum.zip_with/3
    (axon 0.1.0-dev) lib/axon/loop.ex:1146: anonymous fn/4 in Axon.Loop.build_batch_fn/2

PhillippOhlandt avatar Feb 21 '22 15:02 PhillippOhlandt

@PhillippOhlandt Can you please try saving your training progress with Axon.Loop.checkpoint and then resuming training from the checkpoint using Axon.Loop.from_state now? If it works, then we can close this :)

seanmor5 avatar Mar 03 '22 23:03 seanmor5