burn icon indicating copy to clipboard operation
burn copied to clipboard

Slow metal performance

Open piedshag opened this issue 4 months ago • 1 comments

Describe the bug

I'm getting about 0.03 token/second running qwen2 0.5b with the metal backend on a macbook m2 8gb ram. I get about 20 tokens/second running with the candle cpu backend. The output from the generation also isn't right.

I get the same behaviour running with fp16 and fp32

To Reproduce

  1. Clone https://github.com/piedshag/lm
  2. Run cargo run --release --features metal

Expected behavior

I would expect metal to be faster than the cpu backend.

Additional context

Logs

[2025-08-03T00:25:38Z INFO  lm] config: "{\n  \"architectures\": [\n    \"Qwen2ForCausalLM\"\n  ],\n  \"attention_dropout\": 0.0,\n  \"bos_token_id\": 151643,\n  \"eos_token_id\": 151643,\n  \"hidden_act\": \"silu\",\n  \"hidden_size\": 896,\n  \"initializer_range\": 0.02,\n  \"intermediate_size\": 4864,\n  \"max_position_embeddings\": 131072,\n  \"max_window_layers\": 24,\n  \"model_type\": \"qwen2\",\n  \"num_attention_heads\": 14,\n  \"num_hidden_layers\": 24,\n  \"num_key_value_heads\": 2,\n  \"rms_norm_eps\": 1e-06,\n  \"rope_theta\": 1000000.0,\n  \"sliding_window\": 131072,\n  \"tie_word_embeddings\": true,\n  \"torch_dtype\": \"bfloat16\",\n  \"transformers_version\": \"4.40.1\",\n  \"use_cache\": true,\n  \"use_sliding_window\": false,\n  \"vocab_size\": 151936\n}\n"
[2025-08-03T00:25:38Z INFO  cubecl_wgpu::runtime] Using adapter AdapterInfo { name: "Apple M2", vendor: 0, device: 0, device_type: IntegratedGpu, driver: "", driver_info: "", backend: Metal }
[2025-08-03T00:25:38Z INFO  cubecl_wgpu::runtime] Created wgpu compute server on device Device { inner: Core(CoreDevice { context: ContextWgpuCore { type: "Native" }, id: Id(0,1), error_sink: Mutex { data: ErrorSink }, features: Features { features_wgpu: FeaturesWGPU(SHADER_FLOAT32_ATOMIC | TEXTURE_FORMAT_16BIT_NORM | TEXTURE_COMPRESSION_ASTC_HDR | TEXTURE_ADAPTER_SPECIFIC_FORMAT_FEATURES | TIMESTAMP_QUERY_INSIDE_ENCODERS | TEXTURE_BINDING_ARRAY | SAMPLED_TEXTURE_AND_STORAGE_BUFFER_ARRAY_NON_UNIFORM_INDEXING | STORAGE_TEXTURE_ARRAY_NON_UNIFORM_INDEXING | PARTIALLY_BOUND_BINDING_ARRAY | MULTI_DRAW_INDIRECT | PUSH_CONSTANTS | ADDRESS_MODE_CLAMP_TO_ZERO | ADDRESS_MODE_CLAMP_TO_BORDER | POLYGON_MODE_LINE | VERTEX_WRITABLE_STORAGE | CLEAR_TEXTURE | MSL_SHADER_PASSTHROUGH | SHADER_PRIMITIVE_INDEX | SHADER_INT64 | SUBGROUP | SUBGROUP_BARRIER | SHADER_INT64_ATOMIC_MIN_MAX), features_webgpu: FeaturesWebGPU(DEPTH_CLIP_CONTROL | DEPTH32FLOAT_STENCIL8 | TEXTURE_COMPRESSION_BC | TEXTURE_COMPRESSION_BC_SLICED_3D | TEXTURE_COMPRESSION_ETC2 | TEXTURE_COMPRESSION_ASTC | TIMESTAMP_QUERY | INDIRECT_FIRST_INSTANCE | SHADER_F16 | RG11B10UFLOAT_RENDERABLE | BGRA8UNORM_STORAGE | FLOAT32_FILTERABLE | DUAL_SOURCE_BLENDING) } }) } => AdapterInfo { name: "Apple M2", vendor: 0, device: 0, device_type: IntegratedGpu, driver: "", driver_info: "", backend: Metal }
[2025-08-03T00:25:40Z INFO  cubecl_runtime::tune::tune_cache] Load autotune cache ...
[2025-08-03T00:25:40Z INFO  cubecl_runtime::tune::tune_cache] Loaded 38 autotune cached entries
[2025-08-03T00:25:40Z INFO  cubecl_runtime::tune::tuner] Tuning FusedMatmulAutotuneKey - MatmulKey: MatmulAutotuneKey { definition: MatmulProblemDefinition { m: 131072, n: 32, k: 1, lhs_pow2_factor: 0, rhs_pow2_factor: 3, elem_lhs: Float(F32), elem_rhs: Float(F32), elem_out: Float(F32), matrix_layout_lhs: Contiguous, matrix_layout_rhs: Contiguous }, analysis: MatmulAutotuneAnalysis { scale_global: Large, kind: OuterProduct } }, NumOutBuffers: 2, NumOps: 4
[2025-08-03T00:25:40Z INFO  cubecl_runtime::tune::tune_cache] Load autotune cache ...
[2025-08-03T00:25:40Z INFO  cubecl_runtime::tune::tune_cache] Loaded 27 autotune cached entries
[2025-08-03T00:25:40Z WARN  cubecl_runtime::tune::tune_cache] Autotune the same function multiple times for key PersistentCacheKey { key: FusedMatmulAutotuneKey { matmul_key: MatmulAutotuneKey { definition: MatmulProblemDefinition { m: 131072, n: 32, k: 1, lhs_pow2_factor: 0, rhs_pow2_factor: 3, elem_lhs: Float(F32), elem_rhs: Float(F32), elem_out: Float(F32), matrix_layout_lhs: Contiguous, matrix_layout_rhs: Contiguous }, analysis: MatmulAutotuneAnalysis { scale_global: Large, kind: OuterProduct } }, num_out_buffers: 2, num_ops: 4 }, checksum: "932a677d265ed64ef5703fc5b76bfa4e" } => old PersistentCacheValue { fastest_index: 1, results: [Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fused<cubecl_wgpu::runtime::WgpuRuntime, u8, burn_cubecl_fusion::matmul::optimization::SimpleUnit>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 1, computation: BenchmarkComputations { mean: 408.55µs, median: 408.25µs, variance: 0ns, min: 364.084µs, max: 451.208µs } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fallback<cubecl_wgpu::runtime::WgpuRuntime, u8>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 0, computation: BenchmarkComputations { mean: 1.142249ms, median: 1.09275ms, variance: 30ns, min: 1.002792ms, max: 1.608ms } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fused<cubecl_wgpu::runtime::WgpuRuntime, u8, burn_cubecl_fusion::matmul::optimization::Specialized>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 6, computation: BenchmarkComputations { mean: 3.087254ms, median: 3.137ms, variance: 68ns, min: 2.45975ms, max: 3.486667ms } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fused<cubecl_wgpu::runtime::WgpuRuntime, u8, burn_cubecl_fusion::matmul::optimization::DoubleBuffering>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 7, computation: BenchmarkComputations { mean: 3.244416ms, median: 3.281375ms, variance: 6ns, min: 3.147833ms, max: 3.36025ms } }), Err(Skip), Err(Skip), Err(Skip), Err(Skip)] }, new PersistentCacheValue { fastest_index: 1, results: [Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fused<cubecl_wgpu::runtime::WgpuRuntime, u8, burn_cubecl_fusion::matmul::optimization::SimpleUnit>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 1, computation: BenchmarkComputations { mean: 470.533µs, median: 466.333µs, variance: 0ns, min: 442.083µs, max: 510.75µs } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fallback<cubecl_wgpu::runtime::WgpuRuntime, u8>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 0, computation: BenchmarkComputations { mean: 933.191µs, median: 916.958µs, variance: 4ns, min: 824.708µs, max: 1.036208ms } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fused<cubecl_wgpu::runtime::WgpuRuntime, u8, burn_cubecl_fusion::matmul::optimization::Specialized>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 6, computation: BenchmarkComputations { mean: 3.344645ms, median: 3.357041ms, variance: 111ns, min: 2.7385ms, max: 3.756917ms } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fused<cubecl_wgpu::runtime::WgpuRuntime, u8, burn_cubecl_fusion::matmul::optimization::DoubleBuffering>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 7, computation: BenchmarkComputations { mean: 3.367162ms, median: 3.376542ms, variance: 53ns, min: 3.147ms, max: 3.81825ms } }), Err(Skip), Err(Skip), Err(Skip), Err(Skip)] }
[2025-08-03T00:26:04Z INFO  cubecl_runtime::tune::tune_cache] Load autotune cache ...
[2025-08-03T00:26:04Z INFO  cubecl_runtime::tune::tune_cache] Loaded 28 autotune cached entries
[2025-08-03T00:26:04Z INFO  cubecl_runtime::tune::tuner] Tuning FusedReduceAutotuneKey - ReduceKey: ReduceAutotuneKey { elem_input: Float(F32), elem_output: Float(F32), elem_acc: Float(F32), potential_line_size: 4, axis_is_contiguous: true, reduce_axis_shape: 1024, reduce_count: 16 }, FuseNumReads: 1, FuseNumWrites: 1, FuseNumOps: 2
[2025-08-03T00:26:04Z INFO  cubecl_runtime::tune::tune_cache] Load autotune cache ...
[2025-08-03T00:26:04Z INFO  cubecl_runtime::tune::tune_cache] Loaded 12 autotune cached entries
[2025-08-03T00:26:05Z WARN  cubecl_runtime::tune::tune_cache] Autotune the same function multiple times for key PersistentCacheKey { key: FusedReduceAutotuneKey { reduce_key: ReduceAutotuneKey { elem_input: Float(F32), elem_output: Float(F32), elem_acc: Float(F32), potential_line_size: 4, axis_is_contiguous: true, reduce_axis_shape: 1024, reduce_count: 16 }, fuse_num_reads: 1, fuse_num_writes: 1, fuse_num_ops: 2 }, checksum: "06b3b6e52acfaae8a1675f73d783b460" } => old PersistentCacheValue { fastest_index: 0, results: [Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::reduce::tune::tune_fallback<cubecl_wgpu::runtime::WgpuRuntime, u8>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::reduce::optimization::ReduceOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 0, computation: BenchmarkComputations { mean: 53.141µs, median: 52.834µs, variance: 0ns, min: 52.292µs, max: 56.375µs } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::reduce::tune::tune_reduce<cubecl_wgpu::runtime::WgpuRuntime, u8>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::reduce::optimization::ReduceOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 1, computation: BenchmarkComputations { mean: 56.808µs, median: 56.292µs, variance: 0ns, min: 47.875µs, max: 71.75µs } }), Err(Unknown("RunnerError(LaunchError(PlanesUnavailable))")), Err(Unknown("RunnerError(LaunchError(PlanesUnavailable))"))] }, new PersistentCacheValue { fastest_index: 0, results: [Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::reduce::tune::tune_fallback<cubecl_wgpu::runtime::WgpuRuntime, u8>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::reduce::optimization::ReduceOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 0, computation: BenchmarkComputations { mean: 28.891µs, median: 29.125µs, variance: 0ns, min: 24.416µs, max: 39.375µs } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::reduce::tune::tune_reduce<cubecl_wgpu::runtime::WgpuRuntime, u8>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::reduce::optimization::ReduceOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 1, computation: BenchmarkComputations { mean: 89.691µs, median: 90.666µs, variance: 0ns, min: 85.5µs, max: 98.333µs } }), Err(Unknown("RunnerError(LaunchError(PlanesUnavailable))")), Err(Unknown("RunnerError(LaunchError(PlanesUnavailable))"))] }
[2025-08-03T00:26:05Z INFO  cubecl_runtime::tune::tuner] Tuning FusedMatmulAutotuneKey - MatmulKey: MatmulAutotuneKey { definition: MatmulProblemDefinition { m: 8, n: 8, k: 64, lhs_pow2_factor: 3, rhs_pow2_factor: 3, elem_lhs: Float(F32), elem_rhs: Float(F32), elem_out: Float(F32), matrix_layout_lhs: Contiguous, matrix_layout_rhs: MildlyPermuted { transposed: true, batch_swap: false } }, analysis: MatmulAutotuneAnalysis { scale_global: Small, kind: General } }, NumOutBuffers: 1, NumOps: 2
[2025-08-03T00:26:07Z WARN  cubecl_runtime::tune::tune_cache] Autotune the same function multiple times for key PersistentCacheKey { key: FusedMatmulAutotuneKey { matmul_key: MatmulAutotuneKey { definition: MatmulProblemDefinition { m: 8, n: 8, k: 64, lhs_pow2_factor: 3, rhs_pow2_factor: 3, elem_lhs: Float(F32), elem_rhs: Float(F32), elem_out: Float(F32), matrix_layout_lhs: Contiguous, matrix_layout_rhs: MildlyPermuted { transposed: true, batch_swap: false } }, analysis: MatmulAutotuneAnalysis { scale_global: Small, kind: General } }, num_out_buffers: 1, num_ops: 2 }, checksum: "932a677d265ed64ef5703fc5b76bfa4e" } => old PersistentCacheValue { fastest_index: 3, results: [Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fused<cubecl_wgpu::runtime::WgpuRuntime, u8, burn_cubecl_fusion::matmul::optimization::Simple>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 3, computation: BenchmarkComputations { mean: 11.9µs, median: 11.875µs, variance: 0ns, min: 11.208µs, max: 13.208µs } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fallback<cubecl_wgpu::runtime::WgpuRuntime, u8>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 0, computation: BenchmarkComputations { mean: 24.412µs, median: 24.5µs, variance: 0ns, min: 24.125µs, max: 24.792µs } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fused<cubecl_wgpu::runtime::WgpuRuntime, u8, burn_cubecl_fusion::matmul::optimization::Ordered>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 5, computation: BenchmarkComputations { mean: 49.383µs, median: 49.375µs, variance: 0ns, min: 49.208µs, max: 49.583µs } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fused<cubecl_wgpu::runtime::WgpuRuntime, u8, burn_cubecl_fusion::matmul::optimization::SimpleMultiRows>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 4, computation: BenchmarkComputations { mean: 163.02µs, median: 161.833µs, variance: 0ns, min: 161.542µs, max: 173.75µs } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fused<cubecl_wgpu::runtime::WgpuRuntime, u8, burn_cubecl_fusion::matmul::optimization::SimpleUnit>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 1, computation: BenchmarkComputations { mean: 303.47µs, median: 319.584µs, variance: 0ns, min: 274.5µs, max: 337.833µs } }), Err(Skip), Err(Skip), Err(Skip)] }, new PersistentCacheValue { fastest_index: 0, results: [Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fallback<cubecl_wgpu::runtime::WgpuRuntime, u8>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 0, computation: BenchmarkComputations { mean: 15.179µs, median: 14.208µs, variance: 0ns, min: 13.75µs, max: 18.834µs } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fused<cubecl_wgpu::runtime::WgpuRuntime, u8, burn_cubecl_fusion::matmul::optimization::Simple>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 3, computation: BenchmarkComputations { mean: 21.045µs, median: 21.042µs, variance: 0ns, min: 20.958µs, max: 21.292µs } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fused<cubecl_wgpu::runtime::WgpuRuntime, u8, burn_cubecl_fusion::matmul::optimization::Ordered>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 5, computation: BenchmarkComputations { mean: 83.004µs, median: 90.625µs, variance: 0ns, min: 44.333µs, max: 98.292µs } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fused<cubecl_wgpu::runtime::WgpuRuntime, u8, burn_cubecl_fusion::matmul::optimization::SimpleMultiRows>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 4, computation: BenchmarkComputations { mean: 149.179µs, median: 148.209µs, variance: 0ns, min: 147.75µs, max: 159µs } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fused<cubecl_wgpu::runtime::WgpuRuntime, u8, burn_cubecl_fusion::matmul::optimization::SimpleUnit>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 1, computation: BenchmarkComputations { mean: 568.379µs, median: 567.958µs, variance: 0ns, min: 563.667µs, max: 575.084µs } }), Err(Skip), Err(Skip), Err(Skip)] }
[2025-08-03T00:26:07Z INFO  cubecl_runtime::tune::tuner] Tuning FusedReduceAutotuneKey - ReduceKey: ReduceAutotuneKey { elem_input: Float(F32), elem_output: Float(F32), elem_acc: Float(F32), potential_line_size: 2, axis_is_contiguous: true, reduce_axis_shape: 16, reduce_count: 256 }, FuseNumReads: 2, FuseNumWrites: 2, FuseNumOps: 1
[2025-08-03T00:26:08Z INFO  cubecl_runtime::tune::tuner] Tuning FusedReduceAutotuneKey - ReduceKey: ReduceAutotuneKey { elem_input: Float(F32), elem_output: Float(F32), elem_acc: Float(F32), potential_line_size: 2, axis_is_contiguous: true, reduce_axis_shape: 16, reduce_count: 256 }, FuseNumReads: 2, FuseNumWrites: 2, FuseNumOps: 2
[2025-08-03T00:26:08Z INFO  cubecl_runtime::tune::tuner] Tuning MatmulAutotuneKey - Definition: MatmulProblemDefinition { m: 8, n: 64, k: 8, lhs_pow2_factor: 1, rhs_pow2_factor: 3, elem_lhs: Float(F32), elem_rhs: Float(F32), elem_out: Float(F32), matrix_layout_lhs: Contiguous, matrix_layout_rhs: Contiguous }, Analysis: MatmulAutotuneAnalysis { scale_global: Small, kind: General }
[2025-08-03T00:26:08Z INFO  cubecl_runtime::tune::tuner] Tuning FusedMatmulAutotuneKey - MatmulKey: MatmulAutotuneKey { definition: MatmulProblemDefinition { m: 8, n: 8192, k: 1024, lhs_pow2_factor: 3, rhs_pow2_factor: 3, elem_lhs: Float(F32), elem_rhs: Float(F32), elem_out: Float(F32), matrix_layout_lhs: Contiguous, matrix_layout_rhs: Contiguous }, analysis: MatmulAutotuneAnalysis { scale_global: Large, kind: General } }, NumOutBuffers: 1, NumOps: 2
[2025-08-03T00:26:09Z WARN  cubecl_runtime::tune::tune_cache] Autotune the same function multiple times for key PersistentCacheKey { key: FusedMatmulAutotuneKey { matmul_key: MatmulAutotuneKey { definition: MatmulProblemDefinition { m: 8, n: 8192, k: 1024, lhs_pow2_factor: 3, rhs_pow2_factor: 3, elem_lhs: Float(F32), elem_rhs: Float(F32), elem_out: Float(F32), matrix_layout_lhs: Contiguous, matrix_layout_rhs: Contiguous }, analysis: MatmulAutotuneAnalysis { scale_global: Large, kind: General } }, num_out_buffers: 1, num_ops: 2 }, checksum: "932a677d265ed64ef5703fc5b76bfa4e" } => old PersistentCacheValue { fastest_index: 3, results: [Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fused<cubecl_wgpu::runtime::WgpuRuntime, u8, burn_cubecl_fusion::matmul::optimization::Simple>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 3, computation: BenchmarkComputations { mean: 625.524µs, median: 625.583µs, variance: 0ns, min: 622.625µs, max: 628µs } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fallback<cubecl_wgpu::runtime::WgpuRuntime, u8>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 0, computation: BenchmarkComputations { mean: 672.837µs, median: 674.042µs, variance: 0ns, min: 662µs, max: 680.291µs } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fused<cubecl_wgpu::runtime::WgpuRuntime, u8, burn_cubecl_fusion::matmul::optimization::Ordered>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 5, computation: BenchmarkComputations { mean: 1.026483ms, median: 902.125µs, variance: 72ns, min: 889.084µs, max: 1.8015ms } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fused<cubecl_wgpu::runtime::WgpuRuntime, u8, burn_cubecl_fusion::matmul::optimization::SimpleMultiRows>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 4, computation: BenchmarkComputations { mean: 2.174883ms, median: 2.361208ms, variance: 101ns, min: 1.716667ms, max: 2.637167ms } }), Err(Skip), Err(Skip), Err(Skip), Err(Skip)] }, new PersistentCacheValue { fastest_index: 3, results: [Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fused<cubecl_wgpu::runtime::WgpuRuntime, u8, burn_cubecl_fusion::matmul::optimization::Simple>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 3, computation: BenchmarkComputations { mean: 631.712µs, median: 624.917µs, variance: 0ns, min: 622.666µs, max: 671.917µs } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fallback<cubecl_wgpu::runtime::WgpuRuntime, u8>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 0, computation: BenchmarkComputations { mean: 674.908µs, median: 674.709µs, variance: 0ns, min: 669.5µs, max: 682.167µs } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fused<cubecl_wgpu::runtime::WgpuRuntime, u8, burn_cubecl_fusion::matmul::optimization::Ordered>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 5, computation: BenchmarkComputations { mean: 708.887µs, median: 709.625µs, variance: 0ns, min: 703.875µs, max: 714.833µs } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fused<cubecl_wgpu::runtime::WgpuRuntime, u8, burn_cubecl_fusion::matmul::optimization::SimpleMultiRows>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 4, computation: BenchmarkComputations { mean: 3.438262ms, median: 2.549792ms, variance: 1.702µs, min: 2.358958ms, max: 5.62525ms } }), Err(Skip), Err(Skip), Err(Skip), Err(Skip)] }
[2025-08-03T00:26:09Z INFO  cubecl_runtime::tune::tuner] Tuning FusedMatmulAutotuneKey - MatmulKey: MatmulAutotuneKey { definition: MatmulProblemDefinition { m: 8, n: 1024, k: 8192, lhs_pow2_factor: 3, rhs_pow2_factor: 3, elem_lhs: Float(F32), elem_rhs: Float(F32), elem_out: Float(F32), matrix_layout_lhs: Contiguous, matrix_layout_rhs: Contiguous }, analysis: MatmulAutotuneAnalysis { scale_global: Large, kind: General } }, NumOutBuffers: 2, NumOps: 4
[2025-08-03T00:26:09Z WARN  cubecl_runtime::tune::tune_cache] Autotune the same function multiple times for key PersistentCacheKey { key: FusedMatmulAutotuneKey { matmul_key: MatmulAutotuneKey { definition: MatmulProblemDefinition { m: 8, n: 1024, k: 8192, lhs_pow2_factor: 3, rhs_pow2_factor: 3, elem_lhs: Float(F32), elem_rhs: Float(F32), elem_out: Float(F32), matrix_layout_lhs: Contiguous, matrix_layout_rhs: Contiguous }, analysis: MatmulAutotuneAnalysis { scale_global: Large, kind: General } }, num_out_buffers: 2, num_ops: 4 }, checksum: "932a677d265ed64ef5703fc5b76bfa4e" } => old PersistentCacheValue { fastest_index: 0, results: [Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fallback<cubecl_wgpu::runtime::WgpuRuntime, u8>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 0, computation: BenchmarkComputations { mean: 580.895µs, median: 529.792µs, variance: 7ns, min: 502.75µs, max: 712.958µs } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fused<cubecl_wgpu::runtime::WgpuRuntime, u8, burn_cubecl_fusion::matmul::optimization::Simple>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 3, computation: BenchmarkComputations { mean: 739.266µs, median: 739.542µs, variance: 0ns, min: 735.458µs, max: 743.5µs } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fused<cubecl_wgpu::runtime::WgpuRuntime, u8, burn_cubecl_fusion::matmul::optimization::Ordered>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 5, computation: BenchmarkComputations { mean: 1.81555ms, median: 2.2415ms, variance: 309ns, min: 1.211166ms, max: 2.418375ms } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fused<cubecl_wgpu::runtime::WgpuRuntime, u8, burn_cubecl_fusion::matmul::optimization::SimpleMultiRows>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 4, computation: BenchmarkComputations { mean: 2.728258ms, median: 2.507083ms, variance: 193ns, min: 2.330375ms, max: 3.298041ms } }), Err(Skip), Err(Skip), Err(Skip), Err(Skip)] }, new PersistentCacheValue { fastest_index: 3, results: [Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fused<cubecl_wgpu::runtime::WgpuRuntime, u8, burn_cubecl_fusion::matmul::optimization::Simple>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 3, computation: BenchmarkComputations { mean: 326.379µs, median: 311.375µs, variance: 0ns, min: 295.667µs, max: 376.334µs } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fallback<cubecl_wgpu::runtime::WgpuRuntime, u8>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 0, computation: BenchmarkComputations { mean: 543.683µs, median: 546.166µs, variance: 0ns, min: 518.458µs, max: 574.042µs } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fused<cubecl_wgpu::runtime::WgpuRuntime, u8, burn_cubecl_fusion::matmul::optimization::Ordered>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 5, computation: BenchmarkComputations { mean: 949.662µs, median: 948.541µs, variance: 0ns, min: 941.208µs, max: 968.875µs } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fused<cubecl_wgpu::runtime::WgpuRuntime, u8, burn_cubecl_fusion::matmul::optimization::SimpleMultiRows>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 4, computation: BenchmarkComputations { mean: 2.988041ms, median: 3.170333ms, variance: 107ns, min: 2.436375ms, max: 3.480916ms } }), Err(Skip), Err(Skip), Err(Skip), Err(Skip)] }
[2025-08-03T00:26:13Z INFO  cubecl_runtime::tune::tuner] Tuning FusedReduceAutotuneKey - ReduceKey: ReduceAutotuneKey { elem_input: Float(F32), elem_output: Float(F32), elem_acc: Float(F32), potential_line_size: 4, axis_is_contiguous: true, reduce_axis_shape: 4096, reduce_count: 1 }, FuseNumReads: 2, FuseNumWrites: 2, FuseNumOps: 2
[2025-08-03T00:26:16Z WARN  cubecl_runtime::tune::tune_cache] Autotune the same function multiple times for key PersistentCacheKey { key: FusedReduceAutotuneKey { reduce_key: ReduceAutotuneKey { elem_input: Float(F32), elem_output: Float(F32), elem_acc: Float(F32), potential_line_size: 4, axis_is_contiguous: true, reduce_axis_shape: 4096, reduce_count: 1 }, fuse_num_reads: 2, fuse_num_writes: 2, fuse_num_ops: 2 }, checksum: "06b3b6e52acfaae8a1675f73d783b460" } => old PersistentCacheValue { fastest_index: 0, results: [Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::reduce::tune::tune_fallback<cubecl_wgpu::runtime::WgpuRuntime, u8>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::reduce::optimization::ReduceOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 0, computation: BenchmarkComputations { mean: 103.579µs, median: 103.792µs, variance: 0ns, min: 95.209µs, max: 115.5µs } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::reduce::tune::tune_reduce<cubecl_wgpu::runtime::WgpuRuntime, u8>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::reduce::optimization::ReduceOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 1, computation: BenchmarkComputations { mean: 10.593245ms, median: 10.3755ms, variance: 442ns, min: 10.346166ms, max: 12.588125ms } }), Err(Unknown("RunnerError(LaunchError(PlanesUnavailable))")), Err(Unknown("RunnerError(LaunchError(PlanesUnavailable))"))] }, new PersistentCacheValue { fastest_index: 0, results: [Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::reduce::tune::tune_fallback<cubecl_wgpu::runtime::WgpuRuntime, u8>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::reduce::optimization::ReduceOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 0, computation: BenchmarkComputations { mean: 84.745µs, median: 77.125µs, variance: 0ns, min: 73.542µs, max: 107.333µs } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::reduce::tune::tune_reduce<cubecl_wgpu::runtime::WgpuRuntime, u8>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::reduce::optimization::ReduceOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 1, computation: BenchmarkComputations { mean: 11.01017ms, median: 11.015834ms, variance: 49ns, min: 10.704417ms, max: 11.438833ms } }), Err(Unknown("RunnerError(LaunchError(PlanesUnavailable))")), Err(Unknown("RunnerError(LaunchError(PlanesUnavailable))"))] }
ôtel[2025-08-03T00:26:16Z INFO  cubecl_runtime::tune::tuner] Tuning FusedReduceAutotuneKey - ReduceKey: ReduceAutotuneKey { elem_input: Float(F32), elem_output: Float(F32), elem_acc: Float(F32), potential_line_size: 4, axis_is_contiguous: true, reduce_axis_shape: 1024, reduce_count: 1 }, FuseNumReads: 1, FuseNumWrites: 2, FuseNumOps: 2
[2025-08-03T00:26:17Z WARN  cubecl_runtime::tune::tune_cache] Autotune the same function multiple times for key PersistentCacheKey { key: FusedReduceAutotuneKey { reduce_key: ReduceAutotuneKey { elem_input: Float(F32), elem_output: Float(F32), elem_acc: Float(F32), potential_line_size: 4, axis_is_contiguous: true, reduce_axis_shape: 1024, reduce_count: 1 }, fuse_num_reads: 1, fuse_num_writes: 2, fuse_num_ops: 2 }, checksum: "06b3b6e52acfaae8a1675f73d783b460" } => old PersistentCacheValue { fastest_index: 0, results: [Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::reduce::tune::tune_fallback<cubecl_wgpu::runtime::WgpuRuntime, u8>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::reduce::optimization::ReduceOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 0, computation: BenchmarkComputations { mean: 22.429µs, median: 22.292µs, variance: 0ns, min: 21µs, max: 24.708µs } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::reduce::tune::tune_reduce<cubecl_wgpu::runtime::WgpuRuntime, u8>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::reduce::optimization::ReduceOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 1, computation: BenchmarkComputations { mean: 89.845µs, median: 89.583µs, variance: 0ns, min: 89.167µs, max: 91.25µs } }), Err(Unknown("RunnerError(LaunchError(PlanesUnavailable))")), Err(Unknown("RunnerError(LaunchError(PlanesUnavailable))"))] }, new PersistentCacheValue { fastest_index: 0, results: [Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::reduce::tune::tune_fallback<cubecl_wgpu::runtime::WgpuRuntime, u8>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::reduce::optimization::ReduceOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 0, computation: BenchmarkComputations { mean: 61.033µs, median: 60.25µs, variance: 0ns, min: 58.166µs, max: 70.708µs } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::reduce::tune::tune_reduce<cubecl_wgpu::runtime::WgpuRuntime, u8>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::reduce::optimization::ReduceOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 1, computation: BenchmarkComputations { mean: 279.662µs, median: 281.125µs, variance: 0ns, min: 273.458µs, max: 282.917µs } }), Err(Unknown("RunnerError(LaunchError(PlanesUnavailable))")), Err(Unknown("RunnerError(LaunchError(PlanesUnavailable))"))] }
[2025-08-03T00:26:18Z INFO  cubecl_runtime::tune::tuner] Tuning FusedMatmulAutotuneKey - MatmulKey: MatmulAutotuneKey { definition: MatmulProblemDefinition { m: 1, n: 8, k: 64, lhs_pow2_factor: 3, rhs_pow2_factor: 3, elem_lhs: Float(F32), elem_rhs: Float(F32), elem_out: Float(F32), matrix_layout_lhs: Contiguous, matrix_layout_rhs: MildlyPermuted { transposed: true, batch_swap: false } }, analysis: MatmulAutotuneAnalysis { scale_global: Small, kind: VecMat } }, NumOutBuffers: 1, NumOps: 2
[2025-08-03T00:26:20Z WARN  cubecl_runtime::tune::tune_cache] Autotune the same function multiple times for key PersistentCacheKey { key: FusedMatmulAutotuneKey { matmul_key: MatmulAutotuneKey { definition: MatmulProblemDefinition { m: 1, n: 8, k: 64, lhs_pow2_factor: 3, rhs_pow2_factor: 3, elem_lhs: Float(F32), elem_rhs: Float(F32), elem_out: Float(F32), matrix_layout_lhs: Contiguous, matrix_layout_rhs: MildlyPermuted { transposed: true, batch_swap: false } }, analysis: MatmulAutotuneAnalysis { scale_global: Small, kind: VecMat } }, num_out_buffers: 1, num_ops: 2 }, checksum: "932a677d265ed64ef5703fc5b76bfa4e" } => old PersistentCacheValue { fastest_index: 0, results: [Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fallback<cubecl_wgpu::runtime::WgpuRuntime, u8>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 0, computation: BenchmarkComputations { mean: 23.67µs, median: 24.208µs, variance: 0ns, min: 22.416µs, max: 24.791µs } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fused<cubecl_wgpu::runtime::WgpuRuntime, u8, burn_cubecl_fusion::matmul::optimization::SimpleUnit>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 1, computation: BenchmarkComputations { mean: 32.329µs, median: 32.125µs, variance: 0ns, min: 31.5µs, max: 33.209µs } }), Err(Skip), Err(Skip), Err(Skip), Err(Skip), Err(Skip), Err(Skip)] }, new PersistentCacheValue { fastest_index: 0, results: [Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fallback<cubecl_wgpu::runtime::WgpuRuntime, u8>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 0, computation: BenchmarkComputations { mean: 23.4µs, median: 23.334µs, variance: 0ns, min: 23.083µs, max: 23.917µs } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fused<cubecl_wgpu::runtime::WgpuRuntime, u8, burn_cubecl_fusion::matmul::optimization::SimpleUnit>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 1, computation: BenchmarkComputations { mean: 31.883µs, median: 31.792µs, variance: 0ns, min: 31.583µs, max: 32.958µs } }), Err(Skip), Err(Skip), Err(Skip), Err(Skip), Err(Skip), Err(Skip)] }
[2025-08-03T00:26:20Z INFO  cubecl_runtime::tune::tuner] Tuning FusedMatmulAutotuneKey - MatmulKey: MatmulAutotuneKey { definition: MatmulProblemDefinition { m: 1, n: 1024, k: 8192, lhs_pow2_factor: 3, rhs_pow2_factor: 3, elem_lhs: Float(F32), elem_rhs: Float(F32), elem_out: Float(F32), matrix_layout_lhs: Contiguous, matrix_layout_rhs: Contiguous }, analysis: MatmulAutotuneAnalysis { scale_global: Large, kind: VecMat } }, NumOutBuffers: 2, NumOps: 4
[2025-08-03T00:26:24Z WARN  cubecl_runtime::tune::tune_cache] Autotune the same function multiple times for key PersistentCacheKey { key: FusedMatmulAutotuneKey { matmul_key: MatmulAutotuneKey { definition: MatmulProblemDefinition { m: 1, n: 1024, k: 8192, lhs_pow2_factor: 3, rhs_pow2_factor: 3, elem_lhs: Float(F32), elem_rhs: Float(F32), elem_out: Float(F32), matrix_layout_lhs: Contiguous, matrix_layout_rhs: Contiguous }, analysis: MatmulAutotuneAnalysis { scale_global: Large, kind: VecMat } }, num_out_buffers: 2, num_ops: 4 }, checksum: "932a677d265ed64ef5703fc5b76bfa4e" } => old PersistentCacheValue { fastest_index: 1, results: [Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fused<cubecl_wgpu::runtime::WgpuRuntime, u8, burn_cubecl_fusion::matmul::optimization::SimpleUnit>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 1, computation: BenchmarkComputations { mean: 438.641µs, median: 435.834µs, variance: 0ns, min: 424.5µs, max: 476.833µs } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fallback<cubecl_wgpu::runtime::WgpuRuntime, u8>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 0, computation: BenchmarkComputations { mean: 550.279µs, median: 480.458µs, variance: 23ns, min: 426.333µs, max: 835.167µs } }), Err(Skip), Err(Skip), Err(Skip), Err(Skip), Err(Skip), Err(Skip)] }, new PersistentCacheValue { fastest_index: 0, results: [Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fallback<cubecl_wgpu::runtime::WgpuRuntime, u8>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 0, computation: BenchmarkComputations { mean: 801.4µs, median: 803.5µs, variance: 0ns, min: 787.792µs, max: 812.25µs } }), Ok(AutotuneOutcome { name: "cubecl_runtime::tune::function_tunable::FunctionTunable<burn_cubecl_fusion::matmul::tune::tune_fused<cubecl_wgpu::runtime::WgpuRuntime, u8, burn_cubecl_fusion::matmul::optimization::SimpleUnit>, fn(burn_cubecl_fusion::tune::TuneInput<cubecl_wgpu::runtime::WgpuRuntime, burn_cubecl_fusion::matmul::optimization::MatmulOptimizationTuneArg<cubecl_wgpu::runtime::WgpuRuntime>>) -> core::result::Result<burn_cubecl_fusion::shared::trace::base::TuneOutput<cubecl_wgpu::runtime::WgpuRuntime>, alloc::string::String>>", index: 1, computation: BenchmarkComputations { mean: 821.883µs, median: 822.958µs, variance: 0ns, min: 816.875µs, max: 826.667µs } }), Err(Skip), Err(Skip), Err(Skip), Err(Skip), Err(Skip), Err(Skip)] }
Toę[2025-08-03T00:26:34Z INFO  cubecl_runtime::tune::tuner] Tuning FusedMatmulAutotuneKey - MatmulKey: MatmulAutotuneKey { definition: MatmulProblemDefinition { m: 1, n: 16, k: 64, lhs_pow2_factor: 3, rhs_pow2_factor: 3, elem_lhs: Float(F32), elem_rhs: Float(F32), elem_out: Float(F32), matrix_layout_lhs: Contiguous, matrix_layout_rhs: MildlyPermuted { transposed: true, batch_swap: false } }, analysis: MatmulAutotuneAnalysis { scale_global: Small, kind: VecMat } }, NumOutBuffers: 1, NumOps: 2
[2025-08-03T00:26:36Z INFO  cubecl_runtime::tune::tuner] Tuning MatmulAutotuneKey - Definition: MatmulProblemDefinition { m: 1, n: 64, k: 16, lhs_pow2_factor: 0, rhs_pow2_factor: 3, elem_lhs: Float(F32), elem_rhs: Float(F32), elem_out: Float(F32), matrix_layout_lhs: Contiguous, matrix_layout_rhs: Contiguous }, Analysis: MatmulAutotuneAnalysis { scale_global: Small, kind: VecMat }
interface[2025-08-03T00:26:42Z INFO  cubecl_runtime::tune::tuner] Tuning MatmulAutotuneKey - Definition: MatmulProblemDefinition { m: 1, n: 64, k: 16, lhs_pow2_factor: 1, rhs_pow2_factor: 3, elem_lhs: Float(F32), elem_rhs: Float(F32), elem_out: Float(F32), matrix_layout_lhs: Contiguous, matrix_layout_rhs: Contiguous }, Analysis: MatmulAutotuneAnalysis { scale_global: Small, kind: VecMat }
$:outer[2025-08-03T00:27:11Z INFO  cubecl_runtime::tune::tuner] Tuning MatmulAutotuneKey - Definition: MatmulProblemDefinition { m: 1, n: 64, k: 16, lhs_pow2_factor: 2, rhs_pow2_factor: 3, elem_lhs: Float(F32), elem_rhs: Float(F32), elem_out: Float(F32), matrix_layout_lhs: Contiguous, matrix_layout_rhs: Contiguous }, Analysis: MatmulAutotuneAnalysis { scale_global: Small, kind: VecMat }

piedshag avatar Aug 03 '25 04:08 piedshag

I'm finding the same thing: WGPU on a macbook air is fast for the first 100 inferences of a smallish LSTM model, but slows down to 1-2s per inference for the next 900 of a 1000 test. NDArray is much faster and doesn't slow down. I checked memory usage and it doesn't seem to be growing in memory size.

chrishulbert avatar Aug 25 '25 00:08 chrishulbert