GPUArrays.jl
GPUArrays.jl copied to clipboard
Avoid cartesian iteration where possible.
Re-land #454, which turned out problematic: Broke Flux.jl (https://github.com/FluxML/Flux.jl/issues/2214), CUDA.jl (https://buildkite.com/julialang/cuda-dot-jl/builds/3750), and causes excessive compilation with Transformers.jl (https://github.com/JuliaGPU/GPUArrays.jl/pull/454#issuecomment-1475116479).
TODO: look into these failures.
cc @maxwindiff
The error is from here:
Full log
GroupedConv Layer GPU grad test: Error During Test at /var/lib/buildkite-agent/builds/gpuci-14/julialang/flux-dot-jl/test/cuda/layers.jl:24
Got exception outside of a @test
Failed to compile PTX code (ptxas received signal 11)
Invocation arguments: --generate-line-info --verbose --gpu-name sm_80 --output-file /tmp/jl_QHhxFe.cubin /tmp/jl_wfwKli.ptx
If you think this is a bug, please file an issue and attach /tmp/jl_wfwKli.ptx
Stacktrace:
[1] error(s::String)
@ Base ./[90;4merror.jl:33
[2] cufunction_compile(job::GPUCompiler.CompilerJob, ctx::LLVM.Context)
@ CUDA ~/.cache/julia-buildkite-plugin/depots/d4264945-9bae-4dd2-a715-3cee20da2dbf/packages/CUDA/ZdCxS/src/compiler/[90;4mexecution.jl:435
[3] #219
@ ~/.cache/julia-buildkite-plugin/depots/d4264945-9bae-4dd2-a715-3cee20da2dbf/packages/CUDA/ZdCxS/src/compiler/[90;4mexecution.jl:354 [inlined]
[4] JuliaContext(f::CUDA.var"#219#220"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{GPUArrays.var"#broadcast_kernel#28", Tuple{CUDA.CuKernelContext, CuDeviceArray{Float32, 4, 1}, Val{CartesianIndex{4}[CartesianIndex(1, 1, 1, 1) CartesianIndex(1, 2, 1, 1) CartesianIndex(1, 3, 1, 1) .... CartesianIndex(26, 26, 25, 2)]}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{4}, NTuple{4, Base.OneTo{Int64}}, typeof(+), Tuple{Base.Broadcast.Extruded{CuDeviceArray{Float32, 4, 1}, NTuple{4, Bool}, NTuple{4, Int64}}, Base.Broadcast.Extruded{CuDeviceArray{Float32, 4, 1}, NTuple{4, Bool}, NTuple{4, Int64}}}}, Int64}}; name::Nothing, always_inline::Bool, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
^ This is the log with tons of CartesianIndex(), perhaps just poor stringification?
@ CUDA ~/.cache/julia-buildkite-plugin/depots/d4264945-9bae-4dd2-a715-3cee20da2dbf/packages/CUDA/ZdCxS/src/compiler/[90;4mexecution.jl:306
[8] cufunction
@ ~/.cache/julia-buildkite-plugin/depots/d4264945-9bae-4dd2-a715-3cee20da2dbf/packages/CUDA/ZdCxS/src/compiler/[90;4mexecution.jl:300 [inlined]
[9] macro expansion
@ ~/.cache/julia-buildkite-plugin/depots/d4264945-9bae-4dd2-a715-3cee20da2dbf/packages/CUDA/ZdCxS/src/compiler/[90;4mexecution.jl:102 [inlined]
[10] #launch_heuristic#243
@ ~/.cache/julia-buildkite-plugin/depots/d4264945-9bae-4dd2-a715-3cee20da2dbf/packages/CUDA/ZdCxS/src/[90;4mgpuarrays.jl:17 [inlined]
[11] _copyto!
@ ~/.cache/julia-buildkite-plugin/depots/d4264945-9bae-4dd2-a715-3cee20da2dbf/packages/GPUArrays/7TiO1/src/host/[90;4mbroadcast.jl:96 [inlined]
[12] copyto!
@ ~/.cache/julia-buildkite-plugin/depots/d4264945-9bae-4dd2-a715-3cee20da2dbf/packages/GPUArrays/7TiO1/src/host/[90;4mbroadcast.jl:46 [inlined]
[13] copy
@ ~/.cache/julia-buildkite-plugin/depots/d4264945-9bae-4dd2-a715-3cee20da2dbf/packages/GPUArrays/7TiO1/src/host/[90;4mbroadcast.jl:37 [inlined]
[14] materialize
@ ./[90;4mbroadcast.jl:883 [inlined]
[15] broadcast(::typeof(+), ::CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, ::CuArray{Float32, 4, CUDA.Mem.DeviceBuffer})
@ Base.Broadcast ./[90;4mbroadcast.jl:821
[16] adjoint
@ ~/.cache/julia-buildkite-plugin/depots/d4264945-9bae-4dd2-a715-3cee20da2dbf/packages/Zygote/TSj5C/src/lib/[90;4mbroadcast.jl:82 [inlined]
[17] _pullback
@ ~/.cache/julia-buildkite-plugin/depots/d4264945-9bae-4dd2-a715-3cee20da2dbf/packages/ZygoteRules/AIbCs/src/[90;4madjoint.jl:65 [inlined]
[18] _pullback
@ /var/lib/buildkite-agent/builds/gpuci-14/julialang/flux-dot-jl/src/layers/[90;4mconv.jl:202 [inlined]
[19] _pullback(ctx::Zygote.Context{true}, f::Conv{2, 4, typeof(identity), CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, args::CuArray{Float32, 4, CUDA.Mem.DeviceBuffer})
@ Zygote ~/.cache/julia-buildkite-plugin/depots/d4264945-9bae-4dd2-a715-3cee20da2dbf/packages/Zygote/TSj5C/src/compiler/[90;4minterface2.jl:0
[20] _pullback
@ /var/lib/buildkite-agent/builds/gpuci-14/julialang/flux-dot-jl/test/cuda/[90;4mlayers.jl:39 [inlined]
[21] _pullback(::Zygote.Context{true}, ::var"#482#487"{Conv{2, 4, typeof(identity), CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}})
@ Zygote ~/.cache/julia-buildkite-plugin/depots/d4264945-9bae-4dd2-a715-3cee20da2dbf/packages/Zygote/TSj5C/src/compiler/[90;4minterface2.jl:0
[22] pullback(f::Function, ps::Params{Zygote.Buffer{Any, Vector{Any}}})
@ Zygote ~/.cache/julia-buildkite-plugin/depots/d4264945-9bae-4dd2-a715-3cee20da2dbf/packages/Zygote/TSj5C/src/compiler/[90;4minterface.jl:384
[23] macro expansion
@ /var/lib/buildkite-agent/builds/gpuci-14/julialang/flux-dot-jl/test/cuda/[90;4mlayers.jl:39 [inlined]
[24] macro expansion
@ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/[90;4mTest.jl:1151 [inlined]
[25] macro expansion
@ /var/lib/buildkite-agent/builds/gpuci-14/julialang/flux-dot-jl/test/cuda/[90;4mlayers.jl:27 [inlined]
[26] macro expansion
@ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/[90;4mTest.jl:1151 [inlined]
[27] gpu_gradtest(::String, ::Vector{Function}, ::Array{Float32, 4}, ::Tuple{Int64, Int64}, ::Vararg{Any, N} where N; test_cpu::Bool)
@ Main /var/lib/buildkite-agent/builds/gpuci-14/julialang/flux-dot-jl/test/cuda/[90;4mlayers.jl:23
[28] top-level scope
@ /var/lib/buildkite-agent/builds/gpuci-14/julialang/flux-dot-jl/test/cuda/[90;4mlayers.jl:93
[29] include(fname::String)
@ Base.MainInclude ./[90;4mclient.jl:444
[30] top-level scope
@ /var/lib/buildkite-agent/builds/gpuci-14/julialang/flux-dot-jl/test/cuda/[90;4mruntests.jl:11
[31] include(fname::String)
@ Base.MainInclude ./[90;4mclient.jl:444
[32] macro expansion
@ /var/lib/buildkite-agent/builds/gpuci-14/julialang/flux-dot-jl/test/[90;4mruntests.jl:53 [inlined]
[33] macro expansion
@ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/[90;4mTest.jl:1151 [inlined]
[34] macro expansion
@ /var/lib/buildkite-agent/builds/gpuci-14/julialang/flux-dot-jl/test/[90;4mruntests.jl:52 [inlined]
[35] macro expansion
@ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/[90;4mTest.jl:1151 [inlined]
[36] top-level scope
@ /var/lib/buildkite-agent/builds/gpuci-14/julialang/flux-dot-jl/test/[90;4mruntests.jl:16
[37] include(fname::String)
@ Base.MainInclude ./[90;4mclient.jl:444
[38] top-level scope
@ [90;4mnone:6
[39] eval
@ ./[90;4mboot.jl:360 [inlined]
[40] exec_options(opts::Base.JLOptions)
@ Base ./[90;4mclient.jl:261
[41] _start()
@ Base ./[90;4mclient.jl:485
I don't have a way to reproduce this unfortunately.