Polyester.jl Using @threads works, using @batch has no effect

The following function

using LinearAlgebra, Polyester
function invMatU(X ::Matrix{tw}) ::Matrix{tw}  where {tw}
    m,n = size(X)
    Y = zeros(tw,m,n)
    _1 = one(tw)
    @inbounds Threads.@threads for j = n : -1 : 1  # backward-substitution; 
        if j≤m  
            Y[j,j] = inv(X[j,j]) 
        end
        @inbounds for i = min(j-1,m) : -1 : 1  
            Yij = X[i,j] * (j≤m ? Y[j,j] : _1)
            @inbounds for k = i+1 : min(j-1,m)    
                Yij += X[i,k]*Y[k,j] 
            end
            if i≤n  
                Y[i,j] = -Yij/X[i,i] 
            end 
        end 
    end
    return Y 
end;

computes the inverse of an upper-triangular matrix. If X is not square, it is assumed that it represents a larger square matrix, by extending (hcator vcat) by the identity matrix. We can see it works correctly, since

m,n=5,10; tw=Int; 
X=rand(tw.(0:10),m,n);  
triu!(X); 
for i=1:min(m,n)  X[i,i] = rand([-1,1]) end
@time Xi=invMatU(X);  
X_,Xi_=(vcat(t,Matrix{tw}(I,n,n)[m+1:end,:]) for t∈(X,Xi));  
display.((X,Xi,X_,Xi_,X_*Xi_==I));

returns

5×10 Matrix{Int64}:
 -1   2  0   1   9  5  4   7   7   1
  0  -1  0   6   6  2  2   0  10   1
  0   0  1  10   5  3  2  10   3   3
  0   0  0   1   1  3  9   0   8   7
  0   0  0   0  -1  0  5   8   7  10
5×10 Matrix{Int64}:
 -1  -2  0   13  -8  -30  -69  71  -21   -8
  0  -1  0    6   0  -16  -52   0  -38  -41
  0   0  1  -10  -5   27  113  30  112  117
  0   0  0    1   1   -3  -14  -8  -15  -17
  0   0  0    0  -1    0    5   8    7   10
10×10 Matrix{Int64}:
 -1   2  0   1   9  5  4   7   7   1
  0  -1  0   6   6  2  2   0  10   1
  0   0  1  10   5  3  2  10   3   3
  0   0  0   1   1  3  9   0   8   7
  0   0  0   0  -1  0  5   8   7  10
  0   0  0   0   0  1  0   0   0   0
  0   0  0   0   0  0  1   0   0   0
  0   0  0   0   0  0  0   1   0   0
  0   0  0   0   0  0  0   0   1   0
  0   0  0   0   0  0  0   0   0   1
10×10 Matrix{Int64}:
 -1  -2  0   13  -8  -30  -69  71  -21   -8
  0  -1  0    6   0  -16  -52   0  -38  -41
  0   0  1  -10  -5   27  113  30  112  117
  0   0  0    1   1   -3  -14  -8  -15  -17
  0   0  0    0  -1    0    5   8    7   10
  0   0  0    0   0    1    0   0    0    0
  0   0  0    0   0    0    1   0    0    0
  0   0  0    0   0    0    0   1    0    0
  0   0  0    0   0    0    0   0    1    0
  0   0  0    0   0    0    0   0    0    1
true

However, replacing Threads.@threads by Polyester.@batch results in matrix Xi being zero. So something isn't right here.

Btw, this was run with Polyester v0.7.10 on Julia 1.9.2 in Linux Mint 20.

Mar 21 '24 21:03 lampretl

Could you try it with

@batch _j = 0:n-1
  j = n - _j
  # rest of code
end

?

Mar 21 '24 23:03 chriselrod

Interesting. With your suggested edit, it works correctly.

Note also that with my original code, I occasionally get an error:

ERROR: LoadError: BoundsError: attempt to access 5×10 StrideArraysCore.PtrArray{Int64, 2, (1, 2), Tuple{Int64, Int64}, Tuple{Nothing, Nothing}, Tuple{Static.StaticInt{1}, Static.StaticInt{1}}} at index [0, 0]
Stacktrace:
  [1] throw_boundserror(A::StrideArraysCore.PtrArray{Int64, 2, (1, 2), Tuple{Int64, Int64}, Tuple{Nothing, Nothing}, Tuple{Static.StaticInt{1}, Static.StaticInt{1}}}, I::Tuple{Int64, Int64})
    @ Base ./abstractarray.jl:744
  [2] checkbounds
    @ ./abstractarray.jl:709 [inlined]
  [3] getindex
    @ ~/.julia/packages/StrideArraysCore/VyBzA/src/ptr_array.jl:935 [inlined]
  [4] macro expansion
    @ ~/Desktop/tmp.jl:10 [inlined]
  [5] #13
    @ ~/.julia/packages/Polyester/wJbZW/src/closure.jl:281 [inlined]
  [6] macro expansion
    @ ~/.julia/packages/Polyester/wJbZW/src/batch.jl:255 [inlined]
  [7] _batch_no_reserve
    @ ~/.julia/packages/Polyester/wJbZW/src/batch.jl:168 [inlined]
  [8] batch
    @ ~/.julia/packages/Polyester/wJbZW/src/batch.jl:343 [inlined]
  [9] macro expansion
    @ ~/.julia/packages/Polyester/wJbZW/src/closure.jl:426 [inlined]
 [10] invMatU(X::Matrix{Int64})
    @ Main ~/Desktop/tmp.jl:8
 [11] top-level scope
    @ ./timing.jl:273
 [12] include(fname::String)
    @ Base.MainInclude ./client.jl:478
 [13] top-level scope
    @ REPL[2]:1
in expression starting at /home/leon/Desktop/tmp.jl:29

Mar 22 '24 08:03 lampretl

Not handling step ranges with negative steps is a bug. But why use negative steps?

Threading means there is no guaranteed order, so it isn't iterating backwards. Just use for j in 1:n.

Mar 22 '24 13:03 chriselrod

Hah, you're right, reverse iteration is unnecessary anyway. In the context of multithreading, it doesn't even make sense, as you pointed out. However, if we computed in-place (gradually replacing X by its inverse), then we'd have to be doing it as j=n:-1:1, and parallelizing the function wouldn't be an option.

Thanks!

Mar 22 '24 13:03 lampretl