Scientific Computing, TU Berlin, WS 2019/2020, Lecture 28

Jürgen Fuhrmann, WIAS Berlin

GPU computing with Julia¶

Currently based on CUDA, but structurally other interfaces possible
No shader code, just use Julia to program everything
see https://juliagpu.gitlab.io/CUDA.jl/tutorials/introduction/

Necessary packages

using CUDAdrv, CUDAnative, CuArrays
using LinearAlgebra
using BenchmarkTools

Simple function: add vector on CPU

function sequential_add!(y, x)
    for i in eachindex(y, x)
        @inbounds y[i] += x[i]
    end
    return nothing
end

function parallel_add!(y, x)
    Threads.@threads for i in eachindex(y, x)
        @inbounds y[i] += x[i]
    end
    return nothing
end

parallel_add! (generic function with 1 method)

Create vectors on the CPU:

N=2_000_000
T=Float32

x=rand(T,N)
y=rand(T,N)

2000000-element Array{Float32,1}:
 0.9940162 
 0.9140259 
 0.6063175 
 0.82776594
 0.40105784
 0.6527636 
 0.8994876 
 0.18601346
 0.46982193
 0.35957706
 0.9169599 
 0.61288166
 0.6123998 
 ⋮         
 0.15102232
 0.62706494
 0.5843556 
 0.834522  
 0.08142114
 0.6083752 
 0.8880435 
 0.03150463
 0.21834028
 0.53140306
 0.3586582 
 0.33130372

Create vectors on the GPU:

x_gpu = CuArray{T}(undef, N);
y_gpu = CuArray{T}(undef, N);

@btime begin
    copyto!($x_gpu,$x);
    copyto!($y_gpu,$y);
end

  2.458 ms (4 allocations: 15.26 MiB)

2000000-element CuArray{Float32,1,Nothing}:
 0.9940162 
 0.9140259 
 0.6063175 
 0.82776594
 0.40105784
 0.6527636 
 0.8994876 
 0.18601346
 0.46982193
 0.35957706
 0.9169599 
 0.61288166
 0.6123998 
 ⋮         
 0.15102232
 0.62706494
 0.5843556 
 0.834522  
 0.08142114
 0.6083752 
 0.8880435 
 0.03150463
 0.21834028
 0.53140306
 0.3586582 
 0.33130372

CuArrays essentially provide a full vector library for linear algebra which can be controlled from the CPU
Index access only resonable on the CPU
It seems to be like "numpy on steroids"
Here we just write array broadcast code

function gpu_add_bcast!(y,x)
    CuArrays.@sync y .+= x
end

gpu_add_bcast! (generic function with 1 method)

Compare timings

@btime sequential_add!($x,$y);
@btime parallel_add!($x,$y);
@btime gpu_add_bcast!($x_gpu, $y_gpu);

  431.681 μs (0 allocations: 0 bytes)
  339.013 μs (30 allocations: 3.02 KiB)
  250.703 μs (57 allocations: 2.23 KiB)

We know here that the CPU has memory access issues, so 4 threads don't give too much of speedup

If high level abstraction is not sufficient, we can write kernels:

function gpu_add1!(y,x)
    function _kernel!(y, x)
        for i = 1:length(y)
            @inbounds y[i] += x[i]
        end
        return nothing
    end
    CuArrays.@sync begin
        @cuda _kernel!(y,x)
    end
end

gpu_add1! (generic function with 1 method)

@btime gpu_add1!($x_gpu, $y_gpu)

  126.427 ms (27 allocations: 912 bytes)

Linear indexing is incredibly slow here

Try a more thorough adaptation to CUDA data model provides better performance

function gpu_add2!(y, x;nthreads=256)
    function _kernel!(y, x)
        index = threadIdx().x
        stride = blockDim().x
        for i = index:stride:length(y)
            @inbounds y[i] += x[i]
        end
        return nothing
    end
    CuArrays.@sync begin
        @cuda threads=nthreads _kernel!(y,x)
    end
end

gpu_add2! (generic function with 1 method)

@btime gpu_add2!($x_gpu, $y_gpu)

  2.562 ms (37 allocations: 1.05 KiB)

Go even further and do proper blocking

(see CUDA.jl tutorial)

function gpu_add3!(y, x;nthreads=256)
    function _kernel!(y, x)
        index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
        stride = blockDim().x * gridDim().x
        for i = index:stride:length(y)
            @inbounds y[i] += x[i]
        end
        return
    end
    numblocks = ceil(Int, length(x)/nthreads)
    CuArrays.@sync begin
        @cuda threads=nthreads blocks=numblocks  _kernel!(y,x)
    end
end

gpu_add3! (generic function with 1 method)

@btime gpu_add3!($x_gpu, $y_gpu)

  241.245 μs (42 allocations: 1.13 KiB)

Working on the abstraction level of whole arrays¶

Try iterative solution on the GPU
Ignore all the complicated stuff, just use CuArrays

using ExtendableSparse
using SparseArrays
using Printf
using IterativeSolvers

Implement two Jacobi preconditioners based just on a diagonal vector

function LinearAlgebra.ldiv!(b,D::CuVector,a)
    b.=a./D
end
function LinearAlgebra.ldiv!(b,D::Vector,a)
    b.=a./D
end

Create random matrix and problem data on a 3D rectangular grid with 512000 unknowns (we could have done FE assembly here...)

n=80
N=n^3
t=Base.@elapsed begin
    M=ExtendableSparse.fdrand(n,n,n,matrixtype=ExtendableSparseMatrix).cscmatrix
    u_exact=rand(N)
    D=Vector(diag(M))
    f=M*u_exact
end
println("Creating matrix: $(t) s")

Creating matrix: 0.660224228 s

Copy data onto GPU ... yes, they have sparse matrices there!

t=Base.@elapsed begin
    M_gpu=CUSPARSE.CuSparseMatrixCSC(M)
    f_gpu=cu(f)
    D_gpu=cu(D)
    u_exact_gpu=cu(u_exact)
end
println("loading GPU: $(t) s")

loading GPU: 0.180644463 s

Run direct solver

# first run for compiling
u=M\f
t=Base.@elapsed begin
    u=M\f
end
println("Direct solution on CPU: $(t)s")

Direct solution on CPU: 28.396312306s

Use cg from IterativeSolvers to solve system with Jacobi preconditioner

# first run for compiling
u,hist=cg(M,f,Pl=D, tol=1.0e-10,log=true)
t=Base.@elapsed begin
    u,hist=cg(M,f,Pl=D, tol=1.0e-10,log=true)
end
println("CG solution on CPU: $(t) s ($(hist.iters) iterations), error=$(norm(u_exact-u))")

CG solution on CPU: 1.590942921 s (293 iterations), error=1.0578957086032488e-5

Do the same on the GPU ... yes, it is the same cg

# first run for compiling
u_gpu,hist=cg(M_gpu,f_gpu,Pl=D_gpu,tol=Float32(1.0e-10),log=true)
t=Base.@elapsed begin
    u_gpu,hist=cg(M_gpu,f_gpu,Pl=D_gpu,tol=Float32(1.0e-10),log=true)
end
println("CG solution on GPU: $(t) s ($(hist.iters) iterations), error=$(norm(u_exact_gpu-u_gpu))")

CG solution on GPU: 0.579733716 s (293 iterations), error=3.569082061320575e-5

This notebook was generated using Literate.jl.