问题描述
我试图用数值方法求解一维热方程:
我使用的是有限差分,在使用Julia中的@threads指令时遇到一些麻烦。特别是在下面,有两个相同代码的版本:第一个是单线程,而另一个使用@threads(除了@thread指令之外,它们是相同的)
function heatSecLoop(;T::Float64)
println("start")
L = 1
ν = 0.5
Δt = 1e-6
Δx = 1e-3
Nt = ceil(Int,T/Δt )
Nx = ceil(Int,L/Δx + 2)
u = zeros(Nx)
u[round(Int,Nx/2)] = 1
println("starting loop")
for t=1:Nt-1
u_old = copy(u)
for i=2:Nx-1
u[i] = u_old[i] + ν * Δt/(Δx^2)*(u_old[i.-1]-2u_old[i] + u_old[i.+1])
end
if t % round(Int,Nt/10) == 0
println("time = " * string(round(t*Δt,digits=4)) )
end
end
println("done")
return u
end
function heatParLoop(;T::Float64)
println("start")
L = 1
ν = 0.5
Δt = 1e-6
Δx = 1e-3
Nt = ceil(Int,Nx/2)] = 1
println("starting loop")
for t=1:Nt-1
u_old = copy(u)
Threads.@threads for i=2:Nx-1
u[i] = u_old[i] + ν * Δt/(Δx^2)*(u_old[i.-1]-2u_old[i] + u_old[i.+1])
end
if t % round(Int,digits=4)) )
end
end
println("done")
return u
end
问题是顺序线程比多线程线程要快。这是时间(经过一轮编译)
julia> Threads.nthreads()
2
julia> @time heatParLoop(T=1.0)
start
starting loop
time = 0.1
time = 0.2
time = 0.3
time = 0.4
time = 0.5
time = 0.6
time = 0.7
time = 0.8
time = 0.9
done
5.417182 seconds (12.14 M allocations: 9.125 GiB,6.59% gc time)
julia> @time heatSecLoop(T=1.0)
start
starting loop
time = 0.1
time = 0.2
time = 0.3
time = 0.4
time = 0.5
time = 0.6
time = 0.7
time = 0.8
time = 0.9
done
3.892801 seconds (1.00 M allocations: 7.629 GiB,8.06% gc time)
当然,热方程只是一个更复杂的问题的例子。我还尝试将SharedArrays等其他库与distributed一起使用,但效果较差。
感谢您的帮助。
解决方法
这似乎仍然成立,可能是由于
-
Threads.@threads
的开销 - 也许在较小程度上,Julia 中的垃圾收集是单线程的,而这里的原始版本会产生大量垃圾。
此外,基于链接的话语线程的建议,可能值得注意的是,现在有一个来自 LoopVectorization.jl 的 @avx
(现在是 @turbo
)宏的线程版本,它使用来自 Polyester.jl 的非常轻量级的线程,尽管线程开销仍然不小,但仍设法勉强维持了稍微好一点的性能:
function heatSecLoop(;T::Float64)
println("start")
L = 1
ν = 0.5
Δt = 1e-6
Δx = 1e-3
Nt = ceil(Int,T/Δt )
Nx = ceil(Int,L/Δx + 2)
u = zeros(Nx)
u[round(Int,Nx/2)] = 1
u_old = similar(u)
println("starting loop")
for t=1:Nt-1
u_old,u = u,u_old
for i=2:Nx-1
u[i] = u_old[i] + ν * Δt/(Δx^2)*(u_old[i.-1]-2u_old[i] + u_old[i.+1])
end
if t % round(Int,Nt/10) == 0
println("time = " * string(round(t*Δt,digits=4)) )
end
end
println("done")
return u
end
function heatVecLoop(;T::Float64)
println("start")
L = 1
ν = 0.5
Δt = 1e-6
Δx = 1e-3
Nt = ceil(Int,L/Δx + 2)
u = zeros(Nx)
u[round(Int,Nx/2)] = 1
u_old = similar(u)
println("starting loop")
for t=1:Nt-1
u_old,u_old
@tturbo for i=2:Nx-1
u[i] = u_old[i] + ν * Δt/(Δx^2)*(u_old[i-1]-2u_old[i] + u_old[i+1])
end
if t % round(Int,Nt/10) == 0
println("time = " * string(round(t*Δt,digits=4)) )
end
end
println("done")
return u
end
function heatTVecLoop(;T::Float64)
println("start")
L = 1
ν = 0.5
Δt = 1e-6
Δx = 1e-3
Nt = ceil(Int,digits=4)) )
end
end
println("done")
return u
end
julia> @time heatSecLoop(T=1.0)
start
starting loop
time = 0.1
time = 0.2
time = 0.3
time = 0.4
time = 0.5
time = 0.6
time = 0.7
time = 0.8
time = 0.9
done
1.786011 seconds (114 allocations: 22.094 KiB)
julia> @time heatVecLoop(T=1.0)
start
starting loop
time = 0.1
time = 0.2
time = 0.3
time = 0.4
time = 0.5
time = 0.6
time = 0.7
time = 0.8
time = 0.9
done
0.314305 seconds (114 allocations: 22.094 KiB)
julia> @time heatTVecLoop(T=1.0)
start
starting loop
time = 0.1
time = 0.2
time = 0.3
time = 0.4
time = 0.5
time = 0.6
time = 0.7
time = 0.8
time = 0.9
done
0.300656 seconds (114 allocations: 22.094 KiB)
自从第一次提出这个问题以来,单线程 @turbo
向量化版本的性能似乎也有显着提高,多线程 @tturbo
版本的性能可能会继续提高问题大小。