具有可变损耗的线性SVM的更好的梯度下降实现方式？

问题描述

我正在实现具有铰链损耗（线性SVM，软裕度）的SVM，并尝试使用梯度下降来使损耗最小化。
这是我当前在朱莉娅（Julia）中的梯度下降：

for i in 1:max_iter
    if n_cost_no_change <= 0 && early_stop
        break
    end
    learn!(X_data,Y_data,weights,learning_rate) # compute gradient and update weights
    new_cost = cost(X_data,weights) # compute loss
    if early_stop
        if best_cost === nothing || isnan(best_cost)
            best_cost = new_cost
        else
            if new_cost > best_cost - tol
                n_cost_no_change -= 1
            else
                best_cost = min(new_cost,best_cost)
                n_cost_no_change = n_iter_no_change
            end
        end
    end
end

在这里，tol设置为0.001，max_iter是1000，learning_rate是0.05，并且它们在训练过程中都是恒定的。

问题在于，每次迭代计算出的cost变化很大。
为了强制寻找全局最小值，我必须关闭early_stop并将max_iter设置为10000。否则，它将在几次迭代中提前停止，并输出错误的结果。

下面是一张图表，显示cost的迭代方式如何变化：

这些迭代只是浪费时间。

我的猜测是，如果learning_rate没有任何改进，我还需要在每次迭代中更改cost。
问题是我不知道如何以learning_rate的方差减小的方式实施对cost的更新，并且我不会陷入局部最小值（如果存在），并找到线性SVM的最佳权重。

有人有建议吗？

更新

这是我的learn!函数：

function learn!(X_data::Array{T} where T<:Number,Y_data::Array{T} where T<:Number,weights::WeightsLinearSVM,alpha::AbstractFloat)
    @assert ndims(Y_data) == ndims(weights.w) == 1
    @assert size(X_data) == (size(Y_data)[1],size(weights.w)[1])
    # compute deciding feature
    decide = (Y_data .* (X_data * weights.w .+ weights.b)) .< 1 # (? < 1) will be 1,otherwise 0
    # update w
    gradient_w = weights.w .+ (weights.C / size(X_data)[1]) .* vec(-(Y_data .* decide)' * X_data)
    gradient_w .= gradient_w .* alpha
    weights.w .= weights.w .- gradient_w
    # update b
    gradient_b = (weights.C / size(X_data)[1]) * sum(-(Y_data .* decide))
    gradient_b *= alpha
    weights.b = weights.b - gradient_b
    return nothing
end

cost功能：

function cost(X_data::Array{T} where T<:Number,weights::WeightsLinearSVM)::AbstractFloat
    @assert ndims(Y_data) == ndims(weights.w) == 1
    @assert size(X_data) == (size(Y_data)[1],size(weights.w)[1])
    loss_w = 0.5 * (weights.w' * weights.w)
    loss_inner = 1.0 .- Y_data .* vec(X_data * weights.w .+ weights.b)
    loss_inner .= map(m->max(0.0,m),loss_inner)
    loss = loss_w + weights.C * sum(loss_inner) / size(X_data)[1]
    return loss
end

解决方法

阅读此article后，我尝试在梯度更新中使用动量。

我新的learn!函数如下所示：

function learn!(X_data::Array{T} where T<:Number,Y_data::Array{T} where T<:Number,weights::WeightsLinearSVM,momentum::WeightsLinearSVM,alpha::AbstractFloat)
    @assert ndims(Y_data) == ndims(weights.w) == 1
    @assert size(X_data) == (size(Y_data)[1],size(weights.w)[1])
    # compute deciding feature
    decide = (Y_data .* (X_data * weights.w .+ weights.b)) .< 1 # (? < 1) will be 1,otherwise 0
    # update w
    gradient_w = weights.w .+ (weights.C / size(X_data)[1]) .* vec(-(Y_data .* decide)' * X_data)
    gradient_w .= gradient_w .* alpha
    momentum.w .= gradient_w .+ (0.9 .* momentum.w)
    weights.w .= weights.w .- momentum.w
    # update b
    gradient_b = (weights.C / size(X_data)[1]) * sum(-(Y_data .* decide))
    gradient_b *= alpha
    momentum.b = gradient_b + (0.9 * momentum.b)
    weights.b = weights.b - momentum.b
    return nothing
end

这里是成本波动的比较。

使用动量之前：

使用动量后：

我对这种改进感到满意。可能还会尝试本文中提到的其他优化算法。

gradient-descent julia machine-learning svm svm