omp simd 减少没有加速

问题描述

我正在尝试使用矢量化 (openmp simd) 来加速矩阵乘法。为了利用矢量化，我转置了第二个矩阵（以使变化最快的索引通过连续内存）。我正在 3000 x 3000 阵列上运行我的测试。因为我无法测量有与没有打开 mp pragma 时挂墙时间的差异，所以我想确认我实际上得到了我正在乘的单个数组的加速（事实并非如此）。因此，我插入了一些相同大小的虚拟数组，以检查是否通过 SIMD 获得了加速（至少在使用 reduce 子句时）。

现在我的问题是我的问题是什么阻碍了 SIMD 加速，我唯一的猜测是它必须是数组的二维，但我不完全明白为什么这会导致速度变慢。

或者我没有看到我的代码有其他问题吗？

#include <stdlib.h>
#include <stdio.h>
#include <omp.h>

#include <time.h>

const int N = 3000;

struct timespec begin,end;

double **create_a() {
    double *a = (double *)aligned_alloc(32,sizeof(double) * N * N);
    double **a_indexed = (double **)aligned_alloc(32,sizeof(double *) * N);
    for (int i = 0; i<N; i++){
        a_indexed[i] = a+i*N;
    }

    for (int i = 0; i< N; i++) {
        for (int j = 0; j<N; j++) {
            a_indexed[i][j] = i * j;
        }
    }
    return a_indexed;
}

double **create_b(){
    double *b = (double *)aligned_alloc(32,sizeof(double) * N * N);
    double **b_indexed = (double **)aligned_alloc(32,sizeof(double *) * N);
    for (int i = 0; i<N; i++){
        b_indexed[i] = b+i*N;
    }

    for (int i = 0; i< N; i++) {
        for (int j = 0; j<N; j++) {
            b_indexed[i][j] = (i == j) ? 1:0;
        }
    }
    return b_indexed;
}

double **transpose( double **matrix) {
    double *t = (double *)aligned_alloc(32,sizeof(double) * N * N);
    double **t_indexed = (double **)aligned_alloc(32,sizeof(double *) * N);
    for (int i = 0; i<N; i++){
        t_indexed[i] = t+i*N;
    }

    for (int i = 0; i< N; i++) {
        for (int j = 0; j<N; j++) {
            t_indexed[i][j] = matrix[j][i];
        }
    }
    return t_indexed;
}

double **mul(double **a,double **b) {
    double **b_t = transpose(b);
    double *res = (double *)aligned_alloc(32,sizeof(double) * N * N);
    double **res_indexed = (double **)aligned_alloc(32,sizeof(double *) * N);
    for (int i = 0; i<N; i++){
        res_indexed[i] = res+i*N;
    }

    for (int row = 0; row< 1; row++) {
        for (int col = 0; col < 1; coL++) {
    double cell_res = 0;

    // problematic calculation that I can't get to speed up no matter what pragma I use
    clock_gettime(CLOCK_REALTIME,&begin);
    #pragma omp simd aligned(a,b_t:32) reduction(+:cell_res)
    for (int i = 0; i < N; i++) {
        cell_res += a[0][i] * b_t[0][i];
    }
    clock_gettime(CLOCK_REALTIME,&end);

    long seconds = end.tv_sec - begin.tv_sec;
    long nanoseconds = end.tv_nsec - begin.tv_nsec;
    double elapsed = seconds + nanoseconds*1e-9;

    printf("Time simd reduce measured: %.9f seconds.\n",elapsed);


    // dummy array measurements

    struct timespec begin2,end2;
    struct timespec begin3,end3;
    struct timespec begin4,end4;

    double *a2 = (double *)aligned_alloc(32,sizeof(double) * N);
    double *b2 = (double *)aligned_alloc(32,sizeof(double) * N);
    for (int i = 0; i < N ; i++){
        a2[i] = 1;
        b2[i] = 1;
    }

    // measurement with reduction is significantly faster than others
    clock_gettime(CLOCK_REALTIME,&begin2);
    #pragma omp simd aligned(a2,b2:32) reduction(+:cell_res)
    for (int i = 0; i < N; i++) {
        cell_res += a2[i] * b2[i];
    }

    clock_gettime(CLOCK_REALTIME,&end2);
    long seconds2 = end2.tv_sec - begin2.tv_sec;
    long nanoseconds2 = end2.tv_nsec - begin2.tv_nsec;
    double elapsed2 = seconds2 + nanoseconds2*1e-9;
    
    printf("time2 (simd reduction): %.9f seconds.\n",elapsed2);

    // no speedup compared to without simd (slightly faster than problematic calculation)
    clock_gettime(CLOCK_REALTIME,&begin3);
    #pragma omp simd aligned(a2,b2:32)
    for (int i = 0; i < N; i++) {
        cell_res += a2[i] * b2[i];
    }
    clock_gettime(CLOCK_REALTIME,&end3);

    long seconds3 = end3.tv_sec - begin3.tv_sec;
    long nanoseconds3 = end3.tv_nsec - begin3.tv_nsec;
    double elapsed3 = seconds3 + nanoseconds3*1e-9;
    printf("time3 (simd): %.9f seconds.\n",elapsed3);

    // no pragma (slightly faster than problematic calculation)
    clock_gettime(CLOCK_REALTIME,&begin4);
    for (int i = 0; i < N; i++) {
        cell_res += a2[i] * b2[i];
    }
    clock_gettime(CLOCK_REALTIME,&end4);

    long seconds4 = end4.tv_sec - begin4.tv_sec;
    long nanoseconds4 = end4.tv_nsec - begin4.tv_nsec;
    double elapsed4 = seconds4 + nanoseconds4*1e-9;
    printf("time4: %.9f seconds.\n",elapsed4);


    res_indexed[0][0] = cell_res;
        }
    }
    return res_indexed;
}


int main (int argc,char **argv) {
    //init a(i,j) = i * j
    double **a = create_a();

    //init b(i,j) = (i == j) ? 1:0;
    double **b = create_b();

    //multiply
    double **res = mul(a,b);
}

Time simd reduce measured: 0.000004188 seconds. // problematic
time2 (simd reduction): 0.000001762 seconds. // faster
time3 (simd): 0.000003475 seconds. //slightly faster
time4: 0.000003476 seconds. //slightly faster

解决方法

我已经在我的机器上测试了前两个循环，我可以重现相同的行为。

Time simd reduce measured: 0.000006000 seconds.
time2 (simd reduction): 0.000004000 seconds.

我的猜测是有两个问题：

第一个问题：

多个版本的执行时间之间的差异似乎更多地与缓存而不是向量化有关。因此，当您使用包含 3000 个元素（24 KB）的虚拟数组进行测试时：

double *a2 = (double *)aligned_alloc(32,sizeof(double) * N);
double *b2 = (double *)aligned_alloc(32,sizeof(double) * N);
for (int i = 0; i < N ; i++){
    a2[i] = 1;
    b2[i] = 1;
}

这些小数组在初始化期间加载到缓存中（即 a2[i] = 1）。另一方面，矩阵 a 和 b_t 的大小为 3000 x 3000（72 兆字节），这使得第一行不太可能完全在缓存中（尽管这取决于测试环境的缓存大小）。

我改变了以下循环：

// problematic calculation that I can't get to speed up no matter what pragma I use
clock_gettime(CLOCK_REALTIME,&begin);
#pragma omp simd aligned(a,b_t:32) reduction(+:cell_res)
for (int i = 0; i < N; i++) {
    cell_res += a[0][i] * b_t[0][i];
}
clock_gettime(CLOCK_REALTIME,&end);

通过将矩阵a和b_t的第一行分别打包成两个新矩阵a_2和b_2，即：

for(int i = 0; i < N; i++){
      a_2[0][i]  = a[0][i];
      bt_2[0][i] = b_t[0][i];
 }

// problematic calculation that I can't get to speed up no matter what pragma I use
clock_gettime(CLOCK_REALTIME,&begin);
#pragma omp simd aligned(a_2,bt_2) reduction(+:cell_res)
for (int i = 0; i < N; i++) {
    cell_res += a_2[0][i] * bt_2[0][i];
}

通过将这些矩阵打包成只有一行的较小矩阵，我可以将这些矩阵加载到缓存中，从而减少第一个版本的执行时间。新结果：

Time simd reduce measured: 0.000004000 seconds.
time2 (simd reduction): 0.000004000 seconds.

IMO 你不应该在同一个函数中测试所有这些循环，因为编译器可能会以不同的方式优化这些循环，然后存在缓存这些值的问题，等等。我会在单独的运行中测试它们。

现在我的问题是阻碍 SIMD 的问题是什么加速我唯一的猜测是它必须是二维的数组，但我不完全明白为什么这会导致速度变慢。

我还测试了将矩阵 a 和 b_t 的第一行直接打包到单独的一维数组（而不是矩阵）中，但结果完全相同。物有所值。现在您应该在您的环境中进行分析，即测试缓存未命中。

更重要的是，测试这个版本：

clock_gettime(CLOCK_REALTIME,&begin);
for (int i = 0; i < N; i++) {
    cell_res += a[0][i] * b_t[0][i];
}
clock_gettime(CLOCK_REALTIME,&end);

有和没有 #pragma omp simd aligned(a_2,bt_2:32) reduction(+:cell_res)，有和没有打包到数组，但分别测试所有这些版本。此外，针对不同的输入大小对它们进行测试。

第二个问题：

另一个问题是：

for (int i = 0; i < N; i++) {
        cell_res += a[0][i] * b_t[0][i];
}

受内存限制，因此使用 SIMD 获得收益的机会较少，不应将 expect much gains 与 SIMD 从双精度浮点积中使用。一种解决方法是将矩阵从双精度数更改为浮点数，从而将所需的内存带宽减少到一半，并使您可以执行的 SIMD 操作数量增加一倍。尽管如此，上述代码片段仍将受内存限制。尽管如此，您可能会获得一些收益，主要是当值在缓存中时。

在我的机器中，从 double 更改为 floats，即使不使用包装，SIMD 版本也比没有它的版本明显更快。这也可能是您遇到的问题。

c openmp parallel-processing performance performance performance simd