矩阵乘法
- 普通实现
- 7.46346s
#include <iostream>
#include <sys/time.h>
using namespace std;
float a[1000][2000], b[2000][1500], c[1000][1500] = {0.0};
int main() {
for(int i = 0; i < 1000; ++i) {
for(int j = 0; j < 2000; ++j) {
a[i][j] = 1.1 + i / 1000.0;
}
}
for(int i = 0; i < 2000; ++i) {
for(int j = 0; j < 1500; ++j) {
a[i][j] = 1.11 + j / 1000.0;
}
}
struct timeval start, end;
gettimeofday(&start, NULL);
for(int i = 0; i < 1000; ++i) {
for(int j = 0; j < 2000; ++j) {
for(int k = 0; k < 1500; ++k) {
c[i][k] += a[i][j]*b[j][k];
}
}
}
gettimeofday(&end, NULL);
float cost_time = (end.tv_usec-start.tv_usec)/1000000.0 + end.tv_sec-start.tv_sec;
cout << cost_time << endl;
return 0;
}
- opencl实现
- 0.303606 seconds
- Host代码
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <CL/cl.h>
#include <sys/time.h>
#define MAX_SOURCE_SIZE (0x100000)
//data parallel
int main()
{
int i, j;
float *A;
float *B;
float *C;
const int row = 1000, mid = 2000, col = 1500;
A = (float *)malloc(row * mid * sizeof(float));
B = (float *)malloc(mid * col * sizeof(float));
C = (float *)malloc(row * col * sizeof(float));
/* Initialize input data */
printf("Initialize input data");
for (i = 0; i < row; i++) {
for (j = 0; j < mid; j++) {
A[i * mid + j] = 1.1 + i / 1000.0;
}
}
for(i = 0; i < mid; ++i) {
for(j = 0; j < col; ++j) {
B[i*col+j] = 1.11 + j / 1000.0;
}
}
printf("\n");
struct timeval start, end;
gettimeofday(&start, NULL);
cl_int ret;
/* 1.Get Platform information */
cl_platform_id platform_id = NULL;
cl_uint ret_num_platforms;
// 查询的最大数量,返回的平台列表,实际平台数
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
/* 2.Device information */
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
// 平台列表,查询的设备类型,查询的最大数量,返回的设备列表,实际设备数
ret = clGetdeviceids(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
/* 3.Create OpenCL Context */
cl_context context = NULL;
// properties,设备数,设备列表,pfn_notify,user_data,返回码
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
/* 4.Create command queue */
cl_command_queue command_queue = NULL;
// context,device_id,properties,返回码
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
/* 5.Create Buffer Object */
cl_mem Amobj = NULL;
cl_mem Bmobj = NULL;
cl_mem Cmobj = NULL;
// context,flag,大小,host指针,返回码
Amobj = clCreateBuffer(context, CL_MEM_READ_WRITE, row * mid * sizeof(float), NULL, &ret);
Bmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, mid * col * sizeof(float), NULL, &ret);
Cmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, row * col * sizeof(float), NULL, &ret);
/* 6.copy input data to the memory buffer */
// command_queue,buffer,是否阻塞,偏移,大小,主机数据,等待事件数,等待事件列表,事件命令
ret = clEnqueueWriteBuffer(command_queue, Amobj, CL_TRUE, 0, row * mid * sizeof(float), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, Bmobj, CL_TRUE, 0, mid * col * sizeof(float), B, 0, NULL, NULL);
/* 7.Load kernel source file */
FILE *fp;
const char fileName[] = "../src/kernel.cl";
size_t source_size;
char *source_str;
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.cl");
exit(1);
}
source_str = (char *)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
/* 8.Create kernel program from source file*/
cl_program program = NULL;
// context,count,kernel代码,代码长度,返回码
program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
/* 9. build program*/
// program,设备数,device_id,options,pfn_notify,user_data
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
/* 10.Create data parallel OpenCL kernel */
cl_kernel kernel = NULL;
// program, 名字, 返回码
kernel = clCreateKernel(program, "dataParallel", &ret);
/* 11.Set OpenCL kernel arguments */
// kernel, 参数索引,参数大小,参数
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&Amobj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&Bmobj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&Cmobj);
ret = clSetKernelArg(kernel, 3, sizeof(int), &row);
ret = clSetKernelArg(kernel, 4, sizeof(int), &mid);
ret = clSetKernelArg(kernel, 5, sizeof(int), &col);
/* 12.Execute OpenCL kernel as data parallel */
size_t global_item_size[2] = {row, col};
size_t local_item_size = 1;
// command_queue,kernel,数据维度,global_work_offset,global_item_size,local_item_size,等待事件数,等待事件列表,事件命令
ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global_item_size, NULL, 0, NULL, NULL);
/* 13.wait for the commands to complete before reading back results */
ret = clFlush(command_queue);
ret = clFinish(command_queue);
/* 14.Transfer result to host */
// command_queue,buffer,是否阻塞,偏移,大小,主机数据,等待事件数,等待事件列表,事件命令
ret = clEnqueueReadBuffer(command_queue, Cmobj, CL_TRUE, 0, row * col * sizeof(float), C, 0, NULL, NULL);
// cost
gettimeofday(&end, NULL);
float cost_time = (end.tv_usec-start.tv_usec)/1000000.0 + end.tv_sec-start.tv_sec;
printf("cost : %f seconds\n", cost_time);
/* 15.Finalization */
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(Amobj);
ret = clReleaseMemObject(Bmobj);
ret = clReleaseMemObject(Cmobj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(source_str);
free(A);
free(B);
free(C);
return 0;
}
- kernel.cl
- 其实row用不着
__kernel void dataParallel(__global float* A, __global float* B, __global float* C,int row, int mid, int col)
{
int i = get_global_id(0), j = get_global_id(1);
//printf("%d, %d\n", i, j);
float sum = 0;
for(int m = 0; m < mid; ++m) {
sum += A[i*mid+m] * B[m*col+j];
}
C[i*col+j] = sum;
}
20220824补充
- 安装opencl:
sudo apt install ocl-icd-* opencl-headers
,sudo apt install clinfo
,clinfo |grep "Device Type"
- CMakeLists:
cmake_minimum_required(VERSION 2.8.3)
project(cl)
add_compile_options(-std=c++11 -g)
include_directories(
#/usr/local/cuda-10.0/include
)
add_executable( P R O J E C T N A M E s r c / m a i n . c c ) t a r g e t l i n k l i b r a r i e s ( {PROJECT_NAME} src/main.cc) target_link_libraries( PROJECTNAMEsrc/main.cc)targetlinklibraries({PROJECT_NAME}
OpenCL
)
add_executable(slow src/slow.cc)