opencl学习二

矩阵乘法

  • 普通实现
  • 7.46346s
#include <iostream>
#include <sys/time.h>

using namespace std;

float a[1000][2000], b[2000][1500], c[1000][1500] = {0.0};

int main() {
    for(int i = 0; i < 1000; ++i) {
        for(int j = 0; j < 2000; ++j) {
            a[i][j] = 1.1 + i / 1000.0;
        }
    }
    for(int i = 0; i < 2000; ++i) {
        for(int j = 0; j < 1500; ++j) {
            a[i][j] = 1.11 + j / 1000.0;
        }
    }
    struct timeval start, end;
    gettimeofday(&start, NULL);
    for(int i = 0; i < 1000; ++i) {
        for(int j = 0; j < 2000; ++j) {
            for(int k = 0; k < 1500; ++k) {
                c[i][k] += a[i][j]*b[j][k];
            }
        }
    }
    gettimeofday(&end, NULL);
    float cost_time = (end.tv_usec-start.tv_usec)/1000000.0 + end.tv_sec-start.tv_sec;
    cout << cost_time << endl;

    return 0;
}
  • opencl实现
  • 0.303606 seconds
  • Host代码
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <CL/cl.h>
#include <sys/time.h>
 
#define MAX_SOURCE_SIZE (0x100000)
//data parallel
int main()
{ 
	int i, j;
	float *A;
	float *B;
	float *C;
	const int row = 1000, mid = 2000, col = 1500;
 
	A = (float *)malloc(row * mid * sizeof(float));
	B = (float *)malloc(mid * col * sizeof(float));
	C = (float *)malloc(row * col * sizeof(float));
 
	/* Initialize input data */
	printf("Initialize input data");
	for (i = 0; i < row; i++) {
		for (j = 0; j < mid; j++) {
			A[i * mid + j] = 1.1 + i / 1000.0;
		}
	}
	for(i = 0; i < mid; ++i) {
		for(j = 0; j < col; ++j) {
			B[i*col+j] = 1.11 + j / 1000.0;
		}
	}
	printf("\n");
 
	struct timeval start, end;
    gettimeofday(&start, NULL);
    
    cl_int ret;
	/* 1.Get Platform information */
	cl_platform_id platform_id = NULL;
	cl_uint ret_num_platforms;
	// 查询的最大数量,返回的平台列表,实际平台数
	ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);

	/* 2.Device information */
	cl_device_id device_id = NULL;
	cl_uint ret_num_devices;
	// 平台列表,查询的设备类型,查询的最大数量,返回的设备列表,实际设备数
	ret = clGetdeviceids(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);

	/* 3.Create OpenCL Context */
	cl_context context = NULL;
	// properties,设备数,设备列表,pfn_notify,user_data,返回码
	context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);

	/* 4.Create command queue */
	cl_command_queue command_queue = NULL;
	// context,device_id,properties,返回码
	command_queue = clCreateCommandQueue(context, device_id, 0, &ret);

	/* 5.Create Buffer Object */
	cl_mem Amobj = NULL;
	cl_mem Bmobj = NULL;
	cl_mem Cmobj = NULL;
	// context,flag,大小,host指针,返回码
	Amobj = clCreateBuffer(context, CL_MEM_READ_WRITE, row * mid * sizeof(float), NULL, &ret);
	Bmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, mid * col * sizeof(float), NULL, &ret);
	Cmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, row * col * sizeof(float), NULL, &ret);

	/* 6.copy input data to the memory buffer */
	// command_queue,buffer,是否阻塞,偏移,大小,主机数据,等待事件数,等待事件列表,事件命令
	ret = clEnqueueWriteBuffer(command_queue, Amobj, CL_TRUE, 0, row * mid * sizeof(float), A, 0, NULL, NULL);
	ret = clEnqueueWriteBuffer(command_queue, Bmobj, CL_TRUE, 0, mid * col * sizeof(float), B, 0, NULL, NULL);

	/* 7.Load kernel source file */
	FILE *fp;
	const char fileName[] = "../src/kernel.cl";
	size_t source_size;
	char *source_str;
	fp = fopen(fileName, "r");
	if (!fp) {
		fprintf(stderr, "Failed to load kernel.cl");
		exit(1);
	}
	source_str = (char *)malloc(MAX_SOURCE_SIZE);
	source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
	fclose(fp);

	/* 8.Create kernel program from source file*/
	cl_program program = NULL;
	// context,count,kernel代码代码长度,返回码
	program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);

	/*  9. build program*/
	// program,设备数,device_id,options,pfn_notify,user_data
	ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

	/* 10.Create data parallel OpenCL kernel */
	cl_kernel kernel = NULL;
	// program, 名字, 返回码
	kernel = clCreateKernel(program, "dataParallel", &ret);

	/* 11.Set OpenCL kernel arguments */
	// kernel, 参数索引,参数大小,参数
	ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&Amobj);
	ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&Bmobj);
	ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&Cmobj);
	ret = clSetKernelArg(kernel, 3, sizeof(int), &row);
	ret = clSetKernelArg(kernel, 4, sizeof(int), &mid);
	ret = clSetKernelArg(kernel, 5, sizeof(int), &col);
	
	/* 12.Execute OpenCL kernel as data parallel */
	size_t global_item_size[2] = {row, col};
	size_t local_item_size = 1;
	// command_queue,kernel,数据维度,global_work_offset,global_item_size,local_item_size,等待事件数,等待事件列表,事件命令
	ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global_item_size, NULL, 0, NULL, NULL);

	/* 13.wait for the commands to complete before reading back results */
	ret = clFlush(command_queue);
	ret = clFinish(command_queue);
 
	/* 14.Transfer result to host */
	// command_queue,buffer,是否阻塞,偏移,大小,主机数据,等待事件数,等待事件列表,事件命令
	ret = clEnqueueReadBuffer(command_queue, Cmobj, CL_TRUE, 0, row * col * sizeof(float), C, 0, NULL, NULL);
	// cost
	gettimeofday(&end, NULL);
    float cost_time = (end.tv_usec-start.tv_usec)/1000000.0 + end.tv_sec-start.tv_sec;
	printf("cost : %f seconds\n", cost_time);
	
	/* 15.Finalization */
	ret = clReleaseKernel(kernel);
	ret = clReleaseProgram(program);
	ret = clReleaseMemObject(Amobj);
	ret = clReleaseMemObject(Bmobj);
	ret = clReleaseMemObject(Cmobj);
	ret = clReleaseCommandQueue(command_queue);
	ret = clReleaseContext(context);
 
	free(source_str);
 
	free(A);
	free(B);
	free(C);
	return 0;
 }

  • kernel.cl
  • 其实row用不着
__kernel void dataParallel(__global float* A, __global float* B, __global float* C,int row, int mid, int col)
{
	int i = get_global_id(0), j = get_global_id(1);
	//printf("%d, %d\n", i, j);
	float sum = 0;
	for(int m = 0; m < mid; ++m) {
	    sum += A[i*mid+m] * B[m*col+j];
	}
	C[i*col+j] = sum;
}

20220824补充

  • 安装opencl:sudo apt install ocl-icd-* opencl-headerssudo apt install clinfoclinfo |grep "Device Type"
  • CMakeLists:

    cmake_minimum_required(VERSION 2.8.3)
    project(cl)
    add_compile_options(-std=c++11 -g)
    include_directories(
    #/usr/local/cuda-10.0/include
    )
    add_executable( P R O J E C T N A M E s r c / m a i n . c c ) t a r g e t l i n k l i b r a r i e s ( {PROJECT_NAME} src/main.cc) target_link_libraries( PROJECTNAMEsrc/main.cc)targetlinklibraries({PROJECT_NAME}
    OpenCL
    )
    add_executable(slow src/slow.cc)

相关文章

显卡天梯图2024最新版,显卡是电脑进行图形处理的重要设备,...
初始化电脑时出现问题怎么办,可以使用win系统的安装介质,连...
todesk远程开机怎么设置,两台电脑要在同一局域网内,然后需...
油猴谷歌插件怎么安装,可以通过谷歌应用商店进行安装,需要...
虚拟内存这个名词想必很多人都听说过,我们在使用电脑的时候...