问题描述
我在 OpenCL 中为 FPGA 板编写了一个简单的代码。我使用 DE10 nano 仅共享板和 Intel SDK 18.1 。主要问题是 Ram 消耗过多。 HTML 报告主要显示本地内存数组中的问题。在 ND 范围内核中,这个问题变得更糟!
另一个问题是:所有本地数组都有一个编译器警告:
(积极的编译器优化:将不必要的存储移除到本地内存)
顺便说一下,在循环分析选项卡中有 II : ~1 并且在详细信息窗格中提到:
(II 是由于以下可停止指令的近似值:加载操作 #no,存储操作 #no)。我怎样才能解决它并达到 II 的确切 1 ?!
代码:
#define IDX(i,j,n) ((i) * (n) + (j))
//#include<stdlib.h>
__kernel void PushKernel( uint column,__global int * restrict height,__global int * restrict excessFlow,__global int * restrict netFlowOutS,__global int * restrict netFlowInT,uint s,uint t,uint row,__global int * restrict residualFlow_up,__global int * restrict residualFlow_down,__global int * restrict residualFlow_right,__global int * restrict residualFlow_left)
{
const uint num_column=6;
const uint num_row=4;
int FlowOutS=*netFlowOutS;
int FlowInT=*netFlowInT;
uint source=s;
uint destination=t;
uint index;
__local int heights_horizontal_cache[6];
__local int excessFlow_horizontal_cache[6];
__local int excessFlow_horizontal_cache_temp[6];
__local int residualFlow_right_cache[6];
__local int residualFlow_left_cache[6];
__local int outS_cache;
//#pragma unroll
//#pragma loop_coalesce
#pragma ivdep
//#pragma ii 1
for(int i=0; i<num_row; i++){index=IDX(i,num_column);
#pragma unroll
#pragma ivdep
for(int j=0; j<num_column; j++){//index=IDX(i,num_column);
heights_horizontal_cache[j]=height[index+j];
excessFlow_horizontal_cache[j]=excessFlow[index+j];
excessFlow_horizontal_cache_temp[j]=0;
residualFlow_right_cache[j]=residualFlow_right[index+j];
residualFlow_left_cache[j]=residualFlow_left[index+j];
outS_cache=0;
}
//mem_fence(CLK_GLOBAL_MEM_FENCE);
///////////////////////////////////////////////////////////////////////push to right
//#pragma ivdep array (residualFlow_right_cache)
#pragma ivdep
#pragma unroll
for(int j=0; j<num_column-1; j++){
//index=IDX(i,num_column);
if(index+j != source && index+j != destination && excessFlow_horizontal_cache[j]>0 && residualFlow_right_cache[j]>0 && heights_horizontal_cache[j]==heights_horizontal_cache[j+1]+1){
int delta = min(excessFlow_horizontal_cache[j],residualFlow_right_cache[j]);
residualFlow_right_cache[j]-=delta;
residualFlow_left_cache[j+1]+=delta;
excessFlow_horizontal_cache[j]-=delta;
//excessFlow_horizontal_cache[j+1]+=delta;
excessFlow_horizontal_cache_temp[j+1]=delta;
if (IDX(i,j+1,num_column) == s) {
//FlowOutS-=delta;
outS_cache=delta;
}
else if (IDX(i,num_column) == t) {
FlowInT+=delta;}
}
///////////////////////////////////////////////////////////////////////results back to global
//mem_fence(CLK_GLOBAL_MEM_FENCE);
}
#pragma unroll
#pragma ivdep
for(int j=0; j<num_column; j++){
excessFlow_horizontal_cache[j]+=excessFlow_horizontal_cache_temp[j];
}
#pragma unroll
#pragma ivdep
for(int j=0; j<num_column; j++){
//index=IDX(i,num_column);
excessFlow[index+j]=excessFlow_horizontal_cache[j];
residualFlow_right[index+j]=residualFlow_right_cache[j];
residualFlow_left[index+j]=residualFlow_left_cache[j];
}
}
FlowOutS-=outS_cache;
*netFlowOutS=FlowOutS;
*netFlowInT=FlowInT;
}
这里是 HTML 报告:
HTML Report
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)