问题描述
我在 TBB 中使用了一些代码,出于某种原因,在使用 VTune 的线程分析时,我发现它没有使用我用来初始化它的线程数。这是我正在做的事情的要点:
#define NUM_VERTS 1189576
#define NUM_LOOPS 7137492
#define LOOP_TABLE_SIZE 1000
// this typedef's been
// copied from ...
typedef struct MLoop {
/* Vertex index. */
unsigned int v;
...
} MLoop;
int main()
{
srand(time(NULL)); // Initialization,should only be called once.
tbb::task_arena this_arena;
this_arena.initialize(64);
// let's set up our data
float(*vnors)[3] = (float(*)[3]) malloc(NUM_VERTS * sizeof(*vnors));
float(*lnors_weighted)[3] = (float(*)[3]) malloc(NUM_LOOPS * sizeof(*lnors_weighted));
// fill with random values,though
// at some point we may want to
// integrate OpenGL and some actual
// mesh data
for (int i = 0; i < NUM_VERTS; i++)
{
*vnors[i] = rand();
}
for (int i = 0; i < NUM_LOOPS; i++)
{
*lnors_weighted[i] = rand();
}
// build the loop table,this just points to entries in
// the vnors table
MLooP* mloop = (MLooP*)malloc(NUM_LOOPS * sizeof(MLoop));
for (int i = 0; i < NUM_LOOPS; i++)
{
mloop[i].v = (rand() % (NUM_VERTS));
}
int** vert_loop_lookup = NULL;
if (vert_loop_lookup == NULL) {
// let's make a lookup table with more contiguous memory access
vert_loop_lookup = (int**) malloc(NUM_VERTS * sizeof(int*));
for (int i = 0; i < NUM_VERTS; i++) {
// making an assumption here that a vert can be a part of up to 100 loops
// the real number I SUSPECT will be lower
vert_loop_lookup[i] = (int*) malloc(LOOP_TABLE_SIZE * sizeof(vert_loop_lookup[0]));
memset(vert_loop_lookup[i],-1,LOOP_TABLE_SIZE * sizeof(vert_loop_lookup[0]));
}
// this is just to track the maximum index for the
// loop entry for a given vertex,that way we avoid
// a second loop
int* index_counter = (int*) malloc(NUM_VERTS * sizeof(int));
memset(index_counter,NUM_VERTS * sizeof(index_counter[0]));
// fill up our new table
for (int lidx = 0; lidx < NUM_LOOPS; lidx++) {
// get the vert index
unsigned int vert_index = mloop[lidx].v;
int curr_loop_table_value = index_counter[vert_index];
vert_loop_lookup[vert_index][curr_loop_table_value] = lidx;
index_counter[vert_index]++;
}
free(index_counter);
}
this_arena.execute([&] {
// TBB here
tbb::parallel_for(tbb::blocked_range<int>(0,NUM_VERTS,1 /* Grain Size */),[&](tbb::blocked_range<int> r)
{
for (int i = r.begin(); i < r.end(); ++i)
{
// loop through the ... loops
// of these verts and do some
// accumulation
int* loop_table = vert_loop_lookup[i];
int curr_index = 0;
for (curr_index = 0; loop_table[curr_index] != -1; curr_index++) {
int lidx = loop_table[curr_index];
add_v3_v3(vnors[mloop[lidx].v],lnors_weighted[lidx]);
}
}
});
});
}
使用 VTune 的线程分析功能时,我看到应用程序仅使用了 23 个线程。有足够的工作来证明更多线程的合理性,这里发生了什么?如果重要的话,我使用的是 3990X(所以我应该有 128 个可用的逻辑内核)。
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)