问题描述
我对 Thrust 比较陌生,正在尝试执行分段扫描。这是我的代码,您应该可以按原样运行:
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <chrono>
// Sequential scan for CPU
float* test_seqScan(float* in,int s,int m) {
float* out = new float[s * m];
for (unsigned int i = 0; i < s; i++) {
out[i * m] = 0;
}
for (unsigned int i = 0; i < s; i++) {
for (unsigned int j = 1; j < m; j++) {
out[i * m + j] = out[i * m + j - 1] + in[i * m + j - 1];
}
}
return out;
}
void test_sumScan(thrust::device_vector<float> dev_in,thrust::device_vector<int> dev_keys,int m) {
// Allocate device memory for output
thrust::device_vector<float> dev_out(s * m);
thrust::exclusive_scan_by_key(thrust::device,dev_keys.begin(),dev_keys.end(),dev_in.begin(),dev_out.begin());
}
int main(){
int s = 100;
int m = 100000;
float* seq_in = new float[s * m];
for (int i = 0; i < s; i++) {
for (int j = 0; j < m; j++) {
seq_in[i * m + j] = j + 1;
}
}
thrust::host_vector<float> par_in(s * m);
for (int i = 0; i < s; i++) {
for (int j = 0; j < m; j++) {
par_in[i * m + j] = j + 1;
}
}
thrust::host_vector<int> keys(s * m);
for (int i = 0; i < s; i++) {
for (int j = 0; j < m; j++) {
keys[i * m + j] = i;
}
}
thrust::device_vector<float> dev_in = par_in;
thrust::device_vector<int> dev_keys = keys;
auto t1 = std::chrono::high_resolution_clock::now();
test_seqScan(seq_in,s,m);
auto t2 = std::chrono::high_resolution_clock::now();
auto duration1 = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
std::cout << "Sequential duration: " << duration1 << "\n\n";
auto t3 = std::chrono::high_resolution_clock::now();
test_sumScan(dev_in,dev_keys,m);
auto t4 = std::chrono::high_resolution_clock::now();
auto duration2 = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
std::cout << "Parallel duration: " << duration2 << "\n\n";
}
我的问题是,无论我将 s
和 m
设置为多小或多大,这两个代码片段都需要完全相同的时间来运行。我认为我做错了什么,但我不知道是什么;谁能指出这个问题?
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)