问题描述
使用不同版本的 opencv 有一些非常奇怪的结果。我看到 cv::calcHist 对于 3 通道 (RGB) 蒙版图像的性能差异很大(不使用 ipp,因为它只支持单通道)。 使用 3.4.2 平均花费 15 毫秒,而使用 4.5.1 花费 0.7 毫秒。您可能会怀疑 opencv 中 histogram.cpp 代码的性能有所改进,但并未进行太多更改。
所以我基本上将 cv::calcHist 4.5.1 代码复制到下面的测试应用程序中,除了 Mat 实现代码(删除了不必要的英特尔 ipp 代码)。
做了一些性能分析,它说从 4.5.1 分配的内存基本上 cv::calcHist 更快。
当然,Mat 分配没有太大区别。
为了测试这个,我基本上用 4.5.1 和 3.5.2 在下面构建并比较性能
// HistogramBenchmark.cpp : This file contains the 'main' function. Program execution begins and ends there.
//
#include <iostream>
#include <opencv2/imgproc.hpp>
#include <opencv2/core.hpp>
#include <iomanip>
#include <chrono>
#include <opencv2/highgui.hpp>
#define BATCH_SIZE 30
#define WIDTH 164
#define HEIGHT 196
void mycalcHist(const cv::Mat* images,int nimages,const int* channels,cv::InputArray _mask,cv::OutputArray _hist,int dims,const int* histSize,const float** ranges,bool uniform,bool accumulate);
void
mycalcHist_8u(std::vector<uchar*>& _ptrs,const std::vector<int>& _deltas,cv::Size imsize,cv::Mat& hist,const float** _ranges,const double* _uniranges,bool uniform);
typedef std::chrono::duration<double,std::chrono::milliseconds::period> Ms;
void testVersions();
int main()
{
//testVersions();
int channels[] = { 0,1,2 };
float histRanges[] = { 0,256 };
int histSize[] = { 10,10,10 };
const float* ranges[] = { histRanges,histRanges,histRanges };
std::vector<cv::Mat> crops;
std::vector<cv::Mat> masks;
cv::Mat histogram;
for (int i = 0; i < BATCH_SIZE; i++)
masks.push_back(cv::Mat());
for (int i = 0; i < BATCH_SIZE; i++)
crops.push_back(cv::Mat(HEIGHT,WIDTH,CV_8UC3) * 255);
// warm up
for (int i = 0; i < 20; i++)
{
for (int j = 0; j < BATCH_SIZE; j++)
mycalcHist(&crops[i],channels,masks[i],histogram,3,histSize,ranges,true,false);
}
std::chrono::steady_clock::duration latencySum{ 0 };
unsigned latencySamplesNum = 0;
std::ostringstream latencyStream;
// warm up
for (int i = 0; i < 200; i++)
{
std::chrono::steady_clock::time_point t0 = std::chrono::steady_clock::Now();
for (int j = 0; j < BATCH_SIZE; j++)
mycalcHist(&crops[j],masks[j],false);
latencySum += std::chrono::steady_clock::Now() - t0;
latencySamplesNum += 1;
}
latencyStream.str("");
latencyStream << std::fixed << std::setprecision(1)
<< (std::chrono::duration_cast<Ms>(latencySum) / latencySamplesNum).count() << " ms \n" << latencySamplesNum;
std::cout << "Mean pipeline latency: " << latencyStream.str() << '\n';
return 0;
}
void testVersions()
{
int channels[] = { 0,histRanges };
auto img1 = cv::imread(R"(C:\Users\christopher.eviParke\Documents\Configuration\IPU1\images\Reference\1\132571809612622068_Hist3_Fac2.04.bmp)");
cv::Mat img1_hist;
cv::cvtColor(img1,img1,cv::COLOR_BGR2RGB);
cv::calcHist(&img1,cv::Mat(),img1_hist,false);
auto img2 = cv::imread(R"(C:\Users\christopher.eviParke\Documents\Configuration\IPU1\images\Reference\2\132571827343906357_Hist3_Fac1.80.bmp)");
cv::Mat img2_hist;
cv::cvtColor(img2,img2,cv::COLOR_BGR2RGB);
cv::calcHist(&img2,img2_hist,false);
auto dist = cv::compareHist(img1_hist,cv::HISTCMP_BHATTACHARYYA);
return;
}
void myhistPrepareImages(const cv::Mat* images,const cv::Mat& mask,std::vector<uchar*>& ptrs,std::vector<int>& deltas,cv::Size& imsize,std::vector<double>& uniranges)
{
int i,j,c;
CV_Assert(channels != 0 || nimages == dims);
imsize = images[0].size();
int depth = images[0].depth(),esz1 = (int)images[0].elemSize1();
bool isContinuous = true;
ptrs.resize(dims + 1);
deltas.resize((dims + 1) * 2);
for (i = 0; i < dims; i++)
{
if (!channels)
{
j = i;
c = 0;
CV_Assert(images[j].channels() == 1);
}
else
{
c = channels[i];
CV_Assert(c >= 0);
for (j = 0; j < nimages; c -= images[j].channels(),j++)
if (c < images[j].channels())
break;
CV_Assert(j < nimages);
}
CV_Assert(images[j].size() == imsize && images[j].depth() == depth);
if (!images[j].isContinuous())
isContinuous = false;
ptrs[i] = images[j].data + c * esz1;
deltas[i * 2] = images[j].channels();
deltas[i * 2 + 1] = (int)(images[j].step / esz1 - imsize.width * deltas[i * 2]);
}
if (!mask.empty())
{
CV_Assert(mask.size() == imsize && mask.channels() == 1);
isContinuous = isContinuous && mask.isContinuous();
ptrs[dims] = mask.data;
deltas[dims * 2] = 1;
deltas[dims * 2 + 1] = (int)(mask.step / mask.elemSize1());
}
if (isContinuous)
{
imsize.width *= imsize.height;
imsize.height = 1;
}
if (!ranges) // implicit uniform ranges for 8U
{
CV_Assert(depth == CV_8U);
uniranges.resize(dims * 2);
for (i = 0; i < dims; i++)
{
uniranges[i * 2] = histSize[i] / 256.;
uniranges[i * 2 + 1] = 0;
}
}
else if (uniform)
{
uniranges.resize(dims * 2);
for (i = 0; i < dims; i++)
{
CV_Assert(ranges[i] && ranges[i][0] < ranges[i][1]);
double low = ranges[i][0],high = ranges[i][1];
double t = histSize[i] / (high - low);
uniranges[i * 2] = t;
uniranges[i * 2 + 1] = -t * low;
#if 0 // This should be true by math,but it is not accurate numerically
CV_Assert(cvFloor(low * uniranges[i * 2] + uniranges[i * 2 + 1]) == 0);
CV_Assert((high * uniranges[i * 2] + uniranges[i * 2 + 1]) < histSize[i]);
#endif
}
}
else
{
for (i = 0; i < dims; i++)
{
size_t n = histSize[i];
for (size_t k = 0; k < n; k++)
CV_Assert(ranges[i][k] < ranges[i][k + 1]);
}
}
}
void mycalcHist(const cv::Mat* images,bool accumulate)
{
const uchar* const histdata = _hist.getMat().ptr();
if (_hist.empty())
{
_hist.create(dims,CV_32F);
}
cv::Mat hist = _hist.getMat();
if (histdata != hist.data)
accumulate = false;
cv::Mat ihist = hist;
ihist.flags = (ihist.flags & ~CV_MAT_TYPE_MASK) | CV_32S;
if (!accumulate)
hist = cv::Scalar(0.);
else
hist.convertTo(ihist,CV_32S);
std::vector<uchar*> ptrs;
std::vector<int> deltas;
std::vector<double> uniranges;
cv::Size imsize;
cv::Mat mask = _mask.getMat();
CV_Assert(mask.empty() || mask.type() == CV_8UC1);
myhistPrepareImages(images,nimages,mask,dims,hist.size,uniform,ptrs,deltas,imsize,uniranges);
const double* _uniranges = uniform ? &uniranges[0] : 0;
int depth = images[0].depth();
mycalcHist_8u(ptrs,ihist,_uniranges,uniform);
ihist.convertTo(hist,CV_32F);
}
#define CV_CLAMP_INT(v,vmin,vmax) (v < vmin ? vmin : (vmax < v ? vmax : v))
void
mycalcHistLookupTables_8u(const cv::Mat& hist,const cv::SparseMat& shist,const double* uniranges,bool issparse,std::vector<size_t>& _tab)
{
static const size_t OUT_OF_RANGE = (size_t)1 << (sizeof(size_t) * 8 - 2);
const int low = 0,high = 256;
int i,j;
_tab.resize((high - low) * dims);
size_t* tab = &_tab[0];
if (uniform)
{
for (i = 0; i < dims; i++)
{
double a = uniranges[i * 2];
double b = uniranges[i * 2 + 1];
int sz = !issparse ? hist.size[i] : shist.size(i);
size_t step = !issparse ? hist.step[i] : 1;
double v_lo = ranges ? ranges[i][0] : 0;
double v_hi = ranges ? ranges[i][1] : 256;
for (j = low; j < high; j++)
{
int idx = cvFloor(j * a + b);
size_t written_idx = OUT_OF_RANGE;
if (j >= v_lo && j < v_hi)
{
idx = CV_CLAMP_INT(idx,sz - 1);
written_idx = idx * step;
}
tab[i * (high - low) + j - low] = written_idx;
}
}
}
else if (ranges)
{
for (i = 0; i < dims; i++)
{
int limit = std::min(cvCeil(ranges[i][0]),high);
int idx = -1,sz = !issparse ? hist.size[i] : shist.size(i);
size_t written_idx = OUT_OF_RANGE;
size_t step = !issparse ? hist.step[i] : 1;
for (j = low;;)
{
for (; j < limit; j++)
tab[i * (high - low) + j - low] = written_idx;
if ((unsigned)(++idx) < (unsigned)sz)
{
limit = std::min(cvCeil(ranges[i][idx + 1]),high);
written_idx = idx * step;
}
else
{
for (; j < high; j++)
tab[i * (high - low) + j - low] = OUT_OF_RANGE;
break;
}
}
}
}
}
void
mycalcHist_8u(std::vector<uchar*>& _ptrs,bool uniform)
{
static const size_t OUT_OF_RANGE = (size_t)1 << (sizeof(size_t) * 8 - 2);
uchar** ptrs = &_ptrs[0];
const int* deltas = &_deltas[0];
uchar* H = hist.ptr();
int x;
const uchar* mask = _ptrs[dims];
int mstep = _deltas[dims * 2 + 1];
std::vector<size_t> _tab;
mycalcHistLookupTables_8u(hist,cv::SparseMat(),_ranges,false,_tab);
const size_t* tab = &_tab[0];
int d0 = deltas[0],step0 = deltas[1],d1 = deltas[2],step1 = deltas[3],d2 = deltas[4],step2 = deltas[5];
const uchar* p0 = (const uchar*)ptrs[0];
const uchar* p1 = (const uchar*)ptrs[1];
const uchar* p2 = (const uchar*)ptrs[2];
for (; imsize.height--; p0 += step0,p1 += step1,p2 += step2,mask += mstep)
{
if (!mask)
for (x = 0; x < imsize.width; x++,p0 += d0,p1 += d1,p2 += d2)
{
size_t idx = tab[*p0] + tab[*p1 + 256] + tab[*p2 + 512];
if (idx < OUT_OF_RANGE)
++* (int*)(H + idx);
}
else
for (x = 0; x < imsize.width; x++,p2 += d2)
{
size_t idx;
if (mask[x] && (idx = tab[*p0] + tab[*p1 + 256] + tab[*p2 + 512]) < OUT_OF_RANGE)
++* (int*)(H + idx);
}
}
}
解决方法
所以经过大量挖掘后,负责任的代码竟然是这个混蛋:
ihist.convertTo(hist,CV_32F);
在 calcHist 代码中使用
在 4.0 及以上版本中,函数的定义不同。我没有看得太深,但基本上我认为它以某种方式避免了分配。 所以我基本上将 v4 代码 + 辅助函数复制到 v3,我的软件又是实时的.... :)