
Extremely slow bilinear interpolation (compared to OpenCV)

本文关键字:OpenCV 相比 插值 双线性      更新时间:2023-10-16
template<typename T>
cv::Mat_<T> const bilinear_interpolation(cv::Mat_<T> const &src, cv::Size dsize,
float dx, float dy)
cv::Mat_<T> dst = dsize.area() == 0 ? cv::Mat_<T>(src.rows * dy, src.cols * dx) :

float const x_ratio = static_cast<float>((src.cols - 1)) / dst.cols;
float const y_ratio = static_cast<float>((src.rows - 1)) / dst.rows;
for(int row = 0; row != dst.rows; ++row)
int y = static_cast<int>(row * y_ratio);
float const y_diff = (row * y_ratio) - y; //distance of the nearest pixel(y axis)
float const y_diff_2 = 1 - y_diff;
auto *dst_ptr = &dst(row, 0)[0];
for(int col = 0; col != dst.cols; ++col)
int x = static_cast<int>(col * x_ratio);
float const x_diff = (col * x_ratio) - x; //distance of the nearest pixel(x axis)
float const x_diff_2 = 1 - x_diff;
float const y2_cross_x2 = y_diff_2 * x_diff_2;
float const y2_cross_x = y_diff_2 * x_diff;
float const y_cross_x2 = y_diff * x_diff_2;
float const y_cross_x = y_diff * x_diff;
for(int channel = 0; channel != cv::DataType<T>::channels; ++channel)
*dst_ptr++ = y2_cross_x2 * src(y, x)[channel] +
y2_cross_x * src(y, x + 1)[channel] +
y_cross_x2 * src(y + 1, x)[channel] +
y_cross_x * src(y + 1, x + 1)[channel];

return dst;



timeEstimate<> time;
cv::Mat_<cv::Vec3b> const src = input;
bilinear_interpolation(src, cv::Size(), dx, dy);
std::cout << "bilinear" << std::endl;
timeEstimate<> time;
cv::Mat output = input.clone();
cv::resize(input, output, cv::Size(), dx, dy, cv::INTER_LINEAR);
std::cout << "bilinear cv" << std::endl;

编译器:mingw4.6.2os:win7 64位cpu:英特尔®;i3-2330M(2.2G)


  1. OpenCV将调整大小实现为"可分离操作"。也就是说,它分为两个步骤:图像先水平拉伸,然后垂直拉伸。这种技术允许使用较少的算术运算来调整大小。

  2. 手动编码SSE优化。



操作系统:虚拟机中的Xubuntu 20编译器:gcc 9.3.0OpenCV版本:4.2.0CPU:i3-6100u(2.3 GHz)源位图大小:512x512目标位图大小:2048x2048


(此时,我切换到在Windows中使用Visual Studio 2013,为x64目标构建)。

将代码转换为使用定点算术将时间减少到30ms不动点运算很有用,因为它将数据保持为整数。输入和输出数据是整数。必须将它们转换为浮动并再次返回是昂贵的。如果我坚持使用GCC 9.3,我预计速度会更快,因为我通常发现它生成的代码比VS 2013更快。不管怎样,这是代码:

typedef union {
unsigned c;
struct { unsigned char b, g, r, a; };
} DfColour;
typedef struct _DfBitmap {
int width, height;
DfColour *pixels;
} DfBitmap;
void bilinear_interpolation(DfBitmap *src, DfBitmap *dst, float scale) {
unsigned heightRatio = (double)(1<<8) * 255.0 / scale;
unsigned widthRatio = (double)(1<<8) * 255.0 / scale;
int dstH = scale * src->height;
int dstW = scale * src->width;
// For every output pixel...
for (int y = 0; y < dstH; y++) {
int srcYAndWeight = (y * heightRatio) >> 8;
int srcY = srcYAndWeight >> 8;
DfColour *dstPixel = &dst->pixels[y * dst->width];
DfColour *srcRow = &src->pixels[srcY * src->width];
unsigned weightY2 = srcYAndWeight & 0xFF;
unsigned weightY = 256 - weightY2;
for (int x = 0; x < dstW; x++, dstPixel++) {
// Perform bilinear interpolation on 2x2 src pixels.
int srcXAndWeight = (x * widthRatio) >> 8;
int srcX = srcXAndWeight >> 8;
unsigned r = 0, g = 0, b = 0;
unsigned weightX2 = srcXAndWeight & 0xFF;
unsigned weightX = 256 - weightX2;
// Pixel 0,0
DfColour *srcPixel = &srcRow[srcX];
unsigned w = (weightX * weightY) >> 8;
r += srcPixel->r * w;
g += srcPixel->g * w;
b += srcPixel->b * w;
// Pixel 1,0
w = (weightX2 * weightY) >> 8;
r += srcPixel->r * w;
g += srcPixel->g * w;
b += srcPixel->b * w;
// Pixel 1,1
srcPixel += src->width;
w = (weightX2 * weightY2) >> 8;
r += srcPixel->r * w;
g += srcPixel->g * w;
b += srcPixel->b * w;
// Pixel 0,1
w = (weightX * weightY2) >> 8;
r += srcPixel->r * w;
g += srcPixel->g * w;
b += srcPixel->b * w;
dstPixel->r = r >> 8;
dstPixel->g = g >> 8;
dstPixel->b = b >> 8;

切换到更好的算法将时间减少到19.5ms。正如Andrey Kamaev的回答所说,更好的算法通过将垂直和水平大小划分为两个单独的过程来工作。目标位图用作第一遍输出的临时存储空间。第二遍中的X遍历是向后的,以避免覆盖即将需要的数据。这是代码:

void bilinear_interpolation(DfBitmap *src, DfBitmap *dst, float scale) {
unsigned heightRatio = (double)(1<<8) * 255.0 / scale;
unsigned widthRatio = (double)(1<<8) * 255.0 / scale;
int dstH = scale * src->height;
int dstW = scale * src->width;
for (int y = 0; y < dstH; y++) {
int srcYAndWeight = (y * heightRatio) >> 8;
int srcY = srcYAndWeight >> 8;
DfColour *dstPixel = &dst->pixels[y * dst->width];
DfColour *srcRow = &src->pixels[srcY * src->width];
unsigned weightY2 = srcYAndWeight & 0xFF;
unsigned weightY = 256 - weightY2;
for (int x = 0; x < src->width; x++, dstPixel++) {
unsigned r = 0, g = 0, b = 0;
// Pixel 0,0
DfColour *srcPixel = &srcRow[x];
r += srcPixel->r * weightY;
g += srcPixel->g * weightY;
b += srcPixel->b * weightY;
// Pixel 1,0
srcPixel += src->width;
r += srcPixel->r * weightY2;
g += srcPixel->g * weightY2;
b += srcPixel->b * weightY2;
dstPixel->r = r >> 8;
dstPixel->g = g >> 8;
dstPixel->b = b >> 8;
for (int y = 0; y < dstH; y++) {
DfColour *dstRow = &dst->pixels[y * dst->width];
for (int x = dstW - 1; x; x--) {
int srcXAndWeight = (x * widthRatio) >> 8;
int srcX = srcXAndWeight >> 8;
unsigned r = 0, g = 0, b = 0;
unsigned weightX2 = srcXAndWeight & 0xFF;
unsigned weightX = 256 - weightX2;
// Pixel 0,0
DfColour *srcPixel = &dstRow[srcX];
r += srcPixel->r * weightX;
g += srcPixel->g * weightX;
b += srcPixel->b * weightX;
// Pixel 0,1
r += srcPixel->r * weightX2;
g += srcPixel->g * weightX2;
b += srcPixel->b * weightX2;
DfColour *dstPixel = &dstRow[x];
dstPixel->r = r >> 8;
dstPixel->g = g >> 8;
dstPixel->b = b >> 8;


void bilinear_interpolation(DfBitmap *src, DfBitmap *dst, float scale) {
unsigned heightRatio = (double)(1<<8) * 255.0 / scale;
unsigned widthRatio = (double)(1<<8) * 255.0 / scale;
int dstH = scale * src->height;
int dstW = scale * src->width;
for (int y = 0; y < dstH; y++) {
int srcYAndWeight = (y * heightRatio) >> 8;
int srcY = srcYAndWeight >> 8;
DfColour *dstPixel = &dst->pixels[y * dst->width];
DfColour *srcRow = &src->pixels[srcY * src->width];
unsigned weightY2 = srcYAndWeight & 0xFF;
unsigned weightY = 256 - weightY2;
for (int x = 0; x < src->width; x++, dstPixel++) {
unsigned rb = 0, g = 0;
// Pixel 0,0
DfColour *srcPixel = &srcRow[x];
rb += (srcPixel->c & 0xff00ff) * weightY;
g += srcPixel->g * weightY;
// Pixel 1,0
srcPixel += src->width;
rb += (srcPixel->c & 0xff00ff) * weightY2;
g += srcPixel->g * weightY2;
dstPixel->c = rb >> 8;
dstPixel->g = g >> 8;
for (int y = 0; y < dstH; y++) {
DfColour *dstRow = &dst->pixels[y * dst->width];
for (int x = dstW - 1; x; x--) {
int srcXAndWeight = (x * widthRatio) >> 8;
int srcX = srcXAndWeight >> 8;
unsigned rb = 0, g = 0;
unsigned weightX2 = srcXAndWeight & 0xFF;
unsigned weightX = 256 - weightX2;
// Pixel 0,0
DfColour *srcPixel = &dstRow[srcX];
rb += (srcPixel->c & 0xff00ff) * weightX;
g += srcPixel->g * weightX;
// Pixel 0,1
rb += (srcPixel->c & 0xff00ff) * weightX2;
g += srcPixel->g * weightX2;
DfColour *dstPixel = &dstRow[x];
dstPixel->c = rb >> 8;
dstPixel->g = g >> 8;


void bilinear_interpolation(DfBitmap *src, DfBitmap *dst, float scale) {
unsigned heightRatio = (double)(1<<8) * 255.0 / scale;
unsigned widthRatio = (double)(1<<8) * 255.0 / scale;
int dstH = scale * src->height;
int dstW = scale * src->width;
for (int y = 0; y < dstH; y++) {
int srcYAndWeight = (y * heightRatio) >> 8;
int srcY = srcYAndWeight >> 8;
DfColour *dstRow = &dst->pixels[y * dst->width];
DfColour *srcRow = &src->pixels[srcY * src->width];
unsigned weightY2 = srcYAndWeight & 0xFF;
unsigned weightY = 256 - weightY2;
for (int x = 0; x < src->width; x++) {
unsigned rb = 0, g = 0;
// Pixel 0,0
DfColour *srcPixel = &srcRow[x];
rb += (srcPixel->c & 0xff00ff) * weightY;
g += srcPixel->g * weightY;
// Pixel 1,0
srcPixel += src->width;
rb += (srcPixel->c & 0xff00ff) * weightY2;
g += srcPixel->g * weightY2;
dstRow[x].c = rb >> 8;
dstRow[x].g = g >> 8;
for (int x = dstW - 1; x; x--) {
unsigned rb = 0, g = 0;
int srcXAndWeight = (x * widthRatio) >> 8;
int srcX = srcXAndWeight >> 8;
unsigned weightX2 = srcXAndWeight & 0xFF;
unsigned weightX = 256 - weightX2;
// Pixel 0,0
DfColour *srcPixel = &dstRow[srcX];
rb += (srcPixel->c & 0xff00ff) * weightX;
g += srcPixel->g * weightX;
// Pixel 0,1
rb += (srcPixel->c & 0xff00ff) * weightX2;
g += srcPixel->g * weightX2;
dstRow[x].c = rb >> 8;
dstRow[x].g = g >> 8;


struct SrcXandWeights {
uint8_t weightX, weightX2;
uint16_t srcX;
void bilinear_interpolation(DfBitmap *src, DfBitmap *dst, float scale) {
unsigned heightRatio = (double)(1<<8) * 255.0 / scale;
unsigned widthRatio = (double)(1<<8) * 255.0 / scale;
int dstH = scale * src->height;
int dstW = scale * src->width;
// Allocate look-up table.
static SrcXandWeights *lut = NULL;
static int lutSize = 0;
if (lutSize < dstW) {
delete [] lut;
lut = new SrcXandWeights [dstW];
lutSize = dstW;
// Populate look-up table.
for (int x = 0; x < dstW; x++) {
int srcXAndWeight = (x * widthRatio) >> 8;
lut[x].srcX = srcXAndWeight >> 8;
lut[x].weightX2 = srcXAndWeight & 0xFF;
lut[x].weightX = 255 - lut[x].weightX2;
for (int y = 0; y < dstH; y++) {
int srcYAndWeight = (y * heightRatio) >> 8;
int srcY = (srcYAndWeight) >> 8;
DfColour *dstRow = &dst->pixels[y * dst->width];
DfColour *srcRow = &src->pixels[srcY * src->width];
unsigned weightY2 = srcYAndWeight & 0xFF;
unsigned weightY = 256 - weightY2;
for (int x = 0; x < src->width; x++) {
// Pixel 0,0
DfColour *srcPixel = &srcRow[x];
unsigned rb = (srcPixel->c & 0xff00ff) * weightY;
unsigned g = srcPixel->g * weightY;
// Pixel 1,0
srcPixel += src->width;
rb += (srcPixel->c & 0xff00ff) * weightY2;
g += srcPixel->g * weightY2;
dstRow[x].c = rb >> 8;
dstRow[x].g = g >> 8;
for (int x = dstW - 1; x; x--) {
SrcXandWeights *sw = lut + x;
// Pixel 0,0
DfColour *srcPixel = &dstRow[sw->srcX];
unsigned rb = (srcPixel->c & 0xff00ff) * sw->weightX;
unsigned g = srcPixel->g * sw->weightX;
// Pixel 0,1
rb += (srcPixel->c & 0xff00ff) * sw->weightX2;
g += srcPixel->g * sw->weightX2;
dstRow[x].c = rb >> 8;
dstRow[x].g = g >> 8;


我不知道还需要什么技巧才能降到4ms,尽管转换为真正的AVX SIMD实现可能是必要的。
