使用 GPU 上的推力库进行排序

Sorting with thrust library on gpu

本文关键字：排序 GPU 使用更新时间：2023-10-16
我从事立体视觉工作，但我对 te Thrust 库中的sort有问题。当我在我的内核函数中使用它时，应用程序会运行并出现错误，因为所有内核都没有在我的<<< >>>调用中启动，但是当我删除sort时一切都有效(但结果不好(。我已经搜索了替代方案，但除了 GPU 排序的推力之外，我什么也没找到。谢谢
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/core/core.hpp>
#include <iostream>
#include <time.h>
#include <vector>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_functions.h>
#include <device_launch_parameters.h>
#include <opencv2/cudaarithm.hpp>
#include <opencv2/core/cuda.hpp>
#include <algorithm>
#include <functional>
#include <array>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/generate.h>
#include <thrust/equal.h>
#include <thrust/sequence.h>
#include <thrust/for_each.h>
#include <opencv2/imgproc/imgproc.hpp>
using namespace std;
using namespace cv;
const int correlationWindow = 81;
const int widthWindow = (int)sqrt((float)correlationWindow);
const int searchWindow = 52;
__constant__ int widthWindow2 = 9;
__device__
void makeVector(float *mat, float *vec, int col, int x, int y) {
int ind = 0;
for (int i = x; i < x + widthWindow2; i++) {
for (int j = y; j < y + widthWindow2; j++) {
vec[ind] = mat[col * i + j];
ind++;
}
}
}
__device__
void disparityUpdate2(int i, int j, int col, int distance, float *d_disparity) {
d_disparity[col * i + j] = ((255 / searchWindow) * distance);
}
void resize(float *d_disparity, Mat &disparity) {
for (int i = 0; i < disparity.rows; i++) {
for (int j = 0; j < disparity.cols; j++) {
int ind = disparity.cols * i + j;
disparity.at<float>(i, j) = d_disparity[ind];
}
}
}
__global__
void computeSMAD2(int minX, float *d_mL, float *d_mR, float *dif, float *windowL, float *windowR, float  *d_disparity, int colmLO, int colmL, int seachWindow) {
int mini;
int pOiX(threadIdx.x + minX); // + minX
int pOiY(blockIdx.x + minX);
int newPoIx(max(minX, pOiX - searchWindow));
int newPoIy(pOiY); 
/*int pOiX(pox);
int pOiY(poy);
int newPoIx(npox);
int newPoIy(npoy); */
int minPoIx(newPoIx);
int smad = 0;
int bMax = (int)(correlationWindow / 2);
makeVector(d_mL, windowL, colmL, pOiY, pOiX); // ATTENTION ligne / colonne
makeVector(d_mR, windowR, colmL, newPoIy, newPoIx);
for (int h = 0; h < correlationWindow; h++) {
dif[h] = windowL[h] - windowR[h];
}
thrust::sort(thrust::seq, dif, dif + correlationWindow);
int median = dif[(correlationWindow - 1) / 2];
for (int h = 0; h < correlationWindow; h++) {
dif[h] = abs(dif[h] - median);
}
thrust::sort(thrust::seq, dif, dif + correlationWindow);
for (int i = 0; i <= bMax - 1; i++) {
smad = smad + pow(dif[i], 2);
}
mini = smad;
newPoIx++;
for (int i = newPoIx; i <= pOiX; i++) {
smad = 0;
makeVector(d_mR, windowR, colmL, newPoIy, i);
for (int h = 0; h < correlationWindow; h++) {
dif[h] = windowL[h] - windowR[h];
}
thrust::sort(thrust::seq, dif, dif + correlationWindow);
median = dif[(correlationWindow - 1) / 2];
for (int h = 0; h < correlationWindow; h++) {
dif[h] = abs(dif[h] - median);
}
thrust::sort(thrust::seq, dif, dif + correlationWindow);
for (int j = 0; j <= bMax - 1; j++) {
if (smad < mini) {
smad = smad + pow(dif[j], 2);
}
else {
break;
}
}
if (smad < mini) {
mini = smad;
minPoIx = i;
}
}
int distance = pOiX - minPoIx;
d_disparity[colmLO * (pOiY - minX) + (pOiX - minX)] = ((255 / searchWindow) * distance);
}
Mat SMAD2(int minX, Mat mLO, Mat mRO) {
Mat mL = Mat::zeros(mLO.rows + 2 * minX, mLO.cols + 2 * minX, CV_32FC1);
Mat mR = Mat::zeros(mLO.rows + 2 * minX, mLO.cols + 2 * minX, CV_32FC1);
Mat disparity = Mat::zeros(mRO.rows, mRO.cols, CV_32FC1);
mLO.copyTo(mL.rowRange(minX, mL.rows - minX).colRange(minX, mL.cols - minX));
mRO.copyTo(mR.rowRange(minX, mL.rows - minX).colRange(minX, mL.cols - minX));
float *d_mL, *windowL;
float *d_mR, *windowR;
float *dif;
float *d_disparity;
cudaMallocManaged(&dif, correlationWindow * sizeof(float));
cudaMallocManaged(&windowL, correlationWindow * sizeof(float));
cudaMallocManaged(&windowR, correlationWindow * sizeof(float));
cudaMallocManaged(&d_mL, mL.rows * mL.cols * sizeof(float));
cudaMallocManaged(&d_mR, mR.rows * mR.cols * sizeof(float));
cudaMallocManaged(&d_disparity, disparity.rows * disparity.cols * sizeof(float)); 
/*dif = new float[correlationWindow];
windowL = new float[correlationWindow];
windowR = new float[correlationWindow];
d_mL = new float[mL.rows * mL.cols];
d_mR = new float[mR.rows * mR.cols];
d_disparity = new float[disparity.rows * disparity.cols]; */

memcpy(d_mL, mL.data, mL.rows * mL.cols * sizeof(float));
memcpy(d_mR, mR.data, mR.rows * mR.cols * sizeof(float));
memcpy(d_disparity, disparity.data, disparity.rows * disparity.cols * sizeof(float));
int ind = 0;
int colmL = mL.cols;
int colmLO = mLO.cols;
int npox, npoy;
clock_t begin = clock();
computeSMAD2 <<<70, 50>>>(minX, d_mL, d_mR, dif, windowL, windowR, d_disparity, mLO.cols, mL.cols, searchWindow);
//computeSMAD2 <<<mLO.rows, mLO.cols>>>(minX, d_mL, d_mR, dif, windowL, windowR, d_disparity, mLO.cols, mL.cols, searchWindow);
cudaDeviceSynchronize();
/*
for (int poy = minX; poy < mR.rows - minX; poy++) {
for (int pox = minX; pox < mR.cols - minX; pox++) {
//////////////////////// DE GAUCHE A DROITE
npox = max(minX, pox - searchWindow);
npoy = poy;
computeSMAD2(minX, d_mL, d_mR, disparity, d_disparity, windowL, windowR, dif, colmLO, colmL, pox, poy, npox, npoy);
ind++;
}
} */
clock_t end = clock();
double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;
cout << "time " << elapsed_secs << endl;
cudaFree(dif);
cudaFree(windowL);
cudaFree(windowR);
cudaFree(d_mL);
cudaFree(d_mR);
cudaFree(d_disparity); 
return disparity;
}
int main(int argc, char* argv[]) {
int minX = (int)floor((float)(widthWindow / 2));
Mat mL2 = Mat::ones(70, 50, CV_8UC1) * 255;
Mat mR2 = Mat::zeros(70, 50, CV_8UC1);
Mat disparity = SMAD2(minX, mL2, mR2);
disparity.convertTo(disparity, CV_8UC1);
Mat im;
hconcat(mL2, mR2, im);
hconcat(im, disparity, im); imshow("test", im); waitKey(0);
return 0;
}
你的代码是错误的。如我所见，所有线程都会对同一个数组DIF进行排序。它导致内存冲突。