使用自定义运算符减少OpenMP SIMD
OpenMP SIMD reduction with custom operator
我想要使用#pragma omp simd
:加速以下循环
#define N 1024
double* data = new double[N];
// Generate data, not important how.
double mean = 0.0
for (size_t i = 0; i < N; i++) {
mean += (data[i] - mean) / (i+1);
}
正如我所预料的,仅仅将#pragma omp simd
直接放在循环之前是没有影响的(我正在研究运行时间)。使用#pragma omp parallel for reduction(...)
和如下所示的自定义reducer,我可以很容易地处理多线程情况,但我如何在这里使用OpenMP SIMD?
我使用以下类来实现+和+=运算符,用于将double
添加到运行平均值以及组合两个运行平均值:
class RunningMean {
private:
double mean;
size_t count;
public:
RunningMean(): mean(0), count(0) {}
RunningMean(double m, size_t c): mean(m), count(c) {}
RunningMean operator+(RunningMean& rhs) {
size_t c = this->count + rhs.count;
double m = (this->mean*this->count + rhs.mean*rhs.count) / c;
return RunningMean(m, c);
}
RunningMean operator+(double rhs) {
size_t c = this->count + 1;
double m = this->mean + (rhs - this->mean) / c;
return RunningMean(m, c);
}
RunningMean& operator+=(const RunningMean& rhs) {
this->mean = this->mean*this->count + rhs.mean*rhs.count;
this->count += rhs.count;
this->mean /= this->count;
return *this;
}
RunningMean& operator+=(double rhs) {
this->count++;
this->mean += (rhs - this->mean) / this->count;
return *this;
}
double getMean() { return mean; }
size_t getCount() { return count; }
};
这个的数学公式来自http://prod.sandia.gov/techlib/access-control.cgi/2008/086212.pdf.对于多线程、非SIMD并行缩减,我执行以下操作:
#pragma omp declare reduction (runningmean : RunningMean : omp_out += omp_in)
RunningMean mean;
#pragma omp parallel for reduction(runningmean:mean)
for (size_t i = 0; i < N; i++)
mean += data[i];
这让我在使用8个线程的酷睿i7 2600k上获得了3.2X的加速。
如果我自己在没有OpenMP的情况下实现SIMD,我只需要在一个向量中保持4个均值,在另一个向量(假设使用AVX指令)中保持4计数,并使用矢量化版本的operator+(double rhs)
不断添加4元素双精度向量。完成后,我会使用operator+=
的数学运算将得到的4对均值和计数相加。如何指示OpenMP执行此操作?
问题是
mean += (data[i] - mean) / (i+1);
不容易接受SIMD。然而,通过仔细研究数学,可以毫不费力地将其矢量化。
关键论坛是
mean(n+m) = (n*mean(n) + m*mean(m))/(n+m)
示出了如何将CCD_ 7数的均值与CCD_。这可以在操作员定义RunningMean operator+(RunningMean& rhs)
中看到。这就解释了并行代码工作的原因。我认为,如果我们对您的C++代码进行去卷积,这一点会更清楚:
double mean = 0.0;
int count = 0;
#pragma omp parallel
{
double mean_private = 0.0;
int count_private = 0;
#pragma omp for nowait
for(size_t i=0; i<N; i++) {
count_private ++;
mean_private += (data[i] - mean_private)/count_private;
}
#pragma omp critical
{
mean = (count_private*mean_private + count*mean);
count += count_private;
mean /= count;
}
}
但我们可以将相同的想法用于SIMD(并将它们组合在一起)。但是,让我们先做SIMD的唯一部分。使用AVX,我们可以同时处理四种并行方式。每个并行平均值将处理以下数据元素:
mean 1 data elements: 0, 4, 8, 12,...
mean 2 data elements: 1, 5, 9, 13,...
mean 3 data elements: 2, 6, 10, 14,...
mean 4 data elements: 3, 7, 11, 15,...
首先,我们在所有元素上循环,然后将四个平行和相加,然后除以四(因为每个和都在N/4个元素上运行)。
这是这个的代码
double mean = 0.0;
__m256d mean4 = _mm256_set1_pd(0.0);
__m256d count4 = _mm256_set1_pd(0.0);
for(size_t i=0; i<N/4; i++) {
count4 = _mm256_add_pd(count4,_mm256_set1_pd(1.0));
__m256d t1 = _mm256_loadu_pd(&data[4*i]);
__m256d t2 = _mm256_div_pd(_mm256_sub_pd(t1, mean4), count4);
mean4 = _mm256_add_pd(t2, mean4);
}
__m256d t1 = _mm256_hadd_pd(mean4,mean4);
__m128d t2 = _mm256_extractf128_pd(t1,1);
__m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
mean = _mm_cvtsd_f64(t3)/4;
int count = 0;
double mean2 = 0;
for(size_t i=4*(N/4); i<N; i++) {
count++;
mean2 += (data[i] - mean2)/count;
}
mean = (4*(N/4)*mean + count*mean2)/N;
最后,我们可以将其与OpenMP相结合,以充分利用SIMD和MIMD(如)
double mean = 0.0;
int count = 0;
#pragma omp parallel
{
double mean_private = 0.0;
int count_private = 0;
__m256d mean4 = _mm256_set1_pd(0.0);
__m256d count4 = _mm256_set1_pd(0.0);
#pragma omp for nowait
for(size_t i=0; i<N/4; i++) {
count_private++;
count4 = _mm256_add_pd(count4,_mm256_set1_pd(1.0));
__m256d t1 = _mm256_loadu_pd(&data[4*i]);
__m256d t2 = _mm256_div_pd(_mm256_sub_pd(t1, mean4), count4);
mean4 = _mm256_add_pd(t2, mean4);
}
__m256d t1 = _mm256_hadd_pd(mean4,mean4);
__m128d t2 = _mm256_extractf128_pd(t1,1);
__m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
mean_private = _mm_cvtsd_f64(t3)/4;
#pragma omp critical
{
mean = (count_private*mean_private + count*mean);
count += count_private;
mean /= count;
}
}
int count2 = 0;
double mean2 = 0;
for(size_t i=4*(N/4); i<N; i++) {
count2++;
mean2 += (data[i] - mean2)/count2;
}
mean = (4*(N/4)*mean + count2*mean2)/N;
下面是一个工作示例(使用-O3 -mavx -fopenmp
编译)
#include <stdio.h>
#include <stdlib.h>
#include <x86intrin.h>
double mean_simd(double *data, const int N) {
double mean = 0.0;
__m256d mean4 = _mm256_set1_pd(0.0);
__m256d count4 = _mm256_set1_pd(0.0);
for(size_t i=0; i<N/4; i++) {
count4 = _mm256_add_pd(count4,_mm256_set1_pd(1.0));
__m256d t1 = _mm256_loadu_pd(&data[4*i]);
__m256d t2 = _mm256_div_pd(_mm256_sub_pd(t1, mean4), count4);
mean4 = _mm256_add_pd(t2, mean4);
}
__m256d t1 = _mm256_hadd_pd(mean4,mean4);
__m128d t2 = _mm256_extractf128_pd(t1,1);
__m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
mean = _mm_cvtsd_f64(t3)/4;
int count = 0;
double mean2 = 0;
for(size_t i=4*(N/4); i<N; i++) {
count++;
mean2 += (data[i] - mean2)/count;
}
mean = (4*(N/4)*mean + count*mean2)/N;
return mean;
}
double mean_simd_omp(double *data, const int N) {
double mean = 0.0;
int count = 0;
#pragma omp parallel
{
double mean_private = 0.0;
int count_private = 0;
__m256d mean4 = _mm256_set1_pd(0.0);
__m256d count4 = _mm256_set1_pd(0.0);
#pragma omp for nowait
for(size_t i=0; i<N/4; i++) {
count_private++;
count4 = _mm256_add_pd(count4,_mm256_set1_pd(1.0));
__m256d t1 = _mm256_loadu_pd(&data[4*i]);
__m256d t2 = _mm256_div_pd(_mm256_sub_pd(t1, mean4), count4);
mean4 = _mm256_add_pd(t2, mean4);
}
__m256d t1 = _mm256_hadd_pd(mean4,mean4);
__m128d t2 = _mm256_extractf128_pd(t1,1);
__m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
mean_private = _mm_cvtsd_f64(t3)/4;
#pragma omp critical
{
mean = (count_private*mean_private + count*mean);
count += count_private;
mean /= count;
}
}
int count2 = 0;
double mean2 = 0;
for(size_t i=4*(N/4); i<N; i++) {
count2++;
mean2 += (data[i] - mean2)/count2;
}
mean = (4*(N/4)*mean + count2*mean2)/N;
return mean;
}
int main() {
const int N = 1001;
double data[N];
for(int i=0; i<N; i++) data[i] = 1.0*rand()/RAND_MAX;
float sum = 0; for(int i=0; i<N; i++) sum+= data[i]; sum/=N;
printf("mean %fn", sum);
printf("mean_simd %fn", mean_simd(data, N);
printf("mean_simd_omp %fn", mean_simd_omp(data, N));
}
KISS的答案:只需计算循环外的平均值。并行化以下代码:
double sum = 0.0;
for(size_t i = 0; i < N; i++) sum += data[i];
double mean = sum/N;
总和很容易并行化,但您不会看到SIMD优化的任何效果:它纯粹是内存绑定的,CPU只会等待来自内存的数据。如果N
和1024
一样小,那么并行化就没有什么意义了,同步开销会消耗掉所有的增益。
- OpenMP阵列性能较差
- OpenMP卸载说'fatal error: could not find accel/nvptx-none/mkoffload'
- 使用 GCC 卸载的 OpenMP 卸载失败,并出现"Ptx assembly aborted due to errors"
- OpenMP:并行更新数组总是需要减少数组吗
- 如何使用OpenMP并行这两个循环
- 从python调用openMP共享库时,未定义opnMP函数
- 如何使用OpenMP并行化此矩阵时间矢量运算
- 如何使用OpenMP使这个循环并行
- 如何通过替换顺序代码的while循环来添加OpenMP for循环
- 查找最近配对时的OpenMP竞赛条件
- 使用输入打破 OpenMP 中的循环
- 为什么 openmp 的并行不适用于矢量化色彩空间转换?
- 使用 openMP simd 进行以下循环是否安全?
- openMP 的 SIMD 结构是否需要特定类型的硬件?
- 使用 openmp + SIMD 时没有加速
- c=c+a*b的OpenMP 4 simd矢量化
- OpenMP为内联函数声明SIMD
- 使用自定义运算符减少OpenMP SIMD
- SIMD指令缺少OpenMP if子句
- OpenMP奇怪的行为与SIMD线性和并行的线性指令