10维蒙特卡罗集成与openmp
10 dimensional Monte Carlo integration with openmp
我正在尝试用openmp学习并行化。我写了一个c++脚本,通过MC计算函数的10维积分:F = x1+ x2 + x3 +…+x10
现在我正试图将其转换为具有4个线程的openmp工作。我的串行代码给出了可理解的输出,所以我相信它工作得很好。这是我的序列号:我想要每4^k次迭代输出N=样本点的个数。
/* compile with
$ g++ -o monte ND_MonteCarlo.cpp
$ ./monte N
unsigned long long int for i, N
Maximum value for UNSIGNED LONG LONG INT 18446744073709551615
*/
#include <iostream>
#include <fstream>
#include <iomanip>
#include <cmath>
#include <cstdlib>
#include <ctime>
using namespace std;
//define multivariate function F(x1, x2, ...xk)
double f(double x[], int n)
{
double y;
int j;
y = 0.0;
for (j = 0; j < n; j = j+1)
{
y = y + x[j];
}
y = y;
return y;
}
//define function for Monte Carlo Multidimensional integration
double int_mcnd(double(*fn)(double[],int),double a[], double b[], int n, int m)
{
double r, x[n], v;
int i, j;
r = 0.0;
v = 1.0;
// step 1: calculate the common factor V
for (j = 0; j < n; j = j+1)
{
v = v*(b[j]-a[j]);
}
// step 2: integration
for (i = 1; i <= m; i=i+1)
{
// calculate random x[] points
for (j = 0; j < n; j = j+1)
{
x[j] = a[j] + (rand()) /( (RAND_MAX/(b[j]-a[j])));
}
r = r + fn(x,n);
}
r = r*v/m;
return r;
}
double f(double[], int);
double int_mcnd(double(*)(double[],int), double[], double[], int, int);
int main(int argc, char **argv)
{
/* define how many integrals */
const int n = 10;
double b[n] = {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0,5.0};
double a[n] = {-5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0,-5.0};
double result, mean;
int m;
unsigned long long int i, N;
// initial seed value (use system time)
srand(time(NULL));
cout.precision(6);
cout.setf(ios::fixed | ios::showpoint);
// current time in seconds (begin calculations)
time_t seconds_i;
seconds_i = time (NULL);
m = 4; // initial number of intervals
// convert command-line input to N = number of points
N = atoi( argv[1] );
for (i=0; i <=N/pow(4,i); i++)
{
result = int_mcnd(f, a, b, n, m);
mean = result/(pow(10,10));
cout << setw(30) << m << setw(30) << result << setw(30) << mean <<endl;
m = m*4;
}
// current time in seconds (end of calculations)
time_t seconds_f;
seconds_f = time (NULL);
cout << endl << "total elapsed time = " << seconds_f - seconds_i << " seconds" << endl << endl;
return 0;
}
和输出:
N integral mean_integral
4 62061079725.185936 6.206108
16 33459275100.477665 3.345928
64 -2204654740.788784 -0.220465
256 4347440045.990804 0.434744
1024 -1265056243.116922 -0.126506
4096 681660387.953380 0.068166
16384 -799507050.896809 -0.079951
65536 -462592561.594820 -0.046259
262144 50902035.836772 0.005090
1048576 -91104861.129695 -0.009110
4194304 3746742.588701 0.000375
16777216 -32967862.853915 -0.003297
67108864 17730924.602974 0.001773
268435456 -416824.977687 -0.00004
1073741824 2843188.477219 0.000284
但是我认为我的并行代码根本不起作用。我知道我在做一些愚蠢的事情。因为我的线程数是4,我想把结果除以4,输出是荒谬的。
下面是相同代码的并行版本:
/* compile with
$ g++ -fopenmp -Wunknown-pragmas -std=c++11 -o mcOMP parallel_ND_MonteCarlo.cpp -lm
$ ./mcOMP N
unsigned long long int for i, N
Maximum value for UNSIGNED LONG LONG INT 18446744073709551615
*/
#include <iostream>
#include <fstream>
#include <iomanip>
#include <cmath>
#include <cstdlib>
#include <ctime>
#include <omp.h>
using namespace std;
//define multivariate function F(x1, x2, ...xk)
double f(double x[], int n)
{
double y;
int j;
y = 0.0;
for (j = 0; j < n; j = j+1)
{
y = y + x[j];
}
y = y;
return y;
}
//define function for Monte Carlo Multidimensional integration
double int_mcnd(double(*fn)(double[],int),double a[], double b[], int n, int m)
{
double r, x[n], v;
int i, j;
r = 0.0;
v = 1.0;
// step 1: calculate the common factor V
#pragma omp for
for (j = 0; j < n; j = j+1)
{
v = v*(b[j]-a[j]);
}
// step 2: integration
#pragma omp for
for (i = 1; i <= m; i=i+1)
{
// calculate random x[] points
for (j = 0; j < n; j = j+1)
{
x[j] = a[j] + (rand()) /( (RAND_MAX/(b[j]-a[j])));
}
r = r + fn(x,n);
}
r = r*v/m;
return r;
}
double f(double[], int);
double int_mcnd(double(*)(double[],int), double[], double[], int, int);
int main(int argc, char **argv)
{
/* define how many integrals */
const int n = 10;
double b[n] = {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0};
double a[n] = {-5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0,-5.0};
double result, mean;
int m;
unsigned long long int i, N;
int NumThreads = 4;
// initial seed value (use system time)
srand(time(NULL));
cout.precision(6);
cout.setf(ios::fixed | ios::showpoint);
// current time in seconds (begin calculations)
time_t seconds_i;
seconds_i = time (NULL);
m = 4; // initial number of intervals
// convert command-line input to N = number of points
N = atoi( argv[1] );
#pragma omp parallel private(result, mean) shared(N, m) num_threads(NumThreads)
for (i=0; i <=N/pow(4,i); i++)
{
result = int_mcnd(f, a, b, n, m);
mean = result/(pow(10,10));
#pragma omp master
cout << setw(30) << m/4 << setw(30) << result/4 << setw(30) << mean/4 <<endl;
m = m*4;
}
// current time in seconds (end of calculations)
time_t seconds_f;
seconds_f = time (NULL);
cout << endl << "total elapsed time = " << seconds_f - seconds_i << " seconds" << endl << endl;
return 0;
}
我只想让主线程输出这些值。我用:
编译g++ -fopenmp -Wunknown-pragmas -std=c++11 -o mcOMP parallel_ND_MonteCarlo.cpp -lm
非常感谢您对修复代码的帮助和建议。非常感谢。
让我们看看你的程序是做什么的。在omp parallel
,您的线程被生成,它们将并行执行剩余的代码。操作,比如:
m = m * 4;
是未定义的(并且通常没有意义,因为它们每次迭代执行四次)。
此外,当这些线程遇到omp for
时,它们将共享循环的工作,即每个迭代将仅由某个线程执行一次。由于int_mcnd
是在parallel
区域内执行的,所以它的所有局部变量都是私有的。您的代码中没有结构来实际收集这些私有结果(result
和mean
也是私有的)。
正确的方法是使用一个带有reduction
子句的并行for循环,表明有一个变量(r
/v
)在循环的整个执行过程中被聚合。
为了实现这一点,需要将缩减变量声明为共享的,在并行区域的范围之外。最简单的解决方案是移动int_mcnd
内部的并行区域。这也避免了m
的竞争条件。
还有一个障碍:rand
使用全局状态,至少我的实现是锁定的。由于大部分时间都花在了rand
上,所以代码的可伸缩性非常糟糕。解决方案是通过rand_r
使用显式的线程私有状态。(参见这个问题)。
double int_mcnd(double (*fn)(double[], int), double a[], double b[], int n, int m)
{
// Reduction variables need to be shared
double r = 0.0;
double v = 1.0;
#pragma omp parallel
// All variables declared inside are private
{
// step 1: calculate the common factor V
#pragma omp for reduction(* : v)
for (int j = 0; j < n; j = j + 1)
{
v = v * (b[j] - a[j]);
}
// step 2: integration
unsigned int private_seed = omp_get_thread_num();
#pragma omp for reduction(+ : r)
for (int i = 1; i <= m; i = i + 1)
{
// Note: X MUST be private, otherwise, you have race-conditions again
double x[n];
// calculate random x[] points
for (int j = 0; j < n; j = j + 1)
{
x[j] = a[j] + (rand_r(&private_seed)) / ((RAND_MAX / (b[j] - a[j])));
}
r = r + fn(x, n);
}
}
r = r * v / m;
return r;
}
double f(double[], int);
double int_mcnd(double (*)(double[], int), double[], double[], int, int);
int main(int argc, char** argv)
{
/* define how many integrals */
const int n = 10;
double b[n] = { 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0 };
double a[n] = { -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0 };
int m;
unsigned long long int i, N;
int NumThreads = 4;
// initial seed value (use system time)
srand(time(NULL));
cout.precision(6);
cout.setf(ios::fixed | ios::showpoint);
// current time in seconds (begin calculations)
time_t seconds_i;
seconds_i = time(NULL);
m = 4; // initial number of intervals
// convert command-line input to N = number of points
N = atoi(argv[1]);
for (i = 0; i <= N / pow(4, i); i++)
{
double result = int_mcnd(f, a, b, n, m);
double mean = result / (pow(10, 10));
cout << setw(30) << m << setw(30) << result << setw(30) << mean << endl;
m = m * 4;
}
// current time in seconds (end of calculations)
time_t seconds_f;
seconds_f = time(NULL);
cout << endl << "total elapsed time = " << seconds_f - seconds_i << " seconds" << endl << endl;
return 0;
}
注意,我删除了除以4的除法,并且输出是在并行区域之外完成的。结果应该与串行版本相似(当然随机性除外)。
使用-O3
,我在16核系统上观察到完美的16倍加速。
注释:
尽可能在本地声明变量
如果线程开销是一个问题,您可以将并行区域移到外部,但是您需要更仔细地考虑并行执行,并为共享缩减变量找到解决方案。考虑到蒙特卡罗代码令人尴尬的并行特性,您可以通过删除omp for
指令来更紧密地坚持最初的解决方案——这意味着每个线程执行所有循环迭代。然后可以手动汇总结果变量并打印出来。但我真的不明白这有什么意义
我就不一一细说了,不过我会给出一些参考
以这部分代码为例:
// step 1: calculate the common factor V
#pragma omp for
for (j = 0; j < n; j = j+1)
{
v = v*(b[j]-a[j]);
}
如果你看变量v,有一个明显的竞态条件。也就是说,你必须将v声明为线程私有(可能称之为local_v),然后通过缩减操作将所有值收集到一个global_v值中。
一般情况下,我会建议您查找openmp的竞争条件,关键区域,共享和私有内存的概念。
- OpenMP阵列性能较差
- 将公共但非静态的成员函数与ALGLIB集成
- OpenMP卸载说'fatal error: could not find accel/nvptx-none/mkoffload'
- 使用 GCC 卸载的 OpenMP 卸载失败,并出现"Ptx assembly aborted due to errors"
- 将IBM Rhapsody模型集成到VS 2019中
- 从R调用C++函数并对其进行集成时出错
- OpenMP:并行更新数组总是需要减少数组吗
- 如何使用OpenMP并行这两个循环
- 从python调用openMP共享库时,未定义opnMP函数
- 如何使用OpenMP并行化此矩阵时间矢量运算
- 如何使用OpenMP使这个循环并行
- 如何通过替换顺序代码的while循环来添加OpenMP for循环
- 查找最近配对时的OpenMP竞赛条件
- 使用输入打破 OpenMP 中的循环
- 为什么 openmp 的并行不适用于矢量化色彩空间转换?
- 在 openmp 中,omp_get_thread_num是否绑定到物理线程?
- 在C++中使用并行化的预期速度是多少(不是 OpenMp,而是 <thread>)
- OpenMP 加上unordered_map<字符串、双字符串的缩减>
- 通过集成与OpenMP并行计算
- 10维蒙特卡罗集成与openmp