RBM尚未在代码上使用OpenACC进行改进

RBM no improvement with OpenACC on the code yet

本文关键字：OpenACC 代码 RBM 更新时间：2023-10-16

rbm算法是开源算法源代码可在此处找到：https：//github.com/yusugomori/deeplearning/tree/master/master/cpp

我试图以不同的方式通过OpenACC进行改进，但是顺序代码仍然更好因此，您能告诉我应该做什么（需要改进的部分）才能获得高改进

#include <iostream>
#include <math.h>
#include "utils.h"
#include "RBM.h"
using namespace std;
using namespace utils;

RBM::RBM(int size, int n_v, int n_h, double **w, double *hb, double *vb) {
N = size;
n_visible = n_v;
n_hidden = n_h;
#pragma acc enter data copyin ( this)
//#pragma acc enter data copy ( W[0:n_hidden][0:n_visible] )
if(w == NULL) {
W = new double*[n_hidden];
for(int i=0; i<n_hidden; i++) W[i] = new double[n_visible];
double a = 1.0 / n_visible;
for(int i=0; i<n_hidden; i++) {
for(int j=0; j<n_visible; j++) {
W[i][j] = uniform(-a, a);
}
}
} else {
W = w;
}
if(hb == NULL) {
hbias = new double[n_hidden];
for(int i=0; i<n_hidden; i++) hbias[i] = 0;
} else {
hbias = hb;
}
if(vb == NULL) {
vbias = new double[n_visible];
for(int i=0; i<n_visible; i++) vbias[i] = 0;
} else {
vbias = vb;
}
}
RBM::~RBM() {
#pragma acc exit data delete ( W[0:n_hidden][0:n_visible],this )
for(int i=0; i<n_hidden; i++) delete[] W[i];
delete[] W;
delete[] hbias;
delete[] vbias;
}

void RBM::contrastive_divergence(int *input, double lr, int k) {
double *ph_mean = new double[n_hidden];
int *ph_sample = new int[n_hidden];
double *nv_means = new double[n_visible];
int *nv_samples = new int[n_visible];
double *nh_means = new double[n_hidden];
int *nh_samples = new int[n_hidden];
/* CD-k */
sample_h_given_v(input, ph_mean, ph_sample);
for(int step=0; step<k; step++) {
if(step == 0) {
gibbs_hvh(ph_sample, nv_means, nv_samples, nh_means, nh_samples);
} else {
gibbs_hvh(nh_samples, nv_means, nv_samples, nh_means, nh_samples);
}
}
for(int i=0; i<n_hidden; i++) {
for(int j=0; j<n_visible; j++) {
// W[i][j] += lr * (ph_sample[i] * input[j] - nh_means[i] * nv_samples[j]) / N;
W[i][j] += lr * (ph_mean[i] * input[j] - nh_means[i] * nv_samples[j]) / N;
}
hbias[i] += lr * (ph_sample[i] - nh_means[i]) / N;
}
for(int i=0; i<n_visible; i++) {
vbias[i] += lr * (input[i] - nv_samples[i]) / N;
}
delete[] ph_mean;
delete[] ph_sample;
delete[] nv_means;
delete[] nv_samples;
delete[] nh_means;
delete[] nh_samples;
}
void RBM::sample_h_given_v(int *v0_sample, double *mean, int *sample) {
for(int i=0; i<n_hidden; i++) {
mean[i] = propup(v0_sample, W[i], hbias[i]);
sample[i] = binomial(1, mean[i]);
}
}
void RBM::sample_v_given_h(int *h0_sample, double *mean, int *sample) {
for(int i=0; i<n_visible; i++) {
mean[i] = propdown(h0_sample, i, vbias[i]);
sample[i] = binomial(1, mean[i]);
}
}
double RBM::propup(int *v, double *w, double b) {
double pre_sigmoid_activation = 0.0;
#pragma acc enter data present ( this )
#pragma acc data copyin(v[0:n_visible],w[0:n_visible])

#pragma acc parallel
{
#pragma acc loop reduction(+:pre_sigmoid_activation) 
for(int j=0; j<n_visible; j++) {
pre_sigmoid_activation += w[j] * v[j];
}
}
pre_sigmoid_activation += b;
return sigmoid(pre_sigmoid_activation);
}
double RBM::propdown(int *h, int i, double b) {
double pre_sigmoid_activation = 0.0;
#pragma acc enter data present ( this)//,W[0:n_hidden][0:n_visible] )
#pragma acc enter data copyin ( W[0:n_hidden][0:n_visible] )
#pragma acc data copyin(h[0:n_hidden]) 
#pragma acc parallel 
{
#pragma acc loop reduction(+:pre_sigmoid_activation) 
for(int j=0; j<n_hidden; j++) {
pre_sigmoid_activation += W[j][i] * h[j];
}
}
pre_sigmoid_activation += b;
return sigmoid(pre_sigmoid_activation);
}
void RBM::gibbs_hvh(int *h0_sample, double *nv_means, int *nv_samples, 
        double *nh_means, int *nh_samples) {
sample_v_given_h(h0_sample, nv_means, nv_samples);
sample_h_given_v(nv_samples, nh_means, nh_samples);
}
void RBM::reconstruct(int *v, double *reconstructed_v) {
double *h = new double[n_hidden];
double pre_sigmoid_activation;
for(int i=0; i<n_hidden; i++) {
h[i] = propup(v, W[i], hbias[i]);
}
for(int i=0; i<n_visible; i++) {
pre_sigmoid_activation = 0.0;
for(int j=0; j<n_hidden; j++) {
pre_sigmoid_activation += W[j][i] * h[j];
}
pre_sigmoid_activation += vbias[i];
reconstructed_v[i] = sigmoid(pre_sigmoid_activation);
}
delete[] h;
//----------------------------------------------------The main
void test_rbm() {
srand(0);
double learning_rate = 0.1;
int training_epochs = 1000;
int k = 1;

int train_N = 6;
int test_N = 2;
int n_visible = 6;
int n_hidden = 3;

// training data
int train_X[6][6] = {
{1, 1, 1, 0, 0, 0},
{1, 0, 1, 0, 0, 0},
{1, 1, 1, 0, 0, 0},
{0, 0, 1, 1, 1, 0},
{0, 0, 1, 0, 1, 0},
{0, 0, 1, 1, 1, 0}
};


// construct RBM
RBM rbm(train_N, n_visible, n_hidden, NULL, NULL, NULL);

// train
for(int epoch=0; epoch<training_epochs; epoch++) {
for(int i=0; i<train_N; i++) {
rbm.contrastive_divergence(train_X[i], learning_rate, k);
}
}

// test data
int test_X[2][6] = {
{1, 1, 0, 0, 0, 0},
{0, 0, 0, 1, 1, 0}
};
double reconstructed_X[2][6];


// test
for(int i=0; i<test_N; i++) {
rbm.reconstruct(test_X[i], reconstructed_X[i]);
for(int j=0; j<n_visible; j++) {
printf("%.5f ", reconstructed_X[i][j]);
}
cout << endl;
}

}



int main() {
test_rbm();
return 0;

您有一些错误，这些错误给了您错误的答案。我在下面更正这些。

至于性能，您没有足够的并行性来依次执行代码。您并行的循环几乎没有计算，使用降低，并且非常小。要查看主机上的加速，您需要使用更大的尺寸（长度为数千个），最好将并行性的帮派级别推向更高的循环。我尝试了一下，但是二项式例程具有依赖关系（对rand的调用），该依赖性阻止了" sample_ [vh] _given [_vh]"中循环的并行化。

#include <iostream>
#include <math.h>
#include "utils.h"
#include "RBM.h"
using namespace std;
using namespace utils;
RBM::RBM(int size, int n_v, int n_h, double **w, double *hb, double *vb) {
        N = size;
        n_visible = n_v;
        n_hidden = n_h;
        if(w == NULL) {
                W = new double*[n_hidden];
                for(int i=0; i<n_hidden; i++) W[i] = new double[n_visible];
                double a = 1.0 / n_visible;
                for(int i=0; i<n_hidden; i++) {
                        for(int j=0; j<n_visible; j++) {
                                W[i][j] = uniform(-a, a);
                        }
                }
        } else {
                W = w;
        }
        if(hb == NULL) {
                hbias = new double[n_hidden];
                for(int i=0; i<n_hidden; i++) hbias[i] = 0;
        } else {
                hbias = hb;
        }
        if(vb == NULL) {
                vbias = new double[n_visible];
                for(int i=0; i<n_visible; i++) vbias[i] = 0;
        } else {
                vbias = vb;
        }
#pragma acc enter data copyin (this,W[0:n_hidden][0:n_visible],hbias[0:n_hidden],vbias[0:n_visible])
}
RBM::~RBM() {
#pragma acc exit data delete ( hbias[0:n_hidden],vbias[0:n_visible],W[0:n_hidden][0:n_visible],this )
        for(int i=0; i<n_hidden; i++) delete[] W[i];
        delete[] W;
        delete[] hbias;
        delete[] vbias;
}
void RBM::contrastive_divergence(int *input, double lr, int k) {
        double *ph_mean = new double[n_hidden];
        int *ph_sample = new int[n_hidden];
        double *nv_means = new double[n_visible];
        int *nv_samples = new int[n_visible];
        double *nh_means = new double[n_hidden];
        int *nh_samples = new int[n_hidden];
        /* CD-k */
        sample_h_given_v(input, ph_mean, ph_sample);
        for(int step=0; step<k; step++) {
                if(step == 0) {
                        gibbs_hvh(ph_sample, nv_means, nv_samples, nh_means, nh_samples);
                } else {
                        gibbs_hvh(nh_samples, nv_means, nv_samples, nh_means, nh_samples);
                }
        }
        for(int i=0; i<n_hidden; i++) {
                for(int j=0; j<n_visible; j++) {
                        // W[i][j] += lr * (ph_sample[i] * input[j] - nh_means[i] * nv_samples[j]) / N;
                        W[i][j] += lr * (ph_mean[i] * input[j] - nh_means[i] * nv_samples[j]) / N;
                }
                hbias[i] += lr * (ph_sample[i] - nh_means[i]) / N;
        }
        for(int i=0; i<n_visible; i++) {
                vbias[i] += lr * (input[i] - nv_samples[i]) / N;
        }
#pragma acc update device(vbias[0:n_visible],hbias[0:n_hidden],W[0:n_hidden][0:n_visible])
        delete[] ph_mean;
        delete[] ph_sample;
        delete[] nv_means;
        delete[] nv_samples;
        delete[] nh_means;
        delete[] nh_samples;
}
void RBM::sample_h_given_v(int *v0_sample, double *mean, int *sample) {
#pragma acc data copyin(v0_sample[0:n_visible])
        {
                for(int i=0; i<n_hidden; i++) {
                        mean[i] = propup(v0_sample, W[i], hbias[i]);
                        sample[i] = binomial(1, mean[i]);
                }
        }
}
void RBM::sample_v_given_h(int *h0_sample, double *mean, int *sample) {
#pragma acc data copyin(h0_sample[0:n_visible])
        {
                for(int i=0; i<n_visible; i++) {
                        mean[i] = propdown(h0_sample, i, vbias[i]);
                        sample[i] = binomial(1, mean[i]);
                }
        }
}
double RBM::propup(int *v, double *w, double b) {
        double pre_sigmoid_activation = 0.0;
#pragma acc parallel present(w,v)
        {
#pragma acc loop reduction(+:pre_sigmoid_activation)
                for(int j=0; j<n_visible; j++) {
                        pre_sigmoid_activation += w[j] * v[j];
                }
        }
        pre_sigmoid_activation += b;
        return sigmoid(pre_sigmoid_activation);
}
double RBM::propdown(int *h, int i, double b) {
        double pre_sigmoid_activation = 0.0;
#pragma acc parallel present(W,h)
        {
#pragma acc loop reduction(+:pre_sigmoid_activation)
                for(int j=0; j<n_hidden; j++) {
                        pre_sigmoid_activation += W[j][i] * h[j];
                }
        }
        pre_sigmoid_activation += b;
        return sigmoid(pre_sigmoid_activation);
}
void RBM::gibbs_hvh(int *h0_sample, double *nv_means, int *nv_samples, 
                double *nh_means, int *nh_samples) {
        sample_v_given_h(h0_sample, nv_means, nv_samples);
        sample_h_given_v(nv_samples, nh_means, nh_samples);
}
void RBM::reconstruct(int *v, double *reconstructed_v) {
        double *h = new double[n_hidden];
        double pre_sigmoid_activation;
#pragma acc data copyin(v[0:n_visible])
        {
                for(int i=0; i<n_hidden; i++) {
                        h[i] = propup(v, W[i], hbias[i]);
                }
                for(int i=0; i<n_visible; i++) {
                        pre_sigmoid_activation = 0.0;
                        for(int j=0; j<n_hidden; j++) {
                                pre_sigmoid_activation += W[j][i] * h[j];
                        }
                        pre_sigmoid_activation += vbias[i];
                        reconstructed_v[i] = sigmoid(pre_sigmoid_activation);
                }
        }
        delete[] h;
}
//----------------------------------------------------The main
void test_rbm() {
        srand(0);
        double learning_rate = 0.1;
        int training_epochs = 1000;
        int k = 1;
        int train_N = 6;
        int test_N = 2;
        int n_visible = 6;
        int n_hidden = 3;
        // training data
        int train_X[6][6] = {
                {1, 1, 1, 0, 0, 0},
                {1, 0, 1, 0, 0, 0},
                {1, 1, 1, 0, 0, 0},
                {0, 0, 1, 1, 1, 0},
                {0, 0, 1, 0, 1, 0},
                {0, 0, 1, 1, 1, 0}
        };
        // construct RBM
        RBM rbm(train_N, n_visible, n_hidden, NULL, NULL, NULL);
        // train
        for(int epoch=0; epoch<training_epochs; epoch++) {
                for(int i=0; i<train_N; i++) {
                        rbm.contrastive_divergence(train_X[i], learning_rate, k);
                }
        }
        // test data
        int test_X[2][6] = {
                {1, 1, 0, 0, 0, 0},
                {0, 0, 0, 1, 1, 0}
        };
        double reconstructed_X[2][6];
        // test
        for(int i=0; i<test_N; i++) {
                rbm.reconstruct(test_X[i], reconstructed_X[i]);
                for(int j=0; j<n_visible; j++) {
                        printf("%20.15f ", reconstructed_X[i][j]);
                }
                cout << endl;
        }
}
int main() {
        test_rbm();
        return 0;
}

您可以在内核函数内放置有界的二项式分布（作为int的数组）。因此排除了对rand（）的需求。