我如何将这个循环与开放的mp并行

How can I parallel this loop with open mp?

本文关键字：mp 并行循环更新时间：2023-10-16

我不知道如何并行这个循环，因为我有很多因变量，我很困惑你能帮助和引导我吗?第一个是:

for (int a = 0; a < sigmaLen; ++a) {
        int f = freq[a];
        if (f >= sumFreqLB)
            if (updateRemainingDistances(s, a, pos))
                if (prunePassed(pos + 1)) {
                    lmer[pos] = a;
                    enumerateStrings(pos + 1, sumFreqLB - f);
                }
    }

第二个是:

void preprocessLowerBounds() {
    int i = stackSz - 1;
    int pairOffset = (i * (i - 1)) >> 1;
    for (int k = L; k; --k) {
        int *dsn = dist[k] + pairOffset;
        int *ds = dist[k - 1] + pairOffset;
        int *s = colS[k - 1];
        char ci = s[i];
        for (int j = 0; j < i; ++j) {
            char cj = s[j];
            *ds++ = (*dsn++) + (ci != cj);
        }
    }

另一个是:

    void enumerateSubStrings(int rowNumber, int remainQTolerance) {
    int nItems = rowSize[rowNumber][stackSz];
    if (shouldGenerateNeighborhood(rowNumber, nItems)) {
        bruteForceIt(rowNumber, nItems);
    } else {
        indexType *row = rowItem[rowNumber];
        for (int j = 0; j < nItems; ++j) {
            indexType ind = row[j];
            addString(lmers + ind);
            preprocessLowerBounds();
            uint threshold = maxLB[stackSz] - addMaxFreq();
            if (hasSolution(0, threshold)) {
                if (getValid<hasPreprocessedPairs, useQ>(rowNumber + 1,
                        (stackSz <= 2 ? n : smallN), threshold + LminusD,
                        ind, remainQTolerance)) {
                    enumerateSubStrings<hasPreprocessedPairs, useQ>(
                            rowNumber + 1, remainQTolerance);
                }
            }
            removeLastString();
        }
    }

void addString(const char *t) {
    int *mf = colMf[stackSz + 1];
    for (int j = 0; j < L; ++j) {
        int c = t[j];
        colS[j][stackSz] = c;
        mf[j] = colMaxFreq[j] + (colMaxFreq[j] == colFreq[j][c]++);
    }
    colMaxFreq = mf;
    ++stackSz;
}

void preprocessLowerBounds() {
    int i = stackSz - 1;
    int pairOffset = (i * (i - 1)) >> 1;
    for (int k = L; k; --k) {
        int *dsn = dist[k] + pairOffset;
        int *ds = dist[k - 1] + pairOffset;
        int *s = colS[k - 1];
        char ci = s[i];
        for (int j = 0; j < i; ++j) {
            char cj = s[j];
            *ds++ = (*dsn++) + (ci != cj);
        }
    }
}
void removeLastString() {
    --stackSz;
    for (int j = 0; j < L; ++j)
        --colFreq[j][colS[j][stackSz]];
    colMaxFreq = colMf[stackSz];
}

好的,为OpenMP来并行循环在你基本上遵循这两个原则,第一个永远写在同一个内存位置不同线程和第二条规则永远依靠的阅读可能修改另一个线程的内存区域,现在在第一个循环就改变了lm变量和其他操作是只读变量,同时我认为没有改变从另一个代码的一部分,所以第一个循环将如下:

#pragma omp for private(s,a,pos) //According to my intuition these variables are global or belong to a class, so you must convert private to each thread, on the other hand sumFreqLB and freq not included because only these reading
for (int a = 0; a < sigmaLen; ++a) {
    int f = freq[a];
    if (f >= sumFreqLB)
        if (updateRemainingDistances(s, a, pos))
            if (prunePassed(pos + 1)) {
                #pragma omp critical //Only one thread at a time can enter otherwise you will fail at runtime
                {             
                lmer[pos] = a;
                }
                enumerateStrings(pos + 1, sumFreqLB - f);
            }
}

在第二个循环中，我不明白你是如何使用for的，但是你没有问题，因为你只使用读取，只修改线程局部变量。

你必须确保updateRemainingDistances、pruneppassed和enumerateStrings函数不使用静态或全局变量。

在下面的函数中，你只使用可以从多个线程(如果任何线程修改这些变量)中完成的大多数读操作，并在本地内存位置写入，因此只需更改FOR的形状，OpenMP可以识别该FOR。

void preprocessLowerBounds() {
int i = stackSz - 1;
int pairOffset = (i * (i - 1)) >> 1;
#pragma omp for
for (int var=0; var<=k-L; var++){  
    int newK=k-var;//This will cover the initial range and in the same order    
    int *dsn = dist[newK] + pairOffset;
    int *ds = dist[newK - 1] + pairOffset;
    int *s = colS[newK - 1];
    char ci = s[i];
    for (int j = 0; j < i; ++j) {
        char cj = s[j];
        *ds++ = (*dsn++) + (ci != cj);
    }
}

在最后一个函数中，您使用许多函数，我不知道源代码，因此无法知道他们是否正在寻找可并行的示例，下面的示例是错误的:

std::vector myVector;
void notParalelizable_1(int i){
miVector.push_back(i); 
}
void notParalelizable_2(int i){
static int A=0;
A=A+i;
}
int varGlobal=0;
void notParalelizable_3(int i){
varGlobal=varGlobal+i;
}
void oneFunctionParalelizable(int i)
{
int B=i;
}
int main()
{
#pragma omp for
 for(int i=0;i<10;i++)
 {
 notParalelizable_1(i);//Error because myVector is modified simultaneously from multiple threads, The error here is that myVector not store the values in ascending order as this necessarily being accesing  by multiple threads, this more complex functions can generate erroneous results or even errors in run time.
 }

#pragma omp for
 for(int i=0;i<10;i++)
 {
 notParalelizable_2(i);//Error because A is modified simultaneously from multiple threads
 }
 #pragma omp for
 for(int i=0;i<10;i++)
 {
 notParalelizable_3(i);//Error because varGlobal is modified simultaneously from multiple threads
 }   
 #pragma omp for
 for(int i=0;i<10;i++)
 {
 oneFunctionParalelizable(i);//no problem
 }
//The following code is correct
int *vector=new int[10];
#pragma omp for
 for(int i=0;i<10;i++)
 {
 vector[i]=i;//No problem because each thread writes to a different memory pocicion
 } 
//The following code is wrong
int k=2;
#pragma omp for
for(int i=0;i<10;i++)
{
k=k+i; //The result of the k variable at the end will be wrong as it is modified from different threads  
}
 return 0;   
}