如何并行化矩阵排序以进行循环
How to parallelize matrix sorting for loop?
我正在尝试使用OpenMP并行化 for(){...}
循环,它使用openMP,它采用了许多"行" N
em>表" N*M
并按升序排列每行。
我添加了 #pragma omp parallel
, #pragma omp for schedule
指令,但看不到任何更改,就好像它完全无能为力。
这是完整程序:
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <iostream>
double GetTime() {
struct timeval clock;
gettimeofday(&clock, NULL);
double rez = (double)clock.tv_sec+(double)clock.tv_usec/1000000;
return rez;
}
void genMatrix(int *A, int N, int M) {
// Generate matrix
for (int i=0; i<N; i++) {
for (int j=0; j<M; j++) A[i*M+j] = (int)((double)rand()/RAND_MAX*99) + 1;
}
}
int main() {
srand(time(NULL));
int N = 4800;
int M = 6000;
int *A = new int[N*M];
int t, n;
genMatrix(A, N, M);
double t_start = GetTime();
#pragma omp parallel
{
#pragma omp for schedule
for (int k=0; k<N; k++) {
for (int i=0; i<M-1; i++) {
for (int j=0; j<M-1; j++) {
if (A[k*M+j] > A[k*M+j+1]) {
t = A[k*M+j];
A[k*M+j] = A[k*M+j+1];
A[k*M+j+1] = t;
}}}}}
double t_load = GetTime();
// Print matrix
// for (int i=0; i<N; i++) {
// for (int j=0; j<M; j++) {
// printf("%3d", A[i*M+j]);
// }
// printf("n");
// }
printf("Load time: %.2fn", t_load - t_start);
system("pause");
}
在这种情况下,如何与OpenMP添加并行化?
我应该如何添加并行化?另外,不知道为什么,但是当尝试打印矩阵 A
带有大数字
(例如int N = 480;int M = 600;
)时,某些值未分类。
是打印问题吗?
有三个不同的东西,正弦qua-non,go omp parallel
:
a ) - 算法必须正确
B ) - 算法必须有效地使用资源
c ) - 算法在附加费用上的支出要少于omp
修复A
)和在B
上进行了一些轻微的实验之后)和C
):
可能很快就会意识到,对于 b )和 c )的成本对于 rand()
处理较高,以使任何好处更高从任何天真或更智能的矩阵覆盖映射到资源上(在这里,奇异引擎,由于任何类型的并发都必须重新传播rand()
-SOURCE的新状态 - 所有并发用途中的狂欢节目,费用方式超出同时操作的矩阵覆盖范围内传递的多(加上矩阵的幼稚缓存线不知不觉中也无济于事)。
最佳结果(不优化myOMP_SCHEDULE_CHUNKS
):
/*
-O3 private( ..., i, j ) omp single
MATRIX.RAND time: 3 191 [us] 3 446 [us] 3 444 [us] 3 384 [us] 3 173 [us]
MATRIX.SORT time: 96 270 [us] 98 401 [us] 98 423 [us] 95 911 [us] 101 019 [us] @( 3 ) = OMP_THREADS in [ 5 ] OMP_SCHEDULE_CHUNKS
*/
全局视图:
/* COMPILE:: -fopenmp
*
* MAY SHELL:: $ export OMP_NUM_THREADS = 3
* $ export OMP_DISPLAY_ENV = 1
* https://stackoverflow.com/questions/47495916/how-to-parallelize-matrix-sorting-for-loop
*/
#include <omp.h>
#define myOMP_SCHEDULE_CHUNKS 5 // OMP schedule( static, chunk ) ~ better cache-line depletion
#define myOMP_THREADS 4
/*
$ ./OMP_matrix_SORT
MATRIX.RAND time: 187 744 [us] 234 729 [us] 174 535 [us] 254 273 [us] 122 983 [us]
MATRIX.SORT time: 1 911 310 [us] 1 898 494 [us] 2 026 455 [us] 1 978 631 [us] 1 911 231 [us] @( 3 ) = OMP_THREADS
MATRIX.RAND time: 6 166 [us] 6 977 [us] 6 722 [us]
MATRIX.SORT time: 2 448 608 [us] 2 264 572 [us] 2 355 366 [us] @( 3 ) = OMP_THREADS in [ 5 ] OMP_SCHEDULE_CHUNKS
MATRIX.RAND time: 6 918 [us] 17 551 [us] 7 194 [us]
MATRIX.SORT time: 1 774 883 [us] 1 809 002 [us] 1 786 494 [us] @( 1 ) = OMP_THREADS
MATRIX.RAND time: 7 321 [us] 7 337 [us] 6 698 [us]
MATRIX.SORT time: 2 152 945 [us] 1 900 149 [us] 1 883 638 [us] @( 1 ) = OMP_THREADS
MATRIX.RAND time: 54 198 [us] 67 290 [us] 52 123 [us]
MATRIX.SORT time: 759 248 [us] 769 580 [us] 760 759 [us] 812 875 [us] @( 3 ) = OMP_THREADS
MATRIX.RAND time: 7 054 [us] 6 414 [us] 6 435 [us] 6 426 [us]
MATRIX.SORT time: 687 021 [us] 760 917 [us] 674 496 [us] 705 629 [us] @( 3 ) = OMP_THREADS
-O3
MATRIX.RAND time: 5 890 [us] 6 147 [us] 6 081 [us] 5 796 [us] 6 143 [us]
MATRIX.SORT time: 148 381 [us] 152 664 [us] 184 922 [us] 155 236 [us] 169 442 [us] @( 3 ) = OMP_THREADS in [ 5 ] OMP_SCHEDULE_CHUNKS
-O3 private( ..., i, j )
MATRIX.RAND time: 6 410 [us] 6 111 [us] 6 903 [us] 5 831 [us] 6 224 [us]
MATRIX.SORT time: 129 787 [us] 129 836 [us] 195 299 [us] 136 111 [us] 161 117 [us] @( 4 ) = OMP_THREADS in [ 5 ] OMP_SCHEDULE_CHUNKS
MATRIX.RAND time: 6 349 [us] 6 532 [us] 6 104 [us] 6 213 [us]
MATRIX.SORT time: 151 202 [us] 152 542 [us] 160 403 [us] 180 375 [us] @( 3 ) = OMP_THREADS in [ 5 ] OMP_SCHEDULE_CHUNKS
MATRIX.RAND time: 6 745 [us] 5 834 [us] 5 791 [us] 7 164 [us] 6 535 [us]
MATRIX.SORT time: 214 590 [us] 214 563 [us] 209 610 [us] 205 940 [us] 230 787 [us] @( 2 ) = OMP_THREADS in [ 5 ] OMP_SCHEDULE_CHUNKS
*/
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <iostream>
long GetTime() { // double GetTime()
struct timeval clock;
gettimeofday( &clock, NULL );
return (long)clock.tv_sec * 1000000 // in [us] ( go (long) instead of float )
+ (long)clock.tv_usec; //
/* double rez = (double)clock.tv_sec
* + (double)clock.tv_usec / 1000000;
* // + (double)clock.tv_usec * 0.000001; // NEVER DIV
return rez;
*/
}
void genMatrix( int *A, int N, int M ) { // Generate matrix
register int i, iM, j;
#pragma omp parallel
for ( i = 0; i < N; i++ ) {
iM = i * M;
/* for ( register int i = 0; i < N; i++ ) {
register int iM = i * M;
*/
// #pragma omp parallel // 234 729 [us]
// for ( register int j = 0; j < M; j++ )
// #pragma omp parallel for schedule( static, myOMP_SCHEDULE_CHUNKS ) // 122 983 [us] @( 3 ) = OMP_THREADS ~~~ v/s 6 698 [us] @( 1 ) = OMP_THREADS
// // v/s 5 796 [us] @ NON-OMP
#pragma omp single // ~~ 3 191 [us]
for ( int j = 0; j < M; j++ )
A[iM +j] = (int)( (double)rand() / RAND_MAX * 99 ) + 1;
// A[i*M+j] = (int)( (double)rand() / RAND_MAX * 99 ) + 1;
}
}
int main() {
srand( time( NULL ) );
int N = 480; // 4800; ~ 100x faster outputs
int M = 600; // 6000;
int Mb1 = M - 1;
int *A = new int[N*M];
omp_set_num_threads( myOMP_THREADS );
long long int t_start = GetTime();
genMatrix( A, N, M );
long long int t_load = GetTime();
printf( "MATRIX.RAND time: %lld [us]n", t_load - t_start );
register int thisB,
this1,
next1,
t, i, j;
t_start = GetTime(); // double t_start = GetTime();
// for ( register int k = 0; k < N; k++ ) {
// #pragma omp parallel
// #pragma omp parallel for schedule( static, myOMP_SCHEDULE_CHUNKS ) // schedule( type, chunk ):
// #pragma omp parallel for schedule( static, myOMP_SCHEDULE_CHUNKS ) private( thisB, this1, next1, t ) // schedule( type, chunk ):
#pragma omp parallel for schedule( static, myOMP_SCHEDULE_CHUNKS ) private( thisB, this1, next1, t, i, j ) // schedule( type, chunk ):
for ( int k = 0; k < N; k++ ) {
thisB = k*M;
if ( omp_get_num_threads() != myOMP_THREADS ) {
printf( "INF: myOMP_THREADS ( == %d ) do not match the number of executed ones ( == %d ) ", myOMP_THREADS, omp_get_num_threads() );
}
//--------------------------------------------------// -------------SORT ROW-k
// for ( register int i = 0; i < Mb1; i++ ) { // < M-1; i++ ) {
// for ( register int j = 0; j < Mb1; j++ ) { // < M-1; j++ ) {
for ( i = 0; i < Mb1; i++ ) {
for ( j = 0; j < Mb1; j++ ) {
this1 = thisB + j,
next1 = this1 + 1;
if ( A[this1] > A[next1] ){ // A[k*M+j ] > A[k*M+j+1] ) {
t = A[this1]; // t = A[k*M+j ];
A[this1] = A[next1]; // A[k*M+j ] = A[k*M+j+1];
A[next1] = t; // A[k*M+j+1] = t;
}
}
}
//--------------------------------------------------// -------------SORT ROW-k
}
t_load = GetTime(); // double t_load = GetTime();
/* Print matrix
//
for ( int i = 0; i < N; i++ ) {
for ( int j = 0; j < M; j++ ) {
printf( "%3d", A[i*M+j] );
}
printf("n");
}
//
*/
printf( "MATRIX.SORT time: %lld [us] @( %d ) = OMP_THREADS in [ %d ] OMP_SCHEDULE_CHUNKSn",
t_load - t_start,
myOMP_THREADS,
myOMP_SCHEDULE_CHUNKS
);
// system( "pause" );
}
相关文章:
- 气泡排序未正确循环
- C++ - 如何保持半随机数组在循环中排序
- 按升序对数组进行排序嵌套循环问题
- 如何并行化矩阵排序以进行循环
- 气泡排序循环,将一个数字更改为 0
- 选择排序 - 循环过早停止
- 尝试对字符串数组进行排序,但我似乎陷入了循环
- 循环中对向量进行排序
- 在 for 循环中重新排序测试条件:编译器错误?
- 单排序链表-无限循环
- 如何在不使用循环的情况下进行排序
- 提高循环缓冲区或堆排序性能
- 创建一个C++预排序迭代器,为循环提供树节点
- 使用 while 循环和 for 循环进行排序
- 计数排序无限循环
- 计数排序卡在 for 循环中
- 有什么更有效的方法来确保我不会在这个快速排序函数中陷入无限循环?(C++)
- 将"for"循环更改为单个"for"循环(排序)
- Qt foreach循环排序与QList的for循环排序
- 使用if循环排序时出现意外结果-糟糕的算法