
Speeding up large amounts of array related computation, visual studio

本文关键字:计算 数组 加速 visualstudio      更新时间:2023-10-16


int template_t[] = {1, 2, 3, 4, 5, 6, ...., 125};
int image[3200][5600];
int template_image[3200][5600];
for(int i = 0; i < 3200; i++) {
    for(int j = 0; j < 5600; j++) {
        // iterate over template to find template value per pixel
        for(int h = 0; h < template_length; h++)
            template_image[i][j] += template_t[h] * image[i][j];



  • SIMD指令?然而,我似乎找不到任何在visualstudio中编写SIMD特定代码的资源
  • 并行化——尽管我已经将整个执行本身并行化了,所以程序基于X个内核运行X个实例。程序的输入是大量的图像文件,因此这些X实例都将处理单独的文件




对于gcc,我们还可以从循环中提升template_t的和来获得更好的代码。在这种情况下,即使使用int而不是unsigned int,它也能在编译时进行求和。



// aligning the arrays makes gcc's asm output *MUCH* shorter: no fully-unrolled prologue/epilogue for handling unaligned elements
#define DIM1 320
#define DIM2 1000
alignas(32) unsigned int image[DIM1][DIM2];
alignas(32) unsigned int template_image[DIM1][DIM2];
// with const, gcc can sum them at compile time.
static unsigned int template_multipliers[] = {1, 2, 3, 4, 5, 6, 7, 8, 8, 10, 11, 12, 13,   125};
const static int template_length = sizeof(template_multipliers) / sizeof(template_multipliers[0]);

void loop_hoisted(void) {
  for(int i = 0; i < DIM1; i++) {
    for(int j = 0; j < DIM2; j++) {
        // iterate over template to find template value per pixel
        unsigned int tmp = 0;
        for(int h = 0; h < template_length; h++)
            tmp += template_multipliers[h];
        template_image[i][j] += tmp * image[i][j];

带有-O3 -fverbose-asm -march=haswell的gcc 5.3通过的内部循环自动向量化

# gcc inner loop: ymm1 = set1(215) = sum of template_multipliers
    vpmulld ymm0, ymm1, YMMWORD PTR [rcx+rax] # vect__16.10, tmp115, MEM[base: vectp_image.8_4, index: ivtmp.18_90, offset: 0B]
    vpaddd  ymm0, ymm0, YMMWORD PTR [rdx+rax]   # vect__17.12, vect__16.10, MEM[base: vectp_template_image.5_84, index: ivtmp.18_90, offset: 0B]
    vmovdqa YMMWORD PTR [rdx+rax], ymm0       # MEM[base: vectp_template_image.5_84, index: ivtmp.18_90, offset: 0B], vect__17.12
    add     rax, 32   # ivtmp.18,
    cmp     rax, 4000 # ivtmp.18,
    jne     .L2       #,

这是Intel Haswell内部循环中的9个融合域uop,因为pmulld在Haswell及更高版本上是2个uop(即使使用单寄存器寻址模式也无法进行微融合)。这意味着循环每3个时钟只能运行一次迭代。gcc本可以通过为目标使用指针增量和为src使用dst + src-dst 2寄存器寻址模式来保存2个uop(因此它将以每2个时钟一次迭代的速度运行)(因为它无论如何都不能进行微融合)。


    unsigned int tmp = template_image[i][j];
    for(int h = 0; h < template_length; h++)
        tmp += template_multipliers[h] * image[i][j];
    template_image[i][j] = tmp;
.L8:  # ymm4 is a vector of set1(198)
    vmovdqa ymm2, YMMWORD PTR [rcx+rax]       # vect__22.42, MEM[base: vectp_image.41_73, index: ivtmp.56_108, offset: 0B]
    vpaddd  ymm1, ymm2, YMMWORD PTR [rdx+rax]   # vect__1.47, vect__22.42, MEM[base: vectp_template_image.38_94, index: ivtmp.56_108, offset: 0B]
    vpmulld ymm0, ymm2, ymm4  # vect__114.43, vect__22.42, tmp110
    vpslld  ymm3, ymm2, 3       # vect__72.45, vect__22.42,
    vpaddd  ymm0, ymm1, ymm0    # vect__2.48, vect__1.47, vect__114.43
    vpaddd  ymm0, ymm0, ymm3    # vect__29.49, vect__2.48, vect__72.45
    vpaddd  ymm0, ymm0, ymm3    # vect_tmp_115.50, vect__29.49, vect__72.45
    vmovdqa YMMWORD PTR [rdx+rax], ymm0       # MEM[base: vectp_template_image.38_94, index: ivtmp.56_108, offset: 0B], vect_tmp_115.50
    add     rax, 32   # ivtmp.56,
    cmp     rax, 4000 # ivtmp.56,
    jne     .L8       #,




    int p = template_image[i][j], p2= image[i][j];
    // iterate over template to find template value per pixel
    for(int h = 0; h < template_length; h++)
        p += template_t[h] * p2;
    template[i][j]= p;


for (h..
    template_image[i][j] += template_t[h] * image[i][j];


template_image[i][j] += CT * image[i][j];


#define CT 1*2*3*4*5*6*7...*125 // must stil lbe completed
int image[3200][5600];
int template_image[3200][5600];
for(int i = 0; i < 3200; i++) {
    for(int j = 0; j < 5600; j++) {
        template_image[i][j] += CT * image[i][j];

这可以在CCD_ 16上并行化。