解释为 4 个字节的两个整数之间的计算范数

Compute norm between two integers interpreted as 4 bytes

本文关键字：整数两个范数之间计算字节解释更新时间：2023-10-16

我想写一个函数norm2来计算

uint32_t norm2(uint32_t a, uint32_t b) {
  return sqd( a & 0x000000FF     ,  b & 0x000000FF      )
       + sqd((a & 0x0000FF00)>> 8, (b & 0x0000FF00)>>  8)
       + sqd((a & 0x00FF0000)>>16, (b & 0x00FF0000)>> 16)
       + sqd((a & 0xFF000000)>>24, (b & 0xFF000000)>> 24);
}
uint32_t sqd(uint32_t a, uint32_t b) {
  uint32_t x = (a > b) ? a - b : b - a;
  return x*x;
}

在海湾合作委员会下最快的方法是什么？例如使用汇编器、SSE 或类似工具。

使用 SSE 在几条指令中完成整个操作非常简单：

#include <immintrin.h>
#include <stdint.h>
uint32_t norm2(uint32_t a, uint32_t b) {
    const __m128i vec_zero = _mm_setzero_si128();
    __m128i vec_a = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a), vec_zero);
    __m128i vec_b = _mm_unpacklo_epi8(_mm_cvtsi32_si128(b), vec_zero);
    __m128i vec_diff = _mm_sub_epi16(vec_a, vec_b);
    __m128i vec_dsq = _mm_madd_epi16(vec_diff, vec_diff);
    return _mm_cvtsi128_si32(_mm_hadd_epi32(vec_dsq, vec_dsq));
}

我们在这里所做的是用零向量"解包"a和b，以将单个字节扩展为16位整数的向量。然后我们减去它们（作为 16 位整数，避免溢出的风险），然后乘以并累加它们（作为 32 位整数，再次避免溢出风险）。

我没有安装GCC进行测试，但是上面的方法使用clang生成了近乎最佳的组装;对于如此简单的任务，没有必要进入组装中。

如果你可以读取 a 和 b 的 4 组，这可以通过对 4 元组进行操作来最干净/优雅/有效地完成，因为它会更充分地饱和一些指令，因此计算的所有部分都是解决方案的一部分。以下解决方案最多使用 SSSE3。当然，您最好将其从函数中拉出来，预先初始化常量，并根据周围代码的结构找到将值放入__m128i值的最有效方法。

// a, b, and out, must all point to 4 integers
void norm2x4(const unsigned *a, const unsigned *b, unsigned *out) {
  // load up registers a and b, in practice this should probably not be in a function,
  // initialization of zero can happen outside of a loop,
  // and a and b can be loaded directly from memory into __m128i registers
  __m128i const zero = _mm_setzero_si128();
  __m128i       alo  = _mm_loadu_si128((__m128i*)a); // this can also be adapted to aligned read instructions if you ensure an aligned buffer
  __m128i       blo  = _mm_loadu_si128((__m128i*)b);
  // everything is already in the register where we need it except it
  // needs to be expanded to 2-byte ints for computations to work correctly
  __m128i       ahi = _mm_unpackhi_epi8(alo, zero);
  __m128i       bhi = _mm_unpackhi_epi8(blo, zero);
  alo               = _mm_unpacklo_epi8(alo, zero);
  blo               = _mm_unpacklo_epi8(blo, zero);
  alo               = _mm_sub_epi16(alo, blo);  // don't care if a - b, or b - a, the "wrong" one will result in a
  ahi               = _mm_sub_epi16(ahi, bhi);  // negation the square will later correct
  alo               = _mm_madd_epi16(alo, alo); // perform the square, and add every two adjacent
  ahi               = _mm_madd_epi16(ahi, ahi);
  alo               = _mm_hadd_epi32(alo, ahi); // add horizontal elements; ahi now contains 4 ints which are your results
  // store the result to output; this can be adapted to an aligned store if you ensure an aligned buffer
  // or the individual values can be extracted directly to 32-bit registers using _mm_extract_epi32
  _mm_storeu_si128((__m128i*)out, alo);
}

一个

无分支版本（如square(-x) == square(x)）：

uint32_t sqd(int32_t a, int32_t b) {
  int32_t x = a - b;
  return x * x;
}
uint32_t norm2(uint32_t a, uint32_t b) {
  return sqd( a & 0x000000FF     , b &  0x000000FF      )
       + sqd((a & 0x0000FF00) >>  8, (b & 0x0000FF00) >>  8)
       + sqd((a & 0x00FF0000) >> 16, (b & 0x00FF0000) >> 16)
       + sqd((a & 0xFF000000) >> 24, (b & 0xFF000000) >> 24);
}