有效地交错位
Interleave bits efficiently
我需要使uint64_t
的2 uint32_t
交错位:如果A=a0a1a2...a31
和B=b0b1...b31
,我需要C= a0b0a1b1...a31b31
。有没有一种方法可以有效地做到这一点?到目前为止,我只有32次迭代的for
循环的朴素方法,其中每次迭代执行C|=((A&(1<<i))<<i)|((B&(1<<i))<<(i+1))
。
我猜应该有一些数学技巧,比如将A和B乘以一些特殊的数字,这会导致它们的位在得到的64位数字中与零交错,这样只剩下or
这些乘积。但是我找不到这样的乘数。
另一种可能的方法是编译器内部指令或汇编指令,但我不知道这样的
NathanOliver的链接提供了16位->32位的实现:
static const unsigned int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF};
static const unsigned int S[] = {1, 2, 4, 8};
unsigned int x; // Interleave lower 16 bits of x and y, so the bits of x
unsigned int y; // are in the even positions and bits from y in the odd;
unsigned int z; // z gets the resulting 32-bit Morton Number.
// x and y must initially be less than 65536.
x = (x | (x << S[3])) & B[3];
x = (x | (x << S[2])) & B[2];
x = (x | (x << S[1])) & B[1];
x = (x | (x << S[0])) & B[0];
y = [the same thing on y]
z = x | (y << 1);
- 将x的低8位保留在原来的位置。将高8位向上移动8位;
- 分成两半,做同样的事情,这次留下低4位对,并将其他4位向上移动;
。它的过程如下:
0000 0000 0000 0000 abcd efgh ijkl mnop
-> 0000 0000 abcd efgh 0000 0000 ijkl mnop
-> 0000 abcd 0000 efgh 0000 ijkl 0000 mnop
-> 00ab 00cd 00ef 00gh 00ij 00kl 00mn 00op
-> 0a0b 0c0d 0e0f 0g0h 0i0j 0k0l 0m0n 0o0p
然后将两个输入组合在一起。
根据我之前的评论,将其扩展到64位,只需添加16的初始移位和0x0000ffff0000ffff
的掩码,要么因为您可以直观地遵循模式,要么作为分而治之的步骤,将32位问题变成两个不重叠的16位问题,然后使用16位解决方案。
对于较大的整数,值得一提的是用于有限域乘法(无进位乘法)的clmul x86扩展。整数与零的交错相当于整数与自身的无进位乘法,这是一个单独的ALU指令。
一个简短的,预先计算的数组查找计数作为一个"数学技巧"吗?
预计算一个256个uint16_t
s的数组:
static const uint16_t lookup[256]={0x0000, 0x0001, 0x0005 ..., 0x5555};
我们可以将两个8位的值交错,并很容易地得出一个16位的值:
uint16_t interleave(uint8_t a, uint8_t b)
{
return (lookup[a] << 1) | lookup[b];
}
如何将其扩展为将两个32位值交织成一个64位值应该是显而易见的:对组成uint32_t
的四个字节中的每个字节调用此方法四次,然后将<<
和|
一起调用结果。贿赂编译器来内联整个东西,最终的结果应该是相当快速和便宜的。
由于RAM现在很便宜,您可能也想考虑一个包含65536个uint32_t
s的预计算表。
为了使saolof的答案具体化,下面是一个使用CLMUL指令集的实现,在每次调用中穿插两对uint32_t
:
#include <immintrin.h>
#include <stdint.h>
typedef struct {
uint32_t x;
uint32_t y;
} uint32_2;
static inline void interleave_clmul(uint32_2 *input, uint64_t *out) {
__m128i xy = _mm_load_si128((const __m128i *)input);
xy = _mm_shuffle_epi32(xy, 0b11011000);
// Two carryless multiplies
__m128i p2 = _mm_clmulepi64_si128(xy, xy, 0x11);
__m128i p1 = _mm_clmulepi64_si128(xy, xy, 0x00);
// Bitwise interleave the results
p2 = _mm_slli_epi16(p2, 1);
__m128i p3 = _mm_or_si128(p1, p2);
_mm_storeu_si128((__m128i *)out, p3);
}
编译成以下内容:
interleave_clmul(uint32_2*, unsigned long*):
vpshufd xmm0, xmmword ptr [rdi], 216 # xmm0 = mem[0,2,1,3]
vpclmulqdq xmm1, xmm0, xmm0, 17
vpclmulqdq xmm0, xmm0, xmm0, 0
vpaddw xmm1, xmm1, xmm1
vpor xmm0, xmm0, xmm1
vmovdqu xmmword ptr [rsi], xmm0
ret
如果你的数据没有对齐,用_mm_loadu_si128
替换_mm_load_si128
——无论如何,在x86上,未对齐的加载并不会慢很多。在吞吐量方面,该系统比使用pdep
指令的相应实现更快。
Naive
Total rdtsc: 1295559857, iterations: 1000, count: 10000
clmul-based
Total rdtsc: 17751716, iterations: 1000, count: 10000
pdep-based
Total rdtsc: 26123417, iterations: 1000, count: 10000
pdep-based unrolled
Total rdtsc: 24281811, iterations: 1000, count: 10000
Turbo boost被禁用;CPU是一个1.60 GHz的基准时钟Kaby Lake。结果似乎是一致的。(在其他架构上的结果会很好。)测试代码(有些混乱):
#include <stdio.h>
#include <inttypes.h>
#include <string.h>
#include <stdlib.h>
#include <immintrin.h>
// rdtscp
#include <x86intrin.h>
typedef struct uint32_2 {
uint32_t x;
uint32_t y;
} uint32_2;
uint32_2* generate_pairs(const int count) {
uint32_2* p = malloc(count * sizeof(uint32_2));
uint32_t r = 401923091;
#define RNG r *= 52308420; r += 2304;
for (int i = 0; i < count; ++i) {
p[i].x = r;
RNG RNG
p[i].y = r;
RNG RNG
}
return p;
}
void interleave_naive(uint64_t* dst, uint32_2* src, int count) {
for (int i = 0; i < count; ++i) {
struct uint32_2 s = src[i];
uint32_t x = s.x, y = s.y;
uint64_t result = 0;
for (int k = 0; k < 32; ++k) {
if (x & ((uint32_t)1 << k)) {
result |= (uint64_t)1 << (2 * k);
}
if (y & ((uint32_t)1 << k)) {
result |= (uint64_t)1 << (2 * k + 1);
}
}
dst[i] = result;
}
}
void interleave_pdep(uint64_t* dst, uint32_2* src, int count) {
for (int i = 0; i < count; ++i) {
struct uint32_2 s = src[i];
uint32_t x = s.x, y = s.y;
uint64_t result = _pdep_u64(x, 0x5555555555555555) | _pdep_u64(y, 0xaaaaaaaaaaaaaaaa);
dst[i] = result;
}
}
void interleave_pdep_unrolled(uint64_t* dst, uint32_2* src, int count) {
for (int i = 0; i < count; i += 2) {
struct uint32_2 s1 = src[i];
struct uint32_2 s2 = src[i + 1];
uint32_t x1 = s1.x, y1 = s1.y;
uint32_t x2 = s2.x, y2 = s2.y;
uint64_t result1 = _pdep_u64(x1, 0x5555555555555555) | _pdep_u64(y1, 0xaaaaaaaaaaaaaaaa);
uint64_t result2 = _pdep_u64(x2, 0x5555555555555555) | _pdep_u64(y2, 0xaaaaaaaaaaaaaaaa);
dst[i] = result1;
dst[i + 1] = result2;
}
}
void interleave_clmul(uint64_t* dst, uint32_2* src, int count) {
uint32_2* end = src + count;
uint64_t* out = dst;
for (uint32_2* p = src; p < end; p += 2, out += 2) {
__m128i xy = _mm_load_si128((const __m128i *) p);
xy = _mm_shuffle_epi32(xy, 0b11011000);
__m128i p2 = _mm_clmulepi64_si128(xy, xy, 0x11);
__m128i p1 = _mm_clmulepi64_si128(xy, xy, 0x00);
p2 = _mm_slli_epi16(p2, 1);
__m128i p3 = _mm_or_si128(p1, p2);
_mm_store_si128((__m128i *)out, p3);
}
}
#define ITERATIONS 1000
void time_inv(uint32_2* pairs, int count, void (*interleave) (uint64_t*, uint32_2*, int)) {
uint64_t* result = malloc(count * sizeof(uint64_t));
uint64_t* reference_result = malloc(count * sizeof(uint64_t));
interleave_naive(reference_result, pairs, count);
// Induce page faults
memset(result, 0, count * sizeof(uint64_t));
unsigned _;
int64_t start_rdtsc = __rdtscp(&_);
for (int i = 0; i < ITERATIONS; ++i) {
interleave(result, pairs, count);
}
int64_t end_rdtsc = __rdtscp(&_);
for (int i = 0; i < count; ++i) {
if (reference_result[i] != result[i]) {
fprintf(stderr, "Incorrect value at index %d; got %" PRIx64 ", should be %" PRIx64 "n", i, result[i], reference_result[i]);
abort();
}
}
printf("Total rdtsc: %" PRId64 ", iterations: %d, count: %dn", end_rdtsc - start_rdtsc, ITERATIONS, count);
free(result);
}
int main() {
const int count = 10000;
uint32_2* pairs = generate_pairs(count);
printf("Naiven");
time_inv(pairs, count, interleave_naive);
printf("clmul-basedn");
time_inv(pairs, count, interleave_clmul);
printf("pdep-basedn");
time_inv(pairs, count, interleave_pdep);
printf("pdep-based unrolledn");
time_inv(pairs, count, interleave_pdep_unrolled);
free(pairs);
}
用gcc bleh.c -o bleh -O2 -march=native
编译
每个实现的性能统计如下。CLMUL似乎做1.5c/对,瓶颈端口5争用2 pclmulqdq
和1 vpshufd
,而pdep实现瓶颈端口1,在pdep
上执行,导致2c/对:
clmul-based
Total rdtsc: 1774895925, total milliseconds: 985.575000, iterations: 100000, count: 10000
Performance counter stats for './interleave':
556,602,052 uops_dispatched_port.port_0 (49.87%)
1,556,592,314 cycles (49.86%)
469,021,017 uops_dispatched_port.port_1 (49.86%)
472,968,452 uops_dispatched_port.port_2 (50.08%)
519,804,531 uops_dispatched_port.port_3 (50.13%)
499,980,587 uops_dispatched_port.port_4 (50.14%)
1,509,928,584 uops_dispatched_port.port_5 (50.14%)
1,484,649,884 uops_dispatched_port.port_6 (49.92%)
pdep-based
Total rdtsc: 2588637876, total milliseconds: 1438.065000, iterations: 100000, count: 10000
Performance counter stats for './interleave':
745,844,862 uops_dispatched_port.port_0 (50.02%)
2,289,048,624 cycles (50.02%)
2,033,116,738 uops_dispatched_port.port_1 (50.02%)
1,508,870,090 uops_dispatched_port.port_2 (50.02%)
1,498,920,409 uops_dispatched_port.port_3 (49.98%)
1,056,089,339 uops_dispatched_port.port_4 (49.98%)
843,399,033 uops_dispatched_port.port_5 (49.98%)
1,414,062,891 uops_dispatched_port.port_6 (49.98%)
pdep-based unrolled
Total rdtsc: 2387027127, total milliseconds: 1325.857000, iterations: 100000, count: 10000
Performance counter stats for './interleave':
532,577,450 uops_dispatched_port.port_0 (49.64%)
2,099,782,071 cycles (49.94%)
2,004,347,972 uops_dispatched_port.port_1 (50.24%)
1,532,203,395 uops_dispatched_port.port_2 (50.54%)
1,467,988,364 uops_dispatched_port.port_3 (50.36%)
1,701,095,132 uops_dispatched_port.port_4 (50.06%)
543,597,866 uops_dispatched_port.port_5 (49.76%)
930,460,812 uops_dispatched_port.port_6 (49.46%)
- 有效地使用std::unordered_map来插入或增加键的值
- 如何有效地在 std::vector 中插入一对?
- 有效地计算多维数组的累积和?
- 如何有效地计算将单位立方体映射到自身的反射和旋转?
- 有效地将大数存储为 2 的幂用于路径问题
- 如何在C++中写入 1000 个文件时有效地缓冲
- 如何有效地找到数组中三元组和的最小差异?
- 如何在C++中有效地将数字值重新分配给字符数组
- C++有效地找到向量中第一个最接近的匹配值?
- 如何有效地操作满足给定谓词的向量中的所有项目?
- 有效地将数据加载到 std::vector 中<char>
- 如何在使用 cin 请求 int 时有效地使用户输入万无一失?
- C++:有效地将Sha256摘要放入OpenSSL Bignum?
- 如何有效地收集给定数组中的重复元素?
- 如何有效地修剪和合并四叉树中的节点?
- 可以有效地转换 std::any 与 std::any_cast
- 只需要知道我在c ++中打印模式的方式是否有效,或者有另一种方法可以有效地做到这一点
- 如何使用包含内部类的类实例有效地从内部类访问成员?
- 当表示为对象的一维向量时,有效地旋转 NxM 矩阵 (C++)
- 有效地交错位