将0x1234转换为0x11223344

Convert 0x1234 to 0x11223344

本文关键字：0x11223344 转换 0x1234 更新时间：2023-10-16

如何以高性能方式将十六进制数0x1234扩展到0x11223344？

unsigned int c = 0x1234, b;
b = (c & 0xff) << 4 | c & 0xf | (c & 0xff0) << 8
| (c & 0xff00) << 12 | (c & 0xf000) << 16;
printf("%p -> %pn", c, b);

输出：

0x1234 -> 0x11223344

我需要这个进行颜色转换。用户以 0xARGB 的形式提供他们的数据，我需要将其转换为0xAARRGGBB.是的，可能有数百万个，因为每个都可能是一个像素。1000x1000 像素等于 100 万像素。

实际情况甚至更加复杂，因为单个 32 位值同时包含前景色和背景色。所以0xARGBargb变成：[ 0xAARRGGBB, 0xaarrggbb ]

哦，是的，还有一件事，在实际应用程序中我也否定 alpha，因为在 OpenGL 中0xFF是不透明的，而0x00是最透明的，这在大多数情况下很不方便，因为通常你只需要一个RGB部分，透明度被认为是不存在的。

这可以使用SSE2完成，如下所示：

void ExpandSSE2(unsigned __int64 in, unsigned __int64 &outLo, unsigned __int64 &outHi) {
__m128i const mask = _mm_set1_epi16((short)0xF00F);
__m128i const mul0 = _mm_set1_epi16(0x0011);
__m128i const mul1 = _mm_set1_epi16(0x1000);
__m128i       v;
v = _mm_cvtsi64_si128(in); // Move the 64-bit value to a 128-bit register
v = _mm_unpacklo_epi8(v, v);  // 0x12   -> 0x1212
v = _mm_and_si128(v, mask);   // 0x1212 -> 0x1002
v = _mm_mullo_epi16(v, mul0); // 0x1002 -> 0x1022
v = _mm_mulhi_epu16(v, mul1); // 0x1022 -> 0x0102
v = _mm_mullo_epi16(v, mul0); // 0x0102 -> 0x1122
outLo = _mm_extract_epi64(v, 0);
outHi = _mm_extract_epi64(v, 1);
}

当然，您希望将函数的内脏放在内部循环中并提取常量。您还需要跳过 x64 寄存器并将值直接加载到 128 位 SSE 寄存器中。有关如何执行此操作的示例，请参阅下面性能测试中的 SSE2 实现。

其核心是五个指令，一次对四个颜色值执行操作。因此，每个颜色值只有大约 1.25 条指令。还应该注意的是，SSE2 在 x64 可用的任何地方都可用。

此处各种解决方案的性能测试有些人提到，知道什么更快的唯一方法是运行代码，这是无可争辩的。因此，我将一些解决方案编译为性能测试，以便我们可以将苹果与苹果进行比较。我选择的解决方案我觉得与其他解决方案有很大不同，足以需要测试。所有解决方案都从内存中读取，对数据进行操作，然后写回内存。在实践中，当输入数据中没有另一个完整的 16 字节需要处理时，某些 SSE 解决方案将需要额外注意对齐和处理情况。我测试的代码是x64，使用在4+ GHz Core i7上运行的Visual Studio 2013发布。

以下是我的结果：

ExpandOrig:               56.234 seconds  // From asker's original question
ExpandSmallLUT:           30.209 seconds  // From Dmitry's answer
ExpandLookupSmallOneLUT:  33.689 seconds  // from Dmitry's answer
ExpandLookupLarge:        51.312 seconds  // A straightforward lookup table
ExpandAShelly:            43.829 seconds  // From AShelly's answer
ExpandAShellyMulOp:       43.580 seconds  // AShelly's answer with an optimization
ExpandSSE4:               17.854 seconds  // My original SSE4 answer
ExpandSSE4Unroll:         17.405 seconds  // My original SSE4 answer with loop unrolling
ExpandSSE2:               17.281 seconds  // My current SSE2 answer
ExpandSSE2Unroll:         17.152 seconds  // My current SSE2 answer with loop unrolling

在上面的测试结果中，您将看到我包含了提问者的代码，三个查找表实现，包括 Dmitry 答案中提出的小查找表实现。AShelly的解决方案也包括在内，以及我所做的优化版本(可以消除操作)。我包括了我最初的 SSE4 实现，以及我后来制作的高级 SSE2 版本(现在反映为答案)，以及两者的展开版本，因为它们是这里最快的，我想看看展开速度有多快。我还包括了 AShelly 答案的 SSE4 实现。

到目前为止，我必须宣布自己是赢家。但来源在下面，所以任何人都可以在他们的平台上进行测试，并将他们自己的解决方案包含在测试中，看看他们是否制作了一个更快的解决方案。

#define DATA_SIZE_IN  ((unsigned)(1024 * 1024 * 128))
#define DATA_SIZE_OUT ((unsigned)(2 * DATA_SIZE_IN))
#define RERUN_COUNT   500
#include <cstdlib>
#include <ctime>
#include <iostream>
#include <utility>
#include <emmintrin.h> // SSE2
#include <tmmintrin.h> // SSSE3
#include <smmintrin.h> // SSE4
void ExpandOrig(unsigned char const *in, unsigned char const *past, unsigned char *out) {
unsigned u, v;
do {
// Read in data
u  = *(unsigned const*)in;
v  = u >> 16;
u &= 0x0000FFFF;
// Do computation
u  =   (u & 0x00FF) << 4
| (u & 0x000F)
| (u & 0x0FF0) << 8
| (u & 0xFF00) << 12
| (u & 0xF000) << 16;
v  =   (v & 0x00FF) << 4
| (v & 0x000F)
| (v & 0x0FF0) << 8
| (v & 0xFF00) << 12
| (v & 0xF000) << 16;
// Store data
*(unsigned*)(out)      = u;
*(unsigned*)(out + 4)  = v;
in                    += 4;
out                   += 8;
} while (in != past);
}
unsigned LutLo[256],
LutHi[256];
void MakeLutLo(void) {
for (unsigned i = 0, x; i < 256; ++i) {
x        = i;
x        = ((x & 0xF0) << 4) | (x & 0x0F);
x       |= (x << 4);
LutLo[i] = x;
}
}
void MakeLutHi(void) {
for (unsigned i = 0, x; i < 256; ++i) {
x        = i;
x        = ((x & 0xF0) << 20) | ((x & 0x0F) << 16);
x       |= (x << 4);
LutHi[i] = x;
}
}
void ExpandLookupSmall(unsigned char const *in, unsigned char const *past, unsigned char *out) {
unsigned u, v;
do {
// Read in data
u  = *(unsigned const*)in;
v  = u >> 16;
u &= 0x0000FFFF;
// Do computation
u = LutHi[u >> 8] | LutLo[u & 0xFF];
v = LutHi[v >> 8] | LutLo[v & 0xFF];
// Store data
*(unsigned*)(out)      = u;
*(unsigned*)(out + 4)  = v;
in                    += 4;
out                   += 8;
} while (in != past);
}
void ExpandLookupSmallOneLUT(unsigned char const *in, unsigned char const *past, unsigned char *out) {
unsigned u, v;
do {
// Read in data
u = *(unsigned const*)in;
v = u >> 16;
u &= 0x0000FFFF;
// Do computation
u = ((LutLo[u >> 8] << 16) | LutLo[u & 0xFF]);
v = ((LutLo[v >> 8] << 16) | LutLo[v & 0xFF]);
// Store data
*(unsigned*)(out) = u;
*(unsigned*)(out + 4) = v;
in  += 4;
out += 8;
} while (in != past);
}
unsigned LutLarge[256 * 256];
void MakeLutLarge(void) {
for (unsigned i = 0; i < (256 * 256); ++i)
LutLarge[i] = LutHi[i >> 8] | LutLo[i & 0xFF];
}
void ExpandLookupLarge(unsigned char const *in, unsigned char const *past, unsigned char *out) {
unsigned u, v;
do {
// Read in data
u  = *(unsigned const*)in;
v  = u >> 16;
u &= 0x0000FFFF;
// Do computation
u = LutLarge[u];
v = LutLarge[v];
// Store data
*(unsigned*)(out)      = u;
*(unsigned*)(out + 4)  = v;
in                    += 4;
out                   += 8;
} while (in != past);
}
void ExpandAShelly(unsigned char const *in, unsigned char const *past, unsigned char *out) {
unsigned u, v, w, x;
do {
// Read in data
u  = *(unsigned const*)in;
v  = u >> 16;
u &= 0x0000FFFF;
// Do computation
w  = (((u & 0xF0F) * 0x101) & 0xF000F) + (((u & 0xF0F0) * 0x1010) & 0xF000F00);
x  = (((v & 0xF0F) * 0x101) & 0xF000F) + (((v & 0xF0F0) * 0x1010) & 0xF000F00);
w += w * 0x10;
x += x * 0x10;
// Store data
*(unsigned*)(out)      = w;
*(unsigned*)(out + 4)  = x;
in                    += 4;
out                   += 8;
} while (in != past);
}
void ExpandAShellyMulOp(unsigned char const *in, unsigned char const *past, unsigned char *out) {
unsigned u, v;
do {
// Read in data
u = *(unsigned const*)in;
v = u >> 16;
u &= 0x0000FFFF;
// Do computation
u = ((((u & 0xF0F) * 0x101) & 0xF000F) + (((u & 0xF0F0) * 0x1010) & 0xF000F00)) * 0x11;
v = ((((v & 0xF0F) * 0x101) & 0xF000F) + (((v & 0xF0F0) * 0x1010) & 0xF000F00)) * 0x11;
// Store data
*(unsigned*)(out) = u;
*(unsigned*)(out + 4) = v;
in += 4;
out += 8;
} while (in != past);
}
void ExpandSSE4(unsigned char const *in, unsigned char const *past, unsigned char *out) {
__m128i const mask0 = _mm_set1_epi16((short)0x8000),
mask1 = _mm_set1_epi8(0x0F),
mul = _mm_set1_epi16(0x0011);
__m128i       u, v, w, x;
do {
// Read input into low 8 bytes of u and v
u = _mm_load_si128((__m128i const*)in);
v = _mm_unpackhi_epi8(u, u);      // Expand each single byte to two bytes
u = _mm_unpacklo_epi8(u, u);      // Do it again for v
w = _mm_srli_epi16(u, 4);         // Copy the value into w and shift it right half a byte
x = _mm_srli_epi16(v, 4);         // Do it again for v
u = _mm_blendv_epi8(u, w, mask0); // Select odd bytes from w, and even bytes from v, giving the the desired value in the upper nibble of each byte
v = _mm_blendv_epi8(v, x, mask0); // Do it again for v
u = _mm_and_si128(u, mask1);      // Clear the all the upper nibbles
v = _mm_and_si128(v, mask1);      // Do it again for v
u = _mm_mullo_epi16(u, mul);      // Multiply each 16-bit value by 0x0011 to duplicate the lower nibble in the upper nibble of each byte
v = _mm_mullo_epi16(v, mul);      // Do it again for v
// Write output
_mm_store_si128((__m128i*)(out     ), u);
_mm_store_si128((__m128i*)(out + 16), v);
in  += 16;
out += 32;
} while (in != past);
}
void ExpandSSE4Unroll(unsigned char const *in, unsigned char const *past, unsigned char *out) {
__m128i const mask0  = _mm_set1_epi16((short)0x8000),
mask1  = _mm_set1_epi8(0x0F),
mul    = _mm_set1_epi16(0x0011);
__m128i       u0, v0, w0, x0,
u1, v1, w1, x1,
u2, v2, w2, x2,
u3, v3, w3, x3;
do {
// Read input into low 8 bytes of u and v
u0 = _mm_load_si128((__m128i const*)(in     ));
u1 = _mm_load_si128((__m128i const*)(in + 16));
u2 = _mm_load_si128((__m128i const*)(in + 32));
u3 = _mm_load_si128((__m128i const*)(in + 48));
v0 = _mm_unpackhi_epi8(u0, u0);      // Expand each single byte to two bytes
u0 = _mm_unpacklo_epi8(u0, u0);      // Do it again for v
v1 = _mm_unpackhi_epi8(u1, u1);      // Do it again
u1 = _mm_unpacklo_epi8(u1, u1);      // Again for u1
v2 = _mm_unpackhi_epi8(u2, u2);      // Again for v1
u2 = _mm_unpacklo_epi8(u2, u2);      // Again for u2
v3 = _mm_unpackhi_epi8(u3, u3);      // Again for v2
u3 = _mm_unpacklo_epi8(u3, u3);      // Again for u3
w0 = _mm_srli_epi16(u0, 4);          // Copy the value into w and shift it right half a byte
x0 = _mm_srli_epi16(v0, 4);          // Do it again for v
w1 = _mm_srli_epi16(u1, 4);          // Again for u1
x1 = _mm_srli_epi16(v1, 4);          // Again for v1
w2 = _mm_srli_epi16(u2, 4);          // Again for u2
x2 = _mm_srli_epi16(v2, 4);          // Again for v2
w3 = _mm_srli_epi16(u3, 4);          // Again for u3
x3 = _mm_srli_epi16(v3, 4);          // Again for v3
u0 = _mm_blendv_epi8(u0, w0, mask0); // Select even bytes from w, and odd bytes from v, giving the the desired value in the upper nibble of each byte
v0 = _mm_blendv_epi8(v0, x0, mask0); // Do it again for v
u1 = _mm_blendv_epi8(u1, w1, mask0); // Again for u1
v1 = _mm_blendv_epi8(v1, x1, mask0); // Again for v1
u2 = _mm_blendv_epi8(u2, w2, mask0); // Again for u2
v2 = _mm_blendv_epi8(v2, x2, mask0); // Again for v2
u3 = _mm_blendv_epi8(u3, w3, mask0); // Again for u3
v3 = _mm_blendv_epi8(v3, x3, mask0); // Again for v3
u0 = _mm_and_si128(u0, mask1);       // Clear the all the upper nibbles
v0 = _mm_and_si128(v0, mask1);       // Do it again for v
u1 = _mm_and_si128(u1, mask1);       // Again for u1
v1 = _mm_and_si128(v1, mask1);       // Again for v1
u2 = _mm_and_si128(u2, mask1);       // Again for u2
v2 = _mm_and_si128(v2, mask1);       // Again for v2
u3 = _mm_and_si128(u3, mask1);       // Again for u3
v3 = _mm_and_si128(v3, mask1);       // Again for v3
u0 = _mm_mullo_epi16(u0, mul);       // Multiply each 16-bit value by 0x0011 to duplicate the lower nibble in the upper nibble of each byte
v0 = _mm_mullo_epi16(v0, mul);       // Do it again for v
u1 = _mm_mullo_epi16(u1, mul);       // Again for u1
v1 = _mm_mullo_epi16(v1, mul);       // Again for v1
u2 = _mm_mullo_epi16(u2, mul);       // Again for u2
v2 = _mm_mullo_epi16(v2, mul);       // Again for v2
u3 = _mm_mullo_epi16(u3, mul);       // Again for u3
v3 = _mm_mullo_epi16(v3, mul);       // Again for v3
// Write output
_mm_store_si128((__m128i*)(out      ), u0);
_mm_store_si128((__m128i*)(out +  16), v0);
_mm_store_si128((__m128i*)(out +  32), u1);
_mm_store_si128((__m128i*)(out +  48), v1);
_mm_store_si128((__m128i*)(out +  64), u2);
_mm_store_si128((__m128i*)(out +  80), v2);
_mm_store_si128((__m128i*)(out +  96), u3);
_mm_store_si128((__m128i*)(out + 112), v3);
in  += 64;
out += 128;
} while (in != past);
}
void ExpandSSE2(unsigned char const *in, unsigned char const *past, unsigned char *out) {
__m128i const mask = _mm_set1_epi16((short)0xF00F),
mul0 = _mm_set1_epi16(0x0011),
mul1 = _mm_set1_epi16(0x1000);
__m128i       u, v;
do {
// Read input into low 8 bytes of u and v
u = _mm_load_si128((__m128i const*)in);
v = _mm_unpackhi_epi8(u, u);      // Expand each single byte to two bytes
u = _mm_unpacklo_epi8(u, u);      // Do it again for v
u = _mm_and_si128(u, mask);
v = _mm_and_si128(v, mask);
u = _mm_mullo_epi16(u, mul0);
v = _mm_mullo_epi16(v, mul0);
u = _mm_mulhi_epu16(u, mul1);     // This can also be done with a right shift of 4 bits, but this seems to mesure faster
v = _mm_mulhi_epu16(v, mul1);
u = _mm_mullo_epi16(u, mul0);
v = _mm_mullo_epi16(v, mul0);
// write output
_mm_store_si128((__m128i*)(out     ), u);
_mm_store_si128((__m128i*)(out + 16), v);
in  += 16;
out += 32;
} while (in != past);
}
void ExpandSSE2Unroll(unsigned char const *in, unsigned char const *past, unsigned char *out) {
__m128i const mask = _mm_set1_epi16((short)0xF00F),
mul0 = _mm_set1_epi16(0x0011),
mul1 = _mm_set1_epi16(0x1000);
__m128i       u0, v0,
u1, v1;
do {
// Read input into low 8 bytes of u and v
u0 = _mm_load_si128((__m128i const*)(in     ));
u1 = _mm_load_si128((__m128i const*)(in + 16));
v0 = _mm_unpackhi_epi8(u0, u0);      // Expand each single byte to two bytes
u0 = _mm_unpacklo_epi8(u0, u0);      // Do it again for v
v1 = _mm_unpackhi_epi8(u1, u1);      // Do it again
u1 = _mm_unpacklo_epi8(u1, u1);      // Again for u1
u0 = _mm_and_si128(u0, mask);
v0 = _mm_and_si128(v0, mask);
u1 = _mm_and_si128(u1, mask);
v1 = _mm_and_si128(v1, mask);
u0 = _mm_mullo_epi16(u0, mul0);
v0 = _mm_mullo_epi16(v0, mul0);
u1 = _mm_mullo_epi16(u1, mul0);
v1 = _mm_mullo_epi16(v1, mul0);
u0 = _mm_mulhi_epu16(u0, mul1);
v0 = _mm_mulhi_epu16(v0, mul1);
u1 = _mm_mulhi_epu16(u1, mul1);
v1 = _mm_mulhi_epu16(v1, mul1);
u0 = _mm_mullo_epi16(u0, mul0);
v0 = _mm_mullo_epi16(v0, mul0);
u1 = _mm_mullo_epi16(u1, mul0);
v1 = _mm_mullo_epi16(v1, mul0);
// write output
_mm_store_si128((__m128i*)(out     ), u0);
_mm_store_si128((__m128i*)(out + 16), v0);
_mm_store_si128((__m128i*)(out + 32), u1);
_mm_store_si128((__m128i*)(out + 48), v1);
in  += 32;
out += 64;
} while (in != past);
}
void ExpandAShellySSE4(unsigned char const *in, unsigned char const *past, unsigned char *out) {
__m128i const zero      = _mm_setzero_si128(),
v0F0F     = _mm_set1_epi32(0x0F0F),
vF0F0     = _mm_set1_epi32(0xF0F0),
v0101     = _mm_set1_epi32(0x0101),
v1010     = _mm_set1_epi32(0x1010),
v000F000F = _mm_set1_epi32(0x000F000F),
v0F000F00 = _mm_set1_epi32(0x0F000F00),
v0011 = _mm_set1_epi32(0x0011);
__m128i       u, v, w, x;
do {
// Read in data
u = _mm_load_si128((__m128i const*)in);
v = _mm_unpackhi_epi16(u, zero);
u = _mm_unpacklo_epi16(u, zero);
// original source: ((((a & 0xF0F) * 0x101) & 0xF000F) + (((a & 0xF0F0) * 0x1010) & 0xF000F00)) * 0x11;
w = _mm_and_si128(u, v0F0F);
x = _mm_and_si128(v, v0F0F);
u = _mm_and_si128(u, vF0F0);
v = _mm_and_si128(v, vF0F0);
w = _mm_mullo_epi32(w, v0101); // _mm_mullo_epi32 is what makes this require SSE4 instead of SSE2
x = _mm_mullo_epi32(x, v0101);
u = _mm_mullo_epi32(u, v1010);
v = _mm_mullo_epi32(v, v1010);
w = _mm_and_si128(w, v000F000F);
x = _mm_and_si128(x, v000F000F);
u = _mm_and_si128(u, v0F000F00);
v = _mm_and_si128(v, v0F000F00);
u = _mm_add_epi32(u, w);
v = _mm_add_epi32(v, x);
u = _mm_mullo_epi32(u, v0011);
v = _mm_mullo_epi32(v, v0011);
// write output
_mm_store_si128((__m128i*)(out     ), u);
_mm_store_si128((__m128i*)(out + 16), v);
in  += 16;
out += 32;
} while (in != past);
}
int main() {
unsigned char *const indat   = new unsigned char[DATA_SIZE_IN ],
*const outdat0 = new unsigned char[DATA_SIZE_OUT],
*const outdat1 = new unsigned char[DATA_SIZE_OUT],
*      curout  = outdat0,
*      lastout = outdat1,
*      place;
unsigned             start,
stop;
place = indat + DATA_SIZE_IN - 1;
do {
*place = (unsigned char)rand();
} while (place-- != indat);
MakeLutLo();
MakeLutHi();
MakeLutLarge();
for (unsigned testcount = 0; testcount < 1000; ++testcount) {
// Solution posted by the asker
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandOrig(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandOrig:ttt" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
// Dmitry's small lookup table solution
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandLookupSmall(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandSmallLUT:ttt" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;
// Dmitry's small lookup table solution using only one lookup table
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandLookupSmallOneLUT(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandLookupSmallOneLUT:t" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;
// Large lookup table solution
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandLookupLarge(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandLookupLarge:tt" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;
// AShelly's Interleave bits by Binary Magic Numbers solution
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandAShelly(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandAShelly:ttt" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;
// AShelly's Interleave bits by Binary Magic Numbers solution optimizing out an addition
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandAShellyMulOp(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandAShellyMulOp:tt" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;
// My SSE4 solution
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandSSE4(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandSSE4:ttt" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;
// My SSE4 solution unrolled
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandSSE4Unroll(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandSSE4Unroll:tt" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;
// My SSE2 solution
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandSSE2(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandSSE2:ttt" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;
// My SSE2 solution unrolled
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandSSE2Unroll(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandSSE2Unroll:tt" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;
// AShelly's Interleave bits by Binary Magic Numbers solution implemented using SSE2
start = clock();
for (unsigned rerun = 0; rerun < RERUN_COUNT; ++rerun)
ExpandAShellySSE4(indat, indat + DATA_SIZE_IN, curout);
stop = clock();
std::cout << "ExpandAShellySSE4:tt" << (((stop - start) / 1000) / 60) << ':' << (((stop - start) / 1000) % 60) << ":." << ((stop - start) % 1000) << std::endl;
std::swap(curout, lastout);
if (memcmp(outdat0, outdat1, DATA_SIZE_OUT))
std::cout << "INCORRECT OUTPUT" << std::endl;
}
delete[] indat;
delete[] outdat0;
delete[] outdat1;
return 0;
}

注意：

我最初在这里有一个 SSE4 实现。我找到了一种使用 SSE2 实现这一点的方法，这更好，因为它可以在更多平台上运行。SSE2 实现速度也更快。因此，顶部介绍的解决方案现在是 SSE2 实现，而不是 SSE4 实现。SSE4 实现仍然可以在性能测试或编辑历史记录中看到。

我不确定最有效的方法是什么，但这有点短：

#include <stdio.h>
int main()
{
unsigned x = 0x1234;
x = (x << 8) | x;
x = ((x & 0x00f000f0) << 4) | (x & 0x000f000f);
x = (x << 4) | x;
printf("0x1234 -> 0x%08xn",x);
return 0;
}

如果您需要按照编辑中的建议重复且非常快速地执行此操作，则可以考虑生成查找表并改用它。以下函数动态分配和初始化此类表：

unsigned *makeLookupTable(void)
{
unsigned *tbl = malloc(sizeof(unsigned) * 65536);
if (!tbl) return NULL;
int i;
for (i = 0; i < 65536; i++) {
unsigned x = i;
x |= (x << 8);
x = ((x & 0x00f000f0) << 4) | (x & 0x000f000f);
x |= (x << 4);
/* Uncomment next line to invert the high byte as mentioned in the edit. */
/* x = x ^ 0xff000000; */
tbl[i] = x;
}
return tbl;
}

之后，每次转换都像这样：

result = lookuptable[input];

..或者也许：

result = lookuptable[input & 0xffff];

或者，可以使用更小、更友好的缓存查找表(或对)对高字节和低字节各进行一次查找(如注释中@LưuVĩnhPhúc所述)。在这种情况下，表生成代码可能是：

unsigned *makeLookupTableLow(void)
{
unsigned *tbl = malloc(sizeof(unsigned) * 256);
if (!tbl) return NULL;
int i;
for (i = 0; i < 256; i++) {
unsigned x = i;
x = ((x & 0xf0) << 4) | (x & 0x0f);
x |= (x << 4);
tbl[i] = x;
}
return tbl;
}

。和可选的第二个表：

unsigned *makeLookupTableHigh(void)
{
unsigned *tbl = malloc(sizeof(unsigned) * 256);
if (!tbl) return NULL;
int i;
for (i = 0; i < 256; i++) {
unsigned x = i;
x = ((x & 0xf0) << 20) | ((x & 0x0f) << 16);
x |= (x << 4);
/* uncomment next line to invert high byte */
/* x = x ^ 0xff000000; */
tbl[i] = x;
}
return tbl;
}

。并转换包含两个表的值：

result = hightable[input >> 8] | lowtable[input & 0xff];

。或者用一个(只是上面的低表)：

result = (lowtable[input >> 8] << 16) | lowtable[input & 0xff];
result ^= 0xff000000; /* to invert high byte */

如果值的上半部分(alpha？)变化不大，即使是单个大表也可能表现良好，因为连续查找在表中会更靠近。

我拿了@Apriori发布的性能测试代码，做了一些调整，并为他最初没有包含的其他响应添加了测试......然后编译了具有不同设置的三个版本。一种是启用了 SSE4.1 的 64 位代码，编译器可以利用 SSE 进行优化......然后是两个 32 位版本，一个带有 SSE，一个没有。尽管这三者都运行在同一个相当新的处理器上，但结果显示了最佳解决方案如何根据处理器功能而变化：

64b SSE4.1  32b SSE4.1  32b no SSE
-------------------------- ----------  ----------  ----------
ExpandOrig           time:  3.502 s     3.501 s     6.260 s
ExpandLookupSmall    time:  3.530 s     3.997 s     3.996 s
ExpandLookupLarge    time:  3.434 s     3.419 s     3.427 s
ExpandIsalamon       time:  3.654 s     3.673 s     8.870 s
ExpandIsalamonOpt    time:  3.784 s     3.720 s     8.719 s
ExpandChronoKitsune  time:  3.658 s     3.463 s     6.546 s
ExpandEvgenyKluev    time:  6.790 s     7.697 s    13.383 s
ExpandIammilind      time:  3.485 s     3.498 s     6.436 s
ExpandDmitri         time:  3.457 s     3.477 s     5.461 s
ExpandNitish712      time:  3.574 s     3.800 s     6.789 s
ExpandAdamLiss       time:  3.673 s     5.680 s     6.969 s
ExpandAShelly        time:  3.524 s     4.295 s     5.867 s
ExpandAShellyMulOp   time:  3.527 s     4.295 s     5.852 s
ExpandSSE4           time:  3.428 s
ExpandSSE4Unroll     time:  3.333 s
ExpandSSE2           time:  3.392 s
ExpandSSE2Unroll     time:  3.318 s
ExpandAShellySSE4    time:  3.392 s

可执行文件在 gcc 4.8.1 的 64 位 Linux 上编译，分别使用-m64 -O3 -march=core2 -msse4.1、-m32 -O3 -march=core2 -msse4.1和-m32 -O3 -march=core2 -mno-sse。对于 32 位构建，省略了 @Apriori 的 SSE 测试(在启用 SSE 的情况下在 32 位上崩溃，显然在禁用 SSE 的情况下不起作用)。

所做的调整包括使用实际图像数据而不是随机值(具有透明背景的物体照片)，这大大提高了大型查找表的性能，但对其他查找表几乎没有影响。

从本质上讲，当 SSE 不可用(或未使用)时，查找表以压倒性优势获胜......否则，手动编码的 SSE 解决方案胜出。然而，同样值得注意的是，当编译器可以使用SSE进行优化时，大多数位操作解决方案几乎与手动编码的SSE一样快 - 仍然慢，但只是略有。

这是另一个尝试，使用八个操作：

b = (((c & 0x0F0F) * 0x0101) & 0x00F000F) + 
(((c & 0xF0F0) * 0x1010) & 0xF000F00);
b += b * 0x10;
printf("%xn",b); //Shows '0x11223344'

*请注意，这篇文章最初包含完全不同的代码，基于Sean Anderson的bithacks页面中二进制幻数的交错位。但这并不是OP所要求的。所以它已经删除了。下面的大多数评论都提到了该缺失的版本。

我想将此链接添加到答案池中，因为我认为在谈论优化时，记住我们正在运行的硬件以及为所述平台编译代码的技术非常重要。

博客文章使用 CPU 管道是关于研究优化一组 CPU 流水线的代码。它实际上展示了一个例子，他试图将数学简化到最少的实际数学运算，但就时间而言，它远非最佳解决方案。我在这里看到过几个答案，它们可能是正确的，也可能不是。知道的唯一方法是实际测量特定代码片段从开始到结束的时间，与其他代码片段相比。阅读此博客;这是非常有趣的。

我想我应该提到，在这种特殊情况下，除非我真的尝试过多次尝试，否则我不会在这里放置任何代码，并且实际上通过多次尝试变得特别快。

我认为 Dimitri 建议的查找表方法是一个不错的选择，但我建议更进一步，在编译时生成表; 在编译时完成工作显然会减少执行时间。

首先，我们使用任何建议的方法创建一个编译时值：

constexpr unsigned int transform1(unsigned int x)
{
return ((x << 8) | x);
}
constexpr unsigned int transform2(unsigned int x)
{
return (((x & 0x00f000f0) << 4) | (x & 0x000f000f));
}
constexpr unsigned int transform3(unsigned int x)
{
return ((x << 4) | x);
}
constexpr unsigned int transform(unsigned int x)
{
return transform3(transform2(transform1(x)));
}
// Dimitri version, using constexprs
template <unsigned int argb> struct aarrggbb_dimitri
{
static const unsigned int value = transform(argb);
};
// Adam Liss version
template <unsigned int argb> struct aarrggbb_adamLiss
{
static const unsigned int value =
(argb & 0xf000) * 0x11000 +
(argb & 0x0f00) * 0x01100 +
(argb & 0x00f0) * 0x00110 +
(argb & 0x000f) * 0x00011;
};

然后，我们使用任何可用的方法创建编译时查找表，我希望使用 C++14 整数序列，但我不知道 OP 将使用哪个编译器。所以另一种可能的方法是使用一个非常丑陋的宏：

#define EXPAND16(x) aarrggbb<x + 0>::value, 
aarrggbb<x + 1>::value, 
aarrggbb<x + 2>::value, 
aarrggbb<x + 3>::value, 
aarrggbb<x + 4>::value, 
aarrggbb<x + 5>::value, 
aarrggbb<x + 6>::value, 
... and so on
#define EXPAND EXPAND16(0), 
EXPAND16(0x10), 
EXPAND16(0x20), 
EXPAND16(0x30), 
EXPAND16(0x40), 
... and so on
... and so on

在此处查看演示。

PS：亚当·利斯的方法可以在没有C++11的情况下使用。

如果乘法很便宜并且可以使用 64 位算术，您可以使用以下代码：

uint64_t x = 0x1234;
x *= 0x0001000100010001ull;
x &= 0xF0000F0000F0000Full;
x *= 0x0000001001001001ull;
x &= 0xF0F0F0F000000000ull;
x = (x >> 36) * 0x11;
std::cout << std::hex << x << 'n';

事实上，它使用了与AShelly最初尝试相同的想法。

这有效并且可能更容易理解，但是位操作非常便宜，我不会太担心效率。

#include <stdio.h>
#include <stdlib.h>
void main() {
unsigned int c = 0x1234, b;
b = (c & 0xf000) * 0x11000 + (c & 0x0f00) * 0x01100 +
(c & 0x00f0) * 0x00110 + (c & 0x000f) * 0x00011;
printf("%x -> %xn", c, b);
}

假设您希望始终将0xWXYZ转换为0xWWXXYYZZ，我相信以下解决方案会比您建议的解决方案快一点：

unsigned int c = 0x1234;     
unsigned int b = (c & 0xf) | ((c & 0xf0) << 4) |
((c & 0xf00) << 8) | ((c & 0xf000) << 12);
b |= (b << 4);

请注意，从解决方案中保存了一个&(and)操作。

另一种方法是：

DWORD OrVal(DWORD & nible_pos, DWORD input_val, DWORD temp_val, int shift)
{
if (nible_pos==0)
nible_pos = 0x0000000F;
else
nible_pos = nible_pos << 4;
DWORD nible = input_val & nible_pos;
temp_val |= (nible << shift);
temp_val |= (nible << (shift + 4));
return temp_val;
}
DWORD Converter2(DWORD input_val)
{
DWORD nible_pos = 0x00000000;
DWORD temp_val = 0x00000000;
temp_val = OrVal(nible_pos, input_val, temp_val, 0);
temp_val = OrVal(nible_pos, input_val, temp_val, 4);
temp_val = OrVal(nible_pos, input_val, temp_val, 8);
temp_val = OrVal(nible_pos, input_val, temp_val, 12);
return temp_val;
}
DWORD val2 = Converter2(0x1234);

优化版本(快 3 倍)：

DWORD 转换器3(DWORD input_val) { 双字nible_pos = 0; 双字temp_val = 0; 整数移位 = 0; DWORD bit_nible[4] = { 0x000F， 0x000F0， 0x0F00， 0xF000 }; 对于 ( ; 移位 <16; 移位+=4 ) { 如果 (nible_pos==0) nible_pos = 0x0000000F; 还 nible_pos = nible_pos <<4; DWORD nible = input_val & nible_pos; temp_val |= (<<移位); temp_val |= (<<(移位 + 4)); } 返回temp_val; }

也许这可以更简单和高效。

unsigned int g = 0x1234;
unsigned int ans = 0;
ans = ( ( g & 0xf000 ) << 16) + ( (g & 0xf00 ) << 12)
+ ( ( g&0xf0 ) << 8) + ( ( g&0xf ) << 4);
ans  = ( ans | ans>>4 );
printf("%p -> %pn", g, ans);

unsigned long transform(unsigned long n)
{
/* n: 00AR
*    00GB
*/
n = ((n & 0xff00) << 8) | (n & 0x00ff);
/* n: 0AR0
*    0GB0
*/
n <<= 4;
/* n: AAR0
*    GGB0
*/
n |= (n & 0x0f000f00L) << 4;
/* n: AARR
*    GGBB
*/
n |= (n & 0x00f000f0L) >> 4;
return n;
}

alpha 和红色分量被移动到它们所属的较高 2 个字节中，然后结果向左移动 4 位，导致每个分量都恰好在它需要的位置。

对于 0AR0 0GB0 的形式，位掩码和左移组合与当前值进行 OR'ed。这会将 A 和 G 分量复制到它们左侧的位置。对 R 和 B 组件执行相同的操作，只是方向相反。

如果你要为 OpenGL 执行此操作，我建议你使用type参数设置为GL_UNSIGNED_SHORT_4_4_4_4的glTexImageXD函数。您的 OpenGL 驱动程序应该完成剩下的工作。关于透明度反转，您可以随时通过glBlendFunc和glBlendEquation功能操作混合。

而其他人则在核心优化上运行......

将此作为您最好的选择：

std::string toAARRGGBB(const std::string &argb)
{
std::string ret("0x");
int start = 2; //"0x####";
// ^^ skipped
for (int i = start;i < argb.length(); ++i)
{
ret += argb[i];
ret += argb[i];
}
return ret;
}
int main()
{
std::string argb = toAARRGGBB("0xACED"); //!!!
}

哈哈