如何使用SSE将__m128i注册乘以浮点因子?
How to multiply __m128i register by float factor using SSE?
我在将两个寄存器相乘(或只是寄存器乘以float
常量(时遇到问题。一个寄存器是__m128i
类型,包含来自 16 像素的一个 RGBA 像素颜色通道(具有 16 像素的数组作为参数发送到 CPP dll(。我想将此寄存器乘以常量以获得此通道的灰度值,并对存储在__m128i
寄存器中的其他通道也执行此操作。
我认为使用 SIMD 将图像转换为灰度的一个好主意是使用此算法。
fY(R, G, B( = R x 0.29891 + G x 0.58661 + B x 0.11448
我有以下代码,现在它只是将图像分解为通道并将其打包在一起以作为src
向量返回。现在我需要为灰度:)src
变量是指向数组unsigned char
指针。
__m128i vecSrc = _mm_loadu_si128((__m128i*) &src[srcIndex]);
__m128i maskR = _mm_setr_epi16(1, 0, 0, 0, 1, 0, 0, 0);
__m128i maskG = _mm_setr_epi16(0, 1, 0, 0, 0, 1, 0, 0);
__m128i maskB = _mm_setr_epi16(0, 0, 1, 0, 0, 0, 1, 0);
__m128i maskA = _mm_setr_epi16(0, 0, 0, 1, 0, 0, 0, 1);
// Creating factors.
const __m128i factorR = _mm_set1_epi16((short)(0.29891 * 0x10000)); //8 coefficients - R scale factor.
const __m128i factorG = _mm_set1_epi16((short)(0.58661 * 0x10000)); //8 coefficients - G scale factor.
const __m128i factorB = _mm_set1_epi16((short)(0.11448 * 0x10000)); //8 coefficients - B scale factor.
__m128i zero = _mm_setzero_si128();
// Shifting higher part of src register to lower.
__m128i vectSrcLowInHighPart = _mm_cvtepu8_epi16(vecSrc);
__m128i vectSrcHighInHighPart = _mm_unpackhi_epi8(vecSrc, zero);
// Multiply high parts of 16 x uint8 vectors by channels masks and save lower half. Getting each channels separatly (in two parts H and L)
__m128i vecR_L = _mm_mullo_epi16(vectSrcLowInHighPart, maskR);
__m128i vecG_L = _mm_mullo_epi16(vectSrcLowInHighPart, maskG);
__m128i vecB_L = _mm_mullo_epi16(vectSrcLowInHighPart, maskB);
__m128i vecA_L = _mm_mullo_epi16(vectSrcLowInHighPart, maskA);
// Multiply lower parts of 16 x uint8 vectors by channels masks and save lower half.
__m128i vecR_H = _mm_mullo_epi16(vectSrcHighInHighPart, maskR);
__m128i vecG_H = _mm_mullo_epi16(vectSrcHighInHighPart, maskG);
__m128i vecB_H = _mm_mullo_epi16(vectSrcHighInHighPart, maskB);
__m128i vecA_H = _mm_mullo_epi16(vectSrcHighInHighPart, maskA);
// Lower and high masks using to packing.
__m128i maskLo = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
__m128i maskHi = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
// Packed the High and Lowe part of register into one 16 x 8bit registers of each channels.
__m128i R = _mm_or_si128(_mm_shuffle_epi8(vecR_L, maskLo), _mm_shuffle_epi8(vecR_H, maskHi));
__m128i G = _mm_or_si128(_mm_shuffle_epi8(vecG_L, maskLo), _mm_shuffle_epi8(vecG_H, maskHi));
__m128i B = _mm_or_si128(_mm_shuffle_epi8(vecB_L, maskLo), _mm_shuffle_epi8(vecB_H, maskHi));
__m128i A = _mm_or_si128(_mm_shuffle_epi8(vecA_L, maskLo), _mm_shuffle_epi8(vecA_H, maskHi));
// Added all sub vectors to get in result one 128-bit vector with all edited channels.
__m128i resultVect = _mm_add_epi8(_mm_add_epi8(R, G), _mm_add_epi8(B, A));
// Put result vector into array to return as src pointer.
_mm_storel_epi64((__m128i*)&src[srcIndex], resultVect);
感谢您的帮助!这是我的第一个带有 SIMD (SSE( 指令的程序。
根据对我问题的评论,我创建了一个解决方案。还有一个项目,我正在学习当我使用 SSE 指令时寄存器是如何工作的。
// Function displaying only registers with 16 x uInt8. And message.
void printRegister(__m128i registerToprint, const string &msg) {
unsigned char tab_debug[16] = { 0 };
unsigned char *dest = tab_debug;
_mm_store_si128((__m128i*)&dest[0], registerToprint);
cout << msg << endl;
cout << "//// LO ////" << endl;
for (int i = 0; i < 16; i++)
cout << dec << (unsigned int)dest[i] << endl;
cout << "//// HI ////" << endl;
}
int main()
{
// Example array as 128-bit register with 16xuInt8. That represent each channel of pixel in BGRA configuration.
unsigned char tab[] = { 100,200,250,255, 101,201,251,255, 102,202,252,255, 103,203,253,255 };
// A pointer to source tab for simulate dll parameters reference.
unsigned char *src = tab;
// Start index of src t
int srcIndex = 0;
// How to define float numbers as integer of uInt16 type.
const __m128i r_coef = _mm_set1_epi16((short)(0.2989*32768.0 + 0.5));
const __m128i g_coef = _mm_set1_epi16((short)(0.5870*32768.0 + 0.5));
const __m128i b_coef = _mm_set1_epi16((short)(0.1140*32768.0 + 0.5));
// vecSrc - source vector (BGRA BGRA BGRA BGRA).
// Load data from tab[] into 128-bit register starting from adress at pointer src. (From 0 index so load all 16 elements x 8bit).
__m128i vecSrc = _mm_loadu_si128((__m128i*) &src[srcIndex]);
// Shuffle to configuration A0A1A2A3_R0R1R2R3_G0G1G2G3_B0B1B2B3
// Not revers so mask is read from left (Lo) to right (Hi). And counting from righ in srcVect (Lo).
__m128i shuffleMask = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
__m128i AAAA_R0RRR_G0GGG_B0BBB = _mm_shuffle_epi8(vecSrc, shuffleMask);
// Put B0BBB in lower part.
__m128i B0_XXX = _mm_slli_si128(AAAA_R0RRR_G0GGG_B0BBB, 12);
__m128i XXX_B0 = _mm_srli_si128(B0_XXX, 12);
// Put G0GGG in Lower part.
__m128i G0_B_XX = _mm_slli_si128(AAAA_R0RRR_G0GGG_B0BBB, 8);
__m128i XXX_G0 = _mm_srli_si128(G0_B_XX, 12);
// Put R0RRR in Lower part.
__m128i R0_G_XX = _mm_slli_si128(AAAA_R0RRR_G0GGG_B0BBB, 4);
__m128i XXX_R0 = _mm_srli_si128(R0_G_XX, 12);
// Unpack uint8 elements to uint16 elements.
// The sequence in uInt8 is like (Hi) XXXX XXXX XXXX XXXX (Lo) where X represent uInt8.
// In uInt16 is like (Hi) X_X_ X_X_ X_X_ X_X_ (Lo)
__m128i B0BBB = _mm_cvtepu8_epi16(XXX_B0);
__m128i G0GGG = _mm_cvtepu8_epi16(XXX_G0);
__m128i R0RRR = _mm_cvtepu8_epi16(XXX_R0);
// Multiply epi16 registers.
__m128i B0BBB_mul = _mm_mulhrs_epi16(B0BBB, b_coef);
__m128i G0GGG_mul = _mm_mulhrs_epi16(G0GGG, g_coef);
__m128i R0RRR_mul = _mm_mulhrs_epi16(R0RRR, r_coef);
__m128i BGR_gray = _mm_add_epi16(_mm_add_epi16(B0BBB_mul, G0GGG_mul), R0RRR_mul);
__m128i grayMsk = _mm_setr_epi8(0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6);
__m128i vectGray = _mm_shuffle_epi8(BGR_gray, grayMsk);
printRegister(vectGray, "Gray");
}
工作原理
unsigned char tab[]
包含 16 xuInt8
元件,用于填充一个 128 位寄存器。此阵列正在模拟通道采用 BGRA 配置的 8 像素。
void printRegister(__m128i registerToprint, const string &msg);
此函数用于在控制台中打印为十进制寄存器值作为参数发送。
如果有人想测试它,可以在gitHub上找到完整的项目:完整的项目演示gitHub我希望
所有评论都是有效的,如果没有,请纠正我:)感谢您的支持。
相关文章:
- 无法将结构注册为增强几何体3D点
- 如何使用AngelScript注册SFML Vector2运算符
- 在遍历处理程序的向量时注册和注销处理程序
- 有没有任务栏API可以立即应用注册表更改
- 使用QJsEngine在Qt中注册自定义类型
- 检查注册表项是否链接到(或副本)另一个注册表项
- 如何使用 TStyleManager::UnRegisterStyle() 取消注册样式
- WINAPI 注册应用程序重新启动时不清除打开的套接字
- SFML 碰撞永远不会在我的系统中注册
- 尝试从C++访问 UWP 的电子邮件邮件类会导致"REGDB_E_CLASSNOTREG类未注册"错误
- 用于创建/注册虚拟存储设备的 IOKit 驱动程序
- 如何添加预防措施以绕过未注册Microsoft.ACE.OLEDB.12.0?
- C++ SSE 内部函数:将结果存储在变量中
- 进程外 EXE 的免注册 COM
- 如何注册Qt C++对象以在QML中使用它
- 在Qt中注册自定义元类型的别名类型
- 禁用地址共享注册表不起作用
- 如何从注册表项中提取配置单元和注册表名称
- 如何使用SSE将__m128i注册乘以浮点因子?
- 向 SSE 注册管理