__m256d TRANSPOSE4 Equivalent?
__m256d TRANSPOSE4 Equivalent?
Intel已包含__MM_TRANPOSE4_PS以转置4x4向量矩阵。我想用__m256d做同样的事情。然而,我似乎不知道如何以同样的方式获得_mm256_shuffle_pd。
_MM_TRANSPOSE4_PS代码
#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) {
__m128 tmp3, tmp2, tmp1, tmp0;
tmp0 = _mm_shuffle_ps((row0), (row1), 0x44);
tmp2 = _mm_shuffle_ps((row0), (row1), 0xEE);
tmp1 = _mm_shuffle_ps((row2), (row3), 0x44);
tmp3 = _mm_shuffle_ps((row2), (row3), 0xEE);
(row0) = _mm_shuffle_ps(tmp0, tmp1, 0x88);
(row1) = _mm_shuffle_ps(tmp0, tmp1, 0xDD);
(row2) = _mm_shuffle_ps(tmp2, tmp3, 0x88);
(row3) = _mm_shuffle_ps(tmp2, tmp3, 0xDD);
}
我在循环中尝试_MM_TRANSPORTE_PD,我需要它在中
for (int copy = i; copy < m2.size();)
{
__m256d row0 = _mm256_load_pd(m2data + copy);
copy += m2.col();
__m256d row1 = _mm256_load_pd(m2data + copy);
copy += m2.col();
__m256d row2 = _mm256_load_pd(m2data + copy);
copy += m2.col();
__m256d row3 = _mm256_load_pd(m2data + copy);
copy += m2.col();
__m256d tmp3, tmp2, tmp1, tmp0;
tmp0 = _mm256_shuffle_pd(row0,row1, 0x44);
tmp2 = _mm256_shuffle_pd(row0,row1, 0xEE);
tmp1 = _mm256_shuffle_pd(row2,row3, 0x44);
tmp3 = _mm256_shuffle_pd(row2,row3, 0xEE);
row0 = _mm256_shuffle_pd(tmp0, tmp1, 0x88);
row1 = _mm256_shuffle_pd(tmp0, tmp1, 0xDD);
row2 = _mm256_shuffle_pd(tmp2, tmp3, 0x88);
row3 = _mm256_shuffle_pd(tmp2, tmp3, 0xDD);
_mm256_store_pd(reinterpret_cast<double*>(buffer + counter++),row0);
_mm256_store_pd(reinterpret_cast<double*>(buffer + counter++),row1);
_mm256_store_pd(reinterpret_cast<double*>(buffer + counter++),row2);
_mm256_store_pd(reinterpret_cast<double*>(buffer + counter++),row3);
}
这是我找到的解决方案的宏等价物。
#define _MM_TRANSPOSE4_PD(row0,row1,row2,row3)
{
__m256d tmp3, tmp2, tmp1, tmp0;
tmp0 = _mm256_shuffle_pd((row0),(row1), 0x0);
tmp2 = _mm256_shuffle_pd((row0),(row1), 0xF);
tmp1 = _mm256_shuffle_pd((row2),(row3), 0x0);
tmp3 = _mm256_shuffle_pd((row2),(row3), 0xF);
(row0) = _mm256_permute2f128_pd(tmp0, tmp1, 0x20);
(row1) = _mm256_permute2f128_pd(tmp2, tmp3, 0x20);
(row2) = _mm256_permute2f128_pd(tmp0, tmp1, 0x31);
(row3) = _mm256_permute2f128_pd(tmp2, tmp3, 0x31);
}
相关文章:
- C++ equivalent to Java Map getOrDefault?
- Equivalent of OpenCv c++ api Vec2f,norm, acos in emgucv csha
- Win32 equivalent of getgid
- vfprintf_unlocked() equivalent?
- What is the std::chrono::time_point equivalent of std::numer
- AVX equivalent for _mm_movelh_ps
- python equivalent of std::chrono::steady_clock::now();
- C++ Equivalent of Java Map<String, Object>
- TrackPopupMenu equivalent in OS X
- C - 上衣与特征与C -Tran:张量收缩的速度差异
- Is strcpy equivalent to strcpy_s
- C++ equivalent of C# SpinWait.SpinUntil
- C++ equivalent to Fortran Namelist
- OpenGL equivalent of DirectX renderstates
- Arduino equivalent to VB WITH
- C equivalent to C++ decltype
- MPI-Parallel HDF5: H5Pset_fapl_mpio equivalent in C++
- Linux equivalent of Solaris walkcontext
- C++ equivalent to C# Encoding.ASCII.GetBytes()
- C++ equivalent of tailq