AVX2在Haswell上比SSE慢
AVX2 slower than SSE on Haswell
我有以下代码(正常,SSE和AVX):
int testSSE(const aligned_vector & ghs, const aligned_vector & lhs) {
int result[4] __attribute__((aligned(16))) = {0};
__m128i vresult = _mm_set1_epi32(0);
__m128i v1, v2, vmax;
for (int k = 0; k < ghs.size(); k += 4) {
v1 = _mm_load_si128((__m128i *) & lhs[k]);
v2 = _mm_load_si128((__m128i *) & ghs[k]);
vmax = _mm_add_epi32(v1, v2);
vresult = _mm_max_epi32(vresult, vmax);
}
_mm_store_si128((__m128i *) result, vresult);
int mymax = result[0];
for (int k = 1; k < 4; k++) {
if (result[k] > mymax) {
mymax = result[k];
}
}
return mymax;
}
int testAVX(const aligned_vector & ghs, const aligned_vector & lhs) {
int result[8] __attribute__((aligned(32))) = {0};
__m256i vresult = _mm256_set1_epi32(0);
__m256i v1, v2, vmax;
for (int k = 0; k < ghs.size(); k += 8) {
v1 = _mm256_load_si256((__m256i *) & ghs[ k]);
v2 = _mm256_load_si256((__m256i *) & lhs[k]);
vmax = _mm256_add_epi32(v1, v2);
vresult = _mm256_max_epi32(vresult, vmax);
}
_mm256_store_si256((__m256i *) result, vresult);
int mymax = result[0];
for (int k = 1; k < 8; k++) {
if (result[k] > mymax) {
mymax = result[k];
}
}
return mymax;
}
int testNormal(const aligned_vector & ghs, const aligned_vector & lhs) {
int max = 0;
int tempMax;
for (int k = 0; k < ghs.size(); k++) {
tempMax = lhs[k] + ghs[k];
if (max < tempMax) {
max = tempMax;
}
}
return max;
}
用下面的代码测试所有这些函数:
void alignTestSSE() {
aligned_vector lhs;
aligned_vector ghs;
int mySize = 4096;
int FinalResult;
int nofTestCases = 1000;
double time, time1, time2, time3;
vector<int> lhs2;
vector<int> ghs2;
lhs.resize(mySize);
ghs.resize(mySize);
lhs2.resize(mySize);
ghs2.resize(mySize);
srand(1);
for (int k = 0; k < mySize; k++) {
lhs[k] = randomNodeID(1000000);
lhs2[k] = lhs[k];
ghs[k] = randomNodeID(1000000);
ghs2[k] = ghs[k];
}
/* Warming UP */
for (int k = 0; k < nofTestCases; k++) {
FinalResult = testNormal(lhs, ghs);
}
for (int k = 0; k < nofTestCases; k++) {
FinalResult = testSSE(lhs, ghs);
}
for (int k = 0; k < nofTestCases; k++) {
FinalResult = testAVX(lhs, ghs);
}
cout << "===========================" << endl;
time = timestamp();
for (int k = 0; k < nofTestCases; k++) {
FinalResult = testSSE(lhs, ghs);
}
time = timestamp() - time;
time1 = time;
cout << "SSE took " << time << " s" << endl;
cout << "SSE Result: " << FinalResult << endl;
time = timestamp();
for (int k = 0; k < nofTestCases; k++) {
FinalResult = testAVX(lhs, ghs);
}
time = timestamp() - time;
time3 = time;
cout << "AVX took " << time << " s" << endl;
cout << "AVX Result: " << FinalResult << endl;
time = timestamp();
for (int k = 0; k < nofTestCases; k++) {
FinalResult = testNormal(lhs, ghs);
}
time = timestamp() - time;
cout << "Normal took " << time << " s" << endl;
cout << "Normal Result: " << FinalResult << endl;
cout << "SpeedUP SSE= " << time / time1 << " s" << endl;
cout << "SpeedUP AVX= " << time / time3 << " s" << endl;
cout << "===========================" << endl;
ghs.clear();
lhs.clear();
}
,
inline double timestamp() {
struct timeval tp;
gettimeofday(&tp, NULL);
return double(tp.tv_sec) + tp.tv_usec / 1000000.;
}
和
typedef vector<int, aligned_allocator<int, sizeof (int)> > aligned_vector;
是使用https://gist.github.com/donny-dont/1471329
的AlignedAllocator的对齐向量我有一个intel-i7 haswell 4771,最新的Ubuntu 14.04 64位和gcc 4.8.2。一切都是最新的。我用-march=native -mtune=native -O3 -m64编译。
结果:
SSE took 0.000375986 s
SSE Result: 1982689
AVX took 0.000459909 s
AVX Result: 1982689
Normal took 0.00315714 s
Normal Result: 1982689
SpeedUP SSE= 8.39696 s
SpeedUP AVX= 6.8647 s
这表明完全相同的代码在AVX2上比SSE慢22%。是我做错了什么还是这是正常的行为?
我将您的代码转换为更普通的c++(普通数组,没有向量等),清理并在禁用自动向量化的情况下进行测试,并获得合理的结果:
#include <iostream>
using namespace std;
#include <sys/time.h>
#include <cstdlib>
#include <cstdint>
#include <immintrin.h>
inline double timestamp() {
struct timeval tp;
gettimeofday(&tp, NULL);
return double(tp.tv_sec) + tp.tv_usec / 1000000.;
}
int testSSE(const int32_t * ghs, const int32_t * lhs, size_t n) {
int result[4] __attribute__((aligned(16))) = {0};
__m128i vresult = _mm_set1_epi32(0);
__m128i v1, v2, vmax;
for (int k = 0; k < n; k += 4) {
v1 = _mm_load_si128((__m128i *) & lhs[k]);
v2 = _mm_load_si128((__m128i *) & ghs[k]);
vmax = _mm_add_epi32(v1, v2);
vresult = _mm_max_epi32(vresult, vmax);
}
_mm_store_si128((__m128i *) result, vresult);
int mymax = result[0];
for (int k = 1; k < 4; k++) {
if (result[k] > mymax) {
mymax = result[k];
}
}
return mymax;
}
int testAVX(const int32_t * ghs, const int32_t * lhs, size_t n) {
int result[8] __attribute__((aligned(32))) = {0};
__m256i vresult = _mm256_set1_epi32(0);
__m256i v1, v2, vmax;
for (int k = 0; k < n; k += 8) {
v1 = _mm256_load_si256((__m256i *) & ghs[k]);
v2 = _mm256_load_si256((__m256i *) & lhs[k]);
vmax = _mm256_add_epi32(v1, v2);
vresult = _mm256_max_epi32(vresult, vmax);
}
_mm256_store_si256((__m256i *) result, vresult);
int mymax = result[0];
for (int k = 1; k < 8; k++) {
if (result[k] > mymax) {
mymax = result[k];
}
}
return mymax;
}
int testNormal(const int32_t * ghs, const int32_t * lhs, size_t n) {
int max = 0;
int tempMax;
for (int k = 0; k < n; k++) {
tempMax = lhs[k] + ghs[k];
if (max < tempMax) {
max = tempMax;
}
}
return max;
}
void alignTestSSE() {
int n = 4096;
int normalResult, sseResult, avxResult;
int nofTestCases = 1000;
double time, normalTime, sseTime, avxTime;
int lhs[n] __attribute__ ((aligned(32)));
int ghs[n] __attribute__ ((aligned(32)));
for (int k = 0; k < n; k++) {
lhs[k] = arc4random();
ghs[k] = arc4random();
}
/* Warming UP */
for (int k = 0; k < nofTestCases; k++) {
normalResult = testNormal(lhs, ghs, n);
}
for (int k = 0; k < nofTestCases; k++) {
sseResult = testSSE(lhs, ghs, n);
}
for (int k = 0; k < nofTestCases; k++) {
avxResult = testAVX(lhs, ghs, n);
}
time = timestamp();
for (int k = 0; k < nofTestCases; k++) {
normalResult = testNormal(lhs, ghs, n);
}
normalTime = timestamp() - time;
time = timestamp();
for (int k = 0; k < nofTestCases; k++) {
sseResult = testSSE(lhs, ghs, n);
}
sseTime = timestamp() - time;
time = timestamp();
for (int k = 0; k < nofTestCases; k++) {
avxResult = testAVX(lhs, ghs, n);
}
avxTime = timestamp() - time;
cout << "===========================" << endl;
cout << "Normal took " << normalTime << " s" << endl;
cout << "Normal Result: " << normalResult << endl;
cout << "SSE took " << sseTime << " s" << endl;
cout << "SSE Result: " << sseResult << endl;
cout << "AVX took " << avxTime << " s" << endl;
cout << "AVX Result: " << avxResult << endl;
cout << "SpeedUP SSE= " << normalTime / sseTime << endl;
cout << "SpeedUP AVX= " << normalTime / avxTime << endl;
cout << "===========================" << endl;
}
int main()
{
alignTestSSE();
return 0;
}
测试:$ clang++ -Wall -mavx2 -O3 -fno-vectorize SO_avx.cpp && ./a.out
===========================
Normal took 0.00324106 s
Normal Result: 2143749391
SSE took 0.000527859 s
SSE Result: 2143749391
AVX took 0.000221968 s
AVX Result: 2143749391
SpeedUP SSE= 6.14002
SpeedUP AVX= 14.6015
===========================
我建议您尝试上面的代码,-fno-vectorize
(或-fno-tree-vectorize
,如果使用g++),看看你是否得到类似的结果。如果您这样做,那么您可以向后工作到您的原始代码,看看不一致可能来自哪里。
在我的机器(core i7-4900M)上,基于Paul R的更新代码,使用g++ 4.8.2,使用100,000次迭代而不是1000次,我有以下结果:
g++ -Wall -mavx2 -O3 -std=c++11 test_avx.cpp && ./a.exe
SSE took 508,029 us
AVX took 1,308,075 us
Normal took 297,017 us
g++ -Wall -mavx2 -O3 -std=c++11 -fno-tree-vectorize test_avx.cpp && ./a.exe
SSE took 509,029 us
AVX took 1,307,075 us
Normal took 3,436,197 us
GCC在优化"Normal"代码方面做得非常出色。然而,"AVX"代码的缓慢性能可以用下面的行来解释,它需要一个完整的256位存储(哎哟!),然后是对8个整数的最大搜索。
_mm256_store_si256((__m256i *) result, vresult);
int mymax = result[0];
for (int k = 1; k < 8; k++) {
if (result[k] > mymax) {
mymax = result[k];
}
}
return mymax;
对于最大值8,最好继续使用AVX intrinsic。我可以提出以下修改
v1 = _mm256_permute2x128_si256(vresult,vresult,1); // from ABCD-EFGH to ????-ABCD
vresult = _mm256_max_epi32(vresult, v1);
v1 = _mm256_permute4x64_epi64(vresult,1); // from ????-ABCD to ????-??AB
vresult = _mm256_max_epi32(vresult, v1);
v1 = _mm256_shuffle_epi32(vresult,1); // from ????-???AB to ????-???A
vresult = _mm256_max_epi32(vresult, v1);
// no _mm256_extract_epi32 => need extra step
__m128i vres128 = _mm256_extracti128_si256(vresult,0);
return _mm_extract_epi32(vres128,0);
为了公平的比较,我也更新了SSE代码,我有:
SSE took 483,028 us
AVX took 258,015 us
Normal took 307,017 us
AVX时间减少了1/5!
手动循环展开可以加快SSE/AVX代码。
我的i5-5300U上的原始版本:
Normal took 0.347 s
Normal Result: 2146591543
AVX took 0.409 s
AVX Result: 2146591543
SpeedUP AVX= 0.848411
手动循环展开后:
Normal took 0.375 s
Normal Result: 2146591543
AVX took 0.297 s
AVX Result: 2146591543
SpeedUP AVX= 1.26263
相关文章:
- 为什么 SDL 在 Mac 上比 Linux 慢得多?
- R 中的算术在数字上比整数更快。这是怎么回事?
- 在VC2015U3上,std::regex比boost::regex慢得多
- 可以在 macOS 上启用的最低支持的 SSE 标志是什么?
- 在 Linux 上,目标文件比在 macOS 或 Windows 上大 2.5 倍
- 为什么我的程序在 1 个线程上运行得比在 8 个线程上运行得更快.C++
- 在 GPU 上计算积分图像真的比在 CPU 上更快吗?
- 特征:矩阵行的计算规范比在向量上迭代计算它们慢
- SSE矢量操作在双型型上
- 为什么泄漏内存比在动态数组上执行 delete[] 慢
- 为什么 tanh 在我的机器上比 exp 快?
- 为什么 Windows udp 接收套接字上的超时总是比 SO_RCVTIMEO 设置的长 500 毫秒
- g++在多个文件上比使用谷歌模拟的单片单个文件慢得多
- 为什么我的SSE代码比本机C++代码慢
- AVX,SSE总和比gcc自动矢量化慢
- 为什么我的代码在Linux上比在RTOS上更快
- 二进制文件在Windows上比在Linux上要小
- 可执行文件在Wine上比在Windows上运行得快——为什么?
- 为什么 Eigen's Cholesky 分解在 Linux 上比在 Windows 上快得多?
- AVX2在Haswell上比SSE慢