优化 32 位架构上的可移植 128 位整数移位

Optimizing portable 128-bit integer shifts on a 32-bit architecture

本文关键字:可移植 整数 优化      更新时间:2023-10-16

在业余时间,我一直在研究一个实用程序库,其中包括支持有符号/无符号的 128 位整数。在某些情况下,该库使用 CPU 调度来利用 simd 指令,但需要可移植的回退,以便它可以在其他任何地方运行。最近,我实现了 128 位移位的便携式回退。它工作正常并且运行得相当快,但它没有我希望的那么快,尤其是在 32 位架构上。

以下是具有所有相关类型和功能的剥离版本(为完整起见,包括 64 位版本):

typedef uint32_t UInt32;
typedef int32_t Int32;
typedef uint64_t UInt64;
typedef int64_t Int64;
// Returns 0xFFFFFFFF if value != 0, otherwise returns 0.
UInt32 AllOrNothingMask32(Int32 value)
{
return UInt32(-Int32(value != 0));
}
struct alignas(16) UInt128
{     
// Ensure the layout matches the architecture.
// LE = little endian
// BE = big endian
#if CPU_TYPE == CPU_LE32
UInt32 mLow;
UInt32 mLowMid;
UInt32 mHighMid;
UInt32 mHigh;
#elif CPU_TYPE == CPU_BE32
UInt32 mHigh;
UInt32 mHighMid;
UInt32 mLowMid;
UInt32 mLow;
#elif CPU_TYPE == CPU_LE64
UInt64 mLow;
UInt64 mHigh;
#elif CPU_TYPE == CPU_BE64
UInt64 mHigh;
UInt64 mLow;
#endif
UInt128() = default;
UInt128& operator=(const UInt128& other) = default;
inline
UInt128(UInt32 high, UInt32 highMid, UInt32 lowMid, UInt32 low) :
#if CPU_SIZE == CPU_32BIT
mLow(low),
mLowMid(lowMid),
mHighMid(highMid),
mHigh(high) { }
#elif CPU_SIZE == CPU_64BIT
mLow((UInt64(lowMid) << 32) | low),
mHigh((UInt64(high) << 32) | highMid) { }
#endif
inline
UInt128(UInt64 high, UInt64 low) :
#if CPU_SIZE == CPU_32BIT
mLow(UInt32(low)),
mLowMid(UInt32(low >> 32)),
mHighMid(UInt32(high)),
mHigh(UInt32(high >> 32)) { }
#elif CPU_SIZE == CPU_64BIT
mLow(low),
mHigh(high) { }
#endif
inline
bool UInt128::operator==(const UInt128& other) const noexcept
{
#if CPU_TYPE == CPU_32BIT
return mLow == other.mLow &&
mLowMid == other.mLowMid &&
mHighMid == other.mHighMid &&
mHigh == other.mHigh;
#elif CPU_TYPE == CPU_64BIT
return mLow == other.mLow &&
mHigh == other.mHigh;
#endif
}
inline
UInt128& UInt128::operator<<=(Int32 shift) noexcept
{
// Shift is modulo 128, effectively clamping it between 0-127.
shift &= 0x7F;
#if CPU_SIZE == CPU_32BIT
auto low = mLow;
auto lowMid = mLowMid;
auto highMid = mHighMid;
auto high = mHigh;
if (shift == 0) {
return *this;
} else if (shift < 32) {
auto rshift = 32 - shift;
mLow = (low << shift);
mLowMid = (lowMid << shift) | (low >> rshift);
mHighMid = (highMid << shift) | (lowMid >> rshift);
mHigh = (high << shift) | (highMid >> rshift);
} else if (shift < 64) {
auto lshift = (shift - 32);
auto rshift = (32 - lshift) & 0x1F;
auto rshiftMask = AllOrNothingMask32(rshift);
mLow = 0;
mLowMid = (low << lshift);
mHighMid = (lowMid << lshift) | ((low >> rshift) & rshiftMask);
mHigh = (highMid << lshift) | ((lowMid >> rshift) & rshiftMask);
} else if (shift < 96) {
auto lshift = (shift - 64);
auto rshift = (64 - lshift) & 0x1F;
auto rshiftMask = AllOrNothingMask32(rshift);
mLow = 0;
mLowMid = 0;
mHighMid = (low << lshift);
mHigh = (lowMid << lshift) | ((low >> rshift) & rshiftMask);
} else {
mLow = 0;
mLowMid = 0;
mHighMid = 0;
mHigh = (low << (shift - 96));
}
#elif CPU_SIZE == CPU_64BIT
auto low = mLow,
high = mHigh;
if (shift == 0) {
return *this;
} else if (shift < 64) {
mLow = (low << shift);
mHigh = (high << shift) | (low >> (64 - shift));
} else {
mLow = 0;
mHigh = (low << (shift - 64));
}
#endif
return *this;
}
inline
UInt128& UInt128::operator>>=(Int32 shift) noexcept
{
// Shift is modulo 128, effectively clamping it between 0-127.
shift &= 0x7F;
#if CPU_SIZE == CPU_32BIT
auto low = mLow,
lowMid = mLowMid,
highMid = mHighMid,
high = mHigh;
if (shift == 0) {
return *this;
} else if (shift < 32) {
auto rshift = 32 - shift;
mLow = (low >> shift) | (lowMid << rshift);
mLowMid = (lowMid >> shift) | (highMid << rshift);
mHighMid = (highMid >> shift) | (high << rshift);
mHigh = (high >> shift);
} else if (shift < 64) {
auto rshift = (shift - 32);
auto lshift = (32 - rshift) & 0x1F;
auto lshiftMask = AllOrNothingMask32(lshift);
mLow = (lowMid >> rshift) | ((highMid << lshift) & lshiftMask);
mLowMid = (highMid >> rshift) | ((high << lshift) & lshiftMask);
mHighMid = (high >> rshift);
mHigh = 0;
} else if (shift < 96) {
auto rshift = (shift - 64);
auto lshift = (64 - rshift) & 0x1F;
auto lshiftMask = AllOrNothingMask32(lshift);
mLow = (highMid >> rshift) | ((high << lshift) & lshiftMask);
mLowMid = (high >> rshift);
mHighMid = 0;
mHigh = 0;
} else {
mLow = (high >> (shift - 96));
mLowMid = 0;
mHighMid = 0;
mHigh = 0;
}
#elif CPU_SIZE == CPU_64BIT
auto low = mLow,
high = mHigh;
if (shift == 0) {
return *this;
} else if (shift < 64) {
mLow = (low >> shift) | (high << (64 - shift));
mHigh = (high >> shift);
} else {
mLow = (high >> (shift - 64));
mHigh = 0;
}
#endif
return *this;
}
};

相关的 32 位程序集输出相当长,因此除非要求,否则我将省略它。

当编译时不知道 shift 参数时,主要瓶颈显然是分支。可以做些什么来消除分支,或者就此而言,可以利用哪些便携式技巧来加速分支?

更新 1

添加了上面示例中缺少的复制赋值运算符。 对于那些感兴趣的人,这里是单元测试。我使用 Catch 因为它很简单。

// Left shift lookup table, from 0-127.
const UInt128 gLeftShiftLut128[] = {
UInt128(0xFFEEDDCCBBAA9988, 0x7766554433221100),
UInt128(0xFFDDBB9977553310, 0xEECCAA8866442200),
UInt128(0xFFBB7732EEAA6621, 0xDD995510CC884400),
UInt128(0xFF76EE65DD54CC43, 0xBB32AA2199108800),
UInt128(0xFEEDDCCBBAA99887, 0x7665544332211000),
UInt128(0xFDDBB9977553310E, 0xECCAA88664422000),
UInt128(0xFBB7732EEAA6621D, 0xD995510CC8844000),
UInt128(0xF76EE65DD54CC43B, 0xB32AA21991088000),
UInt128(0xEEDDCCBBAA998877, 0x6655443322110000),
UInt128(0xDDBB9977553310EE, 0xCCAA886644220000),
UInt128(0xBB7732EEAA6621DD, 0x995510CC88440000),
UInt128(0x76EE65DD54CC43BB, 0x32AA219910880000),
UInt128(0xEDDCCBBAA9988776, 0x6554433221100000),
UInt128(0xDBB9977553310EEC, 0xCAA8866442200000),
UInt128(0xB7732EEAA6621DD9, 0x95510CC884400000),
UInt128(0x6EE65DD54CC43BB3, 0x2AA2199108800000),
UInt128(0xDDCCBBAA99887766, 0x5544332211000000),
UInt128(0xBB9977553310EECC, 0xAA88664422000000),
UInt128(0x7732EEAA6621DD99, 0x5510CC8844000000),
UInt128(0xEE65DD54CC43BB32, 0xAA21991088000000),
UInt128(0xDCCBBAA998877665, 0x5443322110000000),
UInt128(0xB9977553310EECCA, 0xA886644220000000),
UInt128(0x732EEAA6621DD995, 0x510CC88440000000),
UInt128(0xE65DD54CC43BB32A, 0xA219910880000000),
UInt128(0xCCBBAA9988776655, 0x4433221100000000),
UInt128(0x9977553310EECCAA, 0x8866442200000000),
UInt128(0x32EEAA6621DD9955, 0x10CC884400000000),
UInt128(0x65DD54CC43BB32AA, 0x2199108800000000),
UInt128(0xCBBAA99887766554, 0x4332211000000000),
UInt128(0x977553310EECCAA8, 0x8664422000000000),
UInt128(0x2EEAA6621DD99551, 0xCC8844000000000),
UInt128(0x5DD54CC43BB32AA2, 0x1991088000000000),
UInt128(0xBBAA998877665544, 0x3322110000000000),
UInt128(0x77553310EECCAA88, 0x6644220000000000),
UInt128(0xEEAA6621DD995510, 0xCC88440000000000),
UInt128(0xDD54CC43BB32AA21, 0x9910880000000000),
UInt128(0xBAA9988776655443, 0x3221100000000000),
UInt128(0x7553310EECCAA886, 0x6442200000000000),
UInt128(0xEAA6621DD995510C, 0xC884400000000000),
UInt128(0xD54CC43BB32AA219, 0x9108800000000000),
UInt128(0xAA99887766554433, 0x2211000000000000),
UInt128(0x553310EECCAA8866, 0x4422000000000000),
UInt128(0xAA6621DD995510CC, 0x8844000000000000),
UInt128(0x54CC43BB32AA2199, 0x1088000000000000),
UInt128(0xA998877665544332, 0x2110000000000000),
UInt128(0x53310EECCAA88664, 0x4220000000000000),
UInt128(0xA6621DD995510CC8, 0x8440000000000000),
UInt128(0x4CC43BB32AA21991, 0x880000000000000),
UInt128(0x9988776655443322, 0x1100000000000000),
UInt128(0x3310EECCAA886644, 0x2200000000000000),
UInt128(0x6621DD995510CC88, 0x4400000000000000),
UInt128(0xCC43BB32AA219910, 0x8800000000000000),
UInt128(0x9887766554433221, 0x1000000000000000),
UInt128(0x310EECCAA8866442, 0x2000000000000000),
UInt128(0x621DD995510CC884, 0x4000000000000000),
UInt128(0xC43BB32AA2199108, 0x8000000000000000),
UInt128(0x8877665544332211, 0x0),
UInt128(0x10EECCAA88664422, 0x0),
UInt128(0x21DD995510CC8844, 0x0),
UInt128(0x43BB32AA21991088, 0x0),
UInt128(0x8776655443322110, 0x0),
UInt128(0xEECCAA886644220 , 0x0),
UInt128(0x1DD995510CC88440, 0x0),
UInt128(0x3BB32AA219910880, 0x0),
UInt128(0x7766554433221100, 0x0),
UInt128(0xEECCAA8866442200, 0x0),
UInt128(0xDD995510CC884400, 0x0),
UInt128(0xBB32AA2199108800, 0x0),
UInt128(0x7665544332211000, 0x0),
UInt128(0xECCAA88664422000, 0x0),
UInt128(0xD995510CC8844000, 0x0),
UInt128(0xB32AA21991088000, 0x0),
UInt128(0x6655443322110000, 0x0),
UInt128(0xCCAA886644220000, 0x0),
UInt128(0x995510CC88440000, 0x0),
UInt128(0x32AA219910880000, 0x0),
UInt128(0x6554433221100000, 0x0),
UInt128(0xCAA8866442200000, 0x0),
UInt128(0x95510CC884400000, 0x0),
UInt128(0x2AA2199108800000, 0x0),
UInt128(0x5544332211000000, 0x0),
UInt128(0xAA88664422000000, 0x0),
UInt128(0x5510CC8844000000, 0x0),
UInt128(0xAA21991088000000, 0x0),
UInt128(0x5443322110000000, 0x0),
UInt128(0xA886644220000000, 0x0),
UInt128(0x510CC88440000000, 0x0),
UInt128(0xA219910880000000, 0x0),
UInt128(0x4433221100000000, 0x0),
UInt128(0x8866442200000000, 0x0),
UInt128(0x10CC884400000000, 0x0),
UInt128(0x2199108800000000, 0x0),
UInt128(0x4332211000000000, 0x0),
UInt128(0x8664422000000000, 0x0),
UInt128(0xCC8844000000000 , 0x0),
UInt128(0x1991088000000000, 0x0),
UInt128(0x3322110000000000, 0x0),
UInt128(0x6644220000000000, 0x0),
UInt128(0xCC88440000000000, 0x0),
UInt128(0x9910880000000000, 0x0),
UInt128(0x3221100000000000, 0x0),
UInt128(0x6442200000000000, 0x0),
UInt128(0xC884400000000000, 0x0),
UInt128(0x9108800000000000, 0x0),
UInt128(0x2211000000000000, 0x0),
UInt128(0x4422000000000000, 0x0),
UInt128(0x8844000000000000, 0x0),
UInt128(0x1088000000000000, 0x0),
UInt128(0x2110000000000000, 0x0),
UInt128(0x4220000000000000, 0x0),
UInt128(0x8440000000000000, 0x0),
UInt128(0x880000000000000 , 0x0),
UInt128(0x1100000000000000, 0x0),
UInt128(0x2200000000000000, 0x0),
UInt128(0x4400000000000000, 0x0),
UInt128(0x8800000000000000, 0x0),
UInt128(0x1000000000000000, 0x0),
UInt128(0x2000000000000000, 0x0),
UInt128(0x4000000000000000, 0x0),
UInt128(0x8000000000000000, 0x0),
UInt128(0x0, 0x0),
UInt128(0x0, 0x0),
UInt128(0x0, 0x0),
UInt128(0x0, 0x0),
UInt128(0x0, 0x0),
UInt128(0x0, 0x0),
UInt128(0x0, 0x0),
UInt128(0x0, 0x0)
};
// Right shift lookup table, from 0-127.
const UInt128 gRightShiftLut128[] = {
UInt128(0xFFEEDDCCBBAA9988, 0x7766554433221100),
UInt128(0x7FF76EE65DD54CC4, 0x3BB32AA219910880),
UInt128(0x3FFBB7732EEAA662, 0x1DD995510CC88440),
UInt128(0x1FFDDBB997755331, 0xEECCAA886644220),
UInt128(0xFFEEDDCCBBAA998, 0x8776655443322110),
UInt128(0x7FF76EE65DD54CC, 0x43BB32AA21991088),
UInt128(0x3FFBB7732EEAA66, 0x21DD995510CC8844),
UInt128(0x1FFDDBB99775533, 0x10EECCAA88664422),
UInt128(0xFFEEDDCCBBAA99, 0x8877665544332211),
UInt128(0x7FF76EE65DD54C, 0xC43BB32AA2199108),
UInt128(0x3FFBB7732EEAA6, 0x621DD995510CC884),
UInt128(0x1FFDDBB9977553, 0x310EECCAA8866442),
UInt128(0xFFEEDDCCBBAA9, 0x9887766554433221),
UInt128(0x7FF76EE65DD54, 0xCC43BB32AA219910),
UInt128(0x3FFBB7732EEAA, 0x6621DD995510CC88),
UInt128(0x1FFDDBB997755, 0x3310EECCAA886644),
UInt128(0xFFEEDDCCBBAA, 0x9988776655443322),
UInt128(0x7FF76EE65DD5, 0x4CC43BB32AA21991),
UInt128(0x3FFBB7732EEA, 0xA6621DD995510CC8),
UInt128(0x1FFDDBB99775, 0x53310EECCAA88664),
UInt128(0xFFEEDDCCBBA, 0xA998877665544332),
UInt128(0x7FF76EE65DD, 0x54CC43BB32AA2199),
UInt128(0x3FFBB7732EE, 0xAA6621DD995510CC),
UInt128(0x1FFDDBB9977, 0x553310EECCAA8866),
UInt128(0xFFEEDDCCBB, 0xAA99887766554433),
UInt128(0x7FF76EE65D, 0xD54CC43BB32AA219),
UInt128(0x3FFBB7732E, 0xEAA6621DD995510C),
UInt128(0x1FFDDBB997, 0x7553310EECCAA886),
UInt128(0xFFEEDDCCB, 0xBAA9988776655443),
UInt128(0x7FF76EE65, 0xDD54CC43BB32AA21),
UInt128(0x3FFBB7732, 0xEEAA6621DD995510),
UInt128(0x1FFDDBB99, 0x77553310EECCAA88),
UInt128(0xFFEEDDCC, 0xBBAA998877665544),
UInt128(0x7FF76EE6, 0x5DD54CC43BB32AA2),
UInt128(0x3FFBB773, 0x2EEAA6621DD99551),
UInt128(0x1FFDDBB9, 0x977553310EECCAA8),
UInt128(0xFFEEDDC, 0xCBBAA99887766554),
UInt128(0x7FF76EE, 0x65DD54CC43BB32AA),
UInt128(0x3FFBB77, 0x32EEAA6621DD9955),
UInt128(0x1FFDDBB, 0x9977553310EECCAA),
UInt128(0xFFEEDD, 0xCCBBAA9988776655),
UInt128(0x7FF76E, 0xE65DD54CC43BB32A),
UInt128(0x3FFBB7, 0x732EEAA6621DD995),
UInt128(0x1FFDDB, 0xB9977553310EECCA),
UInt128(0xFFEED, 0xDCCBBAA998877665),
UInt128(0x7FF76, 0xEE65DD54CC43BB32),
UInt128(0x3FFBB, 0x7732EEAA6621DD99),
UInt128(0x1FFDD, 0xBB9977553310EECC),
UInt128(0xFFEE, 0xDDCCBBAA99887766),
UInt128(0x7FF7, 0x6EE65DD54CC43BB3),
UInt128(0x3FFB, 0xB7732EEAA6621DD9),
UInt128(0x1FFD, 0xDBB9977553310EEC),
UInt128(0xFFE, 0xEDDCCBBAA9988776),
UInt128(0x7FF, 0x76EE65DD54CC43BB),
UInt128(0x3FF, 0xBB7732EEAA6621DD),
UInt128(0x1FF, 0xDDBB9977553310EE),
UInt128(0xFF, 0xEEDDCCBBAA998877),
UInt128(0x7F, 0xF76EE65DD54CC43B),
UInt128(0x3F, 0xFBB7732EEAA6621D),
UInt128(0x1F, 0xFDDBB9977553310E),
UInt128(0xF, 0xFEEDDCCBBAA99887),
UInt128(0x7, 0xFF76EE65DD54CC43),
UInt128(0x3, 0xFFBB7732EEAA6621),
UInt128(0x1, 0xFFDDBB9977553310),
UInt128(0x0, 0xFFEEDDCCBBAA9988),
UInt128(0x0, 0x7FF76EE65DD54CC4),
UInt128(0x0, 0x3FFBB7732EEAA662),
UInt128(0x0, 0x1FFDDBB997755331),
UInt128(0x0, 0xFFEEDDCCBBAA998),
UInt128(0x0, 0x7FF76EE65DD54CC),
UInt128(0x0, 0x3FFBB7732EEAA66),
UInt128(0x0, 0x1FFDDBB99775533),
UInt128(0x0, 0xFFEEDDCCBBAA99),
UInt128(0x0, 0x7FF76EE65DD54C),
UInt128(0x0, 0x3FFBB7732EEAA6),
UInt128(0x0, 0x1FFDDBB9977553),
UInt128(0x0, 0xFFEEDDCCBBAA9),
UInt128(0x0, 0x7FF76EE65DD54),
UInt128(0x0, 0x3FFBB7732EEAA),
UInt128(0x0, 0x1FFDDBB997755),
UInt128(0x0, 0xFFEEDDCCBBAA),
UInt128(0x0, 0x7FF76EE65DD5),
UInt128(0x0, 0x3FFBB7732EEA),
UInt128(0x0, 0x1FFDDBB99775),
UInt128(0x0, 0xFFEEDDCCBBA),
UInt128(0x0, 0x7FF76EE65DD),
UInt128(0x0, 0x3FFBB7732EE),
UInt128(0x0, 0x1FFDDBB9977),
UInt128(0x0, 0xFFEEDDCCBB),
UInt128(0x0, 0x7FF76EE65D),
UInt128(0x0, 0x3FFBB7732E),
UInt128(0x0, 0x1FFDDBB997),
UInt128(0x0, 0xFFEEDDCCB),
UInt128(0x0, 0x7FF76EE65),
UInt128(0x0, 0x3FFBB7732),
UInt128(0x0, 0x1FFDDBB99),
UInt128(0x0, 0xFFEEDDCC),
UInt128(0x0, 0x7FF76EE6),
UInt128(0x0, 0x3FFBB773),
UInt128(0x0, 0x1FFDDBB9),
UInt128(0x0, 0xFFEEDDC),
UInt128(0x0, 0x7FF76EE),
UInt128(0x0, 0x3FFBB77),
UInt128(0x0, 0x1FFDDBB),
UInt128(0x0, 0xFFEEDD),
UInt128(0x0, 0x7FF76E),
UInt128(0x0, 0x3FFBB7),
UInt128(0x0, 0x1FFDDB),
UInt128(0x0, 0xFFEED),
UInt128(0x0, 0x7FF76),
UInt128(0x0, 0x3FFBB),
UInt128(0x0, 0x1FFDD),
UInt128(0x0, 0xFFEE),
UInt128(0x0, 0x7FF7),
UInt128(0x0, 0x3FFB),
UInt128(0x0, 0x1FFD),
UInt128(0x0, 0xFFE),
UInt128(0x0, 0x7FF),
UInt128(0x0, 0x3FF),
UInt128(0x0, 0x1FF),
UInt128(0x0, 0xFF),
UInt128(0x0, 0x7F),
UInt128(0x0, 0x3F),
UInt128(0x0, 0x1F),
UInt128(0x0, 0xF),
UInt128(0x0, 0x7),
UInt128(0x0, 0x3),
UInt128(0x0, 0x1)
};
TEST_CASE("UInt128 left shift produces correct results.") {
auto base = UInt128(0xFFEEDDCCBBAA9988, 0x7766554433221100);
for (auto i = 1; i <= 127; i++) {
auto sample = base;
sample <<= i;
INFO("i = " << i);
REQUIRE(sample == gLeftShiftLut128[i]);
}
}
TEST_CASE("UInt128 right shift produces correct results.") {
auto base = UInt128(0xFFEEDDCCBBAA9988, 0x7766554433221100);
for (auto i = 0; i <= 127; i++) {
auto sample = base;
sample >>= i;
INFO("i = " << i);
REQUIRE(sample == gRightShiftLut128[i]);
}
}

我没有对它进行基准测试,但像这样的东西是无分支的:

inline
UInt128& UInt128::operator<<=(Int32 shift) noexcept
{
auto lshift = shift & 31;
auto rshift = 31 - lshift;
UInt32 parts[8] = {
#if CPU_TYPE == CPU_LE32
0, 0, 0, 0,
mLow << lshift,
mLowMid << lshift | mLow >> 1 >> rshift,
mHighMid << lshift | mLowMid >> 1 >> rshift,
mHigh << lshift | mHighMid >> 1 >> rshift
#elif CPU_TYPE == CPU_BE32
mHigh << lshift | mHighMid >> 1 >> rshift,
mHighMid << lshift | mLowMid >> 1 >> rshift,
mLowMid << lshift | mLow >> 1 >> rshift,
mLow << lshift,
0, 0, 0, 0
#endif
};
memcpy(this, &parts[
#if CPU_TYPE == CPU_LE32
4 -
#endif
(shift >> 5 & 3)], 16);
return *this;
}
inline
UInt128& UInt128::operator>>=(Int32 shift) noexcept
{
auto rshift = shift & 31;
auto lshift = 31 - rshift;
UInt32 parts[8] = {
#if CPU_TYPE == CPU_LE32
mLow >> rshift | mMidLow << lshift << 1,
mMidLow >> rshift | mMidHigh << lshift << 1,
mMidHigh >> rshift | mHigh << lshift << 1,
mHigh >> rshift,
0, 0, 0, 0
#elif CPU_TYPE == CPU_BE32
0, 0, 0, 0,
mHigh >> rshift,
mMidHigh >> rshift | mHigh << lshift << 1,
mMidLow >> rshift | mMidHigh << lshift << 1,
mLow >> rshift | mMidLow << lshift << 1
#endif
};
memcpy(this, &parts[
#if CPU_TYPE == CPU_BE32
4 -
#endif
(shift >> 5 & 3)], 16);
return *this;
}

我认为使用数组更好地实现UInt128,其中字节序不是问题,例如,

alignas (16) uint32_t data[4];或:alignas (16) uint64_t data[2];

请注意,对于在堆上创建的对象,不能保证对齐;尽管某些 ABI 确实具有 16 字节的最小对齐方式。您可以与alignof(std::max_align_t).否则,您将需要替换 SIMD 的全局运算符 new 和 delete 函数(例如 SSE)。

对于uint32_t实现,您将移位拆分为"字"和"位"移位 - 此外,将已签名的 Int32 作为移位计数是没有意义的......

inline UInt128 &
UInt128::operator <<= (uint32_t shift) noexcept
{
shift &= 0x7f;
auto shw = shift / (32); // or (shift >> 5)
auto shl = shift % (32); // or (shift & 1f)
// branch-free shift masking:
uint32_t shm = shl - 1;
uint32_t shr = (- shl) & (32 - 1);
shm = (shm >> (32 - 1)) - 1; // 0xffffffff or 0x0
switch (shw)
{
case (3) :
data[3] = (data[0] << shl);
data[2] = 0, data[1] = 0, data[0] = 0;
break;
case (2) :
data[3] = (data[1] << shl) | ((data[0] >> shr) & shm);
data[2] = (data[0] << shl);
data[1] = 0, data[0] = 0;
break;
case (1) :
data[3] = (data[2] << shl) | ((data[1] >> shr) & shm);
data[2] = (data[1] << shl) | ((data[0] >> shr) & shm);
data[1] = (data[0] << shl);
data[0] = 0;
break;
case (0) : // default:
data[3] = (data[3] << shl) | ((data[2] >> shr) & shm);
data[2] = (data[2] << shl) | ((data[1] >> shr) & shm);
data[1] = (data[1] << shl) | ((data[0] >> shr) & shm);
data[0] = (data[0] << shl);
// break;
}
return *this;
}

我很确定我在这里的数据索引是正确的。如果shift是编译时常量,编译器应该能够非常积极地优化此代码。

我将把右移留给你,除了你必须data从低到高更新单词,以免在阅读之前覆盖单词。否则,它应该主要是交换shlshr角色的简单练习。uint64_t数据版本应该非常简单。