From 64c6c01b806efcf70bb06aeb55ca317492123d80 Mon Sep 17 00:00:00 2001 From: theoretical Date: Wed, 7 Jan 2015 14:38:28 -0500 Subject: [PATCH] more efficient uint128 multiplication algorithm (measured 29.23x speedup) --- src/uint128.cpp | 59 +++++++++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 24 deletions(-) diff --git a/src/uint128.cpp b/src/uint128.cpp index 9893f4e..b51d20e 100644 --- a/src/uint128.cpp +++ b/src/uint128.cpp @@ -233,30 +233,41 @@ namespace fc uint128& uint128::operator*=(const uint128 &b) { - // check for multiply by 0 - // result is always 0 :-P - if(b == 0) { - hi = 0; - lo = 0; - } else if(b != 1) { - - // check we aren't multiplying by 1 - - uint128 a(*this); - uint128 t = b; - - lo = 0; - hi = 0; - - for (unsigned int i = 0; i < 128; ++i) { - if((t & 1) != 0) { - *this += (a << i); - } - - t >>= 1; - } - } - + uint64_t a0 = (uint32_t) (this->lo ); + uint64_t a1 = (uint32_t) (this->lo >> 0x20); + uint64_t a2 = (uint32_t) (this->hi ); + uint64_t a3 = (uint32_t) (this->hi >> 0x20); + + uint64_t b0 = (uint32_t) (b.lo ); + uint64_t b1 = (uint32_t) (b.lo >> 0x20); + uint64_t b2 = (uint32_t) (b.hi ); + uint64_t b3 = (uint32_t) (b.hi >> 0x20); + + // (a0 + (a1 << 0x20) + (a2 << 0x40) + (a3 << 0x60)) * + // (b0 + (b1 << 0x20) + (b2 << 0x40) + (b3 << 0x60)) = + // a0 * b0 + // + // (a1 * b0 + a0 * b1) << 0x20 + // (a2 * b0 + a1 * b1 + a0 * b2) << 0x40 + // (a3 * b0 + a2 * b1 + a1 * b2 + a0 * b3) << 0x60 + // + // all other cross terms are << 0x80 or higher, thus do not appear in result + + this->hi = 0; + this->lo = a3*b0; + (*this) += a2*b1; + (*this) += a1*b2; + (*this) += a0*b3; + (*this) <<= 0x20; + (*this) += a2*b0; + (*this) += a1*b1; + (*this) += a0*b2; + (*this) <<= 0x20; + (*this) += a1*b0; + (*this) += a0*b1; + (*this) <<= 0x20; + (*this) += a0*b0; + return *this; }