From 4c17aeb1805213760b0d0a17d49324b7073f510b Mon Sep 17 00:00:00 2001 From: Patrick Niklaus Date: Wed, 11 Jun 2014 01:28:31 +0200 Subject: [PATCH] Removed SSE code in RangeTable to rely on compiler optimazation --- DataStructures/RangeTable.h | 67 ++++--------------------------------- 1 file changed, 7 insertions(+), 60 deletions(-) diff --git a/DataStructures/RangeTable.h b/DataStructures/RangeTable.h index db94840a3..1345cfd44 100644 --- a/DataStructures/RangeTable.h +++ b/DataStructures/RangeTable.h @@ -6,11 +6,6 @@ #include #include -#if defined(__GNUC__) && defined(__SSE2__) -#define OSRM_USE_SSE -#include -#endif - #include "SharedMemoryFactory.h" #include "SharedMemoryVectorWrapper.h" @@ -40,16 +35,8 @@ template class RangeTable { public: - union BlockT - { - unsigned char uint8_blocks[BLOCK_SIZE]; -#ifdef OSRM_USE_SSE - static_assert(BLOCK_SIZE % 16 == 0, - "If SSE instructions are enabled, only multiples of 16 are supported as BLOCK_SIZE"); - __m128i uint128_blocks[BLOCK_SIZE/16]; -#endif - }; + typedef std::array BlockT; typedef typename ShM::vector BlockContainerT; typedef typename ShM::vector OffsetContainerT; typedef decltype(boost::irange(0u,0u)) RangeT; @@ -98,7 +85,7 @@ public: } else { - block.uint8_blocks[block_idx - 1] = last_length; + block[block_idx - 1] = last_length; block_sum += last_length; } @@ -134,7 +121,7 @@ public: while (block_idx != 0) { - block.uint8_blocks[block_idx - 1] = last_length; + block[block_idx - 1] = last_length; last_length = 0; block_idx = (block_idx + 1) % (BLOCK_SIZE + 1); } @@ -167,7 +154,7 @@ public: if (internal_idx < BLOCK_SIZE) { // note internal_idx - 1 is the *current* index for uint8_blocks - end_idx = begin_idx + block.uint8_blocks[internal_idx]; + end_idx = begin_idx + block[internal_idx]; } else { @@ -191,54 +178,14 @@ private: unsigned sum_lengths; }; -#ifdef OSRM_USE_SSE -// For blocksize 16 we can use SSE instructions -// FIXME only implemented for non-shared memory -template<> -unsigned RangeTable<16>::PrefixSumAtIndex(int index, const BlockT& block) const -{ - union OffsetT - { - unsigned short u16[8]; - __m128i u128; - }; - OffsetT offsets; - - // converts lower 8 bytes to 8 shorts - offsets.u128 = _mm_unpacklo_epi8(block.uint128_blocks[0], _mm_set1_epi8(0)); - offsets.u128 = _mm_add_epi16(offsets.u128, _mm_slli_si128(offsets.u128, 2)); - if (index < 2) - return offsets.u16[index]; - offsets.u128 = _mm_add_epi16(offsets.u128, _mm_slli_si128(offsets.u128, 4)); - if (index < 4) - return offsets.u16[index]; - offsets.u128 = _mm_add_epi16(offsets.u128, _mm_slli_si128(offsets.u128, 8)); - - if (index < 8) - return offsets.u16[index]; - unsigned temp = offsets.u16[7]; - index -= 8; - - // converts upper 8 bytes to 8 shorts - offsets.u128 = _mm_unpackhi_epi8(block.uint128_blocks[0], _mm_set1_epi8(0)); - offsets.u128 = _mm_add_epi16(offsets.u128, _mm_slli_si128(offsets.u128, 2)); - if (index < 2) - return (temp + offsets.u16[index]); - offsets.u128 = _mm_add_epi16(offsets.u128, _mm_slli_si128(offsets.u128, 4)); - if (index < 4) - return (temp + offsets.u16[index]); - offsets.u128 = _mm_add_epi16(offsets.u128, _mm_slli_si128(offsets.u128, 8)); - - return (temp + offsets.u16[index]); -} -#endif - template unsigned RangeTable::PrefixSumAtIndex(int index, const BlockT& block) const { + // this loop looks inefficent, but a modern compiler + // will emit nice SIMD here, at least for sensible block sizes. (I checked.) unsigned sum = 0; for (int i = 0; i <= index; i++) - sum += block.uint8_blocks[i]; + sum += block[i]; return sum; }