Removed SSE code in RangeTable to rely on compiler optimazation

2014-06-11 01:28:31 +02:00 · 2014-06-11 01:28:31 +02:00 · 4c17aeb180
commit 4c17aeb180
parent ef60ae652c
1 changed files with 7 additions and 60 deletions
--- a/DataStructures/RangeTable.h
+++ b/DataStructures/RangeTable.h
@ -6,11 +6,6 @@
 #include <fstream>
 #include <vector>
 #if defined(__GNUC__) && defined(__SSE2__)
 #define OSRM_USE_SSE
 #include <xmmintrin.h>
 #endif
 #include "SharedMemoryFactory.h"
 #include "SharedMemoryVectorWrapper.h"
@ -40,16 +35,8 @@ template<unsigned BLOCK_SIZE, bool USE_SHARED_MEMORY>
 class RangeTable
 {
 public:
    union BlockT
    {
        unsigned char uint8_blocks[BLOCK_SIZE];
 #ifdef OSRM_USE_SSE
        static_assert(BLOCK_SIZE % 16 == 0,
        "If SSE instructions are enabled, only multiples of 16 are supported as BLOCK_SIZE");
        __m128i uint128_blocks[BLOCK_SIZE/16];
 #endif
    };
    typedef std::array<unsigned char, BLOCK_SIZE> BlockT;
    typedef typename ShM<BlockT, USE_SHARED_MEMORY>::vector   BlockContainerT;
    typedef typename ShM<unsigned, USE_SHARED_MEMORY>::vector OffsetContainerT;
    typedef decltype(boost::irange(0u,0u))                    RangeT;
@ -98,7 +85,7 @@ public:
            }
            else
            {
-                block.uint8_blocks[block_idx - 1] = last_length;
+                block[block_idx - 1] = last_length;
                block_sum += last_length;
            }
@ -134,7 +121,7 @@ public:
        while (block_idx != 0)
        {
-            block.uint8_blocks[block_idx - 1] = last_length;
+            block[block_idx - 1] = last_length;
            last_length = 0;
            block_idx = (block_idx + 1) % (BLOCK_SIZE + 1);
        }
@ -167,7 +154,7 @@ public:
        if (internal_idx < BLOCK_SIZE)
        {
            // note internal_idx - 1 is the *current* index for uint8_blocks
-            end_idx = begin_idx + block.uint8_blocks[internal_idx];
+            end_idx = begin_idx + block[internal_idx];
        }
        else
        {
@ -191,54 +178,14 @@ private:
    unsigned sum_lengths;
 };
 #ifdef OSRM_USE_SSE
 // For blocksize 16 we can use SSE instructions
 // FIXME only implemented for non-shared memory
 template<>
 unsigned RangeTable<16>::PrefixSumAtIndex(int index, const BlockT& block) const
 {
    union OffsetT
    {
        unsigned short  u16[8];
        __m128i         u128;
    };
    OffsetT offsets;
    // converts lower 8 bytes to 8 shorts
    offsets.u128 = _mm_unpacklo_epi8(block.uint128_blocks[0], _mm_set1_epi8(0));
    offsets.u128 = _mm_add_epi16(offsets.u128, _mm_slli_si128(offsets.u128, 2));
    if (index < 2)
        return offsets.u16[index];
    offsets.u128 = _mm_add_epi16(offsets.u128, _mm_slli_si128(offsets.u128, 4));
    if (index < 4)
        return offsets.u16[index];
    offsets.u128 = _mm_add_epi16(offsets.u128, _mm_slli_si128(offsets.u128, 8));
    if (index < 8)
        return offsets.u16[index];
    unsigned temp = offsets.u16[7];
    index -= 8;
    // converts upper 8 bytes to 8 shorts
    offsets.u128 = _mm_unpackhi_epi8(block.uint128_blocks[0], _mm_set1_epi8(0));
    offsets.u128 = _mm_add_epi16(offsets.u128, _mm_slli_si128(offsets.u128, 2));
    if (index < 2)
        return (temp + offsets.u16[index]);
    offsets.u128 = _mm_add_epi16(offsets.u128, _mm_slli_si128(offsets.u128, 4));
    if (index < 4)
        return (temp + offsets.u16[index]);
    offsets.u128 = _mm_add_epi16(offsets.u128, _mm_slli_si128(offsets.u128, 8));
    return (temp + offsets.u16[index]);
 }
 #endif
 template<unsigned BLOCK_SIZE, bool USE_SHARED_MEMORY>
 unsigned RangeTable<BLOCK_SIZE, USE_SHARED_MEMORY>::PrefixSumAtIndex(int index, const BlockT& block) const
 {
    // this loop looks inefficent, but a modern compiler
    // will emit nice SIMD here, at least for sensible block sizes. (I checked.)
    unsigned sum = 0;
    for (int i = 0; i <= index; i++)
-        sum += block.uint8_blocks[i];
+        sum += block[i];
    return sum;
 }