Removed SSE code in RangeTable to rely on compiler optimazation
This commit is contained in:
parent
ef60ae652c
commit
4c17aeb180
@ -6,11 +6,6 @@
|
||||
#include <fstream>
|
||||
#include <vector>
|
||||
|
||||
#if defined(__GNUC__) && defined(__SSE2__)
|
||||
#define OSRM_USE_SSE
|
||||
#include <xmmintrin.h>
|
||||
#endif
|
||||
|
||||
#include "SharedMemoryFactory.h"
|
||||
#include "SharedMemoryVectorWrapper.h"
|
||||
|
||||
@ -40,16 +35,8 @@ template<unsigned BLOCK_SIZE, bool USE_SHARED_MEMORY>
|
||||
class RangeTable
|
||||
{
|
||||
public:
|
||||
union BlockT
|
||||
{
|
||||
unsigned char uint8_blocks[BLOCK_SIZE];
|
||||
#ifdef OSRM_USE_SSE
|
||||
static_assert(BLOCK_SIZE % 16 == 0,
|
||||
"If SSE instructions are enabled, only multiples of 16 are supported as BLOCK_SIZE");
|
||||
__m128i uint128_blocks[BLOCK_SIZE/16];
|
||||
#endif
|
||||
};
|
||||
|
||||
typedef std::array<unsigned char, BLOCK_SIZE> BlockT;
|
||||
typedef typename ShM<BlockT, USE_SHARED_MEMORY>::vector BlockContainerT;
|
||||
typedef typename ShM<unsigned, USE_SHARED_MEMORY>::vector OffsetContainerT;
|
||||
typedef decltype(boost::irange(0u,0u)) RangeT;
|
||||
@ -98,7 +85,7 @@ public:
|
||||
}
|
||||
else
|
||||
{
|
||||
block.uint8_blocks[block_idx - 1] = last_length;
|
||||
block[block_idx - 1] = last_length;
|
||||
block_sum += last_length;
|
||||
}
|
||||
|
||||
@ -134,7 +121,7 @@ public:
|
||||
|
||||
while (block_idx != 0)
|
||||
{
|
||||
block.uint8_blocks[block_idx - 1] = last_length;
|
||||
block[block_idx - 1] = last_length;
|
||||
last_length = 0;
|
||||
block_idx = (block_idx + 1) % (BLOCK_SIZE + 1);
|
||||
}
|
||||
@ -167,7 +154,7 @@ public:
|
||||
if (internal_idx < BLOCK_SIZE)
|
||||
{
|
||||
// note internal_idx - 1 is the *current* index for uint8_blocks
|
||||
end_idx = begin_idx + block.uint8_blocks[internal_idx];
|
||||
end_idx = begin_idx + block[internal_idx];
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -191,54 +178,14 @@ private:
|
||||
unsigned sum_lengths;
|
||||
};
|
||||
|
||||
#ifdef OSRM_USE_SSE
|
||||
// For blocksize 16 we can use SSE instructions
|
||||
// FIXME only implemented for non-shared memory
|
||||
template<>
|
||||
unsigned RangeTable<16>::PrefixSumAtIndex(int index, const BlockT& block) const
|
||||
{
|
||||
union OffsetT
|
||||
{
|
||||
unsigned short u16[8];
|
||||
__m128i u128;
|
||||
};
|
||||
OffsetT offsets;
|
||||
|
||||
// converts lower 8 bytes to 8 shorts
|
||||
offsets.u128 = _mm_unpacklo_epi8(block.uint128_blocks[0], _mm_set1_epi8(0));
|
||||
offsets.u128 = _mm_add_epi16(offsets.u128, _mm_slli_si128(offsets.u128, 2));
|
||||
if (index < 2)
|
||||
return offsets.u16[index];
|
||||
offsets.u128 = _mm_add_epi16(offsets.u128, _mm_slli_si128(offsets.u128, 4));
|
||||
if (index < 4)
|
||||
return offsets.u16[index];
|
||||
offsets.u128 = _mm_add_epi16(offsets.u128, _mm_slli_si128(offsets.u128, 8));
|
||||
|
||||
if (index < 8)
|
||||
return offsets.u16[index];
|
||||
unsigned temp = offsets.u16[7];
|
||||
index -= 8;
|
||||
|
||||
// converts upper 8 bytes to 8 shorts
|
||||
offsets.u128 = _mm_unpackhi_epi8(block.uint128_blocks[0], _mm_set1_epi8(0));
|
||||
offsets.u128 = _mm_add_epi16(offsets.u128, _mm_slli_si128(offsets.u128, 2));
|
||||
if (index < 2)
|
||||
return (temp + offsets.u16[index]);
|
||||
offsets.u128 = _mm_add_epi16(offsets.u128, _mm_slli_si128(offsets.u128, 4));
|
||||
if (index < 4)
|
||||
return (temp + offsets.u16[index]);
|
||||
offsets.u128 = _mm_add_epi16(offsets.u128, _mm_slli_si128(offsets.u128, 8));
|
||||
|
||||
return (temp + offsets.u16[index]);
|
||||
}
|
||||
#endif
|
||||
|
||||
template<unsigned BLOCK_SIZE, bool USE_SHARED_MEMORY>
|
||||
unsigned RangeTable<BLOCK_SIZE, USE_SHARED_MEMORY>::PrefixSumAtIndex(int index, const BlockT& block) const
|
||||
{
|
||||
// this loop looks inefficent, but a modern compiler
|
||||
// will emit nice SIMD here, at least for sensible block sizes. (I checked.)
|
||||
unsigned sum = 0;
|
||||
for (int i = 0; i <= index; i++)
|
||||
sum += block.uint8_blocks[i];
|
||||
sum += block[i];
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user