Match serialized bit-packing for vector<bool> to match in-memory layout for vector_view<bool> so that data can be directly mmapped.
This commit is contained in:
parent
b1791d1ab3
commit
d80318f8ea
@ -9,6 +9,7 @@
|
||||
#include "storage/shared_datatype.hpp"
|
||||
#include "storage/tar.hpp"
|
||||
|
||||
#include <boost/assert.hpp>
|
||||
#include <boost/function_output_iterator.hpp>
|
||||
#include <boost/iterator/function_input_iterator.hpp>
|
||||
|
||||
@ -30,22 +31,37 @@ namespace serialization
|
||||
namespace detail
|
||||
{
|
||||
template <typename T, typename BlockT = unsigned char>
|
||||
inline BlockT packBits(const T &data, std::size_t index, std::size_t count)
|
||||
inline BlockT packBits(const T &data, std::size_t base_index, const std::size_t count)
|
||||
{
|
||||
static_assert(std::is_same<typename T::value_type, bool>::value, "value_type is not bool");
|
||||
static_assert(std::is_unsigned<BlockT>::value, "BlockT must be unsigned type");
|
||||
static_assert(std::is_integral<BlockT>::value, "BlockT must be an integral type");
|
||||
static_assert(CHAR_BIT == 8, "Non-8-bit bytes not supported, sorry!");
|
||||
BOOST_ASSERT(sizeof(BlockT) * CHAR_BIT >= count);
|
||||
|
||||
// Note: if this packing is changed, be sure to update vector_view<bool>
|
||||
// as well, so that on-disk and in-memory layouts match.
|
||||
BlockT value = 0;
|
||||
for (std::size_t bit = 0; bit < count; ++bit, ++index)
|
||||
value = (value << 1) | data[index];
|
||||
for (std::size_t bit = 0; bit < count; ++bit)
|
||||
{
|
||||
value |= (data[base_index + bit] ? BlockT{1} : BlockT{0}) << bit;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
template <typename T, typename BlockT = unsigned char>
|
||||
inline void unpackBits(T &data, std::size_t index, std::size_t count, BlockT value)
|
||||
inline void
|
||||
unpackBits(T &data, const std::size_t base_index, const std::size_t count, const BlockT value)
|
||||
{
|
||||
static_assert(std::is_same<typename T::value_type, bool>::value, "value_type is not bool");
|
||||
const BlockT mask = BlockT{1} << (count - 1);
|
||||
for (std::size_t bit = 0; bit < count; value <<= 1, ++bit, ++index)
|
||||
data[index] = value & mask;
|
||||
static_assert(std::is_unsigned<BlockT>::value, "BlockT must be unsigned type");
|
||||
static_assert(std::is_integral<BlockT>::value, "BlockT must be an integral type");
|
||||
static_assert(CHAR_BIT == 8, "Non-8-bit bytes not supported, sorry!");
|
||||
BOOST_ASSERT(sizeof(BlockT) * CHAR_BIT >= count);
|
||||
for (std::size_t bit = 0; bit < count; ++bit)
|
||||
{
|
||||
data[base_index + bit] = value & (BlockT{1} << bit);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename VectorT>
|
||||
@ -55,15 +71,16 @@ void readBoolVector(tar::FileReader &reader, const std::string &name, VectorT &d
|
||||
data.resize(count);
|
||||
std::uint64_t index = 0;
|
||||
|
||||
constexpr std::uint64_t WORD_BITS = CHAR_BIT * sizeof(std::uint64_t);
|
||||
using BlockType = std::uint64_t;
|
||||
constexpr std::uint64_t BLOCK_BITS = CHAR_BIT * sizeof(BlockType);
|
||||
|
||||
const auto decode = [&](const std::uint64_t block) {
|
||||
auto read_size = std::min<std::size_t>(count - index, WORD_BITS);
|
||||
unpackBits<VectorT, std::uint64_t>(data, index, read_size, block);
|
||||
index += WORD_BITS;
|
||||
const auto decode = [&](const BlockType block) {
|
||||
auto read_size = std::min<std::size_t>(count - index, BLOCK_BITS);
|
||||
unpackBits<VectorT, BlockType>(data, index, read_size, block);
|
||||
index += BLOCK_BITS;
|
||||
};
|
||||
|
||||
reader.ReadStreaming<std::uint64_t>(name, boost::make_function_output_iterator(decode));
|
||||
reader.ReadStreaming<BlockType>(name, boost::make_function_output_iterator(decode));
|
||||
}
|
||||
|
||||
template <typename VectorT>
|
||||
@ -73,19 +90,20 @@ void writeBoolVector(tar::FileWriter &writer, const std::string &name, const Vec
|
||||
writer.WriteElementCount64(name, count);
|
||||
std::uint64_t index = 0;
|
||||
|
||||
constexpr std::uint64_t WORD_BITS = CHAR_BIT * sizeof(std::uint64_t);
|
||||
using BlockType = std::uint64_t;
|
||||
constexpr std::uint64_t BLOCK_BITS = CHAR_BIT * sizeof(BlockType);
|
||||
|
||||
// FIXME on old boost version the function_input_iterator does not work with lambdas
|
||||
// so we need to wrap it in a function here.
|
||||
const std::function<std::uint64_t()> encode_function = [&]() -> std::uint64_t {
|
||||
auto write_size = std::min<std::size_t>(count - index, WORD_BITS);
|
||||
auto packed = packBits<VectorT, std::uint64_t>(data, index, write_size);
|
||||
index += WORD_BITS;
|
||||
const std::function<BlockType()> encode_function = [&]() -> BlockType {
|
||||
auto write_size = std::min<std::size_t>(count - index, BLOCK_BITS);
|
||||
auto packed = packBits<VectorT, BlockType>(data, index, write_size);
|
||||
index += BLOCK_BITS;
|
||||
return packed;
|
||||
};
|
||||
|
||||
std::uint64_t number_of_blocks = (count + WORD_BITS - 1) / WORD_BITS;
|
||||
writer.WriteStreaming<std::uint64_t>(
|
||||
std::uint64_t number_of_blocks = (count + BLOCK_BITS - 1) / BLOCK_BITS;
|
||||
writer.WriteStreaming<BlockType>(
|
||||
name,
|
||||
boost::make_function_input_iterator(encode_function, boost::infinite()),
|
||||
number_of_blocks);
|
||||
|
@ -195,7 +195,10 @@ template <> class vector_view<bool>
|
||||
{
|
||||
BOOST_ASSERT_MSG(index < m_size, "invalid size");
|
||||
const std::size_t bucket = index / WORD_BITS;
|
||||
// Note: ordering of bits here should match packBits in storage/serialization.hpp
|
||||
// so that directly mmap-ing data is possible
|
||||
const auto offset = index % WORD_BITS;
|
||||
BOOST_ASSERT(WORD_BITS > offset);
|
||||
return m_ptr[bucket] & (static_cast<Word>(1) << offset);
|
||||
}
|
||||
|
||||
@ -224,11 +227,23 @@ template <> class vector_view<bool>
|
||||
{
|
||||
BOOST_ASSERT(index < m_size);
|
||||
const auto bucket = index / WORD_BITS;
|
||||
// Note: ordering of bits here should match packBits in storage/serialization.hpp
|
||||
// so that directly mmap-ing data is possible
|
||||
const auto offset = index % WORD_BITS;
|
||||
BOOST_ASSERT(WORD_BITS > offset);
|
||||
return reference{m_ptr + bucket, static_cast<Word>(1) << offset};
|
||||
}
|
||||
|
||||
template <typename T> friend void swap(vector_view<T> &, vector_view<T> &) noexcept;
|
||||
|
||||
friend std::ostream &operator<<(std::ostream &os, const vector_view<bool> &rhs)
|
||||
{
|
||||
for (std::size_t i = 0; i < rhs.size(); ++i)
|
||||
{
|
||||
os << (i > 0 ? " " : "") << rhs.at(i);
|
||||
}
|
||||
return os;
|
||||
}
|
||||
};
|
||||
|
||||
// Both vector_view<T> and the vector_view<bool> specializations share this impl.
|
||||
|
@ -1,11 +1,15 @@
|
||||
#include "storage/serialization.hpp"
|
||||
|
||||
#include "util/vector_view.hpp"
|
||||
|
||||
#include "../common/range_tools.hpp"
|
||||
#include "../common/temporary_file.hpp"
|
||||
|
||||
#include <boost/filesystem.hpp>
|
||||
#include <boost/test/unit_test.hpp>
|
||||
|
||||
#include <random>
|
||||
|
||||
BOOST_AUTO_TEST_SUITE(serialization)
|
||||
|
||||
using namespace osrm;
|
||||
@ -15,20 +19,48 @@ BOOST_AUTO_TEST_CASE(pack_test)
|
||||
{
|
||||
std::vector<bool> v = {0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1};
|
||||
|
||||
BOOST_CHECK_EQUAL(storage::serialization::detail::packBits(v, 0, 8), 0x2e);
|
||||
BOOST_CHECK_EQUAL(storage::serialization::detail::packBits(v, 5, 7), 0x65);
|
||||
BOOST_CHECK_EQUAL(storage::serialization::detail::packBits(v, 6, 8), 0x95);
|
||||
BOOST_CHECK_EQUAL(storage::serialization::detail::packBits(v, 0, 8), 0x74);
|
||||
BOOST_CHECK_EQUAL(storage::serialization::detail::packBits(v, 5, 7), 0x53);
|
||||
BOOST_CHECK_EQUAL(storage::serialization::detail::packBits(v, 6, 8), 0xa9);
|
||||
BOOST_CHECK_EQUAL(storage::serialization::detail::packBits(v, 11, 1), 0x01);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(vector_view_pack_test)
|
||||
{
|
||||
// Verifies that the packing generated by packBits matches
|
||||
// what vector_view<bool> expects
|
||||
|
||||
// 1. Generate a random bool vector that covers several uint64_t bytes
|
||||
constexpr unsigned RANDOM_SEED = 42;
|
||||
std::mt19937 g(RANDOM_SEED);
|
||||
std::uniform_int_distribution<> binary_distribution(0, 1);
|
||||
std::vector<bool> v(150);
|
||||
for (std::size_t i = 0; i < v.size(); ++i)
|
||||
v[i] = binary_distribution(g) == 1;
|
||||
|
||||
// 2. Pack the vector into a contiguous set of bytes
|
||||
std::uint64_t data[3];
|
||||
data[0] = storage::serialization::detail::packBits<decltype(v), std::uint64_t>(v, 0, 64);
|
||||
data[1] = storage::serialization::detail::packBits<decltype(v), std::uint64_t>(v, 64, 64);
|
||||
data[2] = storage::serialization::detail::packBits<decltype(v), std::uint64_t>(v, 128, 22);
|
||||
|
||||
// 3. Make a vector_view of that memory, and see if the bit sequence is
|
||||
// interpreted correctly by vector_view
|
||||
util::vector_view<bool> view(data, v.size());
|
||||
for (std::size_t index = 0; index < v.size(); ++index)
|
||||
{
|
||||
BOOST_CHECK_EQUAL(v[index], view[index]);
|
||||
}
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(unpack_test)
|
||||
{
|
||||
std::vector<bool> v(14), expected = {0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1};
|
||||
|
||||
storage::serialization::detail::unpackBits(v, 0, 8, 0x2e);
|
||||
storage::serialization::detail::unpackBits(v, 5, 7, 0x65);
|
||||
storage::serialization::detail::unpackBits(v, 6, 8, 0x95);
|
||||
storage::serialization::detail::unpackBits(v, 11, 1, 0x01);
|
||||
storage::serialization::detail::unpackBits(v, 0, 8, 0x74u);
|
||||
storage::serialization::detail::unpackBits(v, 5, 7, 0x53u);
|
||||
storage::serialization::detail::unpackBits(v, 6, 8, 0xa9u);
|
||||
storage::serialization::detail::unpackBits(v, 11, 1, 0x01u);
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(v.begin(), v.end(), expected.begin(), expected.end());
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user