Match serialized bit-packing for vector<bool> to match in-memory layout for vector_view<bool> so that data can be directly mmapped.

This commit is contained in:
Daniel Patterson 2018-10-26 23:37:36 -07:00
parent b1791d1ab3
commit d80318f8ea
No known key found for this signature in database
GPG Key ID: 19C12BE1725A028B
3 changed files with 92 additions and 27 deletions

View File

@ -9,6 +9,7 @@
#include "storage/shared_datatype.hpp"
#include "storage/tar.hpp"
#include <boost/assert.hpp>
#include <boost/function_output_iterator.hpp>
#include <boost/iterator/function_input_iterator.hpp>
@ -30,22 +31,37 @@ namespace serialization
namespace detail
{
template <typename T, typename BlockT = unsigned char>
inline BlockT packBits(const T &data, std::size_t index, std::size_t count)
inline BlockT packBits(const T &data, std::size_t base_index, const std::size_t count)
{
static_assert(std::is_same<typename T::value_type, bool>::value, "value_type is not bool");
static_assert(std::is_unsigned<BlockT>::value, "BlockT must be unsigned type");
static_assert(std::is_integral<BlockT>::value, "BlockT must be an integral type");
static_assert(CHAR_BIT == 8, "Non-8-bit bytes not supported, sorry!");
BOOST_ASSERT(sizeof(BlockT) * CHAR_BIT >= count);
// Note: if this packing is changed, be sure to update vector_view<bool>
// as well, so that on-disk and in-memory layouts match.
BlockT value = 0;
for (std::size_t bit = 0; bit < count; ++bit, ++index)
value = (value << 1) | data[index];
for (std::size_t bit = 0; bit < count; ++bit)
{
value |= (data[base_index + bit] ? BlockT{1} : BlockT{0}) << bit;
}
return value;
}
template <typename T, typename BlockT = unsigned char>
inline void unpackBits(T &data, std::size_t index, std::size_t count, BlockT value)
inline void
unpackBits(T &data, const std::size_t base_index, const std::size_t count, const BlockT value)
{
static_assert(std::is_same<typename T::value_type, bool>::value, "value_type is not bool");
const BlockT mask = BlockT{1} << (count - 1);
for (std::size_t bit = 0; bit < count; value <<= 1, ++bit, ++index)
data[index] = value & mask;
static_assert(std::is_unsigned<BlockT>::value, "BlockT must be unsigned type");
static_assert(std::is_integral<BlockT>::value, "BlockT must be an integral type");
static_assert(CHAR_BIT == 8, "Non-8-bit bytes not supported, sorry!");
BOOST_ASSERT(sizeof(BlockT) * CHAR_BIT >= count);
for (std::size_t bit = 0; bit < count; ++bit)
{
data[base_index + bit] = value & (BlockT{1} << bit);
}
}
template <typename VectorT>
@ -55,15 +71,16 @@ void readBoolVector(tar::FileReader &reader, const std::string &name, VectorT &d
data.resize(count);
std::uint64_t index = 0;
constexpr std::uint64_t WORD_BITS = CHAR_BIT * sizeof(std::uint64_t);
using BlockType = std::uint64_t;
constexpr std::uint64_t BLOCK_BITS = CHAR_BIT * sizeof(BlockType);
const auto decode = [&](const std::uint64_t block) {
auto read_size = std::min<std::size_t>(count - index, WORD_BITS);
unpackBits<VectorT, std::uint64_t>(data, index, read_size, block);
index += WORD_BITS;
const auto decode = [&](const BlockType block) {
auto read_size = std::min<std::size_t>(count - index, BLOCK_BITS);
unpackBits<VectorT, BlockType>(data, index, read_size, block);
index += BLOCK_BITS;
};
reader.ReadStreaming<std::uint64_t>(name, boost::make_function_output_iterator(decode));
reader.ReadStreaming<BlockType>(name, boost::make_function_output_iterator(decode));
}
template <typename VectorT>
@ -73,19 +90,20 @@ void writeBoolVector(tar::FileWriter &writer, const std::string &name, const Vec
writer.WriteElementCount64(name, count);
std::uint64_t index = 0;
constexpr std::uint64_t WORD_BITS = CHAR_BIT * sizeof(std::uint64_t);
using BlockType = std::uint64_t;
constexpr std::uint64_t BLOCK_BITS = CHAR_BIT * sizeof(BlockType);
// FIXME on old boost version the function_input_iterator does not work with lambdas
// so we need to wrap it in a function here.
const std::function<std::uint64_t()> encode_function = [&]() -> std::uint64_t {
auto write_size = std::min<std::size_t>(count - index, WORD_BITS);
auto packed = packBits<VectorT, std::uint64_t>(data, index, write_size);
index += WORD_BITS;
const std::function<BlockType()> encode_function = [&]() -> BlockType {
auto write_size = std::min<std::size_t>(count - index, BLOCK_BITS);
auto packed = packBits<VectorT, BlockType>(data, index, write_size);
index += BLOCK_BITS;
return packed;
};
std::uint64_t number_of_blocks = (count + WORD_BITS - 1) / WORD_BITS;
writer.WriteStreaming<std::uint64_t>(
std::uint64_t number_of_blocks = (count + BLOCK_BITS - 1) / BLOCK_BITS;
writer.WriteStreaming<BlockType>(
name,
boost::make_function_input_iterator(encode_function, boost::infinite()),
number_of_blocks);

View File

@ -195,7 +195,10 @@ template <> class vector_view<bool>
{
BOOST_ASSERT_MSG(index < m_size, "invalid size");
const std::size_t bucket = index / WORD_BITS;
// Note: ordering of bits here should match packBits in storage/serialization.hpp
// so that directly mmap-ing data is possible
const auto offset = index % WORD_BITS;
BOOST_ASSERT(WORD_BITS > offset);
return m_ptr[bucket] & (static_cast<Word>(1) << offset);
}
@ -224,11 +227,23 @@ template <> class vector_view<bool>
{
BOOST_ASSERT(index < m_size);
const auto bucket = index / WORD_BITS;
// Note: ordering of bits here should match packBits in storage/serialization.hpp
// so that directly mmap-ing data is possible
const auto offset = index % WORD_BITS;
BOOST_ASSERT(WORD_BITS > offset);
return reference{m_ptr + bucket, static_cast<Word>(1) << offset};
}
template <typename T> friend void swap(vector_view<T> &, vector_view<T> &) noexcept;
friend std::ostream &operator<<(std::ostream &os, const vector_view<bool> &rhs)
{
for (std::size_t i = 0; i < rhs.size(); ++i)
{
os << (i > 0 ? " " : "") << rhs.at(i);
}
return os;
}
};
// Both vector_view<T> and the vector_view<bool> specializations share this impl.

View File

@ -1,11 +1,15 @@
#include "storage/serialization.hpp"
#include "util/vector_view.hpp"
#include "../common/range_tools.hpp"
#include "../common/temporary_file.hpp"
#include <boost/filesystem.hpp>
#include <boost/test/unit_test.hpp>
#include <random>
BOOST_AUTO_TEST_SUITE(serialization)
using namespace osrm;
@ -15,20 +19,48 @@ BOOST_AUTO_TEST_CASE(pack_test)
{
std::vector<bool> v = {0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1};
BOOST_CHECK_EQUAL(storage::serialization::detail::packBits(v, 0, 8), 0x2e);
BOOST_CHECK_EQUAL(storage::serialization::detail::packBits(v, 5, 7), 0x65);
BOOST_CHECK_EQUAL(storage::serialization::detail::packBits(v, 6, 8), 0x95);
BOOST_CHECK_EQUAL(storage::serialization::detail::packBits(v, 0, 8), 0x74);
BOOST_CHECK_EQUAL(storage::serialization::detail::packBits(v, 5, 7), 0x53);
BOOST_CHECK_EQUAL(storage::serialization::detail::packBits(v, 6, 8), 0xa9);
BOOST_CHECK_EQUAL(storage::serialization::detail::packBits(v, 11, 1), 0x01);
}
BOOST_AUTO_TEST_CASE(vector_view_pack_test)
{
// Verifies that the packing generated by packBits matches
// what vector_view<bool> expects
// 1. Generate a random bool vector that covers several uint64_t bytes
constexpr unsigned RANDOM_SEED = 42;
std::mt19937 g(RANDOM_SEED);
std::uniform_int_distribution<> binary_distribution(0, 1);
std::vector<bool> v(150);
for (std::size_t i = 0; i < v.size(); ++i)
v[i] = binary_distribution(g) == 1;
// 2. Pack the vector into a contiguous set of bytes
std::uint64_t data[3];
data[0] = storage::serialization::detail::packBits<decltype(v), std::uint64_t>(v, 0, 64);
data[1] = storage::serialization::detail::packBits<decltype(v), std::uint64_t>(v, 64, 64);
data[2] = storage::serialization::detail::packBits<decltype(v), std::uint64_t>(v, 128, 22);
// 3. Make a vector_view of that memory, and see if the bit sequence is
// interpreted correctly by vector_view
util::vector_view<bool> view(data, v.size());
for (std::size_t index = 0; index < v.size(); ++index)
{
BOOST_CHECK_EQUAL(v[index], view[index]);
}
}
BOOST_AUTO_TEST_CASE(unpack_test)
{
std::vector<bool> v(14), expected = {0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1};
storage::serialization::detail::unpackBits(v, 0, 8, 0x2e);
storage::serialization::detail::unpackBits(v, 5, 7, 0x65);
storage::serialization::detail::unpackBits(v, 6, 8, 0x95);
storage::serialization::detail::unpackBits(v, 11, 1, 0x01);
storage::serialization::detail::unpackBits(v, 0, 8, 0x74u);
storage::serialization::detail::unpackBits(v, 5, 7, 0x53u);
storage::serialization::detail::unpackBits(v, 6, 8, 0xa9u);
storage::serialization::detail::unpackBits(v, 11, 1, 0x01u);
BOOST_CHECK_EQUAL_COLLECTIONS(v.begin(), v.end(), expected.begin(), expected.end());
}