Match serialized bit-packing for vector<bool> to match in-memory layout for vector_view<bool> so that data can be directly mmapped.

This commit is contained in:
Daniel Patterson
2018-10-26 23:37:36 -07:00
parent b1791d1ab3
commit d80318f8ea
3 changed files with 92 additions and 27 deletions
+38 -20
View File
@@ -9,6 +9,7 @@
#include "storage/shared_datatype.hpp"
#include "storage/tar.hpp"
#include <boost/assert.hpp>
#include <boost/function_output_iterator.hpp>
#include <boost/iterator/function_input_iterator.hpp>
@@ -30,22 +31,37 @@ namespace serialization
namespace detail
{
template <typename T, typename BlockT = unsigned char>
inline BlockT packBits(const T &data, std::size_t index, std::size_t count)
inline BlockT packBits(const T &data, std::size_t base_index, const std::size_t count)
{
static_assert(std::is_same<typename T::value_type, bool>::value, "value_type is not bool");
static_assert(std::is_unsigned<BlockT>::value, "BlockT must be unsigned type");
static_assert(std::is_integral<BlockT>::value, "BlockT must be an integral type");
static_assert(CHAR_BIT == 8, "Non-8-bit bytes not supported, sorry!");
BOOST_ASSERT(sizeof(BlockT) * CHAR_BIT >= count);
// Note: if this packing is changed, be sure to update vector_view<bool>
// as well, so that on-disk and in-memory layouts match.
BlockT value = 0;
for (std::size_t bit = 0; bit < count; ++bit, ++index)
value = (value << 1) | data[index];
for (std::size_t bit = 0; bit < count; ++bit)
{
value |= (data[base_index + bit] ? BlockT{1} : BlockT{0}) << bit;
}
return value;
}
template <typename T, typename BlockT = unsigned char>
inline void unpackBits(T &data, std::size_t index, std::size_t count, BlockT value)
inline void
unpackBits(T &data, const std::size_t base_index, const std::size_t count, const BlockT value)
{
static_assert(std::is_same<typename T::value_type, bool>::value, "value_type is not bool");
const BlockT mask = BlockT{1} << (count - 1);
for (std::size_t bit = 0; bit < count; value <<= 1, ++bit, ++index)
data[index] = value & mask;
static_assert(std::is_unsigned<BlockT>::value, "BlockT must be unsigned type");
static_assert(std::is_integral<BlockT>::value, "BlockT must be an integral type");
static_assert(CHAR_BIT == 8, "Non-8-bit bytes not supported, sorry!");
BOOST_ASSERT(sizeof(BlockT) * CHAR_BIT >= count);
for (std::size_t bit = 0; bit < count; ++bit)
{
data[base_index + bit] = value & (BlockT{1} << bit);
}
}
template <typename VectorT>
@@ -55,15 +71,16 @@ void readBoolVector(tar::FileReader &reader, const std::string &name, VectorT &d
data.resize(count);
std::uint64_t index = 0;
constexpr std::uint64_t WORD_BITS = CHAR_BIT * sizeof(std::uint64_t);
using BlockType = std::uint64_t;
constexpr std::uint64_t BLOCK_BITS = CHAR_BIT * sizeof(BlockType);
const auto decode = [&](const std::uint64_t block) {
auto read_size = std::min<std::size_t>(count - index, WORD_BITS);
unpackBits<VectorT, std::uint64_t>(data, index, read_size, block);
index += WORD_BITS;
const auto decode = [&](const BlockType block) {
auto read_size = std::min<std::size_t>(count - index, BLOCK_BITS);
unpackBits<VectorT, BlockType>(data, index, read_size, block);
index += BLOCK_BITS;
};
reader.ReadStreaming<std::uint64_t>(name, boost::make_function_output_iterator(decode));
reader.ReadStreaming<BlockType>(name, boost::make_function_output_iterator(decode));
}
template <typename VectorT>
@@ -73,19 +90,20 @@ void writeBoolVector(tar::FileWriter &writer, const std::string &name, const Vec
writer.WriteElementCount64(name, count);
std::uint64_t index = 0;
constexpr std::uint64_t WORD_BITS = CHAR_BIT * sizeof(std::uint64_t);
using BlockType = std::uint64_t;
constexpr std::uint64_t BLOCK_BITS = CHAR_BIT * sizeof(BlockType);
// FIXME on old boost version the function_input_iterator does not work with lambdas
// so we need to wrap it in a function here.
const std::function<std::uint64_t()> encode_function = [&]() -> std::uint64_t {
auto write_size = std::min<std::size_t>(count - index, WORD_BITS);
auto packed = packBits<VectorT, std::uint64_t>(data, index, write_size);
index += WORD_BITS;
const std::function<BlockType()> encode_function = [&]() -> BlockType {
auto write_size = std::min<std::size_t>(count - index, BLOCK_BITS);
auto packed = packBits<VectorT, BlockType>(data, index, write_size);
index += BLOCK_BITS;
return packed;
};
std::uint64_t number_of_blocks = (count + WORD_BITS - 1) / WORD_BITS;
writer.WriteStreaming<std::uint64_t>(
std::uint64_t number_of_blocks = (count + BLOCK_BITS - 1) / BLOCK_BITS;
writer.WriteStreaming<BlockType>(
name,
boost::make_function_input_iterator(encode_function, boost::infinite()),
number_of_blocks);
+15
View File
@@ -195,7 +195,10 @@ template <> class vector_view<bool>
{
BOOST_ASSERT_MSG(index < m_size, "invalid size");
const std::size_t bucket = index / WORD_BITS;
// Note: ordering of bits here should match packBits in storage/serialization.hpp
// so that directly mmap-ing data is possible
const auto offset = index % WORD_BITS;
BOOST_ASSERT(WORD_BITS > offset);
return m_ptr[bucket] & (static_cast<Word>(1) << offset);
}
@@ -224,11 +227,23 @@ template <> class vector_view<bool>
{
BOOST_ASSERT(index < m_size);
const auto bucket = index / WORD_BITS;
// Note: ordering of bits here should match packBits in storage/serialization.hpp
// so that directly mmap-ing data is possible
const auto offset = index % WORD_BITS;
BOOST_ASSERT(WORD_BITS > offset);
return reference{m_ptr + bucket, static_cast<Word>(1) << offset};
}
template <typename T> friend void swap(vector_view<T> &, vector_view<T> &) noexcept;
friend std::ostream &operator<<(std::ostream &os, const vector_view<bool> &rhs)
{
for (std::size_t i = 0; i < rhs.size(); ++i)
{
os << (i > 0 ? " " : "") << rhs.at(i);
}
return os;
}
};
// Both vector_view<T> and the vector_view<bool> specializations share this impl.