Use differential encoding for name offsets

Each name is represented as an integer range in a vector of chars.
Instead of storing the absolute offset inside this array, we can store
only the offset to the previous entry (the string size). By doing this we reduce
the number of bytes need to store an offset from 4 to 1 bytes (if we
set a maximum string length of 255).
This is however slower, since the absolute offset must be computed on
each querry by summing up all previous lengths. To limit the
performance inpact we only do this for blocks of a certain size (16).
This commit is contained in:
Patrick Niklaus
2014-06-07 17:50:02 +02:00
parent d27ac27bc7
commit f90ce77da4
2 changed files with 263 additions and 21 deletions
+8 -21
View File
@@ -395,32 +395,19 @@ void ExtractionContainers::PrepareData(const std::string &output_file_name,
std::string name_file_streamName = (output_file_name + ".names");
boost::filesystem::ofstream name_file_stream(name_file_streamName, std::ios::binary);
// write number of names
const unsigned number_of_names = name_list.size() + 1;
name_file_stream.write((char *)&(number_of_names), sizeof(unsigned));
std::vector<unsigned> name_lengths;
for (const std::string &temp_string : name_list)
{
name_lengths.push_back(temp_string.length());
}
// compute total number of chars
unsigned total_number_of_chars = 0;
for (const std::string &temp_string : name_list)
{
total_number_of_chars += temp_string.length();
}
// write total number of chars
name_file_stream.write((char *)&(total_number_of_chars), sizeof(unsigned));
// write prefixe sums
unsigned name_lengths_prefix_sum = 0;
for (const std::string &temp_string : name_list)
{
name_file_stream.write((char *)&(name_lengths_prefix_sum), sizeof(unsigned));
name_lengths_prefix_sum += temp_string.length();
}
// duplicate on purpose!
name_file_stream.write((char *)&(name_lengths_prefix_sum), sizeof(unsigned));
RangeTable<> table(name_lengths);
name_file_stream << table;
// write all chars consecutively
for (const std::string &temp_string : name_list)
{
const unsigned string_length = temp_string.length();
const unsigned string_length = std::min(temp_string.length(), 255lu);
name_file_stream.write(temp_string.c_str(), string_length);
}