From 5b782a783a3d8549dbeb6c89ce75b87e99d8aec0 Mon Sep 17 00:00:00 2001 From: rparanjpe Date: Tue, 8 Dec 2015 23:31:58 -0800 Subject: [PATCH 1/2] Use a std::vector in place of stxxl:vector for the names list -For large datasets with very many unique names, stxxl::vector can corrupt data. Technically, we should only be using stxxl:vectors with POD. Other types might lead to strange/unpredictable behavior as we noticed here. -See http://algo2.iti.kit.edu/dementiev/stxxl/trunk/FAQ.html --- extractor/extraction_containers.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/extractor/extraction_containers.hpp b/extractor/extraction_containers.hpp index 09a5ef189..ab09664a6 100644 --- a/extractor/extraction_containers.hpp +++ b/extractor/extraction_containers.hpp @@ -64,14 +64,13 @@ class ExtractionContainers using STXXLNodeIDVector = stxxl::vector; using STXXLNodeVector = stxxl::vector; using STXXLEdgeVector = stxxl::vector; - using STXXLStringVector = stxxl::vector; using STXXLRestrictionsVector = stxxl::vector; using STXXLWayIDStartEndVector = stxxl::vector; STXXLNodeIDVector used_node_id_list; STXXLNodeVector all_nodes_list; STXXLEdgeVector all_edges_list; - STXXLStringVector name_list; + std::vector name_list; STXXLRestrictionsVector restrictions_list; STXXLWayIDStartEndVector way_start_end_id_list; std::unordered_map external_to_internal_node_id_map; From da91d342f7816f00859971b77cacddda4c9cadee Mon Sep 17 00:00:00 2001 From: rparanjpe Date: Wed, 9 Dec 2015 10:01:45 -0800 Subject: [PATCH 2/2] name_list --> name_char_data and name_offsets -Use stxxl vectors with char and unsigned int containers -Write out the entire character vector to fil -Cap the names at length 255 during the parsing so we reduce the amount of memory used by stxxl vectors and we can do a direct writing of the character vector to .names --- extractor/extraction_containers.cpp | 29 ++++++++++++----------------- extractor/extraction_containers.hpp | 3 ++- extractor/extractor_callbacks.cpp | 15 +++++++++++++-- 3 files changed, 27 insertions(+), 20 deletions(-) diff --git a/extractor/extraction_containers.cpp b/extractor/extraction_containers.cpp index 725ae25ce..52afdf36d 100644 --- a/extractor/extraction_containers.cpp +++ b/extractor/extraction_containers.cpp @@ -55,7 +55,8 @@ ExtractionContainers::ExtractionContainers() { // Check if stxxl can be instantiated stxxl::vector dummy_vector; - name_list.push_back(""); + name_char_data.push_back('\0'); + name_offsets.push_back(1); } ExtractionContainers::~ExtractionContainers() @@ -64,7 +65,8 @@ ExtractionContainers::~ExtractionContainers() used_node_id_list.clear(); all_nodes_list.clear(); all_edges_list.clear(); - name_list.clear(); + name_char_data.clear(); + name_offsets.clear(); restrictions_list.clear(); way_start_end_id_list.clear(); } @@ -116,27 +118,20 @@ void ExtractionContainers::WriteNames(const std::string& names_file_name) const boost::filesystem::ofstream name_file_stream(names_file_name, std::ios::binary); unsigned total_length = 0; - std::vector name_lengths; - for (const std::string &temp_string : name_list) + + for (const unsigned &name_offset : name_offsets) { - const unsigned string_length = - std::min(static_cast(temp_string.length()), 255u); - name_lengths.push_back(string_length); - total_length += string_length; + total_length += name_offset; } // builds and writes the index - RangeTable<> name_index_range(name_lengths); + RangeTable<> name_index_range(name_offsets); name_file_stream << name_index_range; name_file_stream.write((char *)&total_length, sizeof(unsigned)); + // write all chars consecutively - for (const std::string &temp_string : name_list) - { - const unsigned string_length = - std::min(static_cast(temp_string.length()), 255u); - name_file_stream.write(temp_string.c_str(), string_length); - } + name_file_stream.write((const char *)&name_char_data[0], name_char_data.size()); name_file_stream.close(); TIMER_STOP(write_name_index); @@ -196,7 +191,7 @@ void ExtractionContainers::PrepareNodes() node_iter++; ref_iter++; } - if (internal_id > std::numeric_limits::max()) + if (internal_id > std::numeric_limits::max()) { throw osrm::exception("There are too many nodes remaining after filtering, OSRM only supports 2^32 unique nodes"); } @@ -507,7 +502,7 @@ void ExtractionContainers::WriteEdges(std::ofstream& file_out_stream) const std::cout << "[extractor] setting number of edges ... " << std::flush; used_edges_counter_buffer = boost::numeric_cast(used_edges_counter); - + file_out_stream.seekp(start_position); file_out_stream.write((char *)&used_edges_counter_buffer, sizeof(unsigned)); std::cout << "ok" << std::endl; diff --git a/extractor/extraction_containers.hpp b/extractor/extraction_containers.hpp index ab09664a6..4c64c88bd 100644 --- a/extractor/extraction_containers.hpp +++ b/extractor/extraction_containers.hpp @@ -70,7 +70,8 @@ class ExtractionContainers STXXLNodeIDVector used_node_id_list; STXXLNodeVector all_nodes_list; STXXLEdgeVector all_edges_list; - std::vector name_list; + stxxl::vector name_char_data; + std::vector name_offsets; STXXLRestrictionsVector restrictions_list; STXXLWayIDStartEndVector way_start_end_id_list; std::unordered_map external_to_internal_node_id_map; diff --git a/extractor/extractor_callbacks.cpp b/extractor/extractor_callbacks.cpp index 4b0ac6b22..da1efc5a9 100644 --- a/extractor/extractor_callbacks.cpp +++ b/extractor/extractor_callbacks.cpp @@ -153,10 +153,21 @@ void ExtractorCallbacks::ProcessWay(const osmium::Way &input_way, const Extracti // Get the unique identifier for the street name const auto &string_map_iterator = string_map.find(parsed_way.name); - unsigned name_id = external_memory.name_list.size(); + unsigned name_id = external_memory.name_offsets.size(); if (string_map.end() == string_map_iterator) { - external_memory.name_list.push_back(parsed_way.name); + unsigned name_length = 0; + for (const char &c : parsed_way.name) + { + // Cap name length at 255 characters + if (name_length == 255u) + { + break; + } + external_memory.name_char_data.push_back(c); + name_length++; + } + external_memory.name_offsets.push_back(name_length); string_map.insert(std::make_pair(parsed_way.name, name_id)); } else