From 0fc6903d7ebb650162fb05dc6f62ed7d2b54a278 Mon Sep 17 00:00:00 2001
From: Denis Koronchik <denis.koronchik@gmail.com>
Date: Fri, 20 Oct 2017 18:10:32 +0300
Subject: [PATCH] Fix issue #4585

---
 include/extractor/node_based_edge.hpp      | 10 +++
 src/extractor/node_based_graph_factory.cpp | 87 +++++++++++++++++-----
 2 files changed, 77 insertions(+), 20 deletions(-)
diff --git a/include/extractor/node_based_edge.hpp b/include/extractor/node_based_edge.hpp
index a43b906ad..baa34e98d 100644
--- a/include/extractor/node_based_edge.hpp
+++ b/include/extractor/node_based_edge.hpp
@@ -71,6 +71,16 @@ struct NodeBasedEdgeAnnotation
             std::tie(name_id, classes, travel_mode, is_left_hand_driving) ==
             std::tie(other.name_id, other.classes, other.travel_mode, other.is_left_hand_driving));
     }
+
+    bool operator<(const NodeBasedEdgeAnnotation &other) const
+    {
+        return (std::tie(name_id, lane_description_id, classes, travel_mode, is_left_hand_driving) <
+                std::tie(other.name_id,
+                         other.lane_description_id,
+                         other.classes,
+                         other.travel_mode,
+                         other.is_left_hand_driving));
+    }
 };
 
 struct NodeBasedEdge
diff --git a/src/extractor/node_based_graph_factory.cpp b/src/extractor/node_based_graph_factory.cpp
index 657c0954f..8fc7b9d24 100644
--- a/src/extractor/node_based_graph_factory.cpp
+++ b/src/extractor/node_based_graph_factory.cpp
@@ -4,9 +4,12 @@
 #include "util/graph_loader.hpp"
 
 #include "util/log.hpp"
+#include "util/timing_util.hpp"
 
 #include <boost/assert.hpp>
 
+#include <set>
+
 namespace osrm
 {
 namespace extractor
@@ -139,9 +142,38 @@ void NodeBasedGraphFactory::CompressGeometry()
 
 void NodeBasedGraphFactory::CompressAnnotationData()
 {
-    const constexpr AnnotationID INVALID_ANNOTATIONID = -1;
-    // remap all entries to find which are used
-    std::vector<AnnotationID> annotation_mapping(annotation_data.size(), INVALID_ANNOTATIONID);
+    TIMER_START(compress_annotation);
+    /** Main idea, that we need to remove duplicated and unreferenced data
+     * For that:
+     * 1. We create set, that contains indecies of unique data items. Just create
+     * comparator, that compare data from annotation_data vector by passed index.
+     * 2. Create cached id's unordered_map, where key - stored id in set,
+     * value - index of item in a set from begin. We need that map, because
+     * std::distance(set.begin(), it) is too slow O(N). So any words in that step we reorder
+     * annotation data to the order it stored in a set. Apply new id's to edge data.
+     * 3. Remove unused anootation_data items.
+     * 4. At final step just need to sort result annotation_data in the same order as set.
+     * That makes id's stored in edge data valid.
+     */
+    struct IndexComparator
+    {
+        IndexComparator(const std::vector<NodeBasedEdgeAnnotation> &annotation_data_)
+            : annotation_data(annotation_data_)
+        {
+        }
+
+        bool operator()(AnnotationID a, AnnotationID b) const
+        {
+            return annotation_data[a] < annotation_data[b];
+        }
+
+      private:
+        const std::vector<NodeBasedEdgeAnnotation> &annotation_data;
+    };
+
+    /** 1 */
+    IndexComparator comparator(annotation_data);
+    std::set<AnnotationID, IndexComparator> unique_annotations(comparator);
 
     // first we mark entries, by setting their mapping to 0
     for (const auto nbg_node_u : util::irange(0u, compressed_output_graph.GetNumberOfNodes()))
@@ -150,22 +182,17 @@ void NodeBasedGraphFactory::CompressAnnotationData()
         for (EdgeID nbg_edge_id : compressed_output_graph.GetAdjacentEdgeRange(nbg_node_u))
         {
             auto const &edge = compressed_output_graph.GetEdgeData(nbg_edge_id);
-            annotation_mapping[edge.annotation_data] = 0;
+            unique_annotations.insert(edge.annotation_data);
         }
     }
 
-    // now compute a prefix sum on all entries that are 0 to find the new mapping
-    AnnotationID prefix_sum = 0;
-    for (std::size_t i = 0; i < annotation_mapping.size(); ++i)
-    {
-        if (annotation_mapping[i] == 0)
-            annotation_mapping[i] = prefix_sum++;
-        else
-        {
-            // flag for removal
-            annotation_data[i].name_id = INVALID_NAMEID;
-        }
-    }
+    // make additional map, because std::distance of std::set seems is O(N)
+    // that very slow
+    /** 2 */
+    AnnotationID new_id = 0;
+    std::unordered_map<AnnotationID, AnnotationID> cached_ids;
+    for (auto id : unique_annotations)
+        cached_ids[id] = new_id++;
 
     // apply the mapping
     for (const auto nbg_node_u : util::irange(0u, compressed_output_graph.GetNumberOfNodes()))
@@ -174,11 +201,24 @@ void NodeBasedGraphFactory::CompressAnnotationData()
         for (EdgeID nbg_edge_id : compressed_output_graph.GetAdjacentEdgeRange(nbg_node_u))
         {
             auto &edge = compressed_output_graph.GetEdgeData(nbg_edge_id);
-            edge.annotation_data = annotation_mapping[edge.annotation_data];
-            BOOST_ASSERT(edge.annotation_data != INVALID_ANNOTATIONID);
+            auto const it = unique_annotations.find(edge.annotation_data);
+            BOOST_ASSERT(it != unique_annotations.end());
+            auto const it2 = cached_ids.find(*it);
+            BOOST_ASSERT(it2 != cached_ids.end());
+
+            edge.annotation_data = it2->second;
         }
     }
 
+    /** 3 */
+    // mark unused references for remove
+    for (AnnotationID id = 0; id < annotation_data.size(); ++id)
+    {
+        auto const it = unique_annotations.find(id);
+        if (it == unique_annotations.end() || *it != id)
+            annotation_data[id].name_id = INVALID_NAMEID;
+    }
+
     // remove unreferenced entries, shifting other entries to the front
     const auto new_end =
         std::remove_if(annotation_data.begin(), annotation_data.end(), [&](auto const &data) {
@@ -191,8 +231,15 @@ void NodeBasedGraphFactory::CompressAnnotationData()
     const auto old_size = annotation_data.size();
     // remove all remaining elements
     annotation_data.erase(new_end, annotation_data.end());
-    util::Log() << " graoh compression removed " << (old_size - annotation_data.size())
-                << " annotations of " << old_size;
+
+    // reorder data in the same order
+    /** 4 */
+    std::sort(annotation_data.begin(), annotation_data.end());
+
+    TIMER_STOP(compress_annotation);
+    util::Log() << " graph compression removed " << (old_size - annotation_data.size())
+                << " annotations of " << old_size << " in " << TIMER_SEC(compress_annotation)
+                << " seconds";
 }
 
 void NodeBasedGraphFactory::ReleaseOsmNodes()