From 29977c4b88d7c303940b0fa5194cfbb118e77e58 Mon Sep 17 00:00:00 2001 From: Dennis Luxen Date: Tue, 22 Mar 2011 17:38:18 +0000 Subject: [PATCH] Extractor reworked to take only about 2.5GB of RAM while extracting the planet --- DataStructures/ExtractorCallBacks.h | 12 +- DataStructures/ExtractorStructs.h | 105 ++++++------ DataStructures/PBFParser.h | 255 ++++++++++++++++------------ extractLargeNetwork.cpp | 214 +++++++++++++---------- 4 files changed, 336 insertions(+), 250 deletions(-) diff --git a/DataStructures/ExtractorCallBacks.h b/DataStructures/ExtractorCallBacks.h index ab2fa4c49..79c168f0f 100644 --- a/DataStructures/ExtractorCallBacks.h +++ b/DataStructures/ExtractorCallBacks.h @@ -22,6 +22,7 @@ or see http://www.gnu.org/licenses/agpl.txt. #define EXTRACTORCALLBACKS_H_ #include +#include #include "ExtractorStructs.h" typedef stxxl::vector STXXLNodeIDVector; @@ -33,8 +34,10 @@ typedef stxxl::vector STXXLStringVector; class ExtractorCallbacks{ private: + static const unsigned MAX_LOCAL_VECTOR_SIZE = 100; + STXXLNodeVector * allNodes; - STXXLNodeIDVector * usedNodes; + STXXLNodeIDVector * usedNodeIDs; STXXLEdgeVector * allEdges; STXXLStringVector * nameVector; STXXLAddressVector * addressVector; @@ -44,7 +47,7 @@ private: public: ExtractorCallbacks(STXXLNodeVector * aNodes, STXXLNodeIDVector * uNodes, STXXLEdgeVector * aEdges, STXXLStringVector * nVector, STXXLAddressVector * adrVector, Settings s, StringMap * strMap){ allNodes = aNodes; - usedNodes = uNodes; + usedNodeIDs = uNodes; allEdges = aEdges; nameVector = nVector; addressVector = adrVector; @@ -55,6 +58,7 @@ public: ~ExtractorCallbacks() { } + /** warning: caller needs to take care of synchronization! */ bool adressFunction(_Node n, HashTable &keyVals) { std::string housenumber(keyVals.Find("addr:housenumber")); std::string housename(keyVals.Find("addr:housename")); @@ -72,6 +76,7 @@ public: return true; } + /** warning: caller needs to take care of synchronization! */ bool nodeFunction(_Node &n) { allNodes->push_back(n); return true; @@ -82,6 +87,7 @@ public: return true; } + /** warning: caller needs to take care of synchronization! */ bool wayFunction(_Way &w) { std::string highway( w.keyVals.Find("highway") ); std::string name( w.keyVals.Find("name") ); @@ -181,7 +187,7 @@ public: w.nameID = strit->second; } for ( unsigned i = 0; i < w.path.size(); ++i ) { - usedNodes->push_back(w.path[i]); + usedNodeIDs->push_back(w.path[i]); } if ( w.direction == _Way::opposite ){ diff --git a/DataStructures/ExtractorStructs.h b/DataStructures/ExtractorStructs.h index 13f17ef69..1743045f1 100644 --- a/DataStructures/ExtractorStructs.h +++ b/DataStructures/ExtractorStructs.h @@ -50,8 +50,8 @@ std::string names[14] = { "motorway", "motorway_link", "trunk", "trunk_link", "p double speeds[14] = { 110, 90, 90, 70, 70, 60, 60, 50, 55, 25, 40 , 10, 30, 5}; struct _Node : NodeInfo{ - _Node(int _lat, int _lon, unsigned int _id) : NodeInfo(_lat, _lon, _id) {} - _Node() {} + _Node(int _lat, int _lon, unsigned int _id) : NodeInfo(_lat, _lon, _id), used(false) {} + _Node() : used(false) {} static _Node min_value() { return _Node(0,0,0); @@ -62,6 +62,8 @@ struct _Node : NodeInfo{ NodeID key() const { return id; } + + bool used; }; struct _Coordinate { @@ -121,18 +123,27 @@ struct _Relation { }; struct _Edge { - _Edge() {}; - _Edge(NodeID s, NodeID t) : start(s), target(t) { } - _Edge(NodeID s, NodeID t, short tp, short d, double sp): start(s), target(t), type(tp), direction(d), speed(sp) { } + _Edge() : used(false) {}; + _Edge(NodeID s, NodeID t) : start(s), target(t), used(false) { } + _Edge(NodeID s, NodeID t, short tp, short d, double sp): start(s), target(t), type(tp), direction(d), speed(sp), used(false) { } NodeID start; NodeID target; - short type; + short type:15; short direction; double speed; unsigned nameID; + bool used:1; _Coordinate startCoord; _Coordinate targetCoord; + + static _Edge min_value() { + return _Edge(0,0); + } + static _Edge max_value() { + return _Edge(numeric_limits::max(), numeric_limits::max()); + } + }; struct Settings { @@ -142,10 +153,8 @@ struct Settings { } speedProfile; // vector accessList; // int trafficLightPenalty; - int indexInAccessListOf( const string & key) - { - for(unsigned i = 0; i< speedProfile.names.size(); i++) - { + int indexInAccessListOf( const string & key) { + for(unsigned i = 0; i< speedProfile.names.size(); i++) { if(speedProfile.names[i] == key) return i; } @@ -153,41 +162,47 @@ struct Settings { } }; -struct Cmp : public std::binary_function -{ +struct Cmp : public std::binary_function { typedef NodeID value_type; - bool operator () (const NodeID & a, const NodeID & b) const - { + bool operator () (const NodeID & a, const NodeID & b) const { return a < b; } - value_type max_value() - { + value_type max_value() { return 0xffffffff; } - value_type min_value() - { + value_type min_value() { return 0x0; } }; -struct CompareEdgeByStart : public std::binary_function<_Edge, _Edge, bool> -{ - typedef _Edge value_type; - bool operator () (const _Edge & a, const _Edge & b) const - { - return a.start < b.start; +struct CmpNodeByID : public std::binary_function<_Node, _Node, bool> { + typedef _Node value_type; + bool operator () (const _Node & a, const _Node & b) const { + return a.id < b.id; } - value_type max_value() - { - return _Edge(UINT_MAX, UINT_MAX); + value_type max_value() { + return _Node::max_value(); } - value_type min_value() - { - return _Edge(0, 0); + value_type min_value() { + return _Node::min_value(); } }; -struct CompareEdgeByTarget : public std::binary_function<_Edge, _Edge, bool> +struct CmpEdgeByStartID : public std::binary_function<_Edge, _Edge, bool> +{ + typedef _Edge value_type; + bool operator () (const _Edge & a, const _Edge & b) const { + return a.start < b.start; + } + value_type max_value() { + return _Edge::max_value(); + } + value_type min_value() { + return _Edge::min_value(); + } +}; + +struct CmpEdgeByTargetID : public std::binary_function<_Edge, _Edge, bool> { typedef _Edge value_type; bool operator () (const _Edge & a, const _Edge & b) const @@ -196,32 +211,19 @@ struct CompareEdgeByTarget : public std::binary_function<_Edge, _Edge, bool> } value_type max_value() { - return _Edge(UINT_MAX, UINT_MAX); + return _Edge::max_value(); } value_type min_value() { - return _Edge(0, 0); - } -}; - -struct CmpNodeByID : public std::binary_function<_Node, _Node, bool> -{ - typedef _Node value_type; - bool operator () (const _Node & a, const _Node & b) const - { - return a.id < b.id; - } - value_type max_value() - { - return _Node::max_value(); - } - value_type min_value() - { - return _Node::min_value(); + return _Edge::min_value(); } }; double ApproximateDistance( const int lat1, const int lon1, const int lat2, const int lon2 ) { + assert(lat1 != INT_MIN); + assert(lon1 != INT_MIN); + assert(lat2 != INT_MIN); + assert(lon2 != INT_MIN); static const double DEG_TO_RAD = 0.017453292519943295769236907684886; ///Earth's quatratic mean radius for WGS-84 static const double EARTH_RADIUS_IN_METERS = 6372797.560856; @@ -237,8 +239,7 @@ double ApproximateDistance( const int lat1, const int lon1, const int lat2, cons } /* Get angle of line segment (A,C)->(C,B), atan2 magic, formerly cosine theorem*/ -double GetAngleBetweenTwoEdges(const _Coordinate& A, const _Coordinate& C, const _Coordinate& B) -{ +double GetAngleBetweenTwoEdges(const _Coordinate& A, const _Coordinate& C, const _Coordinate& B) { // double a = ApproximateDistance(A.lat, A.lon, C.lat, C.lon); //first edge segment // double b = ApproximateDistance(B.lat, B.lon, C.lat, C.lon); //second edge segment // double c = ApproximateDistance(A.lat, A.lon, B.lat, B.lon); //third edgefrom triangle diff --git a/DataStructures/PBFParser.h b/DataStructures/PBFParser.h index 8104b820e..56a415802 100644 --- a/DataStructures/PBFParser.h +++ b/DataStructures/PBFParser.h @@ -46,12 +46,24 @@ class PBFParser : public BaseParser<_Node, _Relation, _Way> { BigEndian = 2 }; - short entityTypeIndicator; + struct _ThreadData { + int currentGroupID; + int currentEntityID; + short entityTypeIndicator; + + OSMPBF::BlobHeader PBFBlobHeader; + OSMPBF::Blob PBFBlob; + + OSMPBF::HeaderBlock PBFHeaderBlock; + OSMPBF::PrimitiveBlock PBFprimitiveBlock; + + std::vector charBuffer; + }; public: PBFParser(const char * fileName) { GOOGLE_PROTOBUF_VERIFY_VERSION; - + // omp_set_num_threads(1); input.open(fileName, std::ios::in | std::ios::binary); if (!input) { @@ -74,6 +86,11 @@ public: if(input.is_open()) input.close(); + unsigned maxThreads = omp_get_max_threads(); + for ( unsigned threadNum = 0; threadNum < maxThreads; ++threadNum ) { + delete threadDataVector[threadNum]; + } + google::protobuf::ShutdownProtobufLibrary(); std::cout << "[info] blocks: " << blockCount << std::endl; @@ -81,22 +98,30 @@ public: } bool Init() { - if(!readPBFBlobHeader(input)) { + /** Init Vector with ThreadData Objects */ + unsigned maxThreads = omp_get_max_threads(); + for ( unsigned threadNum = 0; threadNum < maxThreads; ++threadNum ) { + threadDataVector.push_back( new _ThreadData( ) ); + } + + _ThreadData initData; + /** read Header */ + if(!readPBFBlobHeader(input, &initData)) { return false; } - if(readBlob(input)) { - char data[charBuffer.size()]; - for(unsigned i = 0; i < charBuffer.size(); i++ ){ - data[i] = charBuffer[i]; + if(readBlob(input, &initData)) { + char data[initData.charBuffer.size()]; + for(unsigned i = 0; i < initData.charBuffer.size(); i++ ){ + data[i] = initData.charBuffer[i]; } - if(!PBFHeaderBlock.ParseFromArray(data, charBuffer.size() ) ) { + if(!initData.PBFHeaderBlock.ParseFromArray(data, initData.charBuffer.size() ) ) { std::cerr << "[error] Header not parseable!" << std::endl; return false; } - for(int i = 0; i < PBFHeaderBlock.required_features_size(); i++) { - const std::string& feature = PBFHeaderBlock.required_features( i ); + for(int i = 0; i < initData.PBFHeaderBlock.required_features_size(); i++) { + const std::string& feature = initData.PBFHeaderBlock.required_features( i ); bool supported = false; if ( feature == "OsmSchema-V0.6" ) supported = true; @@ -115,31 +140,42 @@ public: } bool Parse() { - //parse through all Blocks - while(readNextBlock(input)) { +#pragma omp parallel + { + _ThreadData * threadData = threadDataVector[omp_get_thread_num()]; + //parse through all Blocks + bool keepRunning = true; + // while(readNextBlock(input)) { + do{ +#pragma omp critical + { + keepRunning = readNextBlock(input, threadData); + } + if(keepRunning) { + loadBlock(threadData); + for(int i = 0; i < threadData->PBFprimitiveBlock.primitivegroup_size(); i++) { + threadData->currentGroupID = i; + loadGroup(threadData); - loadBlock(); - for(int i = 0; i < PBFprimitiveBlock.primitivegroup_size(); i++) { - currentGroupID = i; - loadGroup(); - - if(entityTypeIndicator == TypeNode) - parseNode(); - if(entityTypeIndicator == TypeWay) - parseWay(); - if(entityTypeIndicator == TypeRelation) - parseRelation(); - if(entityTypeIndicator == TypeDenseNode) - parseDenseNode(); - } + if(threadData->entityTypeIndicator == TypeNode) + parseNode(threadData); + if(threadData->entityTypeIndicator == TypeWay) + parseWay(threadData); + if(threadData->entityTypeIndicator == TypeRelation) + parseRelation(threadData); + if(threadData->entityTypeIndicator == TypeDenseNode) + parseDenseNode(threadData); + } + } + }while(keepRunning); } return true; } private: - void parseDenseNode() { - const OSMPBF::DenseNodes& dense = PBFprimitiveBlock.primitivegroup( currentGroupID ).dense(); + void parseDenseNode(_ThreadData * threadData) { + const OSMPBF::DenseNodes& dense = threadData->PBFprimitiveBlock.primitivegroup( threadData->currentGroupID ).dense(); int denseTagIndex = 0; int m_lastDenseID = 0; int m_lastDenseLatitude = 0; @@ -152,8 +188,8 @@ private: m_lastDenseLongitude += dense.lon( i ); _Node n; n.id = m_lastDenseID; - n.lat = 100000*( ( double ) m_lastDenseLatitude * PBFprimitiveBlock.granularity() + PBFprimitiveBlock.lat_offset() ) / NANO; - n.lon = 100000*( ( double ) m_lastDenseLongitude * PBFprimitiveBlock.granularity() + PBFprimitiveBlock.lon_offset() ) / NANO; + n.lat = 100000*( ( double ) m_lastDenseLatitude * threadData->PBFprimitiveBlock.granularity() +threadData-> PBFprimitiveBlock.lat_offset() ) / NANO; + n.lon = 100000*( ( double ) m_lastDenseLongitude * threadData->PBFprimitiveBlock.granularity() + threadData->PBFprimitiveBlock.lon_offset() ) / NANO; while (denseTagIndex < dense.keys_vals_size()) { int tagValue = dense.keys_vals( denseTagIndex ); if(tagValue == 0) { @@ -161,39 +197,51 @@ private: break; } int keyValue = dense.keys_vals ( denseTagIndex+1 ); - std::string key = PBFprimitiveBlock.stringtable().s(tagValue).data(); - std::string value = PBFprimitiveBlock.stringtable().s(keyValue).data(); + std::string key = threadData->PBFprimitiveBlock.stringtable().s(tagValue).data(); + std::string value = threadData->PBFprimitiveBlock.stringtable().s(keyValue).data(); keyVals.Add(key, value); denseTagIndex += 2; } - if(!(*addressCallback)(n, keyVals)) - std::cerr << "[PBFParser] adress not parsed" << std::endl; +#pragma omp critical + { + if(!(*addressCallback)(n, keyVals)) + std::cerr << "[PBFParser] adress not parsed" << std::endl; + } - if(!(*nodeCallback)(n)) - std::cerr << "[PBFParser] dense node not parsed" << std::endl; +#pragma omp critical + { + if(!(*nodeCallback)(n)) + std::cerr << "[PBFParser] dense node not parsed" << std::endl; + } } } - void parseNode() { + void parseNode(_ThreadData * threadData) { _Node n; - if(!(*nodeCallback)(n)) - std::cerr << "[PBFParser] simple node not parsed" << std::endl; +#pragma omp critical + { + if(!(*nodeCallback)(n)) + std::cerr << "[PBFParser] simple node not parsed" << std::endl; + } } - void parseRelation() { - const OSMPBF::PrimitiveGroup& group = PBFprimitiveBlock.primitivegroup( currentGroupID ); + void parseRelation(_ThreadData * threadData) { + const OSMPBF::PrimitiveGroup& group = threadData->PBFprimitiveBlock.primitivegroup( threadData->currentGroupID ); for(int i = 0; i < group.relations_size(); i++ ) { _Relation r; r.type = _Relation::unknown; - if(!(*relationCallback)(r)) - std::cerr << "[PBFParser] relation not parsed" << std::endl; +#pragma omp critical + { + if(!(*relationCallback)(r)) + std::cerr << "[PBFParser] relation not parsed" << std::endl; + } } } - void parseWay() { - if( PBFprimitiveBlock.primitivegroup( currentGroupID ).ways_size() > 0) { - for(int i = 0; i < PBFprimitiveBlock.primitivegroup( currentGroupID ).ways_size(); i++) { - const OSMPBF::Way& inputWay = PBFprimitiveBlock.primitivegroup( currentGroupID ).ways( i ); + void parseWay(_ThreadData * threadData) { + if( threadData->PBFprimitiveBlock.primitivegroup( threadData->currentGroupID ).ways_size() > 0) { + for(int i = 0; i < threadData->PBFprimitiveBlock.primitivegroup( threadData->currentGroupID ).ways_size(); i++) { + const OSMPBF::Way& inputWay = threadData->PBFprimitiveBlock.primitivegroup( threadData->currentGroupID ).ways( i ); _Way w; w.id = inputWay.id(); unsigned pathNode(0); @@ -203,41 +251,47 @@ private: } assert(inputWay.keys_size() == inputWay.vals_size()); for(int i = 0; i < inputWay.keys_size(); i++) { - const std::string key = PBFprimitiveBlock.stringtable().s(inputWay.keys(i)); - const std::string val = PBFprimitiveBlock.stringtable().s(inputWay.vals(i)); + const std::string key = threadData->PBFprimitiveBlock.stringtable().s(inputWay.keys(i)); + const std::string val = threadData->PBFprimitiveBlock.stringtable().s(inputWay.vals(i)); w.keyVals.Add(key, val); } - if(!(*wayCallback)(w)) { - std::cerr << "[PBFParser] way not parsed" << std::endl; +#pragma omp critical + { + if(!(*wayCallback)(w)) { + std::cerr << "[PBFParser] way not parsed" << std::endl; + } } } } } - void loadGroup() { + void loadGroup(_ThreadData * threadData) { +#pragma omp atomic groupCount++; - const OSMPBF::PrimitiveGroup& group = PBFprimitiveBlock.primitivegroup( currentGroupID ); - entityTypeIndicator = 0; + + const OSMPBF::PrimitiveGroup& group = threadData->PBFprimitiveBlock.primitivegroup( threadData->currentGroupID ); + threadData->entityTypeIndicator = 0; if ( group.nodes_size() != 0 ) { - entityTypeIndicator = TypeNode; + threadData->entityTypeIndicator = TypeNode; } if ( group.ways_size() != 0 ) { - entityTypeIndicator = TypeWay; + threadData->entityTypeIndicator = TypeWay; } if ( group.relations_size() != 0 ) { - entityTypeIndicator = TypeRelation; + threadData->entityTypeIndicator = TypeRelation; } if ( group.has_dense() ) { - entityTypeIndicator = TypeDenseNode; + threadData->entityTypeIndicator = TypeDenseNode; assert( group.dense().id_size() != 0 ); } - assert( entityTypeIndicator != 0 ); + assert( threadData->entityTypeIndicator != 0 ); } - void loadBlock() { + void loadBlock(_ThreadData * threadData) { +#pragma omp critical blockCount++; - currentGroupID = 0; - currentEntityID = 0; + threadData->currentGroupID = 0; + threadData->currentEntityID = 0; } /* Reverses Network Byte Order into something usable */ @@ -247,7 +301,7 @@ private: return x; } - bool readPBFBlobHeader(std::fstream& stream) { + bool readPBFBlobHeader(std::fstream& stream, _ThreadData * threadData) { int size(0); stream.read((char *)&size, sizeof(int)); size = swapEndian(size); @@ -262,18 +316,18 @@ private: stream.read(&data[i], 1); } - if ( !PBFBlobHeader.ParseFromArray( data, size ) ) { + if ( !(threadData->PBFBlobHeader).ParseFromArray( data, size ) ){ return false; } return true; } - bool unpackZLIB(std::fstream & stream) { - unsigned rawSize = PBFBlob.raw_size(); + bool unpackZLIB(std::fstream & stream, _ThreadData * threadData) { + unsigned rawSize = threadData->PBFBlob.raw_size(); char unpackedDataArray[rawSize]; z_stream compressedDataStream; - compressedDataStream.next_in = ( unsigned char* ) PBFBlob.zlib_data().data(); - compressedDataStream.avail_in = PBFBlob.zlib_data().size(); + compressedDataStream.next_in = ( unsigned char* ) threadData->PBFBlob.zlib_data().data(); + compressedDataStream.avail_in = threadData->PBFBlob.zlib_data().size(); compressedDataStream.next_out = ( unsigned char* ) unpackedDataArray; compressedDataStream.avail_out = rawSize; compressedDataStream.zalloc = Z_NULL; @@ -298,22 +352,22 @@ private: return false; } - charBuffer.clear(); charBuffer.resize(rawSize); + threadData->charBuffer.clear(); threadData->charBuffer.resize(rawSize); for(unsigned i = 0; i < rawSize; i++) { - charBuffer[i] = unpackedDataArray[i]; + threadData->charBuffer[i] = unpackedDataArray[i]; } return true; } - bool unpackLZMA(std::fstream & stream) { + bool unpackLZMA(std::fstream & stream, _ThreadData * threadData) { return false; } - bool readBlob(std::fstream& stream) { + bool readBlob(std::fstream& stream, _ThreadData * threadData) { if(stream.eof()) return false; - int size = PBFBlobHeader.datasize(); + int size = threadData->PBFBlobHeader.datasize(); if ( size < 0 || size > MAX_BLOB_SIZE ) { std::cerr << "[error] invalid Blob size:" << size << std::endl; return false; @@ -324,25 +378,25 @@ private: stream.read(&data[i], 1); } - if ( !PBFBlob.ParseFromArray( data, size ) ) { + if ( !threadData->PBFBlob.ParseFromArray( data, size ) ) { std::cerr << "[error] failed to parse blob" << std::endl; return false; } - if ( PBFBlob.has_raw() ) { - const std::string& data = PBFBlob.raw(); - charBuffer.clear(); - charBuffer.resize( data.size() ); + if ( threadData->PBFBlob.has_raw() ) { + const std::string& data = threadData->PBFBlob.raw(); + threadData->charBuffer.clear(); + threadData->charBuffer.resize( data.size() ); for ( unsigned i = 0; i < data.size(); i++ ) { - charBuffer[i] = data[i]; + threadData->charBuffer[i] = data[i]; } - } else if ( PBFBlob.has_zlib_data() ) { - if ( !unpackZLIB(stream) ) { + } else if ( threadData->PBFBlob.has_zlib_data() ) { + if ( !unpackZLIB(stream, threadData) ) { std::cerr << "[error] zlib data encountered that could not be unpacked" << std::endl; return false; } - } else if ( PBFBlob.has_lzma_data() ) { - if ( !unpackLZMA(stream) ) + } else if ( threadData->PBFBlob.has_lzma_data() ) { + if ( !unpackLZMA(stream, threadData) ) std::cerr << "[error] lzma data encountered that could not be unpacked" << std::endl; return false; } else { @@ -352,34 +406,34 @@ private: return true; } - bool readNextBlock(std::fstream& stream) { + bool readNextBlock(std::fstream& stream, _ThreadData * threadData) { if(stream.eof()) { return false; } - if ( !readPBFBlobHeader(stream) ) + if ( !readPBFBlobHeader(stream, threadData) ) return false; - if ( PBFBlobHeader.type() != "OSMData" ) { - std::cerr << "[error] invalid block type, found" << PBFBlobHeader.type().data() << "instead of OSMData" << std::endl; + if ( threadData->PBFBlobHeader.type() != "OSMData" ) { + std::cerr << "[error] invalid block type, found" << threadData->PBFBlobHeader.type().data() << "instead of OSMData" << std::endl; return false; } - if ( !readBlob(stream) ) + if ( !readBlob(stream, threadData) ) return false; - char data[charBuffer.size()]; - for(unsigned i = 0; i < charBuffer.size(); i++ ){ - data[i] = charBuffer[i]; + char data[threadData->charBuffer.size()]; + for(unsigned i = 0; i < threadData->charBuffer.size(); i++ ){ + data[i] = threadData->charBuffer[i]; } - if ( !PBFprimitiveBlock.ParseFromArray( data, charBuffer.size() ) ) { + if ( !threadData->PBFprimitiveBlock.ParseFromArray( data, threadData-> charBuffer.size() ) ) { std::cerr << "[error] failed to parse PrimitiveBlock" << std::endl; return false; } return true; } - Endianness getMachineEndianness() { + static Endianness getMachineEndianness() { int i(1); char *p = (char *) &i; if (p[0] == 1) @@ -391,17 +445,6 @@ private: static const int MAX_BLOB_HEADER_SIZE = 64 * 1024; static const int MAX_BLOB_SIZE = 32 * 1024 * 1024; - OSMPBF::BlobHeader PBFBlobHeader; - OSMPBF::Blob PBFBlob; - - OSMPBF::HeaderBlock PBFHeaderBlock; - OSMPBF::PrimitiveBlock PBFprimitiveBlock; - - std::vector charBuffer; - - int currentGroupID; - int currentEntityID; - /* counting the number of read blocks and groups */ unsigned groupCount; unsigned blockCount; @@ -413,6 +456,10 @@ private: bool (*addressCallback)(_Node, HashTable); /* the input stream to parse */ std::fstream input; + + /* ThreadData Array */ + std::vector < _ThreadData* > threadDataVector; + }; #endif /* PBFPARSER_H_ */ diff --git a/extractLargeNetwork.cpp b/extractLargeNetwork.cpp index bffdb919e..e36385478 100644 --- a/extractLargeNetwork.cpp +++ b/extractLargeNetwork.cpp @@ -22,6 +22,7 @@ or see http://www.gnu.org/licenses/agpl.txt. #endif #define STXXL_VERBOSE_LEVEL -1000 +#include #include #include #include @@ -51,6 +52,9 @@ bool adressFunction(_Node n, HashTable keyVals); bool relationFunction(_Relation r); bool wayFunction(_Way w); +template +bool removeIfUnused(ClassT n) { return (false == n.used); } + int main (int argc, char *argv[]) { if(argc <= 1) { std::cerr << "usage: " << endl << argv[0] << " " << std::endl; @@ -79,15 +83,14 @@ int main (int argc, char *argv[]) { } std::string adressFileName(outputFileName); - STXXLNodeIDVector * usedNodes = new STXXLNodeIDVector(); - STXXLNodeVector * allNodes = new STXXLNodeVector(); - STXXLNodeVector * confirmedNodes = new STXXLNodeVector(); - STXXLEdgeVector * allEdges = new STXXLEdgeVector(); - STXXLEdgeVector * confirmedEdges = new STXXLEdgeVector(); - STXXLAddressVector* adressVector = new STXXLAddressVector(); - STXXLStringVector * nameVector = new STXXLStringVector(); + STXXLNodeIDVector usedNodeIDs; + STXXLNodeVector allNodes; + STXXLEdgeVector allEdges; + STXXLAddressVector adressVector; + STXXLStringVector nameVector; + unsigned usedNodeCounter = 0; + unsigned usedEdgeCounter = 0; - NodeMap * nodeMap = new NodeMap(); StringMap * stringMap = new StringMap(); Settings settings; settings.speedProfile.names.insert(settings.speedProfile.names.begin(), names, names+14); @@ -95,10 +98,9 @@ int main (int argc, char *argv[]) { double time = get_timestamp(); - nodeMap->set_empty_key(UINT_MAX); stringMap->set_empty_key(GetRandomString()); stringMap->insert(std::make_pair("", 0)); - extractCallBacks = new ExtractorCallbacks(allNodes, usedNodes, allEdges, nameVector, adressVector, settings, stringMap); + extractCallBacks = new ExtractorCallbacks(&allNodes, &usedNodeIDs, &allEdges, &nameVector, &adressVector, settings, stringMap); BaseParser<_Node, _Relation, _Way> * parser; if(isPBF) { @@ -113,33 +115,34 @@ int main (int argc, char *argv[]) { std::cerr << "[error] parser not initialized!" << std::endl; exit(-1); } + /* flush needs to be called to flush the remaining local vector elements */ + delete parser; try { - std::cout << "[info] raw no. of names: " << nameVector->size() << std::endl; - std::cout << "[info] raw no. of nodes: " << allNodes->size() << std::endl; - std::cout << "[info] no. of used nodes: " << usedNodes->size() << std::endl; - std::cout << "[info] raw no. of edges: " << allEdges->size() << std::endl; - std::cout << "[info] raw no. of relations: " << globalRelationCounter << std::endl; - std::cout << "[info] raw no. of addresses: " << adressVector->size() << std::endl; + // std::cout << "[info] raw no. of names: " << nameVector.size() << std::endl; + // std::cout << "[info] raw no. of nodes: " << allNodes.size() << std::endl; + // std::cout << "[info] no. of used nodes: " << usedNodeIDs.size() << std::endl; + // std::cout << "[info] raw no. of edges: " << allEdges.size() << std::endl; + // std::cout << "[info] raw no. of relations: " << globalRelationCounter << std::endl; + // std::cout << "[info] raw no. of addresses: " << adressVector.size() << std::endl; - std::cout << "[info] parsing through input file took " << get_timestamp() - time << "seconds" << std::endl; + std::cout << "[extractor] parsing finished after " << get_timestamp() - time << "seconds" << std::endl; time = get_timestamp(); unsigned memory_to_use = 1024 * 1024 * 1024; std::cout << "[extractor] Sorting used nodes ... " << std::flush; - stxxl::sort(usedNodes->begin(), usedNodes->end(), Cmp(), memory_to_use); + stxxl::sort(usedNodeIDs.begin(), usedNodeIDs.end(), Cmp(), memory_to_use); std::cout << "ok, after " << get_timestamp() - time << "s" << std::endl; - std::cout << "[debug] highest node id: " << usedNodes->back() << std::endl; time = get_timestamp(); - std::cout << "[extractor] Erasing duplicate entries ... " << std::flush; - stxxl::vector::iterator NewEnd = unique ( usedNodes->begin(),usedNodes->end() ) ; - usedNodes->resize ( NewEnd - usedNodes->begin() ); + std::cout << "[extractor] Erasing duplicate nodes ... " << std::flush; + stxxl::vector::iterator NewEnd = unique ( usedNodeIDs.begin(),usedNodeIDs.end() ) ; + usedNodeIDs.resize ( NewEnd - usedNodeIDs.begin() ); cout << "ok, after " << get_timestamp() - time << "s" << endl; time = get_timestamp(); std::cout << "[extractor] Sorting all nodes ... " << std::flush; - stxxl::sort(allNodes->begin(), allNodes->end(), CmpNodeByID(), memory_to_use); + stxxl::sort(allNodes.begin(), allNodes.end(), CmpNodeByID(), memory_to_use); std::cout << "ok, after " << get_timestamp() - time << "s" << std::endl; time = get_timestamp(); @@ -147,68 +150,108 @@ int main (int argc, char *argv[]) { fout.open(outputFileName.c_str()); cout << "[extractor] Confirming used nodes ... " << flush; - STXXLNodeVector::iterator nvit = allNodes->begin(); - STXXLNodeIDVector::iterator niit = usedNodes->begin(); - while(niit != usedNodes->end() && nvit != allNodes->end()) { - if(*niit < nvit->id){ - niit++; + STXXLNodeVector::iterator nodesIT = allNodes.begin(); + STXXLNodeIDVector::iterator usedNodeIDsIT = usedNodeIDs.begin(); + while(usedNodeIDsIT != usedNodeIDs.end() && nodesIT != allNodes.end()) { + if(*usedNodeIDsIT < nodesIT->id){ + usedNodeIDsIT++; continue; } - if(*niit > nvit->id) { - nvit++; + if(*usedNodeIDsIT > nodesIT->id) { + nodesIT++; continue; } - if(*niit == nvit->id) { - confirmedNodes->push_back(*nvit); - nodeMap->insert(std::make_pair(nvit->id, *nvit)); - niit++; - nvit++; + if(*usedNodeIDsIT == nodesIT->id) { + nodesIT->used = true; + usedNodeCounter++; + usedNodeIDsIT++; + nodesIT++; } } - cout << "ok, after " << get_timestamp() - time << "s" << endl; - std::cout << "[debug] no of entries in nodemap" << nodeMap->size() << std::endl; + cout << "ok, after " << get_timestamp() - time << "s" << std::endl; time = get_timestamp(); - cout << "[extractor] Writing used nodes ... " << flush; - fout << confirmedNodes->size() << endl; - for(STXXLNodeVector::iterator ut = confirmedNodes->begin(); ut != confirmedNodes->end(); ut++) { - fout << ut->id<< " " << ut->lon << " " << ut->lat << "\n"; +// std::cout << "[extractor] Erasing unused nodes ... " << std::flush; +// allNodes.resize(std::remove_if(allNodes.begin(), allNodes.end(), removeIfUnused<_Node>)-allNodes.begin()); +// +// cout << "ok, after " << get_timestamp() - time << "s" << std::endl; +// time = get_timestamp(); + + std::cout << "[extractor] Writing used nodes ... " << std::flush; + fout << usedNodeCounter << endl; + for(STXXLNodeVector::iterator ut = allNodes.begin(); ut != allNodes.end(); ut++) { + if(ut->used) + fout << ut->id<< " " << ut->lon << " " << ut->lat << "\n"; } cout << "ok, after " << get_timestamp() - time << "s" << endl; time = get_timestamp(); - cout << "[extractor] confirming used ways ... " << flush; - for(STXXLEdgeVector::iterator eit = allEdges->begin(); eit != allEdges->end(); eit++) { - assert(eit->type > -1 || eit->speed != -1); + // Sort edges by start. + std::cout << "[extractor] Sorting edges by start ... " << std::flush; + stxxl::sort(allEdges.begin(), allEdges.end(), CmpEdgeByStartID(), memory_to_use); + std::cout << "ok, after " << get_timestamp() - time << "s" << std::endl; + time = get_timestamp(); - NodeMap::iterator startit = nodeMap->find(eit->start); - if(startit == nodeMap->end()) { + std::cout << "[extractor] Setting start coords ... " << std::flush; + // Traverse list of edges and nodes in parallel and set start coord + nodesIT = allNodes.begin(); + STXXLEdgeVector::iterator edgeIT = allEdges.begin(); + while(edgeIT != allEdges.end() && nodesIT != allNodes.end()) { + if(edgeIT->start < nodesIT->id){ + edgeIT++; continue; } - NodeMap::iterator targetit = nodeMap->find(eit->target); - - if(targetit == nodeMap->end()) { + if(edgeIT->start > nodesIT->id) { + nodesIT++; continue; } - confirmedEdges->push_back(*eit); + if(edgeIT->start == nodesIT->id) { + edgeIT->startCoord.lat = nodesIT->lat; + edgeIT->startCoord.lon = nodesIT->lon; + edgeIT++; + } } - fout << confirmedEdges->size() << "\n"; + std::cout << "ok, after " << get_timestamp() - time << "s" << std::endl; + time = get_timestamp(); + + // Sort Edges by target + std::cout << "[extractor] Sorting edges by target ... " << std::flush; + stxxl::sort(allEdges.begin(), allEdges.end(), CmpEdgeByTargetID(), memory_to_use); + std::cout << "ok, after " << get_timestamp() - time << "s" << std::endl; + time = get_timestamp(); + + std::cout << "[extractor] Setting target coords ... " << std::flush; + // Traverse list of edges and nodes in parallel and set target coord + nodesIT = allNodes.begin(); + edgeIT = allEdges.begin(); + while(edgeIT != allEdges.end() && nodesIT != allNodes.end()) { + if(edgeIT->target < nodesIT->id){ + edgeIT++; + continue; + } + if(edgeIT->target > nodesIT->id) { + nodesIT++; + continue; + } + if(edgeIT->startCoord.lat != INT_MIN && edgeIT->target == nodesIT->id) { + edgeIT->targetCoord.lat = nodesIT->lat; + edgeIT->targetCoord.lon = nodesIT->lon; + edgeIT->used = true; + usedEdgeCounter++; + edgeIT++; + } + } + + fout << usedEdgeCounter << "\n"; cout << "ok, after " << get_timestamp() - time << "s" << endl; time = get_timestamp(); - cout << "[extractor] writing confirmed ways ... " << flush; - for(STXXLEdgeVector::iterator eit = confirmedEdges->begin(); eit != confirmedEdges->end(); eit++) { - NodeMap::iterator startit = nodeMap->find(eit->start); - if(startit == nodeMap->end()) { + cout << "[extractor] writing confirmed edges ... " << flush; + for(STXXLEdgeVector::iterator eit = allEdges.begin(); eit != allEdges.end(); eit++) { + if(eit->used == false) continue; - } - NodeMap::iterator targetit = nodeMap->find(eit->target); - - if(targetit == nodeMap->end()) { - continue; - } - double distance = ApproximateDistance(startit->second.lat, startit->second.lon, targetit->second.lat, targetit->second.lon); + double distance = ApproximateDistance(eit->startCoord.lat, eit->startCoord.lon, eit->targetCoord.lat, eit->targetCoord.lon); if(eit->speed == -1) eit->speed = settings.speedProfile.speed[eit->type]; double weight = ( distance * 10. ) / (eit->speed / 3.6); @@ -219,16 +262,16 @@ int main (int argc, char *argv[]) { switch(eit->direction) { case _Way::notSure: - fout << startit->first << " " << targetit->first << " " << intDist << " " << 0 << " " << intWeight << " " << eit->type << " " << eit->nameID << "\n"; + fout << eit->start << " " << eit->target << " " << intDist << " " << 0 << " " << intWeight << " " << eit->type << " " << eit->nameID << "\n"; break; case _Way::oneway: - fout << startit->first << " " << targetit->first << " " << intDist << " " << 1 << " " << intWeight << " " << eit->type << " " << eit->nameID << "\n"; + fout << eit->start << " " << eit->target << " " << intDist << " " << 1 << " " << intWeight << " " << eit->type << " " << eit->nameID << "\n"; break; case _Way::bidirectional: - fout << startit->first << " " << targetit->first << " " << intDist << " " << 0 << " " << intWeight << " " << eit->type << " " << eit->nameID << "\n"; + fout << eit->start << " " << eit->target << " " << intDist << " " << 0 << " " << intWeight << " " << eit->type << " " << eit->nameID << "\n"; break; case _Way::opposite: - fout << startit->first << " " << targetit->first << " " << intDist << " " << 1 << " " << intWeight << " " << eit->type << " " << eit->nameID << "\n"; + fout << eit->start << " " << eit->target << " " << intDist << " " << 1 << " " << intWeight << " " << eit->type << " " << eit->nameID << "\n"; break; default: std::cerr << "[error] edge with no direction: " << eit->direction << std::endl; @@ -240,19 +283,18 @@ int main (int argc, char *argv[]) { outputFileName.append(".names"); std::cout << "ok, after " << get_timestamp() - time << "s" << std::endl; -// std::cout << "[debug] written edges: " << testCounter << std::endl; time = get_timestamp(); std::cout << "[extractor] writing street name index ... " << std::flush; - std::vector * nameIndex = new std::vector(nameVector->size()+1, 0); + std::vector * nameIndex = new std::vector(nameVector.size()+1, 0); unsigned currentNameIndex = 0; unsigned elementCounter(0); - for(STXXLStringVector::iterator it = nameVector->begin(); it != nameVector->end(); it++) { + for(STXXLStringVector::iterator it = nameVector.begin(); it != nameVector.end(); it++) { nameIndex->at(elementCounter) = currentNameIndex; currentNameIndex += it->length(); elementCounter++; } - nameIndex->at(nameVector->size()) = currentNameIndex; + nameIndex->at(nameVector.size()) = currentNameIndex; ofstream nameOutFile(outputFileName.c_str(), ios::binary); unsigned sizeOfNameIndex = nameIndex->size(); nameOutFile.write((char *)&(sizeOfNameIndex), sizeof(unsigned)); @@ -260,7 +302,7 @@ int main (int argc, char *argv[]) { for(unsigned i = 0; i < nameIndex->size(); i++) { nameOutFile.write((char *)&(nameIndex->at(i)), sizeof(unsigned)); } - for(STXXLStringVector::iterator it = nameVector->begin(); it != nameVector->end(); it++) { + for(STXXLStringVector::iterator it = nameVector.begin(); it != nameVector.end(); it++) { nameOutFile << *it; } @@ -268,33 +310,23 @@ int main (int argc, char *argv[]) { delete nameIndex; std::cout << "ok, after " << get_timestamp() - time << "s" << std::endl; - time = get_timestamp(); - std::cout << "[extractor] writing address list ... " << std::flush; - - adressFileName.append(".address"); - std::ofstream addressOutFile(adressFileName.c_str()); - for(STXXLAddressVector::iterator it = adressVector->begin(); it != adressVector->end(); it++) { - addressOutFile << it->node.id << "|" << it->node.lat << "|" << it->node.lon << "|" << it->city << "|" << it->street << "|" << it->housenumber << "|" << it->state << "|" << it->country << "\n"; - } - addressOutFile.close(); - std::cout << "ok, after " << get_timestamp() - time << "s" << std::endl; + // time = get_timestamp(); + // std::cout << "[extractor] writing address list ... " << std::flush; + // + // adressFileName.append(".address"); + // std::ofstream addressOutFile(adressFileName.c_str()); + // for(STXXLAddressVector::iterator it = adressVector.begin(); it != adressVector.end(); it++) { + // addressOutFile << it->node.id << "|" << it->node.lat << "|" << it->node.lon << "|" << it->city << "|" << it->street << "|" << it->housenumber << "|" << it->state << "|" << it->country << "\n"; + // } + // addressOutFile.close(); + // std::cout << "ok, after " << get_timestamp() - time << "s" << std::endl; } catch ( const std::exception& e ) { std::cerr << "Caught Execption:" << e.what() << std::endl; return false; } - std::cout << "[info] Statistics:" << std::endl; - std::cout << "[info] -----------" << std::endl; - std::cout << "[info] Usable Nodes: " << confirmedNodes->size() << std::endl; - std::cout << "[info] Usable Edges: " << confirmedEdges->size() << std::endl; - delete extractCallBacks; - delete nodeMap; - delete confirmedNodes; - delete confirmedEdges; - delete adressVector; - delete parser; cout << "[extractor] finished." << endl; return 0; }