fix detection of suffix/prefix changes for name-changes

This commit is contained in:
Moritz Kobitzsch
2017-08-17 15:31:13 +02:00
parent 2ddd98ee6d
commit 884ce4025b
5 changed files with 130 additions and 56 deletions
+88 -50
View File
@@ -10,6 +10,7 @@
#include <algorithm>
#include <string>
#include <tuple>
#include <utility>
#include <vector>
@@ -25,17 +26,81 @@ namespace guidance
// Name Change Logic
// Used both during Extraction as well as during Post-Processing
inline std::pair<std::string, std::string> getPrefixAndSuffix(const std::string &data)
inline std::string longest_common_substring(const std::string &lhs, const std::string &rhs)
{
const auto suffix_pos = data.find_last_of(' ');
if (suffix_pos == std::string::npos)
return {};
if (lhs.empty() || rhs.empty())
return "";
const auto prefix_pos = data.find_first_of(' ');
auto result = std::make_pair(data.substr(0, prefix_pos), data.substr(suffix_pos + 1));
boost::to_lower(result.first);
boost::to_lower(result.second);
return result;
// array for dynamic programming
std::vector<std::vector<std::uint32_t>> dp(lhs.size(),
std::vector<std::uint32_t>(rhs.size(), 0));
// to remember the best location
std::uint32_t best = 0;
std::uint32_t best_pos = 0;
for (std::uint32_t i = 0; i < lhs.size(); ++i)
{
for (std::uint32_t j = 0; j < rhs.size(); ++j)
{
if (lhs[i] == rhs[j])
{
dp[i][j] = (i == 0 || j == 0) ? 1 : (dp[i - 1][j - 1] + 1);
if (dp[i][j] > best)
{
best = dp[i][j];
best_pos = i + 1;
}
}
}
}
// the best position marks the end of the string
return lhs.substr(best_pos - best, best);
}
inline auto decompose(const std::string &lhs, const std::string &rhs)
{
auto const lcs = longest_common_substring(lhs, rhs);
// trim spaces, transform to lower
const auto trim = [](auto str) {
boost::to_lower(str);
auto front = str.find_first_not_of(" ");
if (front == std::string::npos)
return str;
auto back = str.find_last_not_of(" ");
return str.substr(front, back - front + 1);
};
if (lcs.empty())
{
std::string empty = "";
return std::make_tuple(trim(lhs), trim(rhs), empty, empty);
}
// find the common substring in both
auto lhs_pos = lhs.find(lcs);
auto rhs_pos = rhs.find(lcs);
BOOST_ASSERT(lhs_pos + lcs.size() <= lhs.size());
BOOST_ASSERT(rhs_pos + lcs.size() <= rhs.size());
// prefixes
std::string lhs_prefix = (lhs_pos > 0) ? lhs.substr(0, lhs_pos) : "";
std::string rhs_prefix = (rhs_pos > 0) ? rhs.substr(0, rhs_pos) : "";
// suffices
std::string lhs_suffix = lhs.substr(lhs_pos + lcs.size());
std::string rhs_suffix = rhs.substr(rhs_pos + lcs.size());
lhs_prefix = trim(std::move(lhs_prefix));
lhs_suffix = trim(std::move(lhs_suffix));
rhs_prefix = trim(std::move(rhs_prefix));
rhs_suffix = trim(std::move(rhs_suffix));
return std::make_tuple(lhs_prefix, lhs_suffix, rhs_prefix, rhs_suffix);
}
// Note: there is an overload without suffix checking below.
@@ -64,51 +129,24 @@ inline bool requiresNameAnnounced(const std::string &from_name,
const auto name_is_contained =
boost::starts_with(from_name, to_name) || boost::starts_with(to_name, from_name);
const auto checkForPrefixOrSuffixChange = [](
const std::string &first, const std::string &second, const SuffixTable &suffix_table) {
const auto checkForPrefixOrSuffixChange =
[](const std::string &first, const std::string &second, const SuffixTable &suffix_table) {
std::string first_prefix, first_suffix, second_prefix, second_suffix;
std::tie(first_prefix, first_suffix, second_prefix, second_suffix) =
decompose(first, second);
const auto first_prefix_and_suffixes = getPrefixAndSuffix(first);
const auto second_prefix_and_suffixes = getPrefixAndSuffix(second);
const auto checkTable = [&](const std::string &str) {
// workaround for cucumber tests:
if (str.length() == 1 && (first.length() == 2 || second.length() == 2))
return false;
// reverse strings, get suffices and reverse them to get prefixes
const auto checkTable = [&](const std::string &str) {
return str.empty() || suffix_table.isSuffix(str);
return str.empty() || suffix_table.isSuffix(str);
};
return checkTable(first_prefix) && checkTable(first_suffix) &&
checkTable(second_prefix) && checkTable(second_suffix);
};
const auto getOffset = [](const std::string &str) -> std::size_t {
if (str.empty())
return 0;
else
return str.length() + 1;
};
const bool is_prefix_change = [&]() -> bool {
if (!checkTable(first_prefix_and_suffixes.first))
return false;
if (!checkTable(second_prefix_and_suffixes.first))
return false;
return !first.compare(getOffset(first_prefix_and_suffixes.first),
std::string::npos,
second,
getOffset(second_prefix_and_suffixes.first),
std::string::npos);
}();
const bool is_suffix_change = [&]() -> bool {
if (!checkTable(first_prefix_and_suffixes.second))
return false;
if (!checkTable(second_prefix_and_suffixes.second))
return false;
return !first.compare(0,
first.length() - getOffset(first_prefix_and_suffixes.second),
second,
0,
second.length() - getOffset(second_prefix_and_suffixes.second));
}();
return is_prefix_change || is_suffix_change;
};
const auto is_suffix_change = checkForPrefixOrSuffixChange(from_name, to_name, suffix_table);
const auto names_are_equal = from_name == to_name || name_is_contained || is_suffix_change;
const auto name_is_removed = !from_name.empty() && to_name.empty();