Skip to content

Commit

Permalink
createObjectsFromName(): be more tolerant about N/S vs North/South, a…
Browse files Browse the repository at this point in the history
…bsence of zone or height

We want the following matches to be possible:

user entry            official name
------------------    ----------------
EGM96                 EGM96 height
WGS84 UTM31 north     WGS 84 / UTM zone 31N
  • Loading branch information
rouault committed Nov 10, 2024
1 parent 9f2289c commit 094005c
Show file tree
Hide file tree
Showing 7 changed files with 239 additions and 39 deletions.
6 changes: 5 additions & 1 deletion include/proj/metadata.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -397,11 +397,15 @@ class PROJ_GCC_DLL Identifier : public util::BaseObject,

PROJ_DLL static bool isEquivalentName(const char *a,
const char *b) noexcept;
PROJ_DLL static bool
isEquivalentName(const char *a, const char *b,
bool biggerDifferencesAllowed) noexcept;

PROJ_PRIVATE :
//! @cond Doxygen_Suppress
PROJ_INTERNAL static std::string
canonicalizeName(const std::string &str);
canonicalizeName(const std::string &str,
bool biggerDifferencesAllowed = true);

PROJ_INTERNAL void _exportToWKT(io::WKTFormatter *formatter)
const override; // throw(io::FormattingException)
Expand Down
1 change: 1 addition & 0 deletions scripts/reference_exported_symbols.txt
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,7 @@ osgeo::proj::metadata::Identifier::description() const
osgeo::proj::metadata::Identifier::~Identifier()
osgeo::proj::metadata::Identifier::Identifier(osgeo::proj::metadata::Identifier const&)
osgeo::proj::metadata::Identifier::isEquivalentName(char const*, char const*)
osgeo::proj::metadata::Identifier::isEquivalentName(char const*, char const*, bool)
osgeo::proj::metadata::Identifier::uri() const
osgeo::proj::metadata::Identifier::version() const
osgeo::proj::metadata::PositionalAccuracy::create(std::string const&)
Expand Down
1 change: 1 addition & 0 deletions src/apps/projinfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,7 @@ static BaseObjectNNPtr buildObject(
limitResultCount);
if (res.size() == 1) {
obj = res.front().as_nullable();
break;
} else {
for (const auto &l_obj : res) {
if (Identifier::isEquivalentName(
Expand Down
32 changes: 22 additions & 10 deletions src/iso19111/factory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9171,7 +9171,8 @@ AuthorityFactory::createObjectsFromNameEx(
auto sqlRes = d->run(sql, params);
bool isFirst = true;
bool firstIsDeprecated = false;
bool foundExactMatch = false;
size_t countExactMatch = 0;
size_t countExactMatchOnAlias = 0;
std::size_t hashCodeFirstMatch = 0;
for (const auto &row : sqlRes) {
const auto &name = row[3];
Expand Down Expand Up @@ -9262,9 +9263,12 @@ AuthorityFactory::createObjectsFromNameEx(
throw std::runtime_error("Unsupported table_name");
};
const auto obj = getObject(table_name, code);
if (metadata::Identifier::canonicalizeName(obj->nameStr()) ==
canonicalizedSearchedName) {
foundExactMatch = true;
if (metadata::Identifier::isEquivalentName(
obj->nameStr().c_str(), searchedName.c_str(), false)) {
countExactMatch++;
} else if (metadata::Identifier::isEquivalentName(
name.c_str(), searchedName.c_str(), false)) {
countExactMatchOnAlias++;
}

const auto objPtr = obj.get();
Expand All @@ -9280,14 +9284,21 @@ AuthorityFactory::createObjectsFromNameEx(
}
}

// If we found a name that is an exact match, and all objects have the
// same type, and we are not in approximate mode, only keep the
// object(s) with the exact name match.
if (foundExactMatch && hashCodeFirstMatch != 0 && !approximateMatch) {
// If we found several objects that are an exact match, and all objects
// have the same type, and we are not in approximate mode, only keep the
// objects with the exact name match.
if ((countExactMatch + countExactMatchOnAlias) >= 1 &&
hashCodeFirstMatch != 0 && !approximateMatch) {
std::list<PairObjectName> resTmp;
bool biggerDifferencesAllowed = (countExactMatch == 0);
for (const auto &pair : res) {
if (metadata::Identifier::canonicalizeName(
pair.first->nameStr()) == canonicalizedSearchedName) {
if (metadata::Identifier::isEquivalentName(
pair.first->nameStr().c_str(), searchedName.c_str(),
biggerDifferencesAllowed) ||
(countExactMatch == 0 &&
metadata::Identifier::isEquivalentName(
pair.second.c_str(), searchedName.c_str(),
biggerDifferencesAllowed))) {
resTmp.emplace_back(pair);
}
}
Expand All @@ -9298,6 +9309,7 @@ AuthorityFactory::createObjectsFromNameEx(
auto sortLambda = [](const PairObjectName &a, const PairObjectName &b) {
const auto &aName = a.first->nameStr();
const auto &bName = b.first->nameStr();

if (aName.size() < bName.size()) {
return true;
}
Expand Down
27 changes: 15 additions & 12 deletions src/iso19111/io.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7966,20 +7966,23 @@ static BaseObjectNNPtr createFromUserInput(const std::string &text,

// If there's exactly only one object whose name is equivalent
// to the user input, return it.
IdentifiedObjectPtr identifiedObj;
for (const auto &obj : res) {
if (Identifier::isEquivalentName(obj->nameStr().c_str(),
objectName.c_str())) {
if (identifiedObj == nullptr) {
identifiedObj = obj.as_nullable();
} else {
identifiedObj = nullptr;
break;
for (int pass = 0; pass <= 1; ++pass) {
IdentifiedObjectPtr identifiedObj;
for (const auto &obj : res) {
if (Identifier::isEquivalentName(
obj->nameStr().c_str(), objectName.c_str(),
/* biggerDifferencesAllowed = */ pass == 1)) {
if (identifiedObj == nullptr) {
identifiedObj = obj.as_nullable();
} else {
identifiedObj = nullptr;
break;
}
}
}
}
if (identifiedObj) {
return identifiedObj;
if (identifiedObj) {
return identifiedObj;
}
}

std::string msg("several objects matching this name: ");
Expand Down
172 changes: 157 additions & 15 deletions src/iso19111/metadata.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1214,6 +1214,14 @@ static bool isIgnoredChar(char ch) {

// ---------------------------------------------------------------------------

//! @cond Doxygen_Suppress
static char lower(char ch) {
return ch >= 'A' && ch <= 'Z' ? ch - 'A' + 'a' : ch;
}
//! @endcond

// ---------------------------------------------------------------------------

//! @cond Doxygen_Suppress
static const struct utf8_to_lower {
const char *utf8;
Expand Down Expand Up @@ -1249,21 +1257,87 @@ static const struct utf8_to_lower *get_ascii_replacement(const char *c_str) {
// ---------------------------------------------------------------------------

//! @cond Doxygen_Suppress
std::string Identifier::canonicalizeName(const std::string &str) {

/** Checks if needle is a substring of c_str.
*
* e.g matchesLowerCase("JavaScript", "java") returns true
*/
static bool matchesLowerCase(const char *c_str, const char *needle) {
size_t i = 0;
for (; c_str[i] && needle[i]; ++i) {
if (lower(c_str[i]) != lower(needle[i])) {
return false;
}
}
return needle[i] == 0;
}
//! @endcond

// ---------------------------------------------------------------------------

//! @cond Doxygen_Suppress

static inline bool isdigit(char ch) { return ch >= '0' && ch <= '9'; }
//! @endcond

// ---------------------------------------------------------------------------

//! @cond Doxygen_Suppress
std::string Identifier::canonicalizeName(const std::string &str,
bool biggerDifferencesAllowed) {
std::string res;
const char *c_str = str.c_str();
for (size_t i = 0; c_str[i] != 0; ++i) {
const auto ch = c_str[i];
const auto ch = lower(c_str[i]);
if (ch == ' ' && c_str[i + 1] == '+' && c_str[i + 2] == ' ') {
i += 2;
continue;
}
if (ch == '1' && !res.empty() &&
!(res.back() >= '0' && res.back() <= '9') && c_str[i + 1] == '9' &&
c_str[i + 2] >= '0' && c_str[i + 2] <= '9') {

// Canonicalize "19dd" (where d is a digit) as "dd"
if (ch == '1' && !res.empty() && !isdigit(res.back()) &&
c_str[i + 1] == '9' && isdigit(c_str[i + 2]) &&
isdigit(c_str[i + 3])) {
++i;
continue;
}

if (biggerDifferencesAllowed) {
// Skip "zone" if preceding character is a space
if (ch == 'z' && i > 0 && isIgnoredChar(c_str[i - 1]) &&
matchesLowerCase(c_str + i, "zone")) {
i += strlen("zone") - 1;
continue;
}

// Skip "height" if preceding character is a space
else if (ch == 'h' && i > 0 && isIgnoredChar(c_str[i - 1]) &&
matchesLowerCase(c_str + i, "height")) {
i += strlen("height") - 1;
continue;
}

// Replace "north" by "n" if preceding character is a space or a
// digit
if (ch == 'n' && i > 0 &&
(isIgnoredChar(c_str[i - 1]) || isdigit(c_str[i - 1])) &&
matchesLowerCase(c_str + i, "north")) {
res.push_back(ch);
i += strlen("north") - 1;
continue;
}

// Replace "south" by "s" if preceding character is a space or a
// digit
else if (ch == 's' && i > 0 &&
(isIgnoredChar(c_str[i - 1]) || isdigit(c_str[i - 1])) &&
matchesLowerCase(c_str + i, "south")) {
res.push_back(ch);
i += strlen("south") - 1;
continue;
}
}

if (static_cast<unsigned char>(ch) > 127) {
const auto *replacement = get_ascii_replacement(c_str + i);
if (replacement) {
Expand All @@ -1273,7 +1347,7 @@ std::string Identifier::canonicalizeName(const std::string &str) {
}
}
if (!isIgnoredChar(ch)) {
res.push_back(static_cast<char>(::tolower(ch)));
res.push_back(ch);
}
}
return res;
Expand All @@ -1286,15 +1360,22 @@ std::string Identifier::canonicalizeName(const std::string &str) {
*
* Two names are equivalent by removing any space, underscore, dash, slash,
* { or } character from them, and comparing in a case insensitive way.
*
* @param a first string
* @param b second string
* @param biggerDifferencesAllowed if true, "height" and "zone" words are
* ignored, and "north" is shortened as "n" and "south" as "n".
* @since 9.6
*/
bool Identifier::isEquivalentName(const char *a, const char *b) noexcept {
bool Identifier::isEquivalentName(const char *a, const char *b,
bool biggerDifferencesAllowed) noexcept {
size_t i = 0;
size_t j = 0;
char lastValidA = 0;
char lastValidB = 0;
while (a[i] != 0 || b[j] != 0) {
char aCh = a[i];
char bCh = b[j];
char aCh = lower(a[i]);
char bCh = lower(b[j]);
if (aCh == ' ' && a[i + 1] == '+' && a[i + 2] == ' ' && a[i + 3] != 0) {
i += 3;
continue;
Expand All @@ -1311,18 +1392,69 @@ bool Identifier::isEquivalentName(const char *a, const char *b) noexcept {
++j;
continue;
}
if (aCh == '1' && !(lastValidA >= '0' && lastValidA <= '9') &&
a[i + 1] == '9' && a[i + 2] >= '0' && a[i + 2] <= '9') {

// Canonicalize "19dd" (where d is a digit) as "dd"
if (aCh == '1' && !isdigit(lastValidA) && a[i + 1] == '9' &&
isdigit(a[i + 2]) && isdigit(a[i + 3])) {
i += 2;
lastValidA = '9';
continue;
}
if (bCh == '1' && !(lastValidB >= '0' && lastValidB <= '9') &&
b[j + 1] == '9' && b[j + 2] >= '0' && b[j + 2] <= '9') {
if (bCh == '1' && !isdigit(lastValidB) && b[j + 1] == '9' &&
isdigit(b[j + 2]) && isdigit(b[j + 3])) {
j += 2;
lastValidB = '9';
continue;
}

if (biggerDifferencesAllowed) {
// Skip a substring if preceding character is a space
const auto skipSubString = [](char ch, const char *str, size_t &idx,
const char *substr) {
if (ch == substr[0] && idx > 0 && isIgnoredChar(str[idx - 1]) &&
matchesLowerCase(str + idx, substr)) {
idx += strlen(substr);
return true;
}
return false;
};

bool skip = false;
if (skipSubString(aCh, a, i, "zone"))
skip = true;
if (skipSubString(bCh, b, j, "zone"))
skip = true;
if (skip)
continue;

if (skipSubString(aCh, a, i, "height"))
skip = true;
if (skipSubString(bCh, b, j, "height"))
skip = true;
if (skip)
continue;

// Replace a substring by its first character if preceding character
// is a space or a digit
const auto replaceByFirstChar = [](char ch, const char *str,
size_t &idx,
const char *substr) {
if (ch == substr[0] && idx > 0 &&
(isIgnoredChar(str[idx - 1]) || isdigit(str[idx - 1])) &&
matchesLowerCase(str + idx, substr)) {
idx += strlen(substr) - 1;
return true;
}
return false;
};

if (!replaceByFirstChar(aCh, a, i, "north"))
replaceByFirstChar(aCh, a, i, "south");

if (!replaceByFirstChar(bCh, b, j, "north"))
replaceByFirstChar(bCh, b, j, "south");
}

if (static_cast<unsigned char>(aCh) > 127) {
const auto *replacement = get_ascii_replacement(a + i);
if (replacement) {
Expand All @@ -1337,8 +1469,7 @@ bool Identifier::isEquivalentName(const char *a, const char *b) noexcept {
j += strlen(replacement->utf8) - 1;
}
}
if ((aCh == 0 && bCh != 0) || (aCh != 0 && bCh == 0) ||
::tolower(aCh) != ::tolower(bCh)) {
if (aCh != bCh) {
return false;
}
lastValidA = aCh;
Expand All @@ -1353,6 +1484,17 @@ bool Identifier::isEquivalentName(const char *a, const char *b) noexcept {

// ---------------------------------------------------------------------------

/** \brief Returns whether two names are considered equivalent.
*
* Two names are equivalent by removing any space, underscore, dash, slash,
* { or } character from them, and comparing in a case insensitive way.
*/
bool Identifier::isEquivalentName(const char *a, const char *b) noexcept {
return isEquivalentName(a, b, /* biggerDifferencesAllowed = */ true);
}

// ---------------------------------------------------------------------------

//! @cond Doxygen_Suppress
struct PositionalAccuracy::Private {
std::string value_{};
Expand Down
Loading

0 comments on commit 094005c

Please sign in to comment.