Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cu 86c0p7y10 implement levenshtein distance comparison #336

Open
wants to merge 9 commits into
base: dev
Choose a base branch
from
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
package org.jembi.jempi.linker.backend;

import com.fasterxml.jackson.core.JsonProcessingException;
import org.apache.commons.codec.language.Soundex;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.similarity.JaccardSimilarity;
import org.apache.commons.text.similarity.JaroWinklerSimilarity;
import org.apache.commons.text.similarity.LevenshteinDistance;
import org.apache.commons.text.similarity.SimilarityScore;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
Expand All @@ -27,6 +29,9 @@ public final class LinkerProbabilistic {
static final JaccardSimilarity JACCARD_SIMILARITY = new JaccardSimilarity();
static final JaroSimilarity JARO_SIMILARITY = new JaroSimilarity();
static final ExactSimilarity EXACT_SIMILARITY = new ExactSimilarity();
static final SoundexSimilarity SOUNDEX_SIMILARITY = new SoundexSimilarity();
static final LevenshteinSimilarity LEVENSHTEIN_SIMILARITY = new LevenshteinSimilarity();
static final LevenshteinSimilarityPercentage LEVENSHTEIN_SIMILARITY_PERCENTAGE = new LevenshteinSimilarityPercentage();
private static final int METRIC_MIN = 0;
private static final int METRIC_MAX = 1;
private static final int METRIC_SCORE = 2;
Expand All @@ -36,15 +41,15 @@ public final class LinkerProbabilistic {
private static final float MISSING_PENALTY = 0.925F;
static List<ProbabilisticField> currentProbabilisticLinkFields = LINKER_CONFIG.probabilisticLinkFields
.stream()
.map(f -> new ProbabilisticField(getSimilarityFunction(f.similarityScore()), f.comparisonLevels(), f.m(), f.u()))
.map(f -> new ProbabilisticField(getSimilarityFunction(SimilarityFunctionName.valueOf(f.similarityScore())), f.comparisonLevels(), f.m(), f.u()))
MatthewErispe marked this conversation as resolved.
Show resolved Hide resolved
.toList();
static List<ProbabilisticField> currentProbabilisticValidateFields = LINKER_CONFIG.probabilisticValidateFields
.stream()
.map(f -> new ProbabilisticField(getSimilarityFunction(f.similarityScore()), f.comparisonLevels(), f.m(), f.u()))
.map(f -> new ProbabilisticField(getSimilarityFunction(SimilarityFunctionName.valueOf(f.similarityScore())), f.comparisonLevels(), f.m(), f.u()))
.toList();
static List<ProbabilisticField> currentProbabilisticMatchFields = LINKER_CONFIG.probabilisticMatchNotificationFields
.stream()
.map(f -> new ProbabilisticField(getSimilarityFunction(f.similarityScore()), f.comparisonLevels(), f.m(), f.u()))
.map(f -> new ProbabilisticField(getSimilarityFunction(SimilarityFunctionName.valueOf(f.similarityScore())), f.comparisonLevels(), f.m(), f.u()))
.toList();

static List<ProbabilisticField> updatedProbabilisticLinkFields = null;
Expand All @@ -65,18 +70,39 @@ static List<ProbabilisticField> toLinkProbabilisticFieldList(
final var list = new ArrayList<ProbabilisticField>();
for (int i = 0; i < mu.size(); i++) {
list.add(new ProbabilisticField(
getSimilarityFunction(probabilisticMetaData.get(i).similarityScore()),
getSimilarityFunction(SimilarityFunctionName.valueOf(probabilisticMetaData.get(i).similarityScore())),
probabilisticMetaData.get(i).comparisonLevels(),
mu.get(i).m(), mu.get(i).u()));
}
return list;
}

static SimilarityScore<Double> getSimilarityFunction(final String func) {
if ("JARO_WINKLER_SIMILARITY".equals(func)) {
return JARO_WINKLER_SIMILARITY;
} else {
return JACCARD_SIMILARITY;
public enum SimilarityFunctionName {
JARO_WINKLER_SIMILARITY,
JARO_SIMILARITY,
JACCARD_SIMILARITY,
SOUNDEX_SIMILARITY,
EXACT_SIMILARITY,
LEVENSHTEIN_SIMILARITY,
LEVENSHTEIN_SIMILARITY_PERCENTAGE
}

static SimilarityScore<Double> getSimilarityFunction(final SimilarityFunctionName func) {
switch (func) {
case JARO_WINKLER_SIMILARITY:
return JARO_WINKLER_SIMILARITY;
case JARO_SIMILARITY:
return JARO_SIMILARITY;
case JACCARD_SIMILARITY:
return JACCARD_SIMILARITY;
case SOUNDEX_SIMILARITY:
return SOUNDEX_SIMILARITY;
case LEVENSHTEIN_SIMILARITY:
return LEVENSHTEIN_SIMILARITY;
case LEVENSHTEIN_SIMILARITY_PERCENTAGE:
return LEVENSHTEIN_SIMILARITY_PERCENTAGE;
default:
return EXACT_SIMILARITY;
MatthewErispe marked this conversation as resolved.
Show resolved Hide resolved
}
}

Expand Down Expand Up @@ -268,14 +294,74 @@ public Double apply(
if (StringUtils.isEmpty(left) || StringUtils.isEmpty(right)) {
return 0.5;
}
// assert - we have 2 non-empty strings

return StringUtils.equals(left, right)
? 1.0
MatthewErispe marked this conversation as resolved.
Show resolved Hide resolved
: 0.0;
}

}

static class SoundexSimilarity implements SimilarityScore<Double> {

private final Soundex soundex = new Soundex();

@Override
public Double apply(
final CharSequence left,
final CharSequence right) {
if (StringUtils.isEmpty(left) || StringUtils.isEmpty(right)) {
return 0.5;
}

return StringUtils.equals(soundex.soundex(left.toString()), soundex.soundex(right.toString()))
? 1.0
: 0.0;
}

}
MatthewErispe marked this conversation as resolved.
Show resolved Hide resolved

static class LevenshteinSimilarityPercentage implements SimilarityScore<Double> {

private final LevenshteinDistance levenshteinDistance = new LevenshteinDistance();

@Override
public Double apply(
final CharSequence left,
final CharSequence right) {
if (StringUtils.isEmpty(left) || StringUtils.isEmpty(right)) {
return 0.5;
}

int maxLength = Math.max(left.length(), right.length());
double levenshteinDistanceValue = levenshteinDistance.apply(left, right);

// Invert the percentage value
double percentage = (levenshteinDistanceValue / maxLength) * 100;
double invertedPercentage = 100 - percentage;

return invertedPercentage;
MatthewErispe marked this conversation as resolved.
Show resolved Hide resolved
}

}

static class LevenshteinSimilarity implements SimilarityScore<Double> {

private final LevenshteinDistance levenshteinDistance = new LevenshteinDistance();

@Override
public Double apply(
final CharSequence left,
final CharSequence right) {
if (StringUtils.isEmpty(left) || StringUtils.isEmpty(right)) {
return 0.5;
}

return Double.valueOf(levenshteinDistance.apply(left, right));
MatthewErispe marked this conversation as resolved.
Show resolved Hide resolved
}

}

static class JaroSimilarity implements SimilarityScore<Double> {

@Override
Expand Down