diff --git a/.gitignore b/.gitignore
index d4d8e1436..352452212 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,8 @@ target
*.iws
*.ipynb
*.egg-info
+*~
+java.hprof.txt
.classpath
.project
diff --git a/collatex-core/src/main/java/eu/interedition/collatex/CollationAlgorithmFactory.java b/collatex-core/src/main/java/eu/interedition/collatex/CollationAlgorithmFactory.java
index b6555d756..6db3ab1a7 100644
--- a/collatex-core/src/main/java/eu/interedition/collatex/CollationAlgorithmFactory.java
+++ b/collatex-core/src/main/java/eu/interedition/collatex/CollationAlgorithmFactory.java
@@ -21,33 +21,107 @@
import eu.interedition.collatex.dekker.DekkerAlgorithm;
import eu.interedition.collatex.medite.MediteAlgorithm;
-import eu.interedition.collatex.needlemanwunsch.NeedlemanWunschAlgorithm;
+import eu.interedition.collatex.needlemanwunsch.*;
+import eu.interedition.collatex.needlemanwunschgotoh.*;
+import eu.interedition.collatex.matching.*;
+import eu.interedition.collatex.simple.SimpleToken;
import eu.interedition.collatex.util.GreedyStringTilingAlgorithm;
import eu.interedition.collatex.util.VertexMatch;
import java.util.Comparator;
import java.util.SortedSet;
import java.util.function.Function;
+import java.util.logging.Level;
+import java.util.logging.Logger;
/**
* @author Gregor Middell
* @author Ronald Haentjens Dekker
*/
public class CollationAlgorithmFactory {
+ protected final static Logger LOG = Logger.getLogger("CollationAlgorithmFactory");
public static CollationAlgorithm dekker(Comparator comparator) {
return new DekkerAlgorithm(comparator);
}
+
public static CollationAlgorithm needlemanWunsch(Comparator comparator) {
- return new NeedlemanWunschAlgorithm(comparator);
+ return new eu.interedition.collatex.needlemanwunsch.NeedlemanWunschAlgorithm(comparator);
+ }
+
+
+ public static CollationAlgorithm needlemanWunschGotoh(StringMetricScorer scorer) {
+ return new eu.interedition.collatex.needlemanwunschgotoh.NeedlemanWunschGotohAlgorithm(scorer);
+ }
+
+ public static CollationAlgorithm needlemanWunschGotoh() {
+ return needlemanWunschGotoh(new TrigramRatioScorer());
+ }
+
+
+ public static CollationAlgorithm greedyStringTiling(Comparator comparator) {
+ return greedyStringTiling(comparator, 2);
}
- public static CollationAlgorithm greedyStringTiling(Comparator comparator, int minimumTileLength) {
+ public static CollationAlgorithm greedyStringTiling(Comparator comparator,
+ Integer minimumTileLength) {
return new GreedyStringTilingAlgorithm(comparator, minimumTileLength);
}
- public static CollationAlgorithm medite(Comparator comparator, Function, Integer> matchEvaluator) {
+
+ public static CollationAlgorithm medite(Comparator comparator) {
+ return medite(comparator, SimpleToken.TOKEN_MATCH_EVALUATOR);
+ }
+
+ public static CollationAlgorithm medite(Comparator comparator,
+ Function, Integer> matchEvaluator) {
return new MediteAlgorithm(comparator, matchEvaluator);
}
+
+
+ public static Comparator createComparator(String name, Object... args) {
+ if (LOG.isLoggable(Level.CONFIG)) {
+ LOG.log(Level.CONFIG, "Comparator: {0}", name);
+ }
+ switch (name) {
+ case "equality":
+ new EqualityTokenComparator();
+ case "levenshtein.distance":
+ return args.length >= 1 ?
+ new EditDistanceTokenComparator((Integer) args[0]) :
+ new EditDistanceTokenComparator();
+ case "levenshtein.ratio":
+ return args.length >= 1 ?
+ new EditDistanceRatioTokenComparator((Double) args[0]) :
+ new EditDistanceRatioTokenComparator();
+ }
+ return new EqualityTokenComparator(); // default
+ }
+
+ public static CollationAlgorithm createAlgorithm(String name, Comparator comparator,
+ Object... args) {
+ if (LOG.isLoggable(Level.CONFIG)) {
+ LOG.log(Level.CONFIG, "Algorithm: {0}", name);
+ }
+ switch (name) {
+ case "dekker":
+ return dekker(comparator);
+ case "gst":
+ return args.length >= 1 ?
+ greedyStringTiling(comparator, (Integer) args[0]) :
+ greedyStringTiling(comparator);
+ case "medite":
+ return args.length >= 1 ?
+ medite(comparator, (Function, Integer>) args[0]) :
+ medite(comparator);
+ case "needleman-wunsch":
+ return needlemanWunsch(comparator);
+ case "needleman-wunsch-gotoh":
+ return args.length >= 1 ?
+ needlemanWunschGotoh((eu.interedition.collatex.matching.StringMetricScorer) args[0]) :
+ needlemanWunschGotoh();
+ }
+ return dekker(comparator); // default
+ }
}
diff --git a/collatex-core/src/main/java/eu/interedition/collatex/VariantGraph.java b/collatex-core/src/main/java/eu/interedition/collatex/VariantGraph.java
index 8e039bd64..b3a81d805 100644
--- a/collatex-core/src/main/java/eu/interedition/collatex/VariantGraph.java
+++ b/collatex-core/src/main/java/eu/interedition/collatex/VariantGraph.java
@@ -37,12 +37,16 @@
* @author Gregor Middell
*/
public class VariantGraph {
- final VariantGraph.Vertex start;
- final VariantGraph.Vertex end;
+ VariantGraph.Vertex start;
+ VariantGraph.Vertex end;
final Map>> transpositionIndex = new HashMap<>();
public VariantGraph() {
super();
+ init();
+ }
+
+ public void init() {
this.start = new VariantGraph.Vertex(this);
this.end = new VariantGraph.Vertex(this);
diff --git a/collatex-core/src/main/java/eu/interedition/collatex/matching/EditDistance.java b/collatex-core/src/main/java/eu/interedition/collatex/matching/EditDistance.java
index 674a50de2..560db5f26 100644
--- a/collatex-core/src/main/java/eu/interedition/collatex/matching/EditDistance.java
+++ b/collatex-core/src/main/java/eu/interedition/collatex/matching/EditDistance.java
@@ -23,6 +23,10 @@ public final class EditDistance {
private static final int MAX_DISTANCE_COMPARISON = 2500;
public static int compute(String str1, String str2) {
+ return compute(str1, str2, 1);
+ }
+
+ public static int compute(String str1, String str2, int subst_cost) {
if ((str1.length() * str2.length() > MAX_DISTANCE_COMPARISON)) {
return MAX_DISTANCE_COMPARISON;
}
@@ -53,7 +57,7 @@ public static int compute(String str1, String str2) {
final char str1Char = str1Chars[i - 1];
for (int j = 1; j <= str2Length; j++) {
final char str2Char = str2Chars[j - 1];
- final int cost = (str1Char == str2Char ? 0 : 1);
+ final int cost = (str1Char == str2Char ? 0 : subst_cost);
matrix[i][j] = min3(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + cost);
}
}
diff --git a/collatex-core/src/main/java/eu/interedition/collatex/matching/EditDistanceRatioTokenComparator.java b/collatex-core/src/main/java/eu/interedition/collatex/matching/EditDistanceRatioTokenComparator.java
new file mode 100644
index 000000000..b7435aab5
--- /dev/null
+++ b/collatex-core/src/main/java/eu/interedition/collatex/matching/EditDistanceRatioTokenComparator.java
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2015 The Interedition Development Group.
+ *
+ * This file is part of CollateX.
+ *
+ * CollateX is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * CollateX is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with CollateX. If not, see .
+ */
+
+package eu.interedition.collatex.matching;
+
+import eu.interedition.collatex.Token;
+import eu.interedition.collatex.simple.SimpleToken;
+
+import java.util.Comparator;
+
+public class EditDistanceRatioTokenComparator implements Comparator {
+
+ private final double threshold;
+ private final LevenshteinRatioScorer scorer;
+
+ public EditDistanceRatioTokenComparator() {
+ this(0.6);
+ }
+
+ public EditDistanceRatioTokenComparator(double threshold) {
+ this.threshold = threshold;
+ this.scorer = new LevenshteinRatioScorer();
+ }
+
+ @Override
+ public int compare(Token token_a, Token token_b) {
+ final String a = ((SimpleToken) token_a).getNormalized();
+ final String b = ((SimpleToken) token_b).getNormalized();
+ return (scorer.score(a, b) >= threshold) ? 0 : a.compareTo(b);
+ }
+}
diff --git a/collatex-core/src/main/java/eu/interedition/collatex/matching/StrictEqualityTokenComparator.java b/collatex-core/src/main/java/eu/interedition/collatex/matching/StrictEqualityTokenComparator.java
old mode 100755
new mode 100644
diff --git a/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschGotohAlgorithm.java b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschGotohAlgorithm.java
new file mode 100644
index 000000000..437cdaf9c
--- /dev/null
+++ b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschGotohAlgorithm.java
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2015 The Interedition Development Group.
+ *
+ * This file is part of CollateX.
+ *
+ * CollateX is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * CollateX is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with CollateX. If not, see .
+ */
+
+package eu.interedition.collatex.needlemanwunschgotoh;
+
+import eu.interedition.collatex.CollationAlgorithm;
+import eu.interedition.collatex.Token;
+import eu.interedition.collatex.VariantGraph;
+import eu.interedition.collatex.Witness;
+import eu.interedition.collatex.matching.Pair;
+import eu.interedition.collatex.matching.StringMetricScorer;
+import eu.interedition.collatex.util.VariantGraphRanking;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+
+/**
+ * Implements the Needleman-Wunsch-Gotoh collation algorithm.
+ *
+ * @author Marcello Perathoner
+ */
+public class NeedlemanWunschGotohAlgorithm implements CollationAlgorithm {
+
+ private final StringMetricScorer scorer;
+ /** The minimum similarity score to merge two tokens into one vertex. */
+ private final double matchScore;
+ /** A human-readable matrix. Written to if set. */
+ private StringBuilder debugMatrix = null;
+
+ public NeedlemanWunschGotohAlgorithm(final StringMetricScorer scorer) {
+ this.scorer = scorer;
+ this.matchScore = 0.6;
+ }
+
+ @Override
+ public void collate(final VariantGraph graph, final Iterable witness) {
+ List> witnesses = new ArrayList<>();
+ witnesses.add(witness);
+ collate(graph, witnesses);
+ };
+
+ @Override
+ public void collate(final VariantGraph graph, final Iterable... witnesses) {
+ collate(graph, Arrays.asList(witnesses));
+ };
+
+ @Override
+ public void collate(final VariantGraph graph, final List extends Iterable> witnesses) {
+ List> rankingA = graphToRanking(graph);
+ Set witnessesA = getWitnesses(rankingA);
+
+ for (Iterable witness : witnesses) {
+ final List> rankingB = tokensToRanking(witness);
+ final Set witnessesB = getWitnesses(rankingB);
+
+ rankingA = doCollate(rankingA, witnessesA, rankingB, witnessesB);
+ witnessesA.addAll(witnessesB);
+ }
+ rankingToGraph(graph, rankingA);
+ };
+
+ public void setDebugMatrix(final StringBuilder debugMatrix) {
+ this.debugMatrix = debugMatrix;
+ }
+
+ /* Incipit private stuff */
+
+ private Set getWitnesses(final VariantGraph.Vertex vertex) {
+ return vertex.tokens().stream().map(t -> t.getWitness()).collect(Collectors.toSet());
+ }
+
+ private Set getWitnesses(final Collection> ranks) {
+ return ranks.stream()
+ .flatMap(s -> s.stream())
+ .flatMap(t -> t.tokens().stream())
+ .map(t -> t.getWitness())
+ .collect(Collectors.toSet());
+ }
+
+ private Set tokenToVertexSet(final Token t) {
+ Set vertexSet = new HashSet<>();
+ VariantGraph.Vertex vertex = new VariantGraph.Vertex(null);
+ vertex.tokens().add(t);
+ vertexSet.add(vertex);
+ return vertexSet;
+ }
+
+ /**
+ * Import a set of vertices into a new graph
+ *
+ * Makes a copy of the vertex and imports it into the new graph. Does not
+ * connect the vertices.
+ *
+ * @param graph The graph into which to import the vertices
+ * @param vertices The vertices to import
+ * @return A set of copied and imported vertices
+ */
+ private Set importVertexSet(
+ final VariantGraph graph,
+ final Set vertices) {
+ return vertices.stream()
+ .map(vertex -> {
+ VariantGraph.Vertex v = new VariantGraph.Vertex(graph);
+ v.tokens().addAll(vertex.tokens());
+ return v;
+ }
+ ).collect(Collectors.toSet());
+ }
+
+ /**
+ * Create a ranking from a graph.
+ *
+ * @param graph The graph
+ * @return The ranking
+ */
+ private List> graphToRanking(final VariantGraph graph) {
+ final List> ranking = new ArrayList<>();
+ Iterator> iter = VariantGraphRanking.of(graph).iterator();
+ while (iter.hasNext()) {
+ ranking.add(importVertexSet(null, iter.next()));
+ }
+ ranking.remove(0); // shave off graph start element
+ ranking.remove(ranking.size() - 1); // shave off graph end element
+ return ranking;
+ }
+
+ /**
+ * Create a list of {@code Set} from a witness.
+ *
+ * The rationale behind this conversion is to make both inputs to the
+ * collator be of the same type. This because a symmetric problem is
+ * generally easier to solve.
+ *
+ * @param iter A witness as token stream
+ * @return A ranking with one token in each rank
+ */
+ private List> tokensToRanking(final Iterable iter) {
+ final List> ranking = new ArrayList<>();
+ for (Token t : iter) {
+ ranking.add(tokenToVertexSet(t));
+ }
+ return ranking;
+ }
+
+ /**
+ * Collate two rankings
+ *
+ * The aligner decides which ranks of rankingA and rankingB to align. If
+ * the alignment is good enough, two vertices, one in each ranking, are
+ * merged.
+ *
+ * @param rankingA A ranking
+ * @param witnessesA All witnesses in rankingA
+ * @param rankingB A ranking
+ * @param witnessesB All witnesses in rankingB
+ * @return The collated ranking
+ */
+
+ private List> doCollate(final Collection> rankingA,
+ final Set witnessesA,
+ final Collection> rankingB,
+ final Set witnessesB) {
+ // Run the aligner.
+
+ NeedlemanWunschGotohAligner aligner =
+ new NeedlemanWunschGotohAligner(new NeedlemanWunschProfileScorer(scorer, witnessesA.size()));
+ aligner.setDebugMatrix (debugMatrix);
+
+ NeedlemanWunschScorerSetVertexSetVertex matcher =
+ new NeedlemanWunschScorerSetVertexSetVertex(scorer);
+
+ List, Set>> alignmentList =
+ aligner.align(rankingA, rankingB);
+
+ // Build a new ranking by merging the aligner output into one ranking.
+ List> collated = new ArrayList();
+ for (Pair, Set> alignment : alignmentList) {
+ assert ((alignment.a != null) || (alignment.b != null));
+
+ Set verticesA = alignment.a;
+ Set verticesB = alignment.b;
+
+ // Merge the matching vertices of each sequence
+ NeedlemanWunschScorerSetVertexSetVertex.Match matching =
+ matcher.match(verticesA, verticesB, matchScore);
+ if (matching != null) {
+ matching.vertexA.tokens().addAll(matching.vertexB.tokens());
+ verticesB.remove(matching.vertexB);
+ }
+
+ Set vertices = new HashSet<>();
+ if (verticesA != null) {
+ vertices.addAll(verticesA);
+ }
+ if (verticesB != null) {
+ vertices.addAll(verticesB);
+ }
+ collated.add(vertices);
+ }
+ return collated;
+ }
+
+ /**
+ * Create a graph out of a ranking.
+ *
+ * This is the inverse of {@code VariantGraphRanking.of}.
+ *
+ * @param graph The graph
+ * @param ranking The ranking
+ */
+ private void rankingToGraph(final VariantGraph graph,
+ final List> ranking) {
+ graph.init();
+
+ // A map of witness -> last vertex with witness
+ Map vertexMap =
+ getWitnesses(ranking).stream().collect(Collectors.toMap(w -> w, w -> graph.getStart()));
+
+ for (Set vertices : ranking) {
+ // Normally, if we convert a graph to a table, and one path between
+ // two vertices contains more vertices than another path, the
+ // relation between vertices in the 'shorter' path and their ranks
+ // will become ambiguous. To avoid that, we insert placeholder
+ // vertices, so that all witnesses connect to some vertex at each
+ // rank and all paths will be of the same length.
+ VariantGraph.Vertex placeholder = new VariantGraph.Vertex(graph);
+ Set unconnectedWitnesses = new HashSet<>(vertexMap.keySet());
+ unconnectedWitnesses.removeAll(getWitnesses(Collections.singleton(vertices)));
+ for (Witness w : unconnectedWitnesses) {
+ graph.connect(vertexMap.put(w, placeholder), placeholder, Collections.singleton(w));
+ }
+
+ // Connect the 'real' vertices in each rank.
+ for (VariantGraph.Vertex vertex : importVertexSet(graph, vertices)) {
+ for (Witness w : getWitnesses(vertex)) {
+ graph.connect(vertexMap.put(w, vertex), vertex, Collections.singleton(w));
+ }
+ }
+ }
+
+ // Connect the end.
+ for (VariantGraph.Vertex vertex : vertexMap.values()) {
+ graph.connect(vertex, graph.getEnd(), getWitnesses(vertex));
+ }
+ }
+}
diff --git a/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschGotohAligner.java b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschGotohAligner.java
new file mode 100644
index 000000000..18194b10f
--- /dev/null
+++ b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschGotohAligner.java
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2016 The Interedition Development Group.
+ *
+ * This file is part of CollateX.
+ *
+ * CollateX is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * CollateX is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with CollateX. If not, see .
+ */
+
+package eu.interedition.collatex.needlemanwunschgotoh;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.LinkedList;
+import java.util.List;
+
+import eu.interedition.collatex.matching.Pair;
+
+/**
+ * A generic Needleman-Wunsch-Gotoh sequence aligner.
+ *
+ * This aligner aligns two sequences of type A and type B respectively. It is
+ * totally agnostic of the type of objects it aligns.
+ *
+ * The aligner needs a {@code NeedlemanWunschScorer} that determines the score
+ * of a match between an object of type A and an object of type B.
+ *
+ * This implementation uses Gotoh's improvements to get $\mathcal{O}(mn)$
+ * running time and reduce memory requirements to essentially the backtracking
+ * matrix only. In Gotoh's technique the gap weight formula must be of the
+ * special form $w_k = uk + v$. $k$ is the gap size, $v$ is the gap opening
+ * score and $u$ the gap extension score.
+ *
+ * See: Gotoh, O. (1982). An Improved Algorithm for Matching Biological
+ * Sequences, J. Mol. Biol. 162, 705-708
+ *
+ * @param Type of first sequence
+ * @param Type of second sequence
+ *
+ * @author Marcello Perathoner
+ */
+public class NeedlemanWunschGotohAligner {
+ /** A Scorer. */
+ private final NeedlemanWunschScorer scorer;
+ /** A human-readable matrix as string. Written to only if set. */
+ private StringBuilder debugMatrix = null;
+ /** The gap opening score. */
+ private final double openScore;
+ /** The gap extension score. */
+ private final double extendScore;
+
+ public NeedlemanWunschGotohAligner(final NeedlemanWunschScorer scorer) {
+ this(scorer, -1.0, -0.5);
+ }
+
+ public NeedlemanWunschGotohAligner(final NeedlemanWunschScorer scorer,
+ final double gapOpenScore,
+ final double gapExtendScore) {
+ this.scorer = scorer;
+ this.openScore = gapOpenScore;
+ this.extendScore = gapExtendScore;
+ }
+
+ private class Data {
+ /** The current score. */
+ public double score;
+ /** $P_{m,n}, Q_{m,n} in Gotoh. */
+ public double p;
+ public double q;
+ /** The size of the gap. $k$ in Gotoh. */
+ public int pSize;
+ public int qSize;
+
+ Data(final double score) {
+ this.score = score;
+ this.p = 0.0;
+ this.q = 0.0;
+ this.pSize = 0;
+ this.qSize = 0;
+ }
+ }
+
+ public List> align(final Collection inputA, final Collection inputB) {
+ int i, j;
+
+ final int sizeA = inputA.size();
+ final int sizeB = inputB.size();
+
+ // 0 j . B
+ // i
+ // .
+ // A
+
+ /**
+ * The backtracking matrix. 0 stands for a match. Negative numbers
+ * represent a DEL TOP operation. The abs() of the number is the length
+ * of the gap. Positive numbers represent an INS LEFT operation. The
+ * value of the muber is the length of the gap.
+ */
+ ArrayList> lenMatrix = new ArrayList<>(sizeA + 1);
+ /**
+ * The scoring matrix. We need only the last row of the scoring matrix
+ * for our calculations, so we allocate the scoring matrix only when
+ * debugging.
+ */
+ ArrayList> matrix = null;
+ /** The current row of the backtracking matrix. */
+ ArrayList thisLenRow = new ArrayList<>(sizeB + 1);
+ /** The current row of the scoring matrix. */
+ ArrayList thisRow = new ArrayList<>(sizeB + 1);
+
+ // Initialize lenMatrix and one row of the scoring matrix.
+
+ lenMatrix.add(thisLenRow);
+
+ thisRow.add(new Data(0.0));
+ thisLenRow.add(0);
+ for (j = 1; j <= sizeB; ++j) {
+ Data d = new Data(openScore + (j - 1) * extendScore);
+ d.p = d.score;
+ // d.pSize = j;
+ thisRow.add(d);
+ thisLenRow.add(j);
+ }
+
+ if (debugMatrix != null) {
+ matrix = new ArrayList<>(sizeA + 1);
+ matrix.add((ArrayList) thisRow.clone());
+ }
+
+ // Score the matrix
+ i = 0;
+ for (A a : inputA) {
+ i++;
+
+ // add new lenRow to matrix
+ thisLenRow = new ArrayList<>(sizeB + 1);
+ lenMatrix.add(thisLenRow);
+ thisLenRow.add(-i); // DEL TOP
+
+ Data diag = thisRow.get(0);
+ Data left = new Data(openScore + (i - 1) * extendScore);
+ left.q = left.score;
+ // left.qSize = i;
+ j = 0;
+ for (B b : inputB) {
+ j++;
+ Data top = thisRow.get(j);
+ Data curr = new Data(0.0);
+
+ curr.p = top.score + openScore;
+ curr.pSize = 1;
+ if (curr.p < top.p + extendScore) {
+ curr.p = top.p + extendScore;
+ curr.pSize = top.pSize + 1;
+ }
+ curr.q = left.score + openScore;
+ curr.qSize = 1;
+ if (curr.q < left.q + extendScore) {
+ curr.q = left.q + extendScore;
+ curr.qSize = left.qSize + 1;
+ }
+ final double d = diag.score + scorer.score(a, b);
+
+ // Decide which operation is optimal and perform it
+ if ((d > curr.p) && (d > curr.q)) {
+ curr.score = d;
+ thisLenRow.add(0);
+ } else if (curr.q > curr.p) {
+ curr.score = curr.q;
+ thisLenRow.add(curr.qSize); // INS LEFT
+ } else {
+ curr.score = curr.p;
+ thisLenRow.add(-curr.pSize); // DEL TOP
+ }
+
+ // Advance to next column
+ thisRow.set(j - 1, left);
+ thisRow.set(j, curr);
+ diag = top;
+ left = curr;
+ }
+
+ if (matrix != null) {
+ matrix.add((ArrayList) thisRow.clone());
+ }
+ }
+
+ // Walk back and output alignments. We need random access, so copy the
+ // input Collections to ArrayLists.
+ final LinkedList> alignments = new LinkedList<>();
+ final ArrayList arrayA = new ArrayList<>(inputA);
+ final ArrayList arrayB = new ArrayList<>(inputB);
+ i = sizeA;
+ j = sizeB;
+ while ((i > 0) || (j > 0)) {
+ int len = lenMatrix.get(i).get(j);
+ if (len == 0) {
+ alignments.addFirst(new Pair(arrayA.get(i - 1), arrayB.get(j - 1)));
+ --i;
+ --j;
+ } else {
+ if (len < 0) {
+ for (int k = 0; k < -len; ++k) {
+ alignments.addFirst(new Pair(arrayA.get(i - 1), null));
+ --i;
+ }
+ } else {
+ for (int k = 0; k < len; ++k) {
+ alignments.addFirst(new Pair(null, arrayB.get(j - 1)));
+ --j;
+ }
+ }
+ }
+ }
+
+ if (matrix != null) {
+ buildDebugMatrix(matrix, lenMatrix, arrayA, arrayB);
+ }
+
+ return alignments;
+ }
+
+ /**
+ * Set a debug matrix. The aligner will fill the debug matrix with a
+ * human-readable representation of the Needleman-Wunsch matrix if the debug
+ * matrix is set.
+ *
+ * @param debugMatrix A StringBuilder or null.
+ */
+ public void setDebugMatrix(final StringBuilder debugMatrix) {
+ this.debugMatrix = debugMatrix;
+ }
+
+ /**
+ * Build the debug matrix string. Builds a human-readable matrix in a
+ * string.
+ *
+ * @param matrix
+ * @param lenMatrix
+ * @param inputA
+ * @param inputB
+ */
+ private void buildDebugMatrix(final ArrayList> matrix,
+ final ArrayList> lenMatrix,
+ final ArrayList inputA,
+ final ArrayList inputB) {
+
+ debugMatrix.setLength(0);
+
+ debugMatrix.append(String.format("%29s | ", ""));
+ debugMatrix.append(String.format("%29s | ", ""));
+ for (B b : inputB) {
+ debugMatrix.append(String.format("%-29s | ", b));
+ }
+ debugMatrix.append("\n");
+
+ for (int i = 0; i < matrix.size(); ++i) {
+ debugAdd(matrix.get(i), lenMatrix.get(i), i > 0 ? inputA.get(i - 1).toString() : "");
+ }
+ debugMatrix.append("\n");
+ }
+
+ /**
+ * Helper function.
+ *
+ * @param dataRow
+ * @param lenRow
+ * @param a
+ */
+ private void debugAdd(final ArrayList dataRow,
+ final ArrayList lenRow,
+ final String a) {
+
+ debugMatrix.append(String.format("%29s | ", a));
+ for (int i = 0; i < dataRow.size(); ++i) {
+ Data data = dataRow.get(i);
+ int len = lenRow.get(i);
+ if (len == 0) {
+ debugMatrix.append("↖ ");
+ } else {
+ if (len < 0) {
+ debugMatrix.append("↑ ");
+ } else {
+ debugMatrix.append("← ");
+ }
+ }
+ debugMatrix.append(String.format("% 2.6f ", data.score));
+ debugMatrix.append(String.format("% 2.2f ", data.p));
+ debugMatrix.append(String.format("% 2d ", data.pSize));
+ debugMatrix.append(String.format("% 2.2f ", data.q));
+ debugMatrix.append(String.format("% 2d | ", data.qSize));
+ }
+ debugMatrix.append("\n");
+ }
+}
diff --git a/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschProfileScorer.java b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschProfileScorer.java
new file mode 100644
index 000000000..618512df0
--- /dev/null
+++ b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschProfileScorer.java
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2016 The Interedition Development Group.
+ *
+ * This file is part of CollateX.
+ *
+ * CollateX is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * CollateX is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with CollateX. If not, see .
+ */
+
+package eu.interedition.collatex.needlemanwunschgotoh;
+
+import java.util.Set;
+
+import eu.interedition.collatex.Token;
+import eu.interedition.collatex.VariantGraph;
+import eu.interedition.collatex.simple.SimpleToken;
+import eu.interedition.collatex.matching.StringMetricScorer;
+
+/**
+ * Score against a profile.
+ *
+ * "Definition: Given a multiple alignment of a set of strings, a
+ * profile for that multiple alignment specifies for each column the
+ * frequency that each character appears in the column." -- Gusfield
+ * 1977, Algorithms on Strings, Trees and Sequences, Cambridge University Press
+ *
+ * The score for the full match is a weighted sum of scores based on the
+ * frequency of the matched vertices.
+ *
+ * @author Marcello Perathoner
+ */
+
+public class NeedlemanWunschProfileScorer
+ implements NeedlemanWunschScorer, Set> {
+ private final StringMetricScorer matchScorer;
+ private final int size;
+
+ public NeedlemanWunschProfileScorer(final StringMetricScorer matchScorer, final int size) {
+ this.matchScorer = matchScorer;
+ this.size = size;
+ }
+
+ @Override
+ public double score(final Set verticesA,
+ final Set verticesB) {
+
+ if (verticesA == null || verticesB == null) {
+ return matchScorer.getMinScore();
+ }
+ if (verticesA.size() == 0 || verticesB.size() == 0) {
+ return matchScorer.getMinScore();
+ }
+
+ double totalScore = 0.0;
+ int totalMatched = 0;
+
+ for (VariantGraph.Vertex vertexA : verticesA) {
+ for (VariantGraph.Vertex vertexB : verticesB) {
+ for (Token tokenA : vertexA.tokens()) {
+ final String a = ((SimpleToken) tokenA).getNormalized();
+ for (Token tokenB : vertexB.tokens()) {
+ final String b = ((SimpleToken) tokenB).getNormalized();
+ totalScore += matchScorer.score(a, b);
+ totalMatched++;
+ }
+ }
+ }
+ }
+ int totalUnmatched = size - totalMatched;
+
+ return (totalScore + totalUnmatched * -1.0) / size;
+ }
+};
diff --git a/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschScorer.java b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschScorer.java
new file mode 100644
index 000000000..f7eb18287
--- /dev/null
+++ b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschScorer.java
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2016 The Interedition Development Group.
+ *
+ * This file is part of CollateX.
+ *
+ * CollateX is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * CollateX is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with CollateX. If not, see .
+ */
+
+package eu.interedition.collatex.needlemanwunschgotoh;
+
+/**
+ * A scorer for a {@code NeedlemanWunschGotohAligner}.
+ *
+ * Calculates the score of a match between two generic objects.
+ *
+ * @param Type of the first object
+ * @param Type of the second object
+ *
+ * @author Marcello Perathoner
+ */
+public interface NeedlemanWunschScorer {
+ /**
+ * Calculate the score given to a match between a and b.
+ *
+ * @param a An object
+ * @param b An object
+ *
+ * @return The score
+ */
+ double score(A a, B b);
+}
diff --git a/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschScorerSetVertexSetVertex.java b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschScorerSetVertexSetVertex.java
new file mode 100644
index 000000000..c66c86525
--- /dev/null
+++ b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschScorerSetVertexSetVertex.java
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2015 The Interedition Development Group.
+ *
+ * This file is part of CollateX.
+ *
+ * CollateX is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * CollateX is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with CollateX. If not, see .
+ */
+
+package eu.interedition.collatex.needlemanwunschgotoh;
+
+import java.util.Set;
+
+import eu.interedition.collatex.Token;
+import eu.interedition.collatex.VariantGraph;
+import eu.interedition.collatex.simple.SimpleToken;
+import eu.interedition.collatex.matching.StringMetricScorer;
+
+/**
+ * Score a set of vertices against another set of vertices
+ *
+ * The score will be the highest score achieved while matching all tokens of all
+ * vertices in verticesA against all tokens of all vertices in verticesB.
+ *
+ * @author Marcello Perathoner
+ */
+
+public class NeedlemanWunschScorerSetVertexSetVertex
+ implements NeedlemanWunschScorer, Set> {
+ private final StringMetricScorer matchScorer;
+
+ public NeedlemanWunschScorerSetVertexSetVertex(final StringMetricScorer matchScorer) {
+ this.matchScorer = matchScorer;
+ }
+
+ public class Match {
+ public final VariantGraph.Vertex vertexA;
+ public final VariantGraph.Vertex vertexB;
+ public final double score;
+ public Match(final VariantGraph.Vertex vertexA,
+ final VariantGraph.Vertex vertexB,
+ final double score) {
+ this.vertexA = vertexA;
+ this.vertexB = vertexB;
+ this.score = score;
+ }
+ }
+
+ public Match match(final Set verticesA,
+ final Set verticesB,
+ final double minScore) {
+
+ if (verticesA == null || verticesB == null) {
+ return null;
+ }
+ if (verticesA.size() == 0 || verticesB.size() == 0) {
+ return null;
+ }
+
+ Match matching = null;
+ double minScoreMatched = minScore;
+
+ for (VariantGraph.Vertex vertexA : verticesA) {
+ for (VariantGraph.Vertex vertexB : verticesB) {
+
+ for (Token tokenA : vertexA.tokens()) {
+ final String a = ((SimpleToken) tokenA).getNormalized();
+
+ for (Token tokenB : vertexB.tokens()) {
+ final String b = ((SimpleToken) tokenB).getNormalized();
+
+ double score = matchScorer.score(a, b);
+ if (score > minScoreMatched) {
+ minScoreMatched = score;
+ matching = new Match(vertexA, vertexB, score);
+ }
+ }
+ }
+ }
+ }
+ return matching;
+ }
+
+ @Override
+ public double score(final Set verticesA,
+ final Set verticesB) {
+ Match matching = match(verticesA, verticesB, matchScorer.getMinScore());
+ return (matching != null) ? matching.score : matchScorer.getMinScore();
+ }
+};
diff --git a/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/package-info.java b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/package-info.java
new file mode 100644
index 000000000..1dd01c484
--- /dev/null
+++ b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/package-info.java
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2016 The Interedition Development Group.
+ *
+ * This file is part of CollateX.
+ *
+ * CollateX is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * CollateX is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with CollateX. If not, see .
+ */
+
+/**
+ * A version of the Needleman-Wunsch algorithm with floating point scoring.
+ *
+ * This algorithm
+ * strives for global alignment of witnesses and bases the alignment on a
+ * configurable scoring of matches vs. differences/gaps. It does not try to
+ * detect transpositions.
+ *
+ * @author Marcello Perathoner
+ *
+ * @see eu.interedition.collatex.needlemanwunschgotoh.NeedlemanWunschGotohAligner
+ * @see eu.interedition.collatex.needlemanwunschgotoh.NeedlemanWunschGotohAlgorithm
+ * @see eu.interedition.collatex.needlemanwunschgotoh.NeedlemanWunschScorer
+ */
+package eu.interedition.collatex.needlemanwunschgotoh;
diff --git a/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/AbstractStringMetricScorer.java b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/AbstractStringMetricScorer.java
new file mode 100644
index 000000000..78b7b0640
--- /dev/null
+++ b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/AbstractStringMetricScorer.java
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2016 The Interedition Development Group.
+ *
+ * This file is part of CollateX.
+ *
+ * CollateX is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * CollateX is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with CollateX. If not, see .
+ */
+
+package eu.interedition.collatex.matching;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Calculate a score based on string metrics. Skeletal implementation.
+ *
+ * Note that the minimum score is the score for a mismatch. This score must be
+ * higher than two times the score of a gap. If a mismatch would score less
+ * than two gaps, the aligner would always select two gaps and never select a
+ * mismatch.
+ *
+ * This implementation caches the calculated score.
+ *
+ * @author Marcello Perathoner
+ */
+public abstract class AbstractStringMetricScorer implements StringMetricScorer {
+
+ /** Memoization cache. Caches the score of already computed pairs. */
+ private final Map, Double> cache = new HashMap<>();
+
+ /** The minimum score. */
+ protected final double minScore;
+
+ /** The maxiumum score. */
+ protected final double maxScore;
+
+ /** Constructor. */
+ public AbstractStringMetricScorer() {
+ this(-1.0, 1.0);
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param minScore The score for the worst match.
+ * @param maxScore The score for the best match.
+ */
+ public AbstractStringMetricScorer(final double minScore, final double maxScore) {
+ this.minScore = minScore;
+ this.maxScore = maxScore;
+ }
+
+ /**
+ * Return the minimun score this scorer will ever calculate.
+ *
+ * @return The score for the worst match.
+ */
+ public double getMinScore() {
+ return minScore;
+ };
+
+ /**
+ * Return the maximun score this scorer will ever calculate.
+ *
+ * @return The score for the best match.
+ */
+ public double getMaxScore() {
+ return maxScore;
+ };
+
+ /** Calculate the score. Override this for cached scorers. */
+ protected double _score(final Pair p) {
+ return 0.0;
+ }
+
+ /** Calculate the score. Override this for uncached scorers. */
+ public double score(final String a, final String b) {
+ final Pair pair = new Pair<>(a, b);
+
+ return cache.computeIfAbsent(pair, p -> _score(p));
+ }
+}
diff --git a/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/EqualityScorer.java b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/EqualityScorer.java
new file mode 100644
index 000000000..6767650cb
--- /dev/null
+++ b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/EqualityScorer.java
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2016 The Interedition Development Group.
+ *
+ * This file is part of CollateX.
+ *
+ * CollateX is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * CollateX is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with CollateX. If not, see .
+ */
+
+package eu.interedition.collatex.matching;
+
+/**
+ * Calculate a score based on string equality.
+ *
+ * Score is max if strings match, min otherwise.
+ *
+ * @author Marcello Perathoner
+ */
+public class EqualityScorer extends AbstractStringMetricScorer {
+
+ public EqualityScorer() {
+ super();
+ }
+
+ public EqualityScorer(final double minScore, final double maxScore) {
+ super(minScore, maxScore);
+ }
+
+ @Override
+ public double score(final String a, final String b) {
+ return a.equals(b) ? maxScore : minScore;
+ }
+}
diff --git a/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/LevenshteinDistanceScorer.java b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/LevenshteinDistanceScorer.java
new file mode 100644
index 000000000..6a6be33cd
--- /dev/null
+++ b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/LevenshteinDistanceScorer.java
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016 The Interedition Development Group.
+ *
+ * This file is part of CollateX.
+ *
+ * CollateX is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * CollateX is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with CollateX. If not, see .
+ */
+
+package eu.interedition.collatex.matching;
+
+/**
+ * Calculate a score based on the Levenshtein distance.
+ *
+ * Score is 1 if distance <= threshold, -1 otherwise.
+ *
+ * @author Marcello Perathoner
+ */
+public class LevenshteinDistanceScorer extends AbstractStringMetricScorer {
+
+ final private int threshold;
+
+ public LevenshteinDistanceScorer(final int threshold) {
+ super();
+ this.threshold = threshold;
+ }
+
+ public LevenshteinDistanceScorer(final double minScore,
+ final double maxScore,
+ final int threshold) {
+ super(minScore, maxScore);
+ this.threshold = threshold;
+ }
+
+ @Override
+ protected double _score(final Pair p) {
+ if (p.a.equals(p.b)) {
+ return 1.0;
+ }
+ return EditDistance.compute(p.a, p.b) <= threshold ? maxScore : minScore;
+ }
+}
diff --git a/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/LevenshteinRatioScorer.java b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/LevenshteinRatioScorer.java
new file mode 100644
index 000000000..d8082627e
--- /dev/null
+++ b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/LevenshteinRatioScorer.java
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2016 The Interedition Development Group.
+ *
+ * This file is part of CollateX.
+ *
+ * CollateX is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * CollateX is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with CollateX. If not, see .
+ */
+
+package eu.interedition.collatex.matching;
+
+/**
+ * Calculate a score based on Levenshtein Ratio.
+ *
+ * Levenshtein ratio as used in python-Levenshtein
+ * distance = levenshtein (a, b)
+ * length = length (a) + length (b)
+ * ratio = (length - distance) / length
+ *
+ * @author Marcello Perathoner
+ */
+public class LevenshteinRatioScorer extends AbstractStringMetricScorer {
+
+ public LevenshteinRatioScorer() {
+ super();
+ }
+
+ public LevenshteinRatioScorer(final double minScore, final double maxScore) {
+ super(minScore, maxScore);
+ }
+
+ @Override
+ protected double _score(final Pair p) {
+ if (p.a.equals(p.b)) {
+ return maxScore;
+ }
+ final int distance = EditDistance.compute(p.a, p.b, 2);
+ final int length = p.a.length() + p.b.length();
+ final double ratio = ((double) (length - distance)) / length;
+
+ return minScore + (maxScore - minScore) * ratio;
+ }
+}
diff --git a/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/Pair.java b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/Pair.java
new file mode 100644
index 000000000..cec091b9d
--- /dev/null
+++ b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/Pair.java
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2015 The Interedition Development Group.
+ *
+ * This file is part of CollateX.
+ *
+ * CollateX is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * CollateX is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with CollateX. If not, see .
+ */
+
+package eu.interedition.collatex.matching;
+
+/**
+ * A Pair class that can be used as key in Map.
+ *
+ * @author Marcello Perathoner
+ */
+public class Pair {
+ public final T a;
+ public final U b;
+
+ public Pair(T a, U b) {
+ this.a = a;
+ this.b = b;
+ }
+
+ @Override
+ public boolean equals(final Object obj) {
+ if (obj == this) {
+ return true;
+ }
+ if (obj instanceof Pair) {
+ Pair other = (Pair) obj;
+ return a.equals(other.a) && b.equals(other.b);
+ }
+ return false;
+ }
+
+ @Override
+ public int hashCode() {
+ // Map.Entry uses operator ^ but + is a better choice because
+ // we also want to store Pairs of identic strings.
+ return a.hashCode() + b.hashCode();
+ }
+
+ public String toString() {
+ return "(" + a.toString() + ", " + b.toString() + ")";
+ }
+}
diff --git a/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/StringMetricScorer.java b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/StringMetricScorer.java
new file mode 100644
index 000000000..2b1760789
--- /dev/null
+++ b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/StringMetricScorer.java
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2016 The Interedition Development Group.
+ *
+ * This file is part of CollateX.
+ *
+ * CollateX is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * CollateX is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with CollateX. If not, see .
+ */
+
+package eu.interedition.collatex.matching;
+
+/**
+ * Calculate a score based on string metrics.
+ *
+ * @author Marcello Perathoner
+ */
+public interface StringMetricScorer {
+
+ /** Calculate the score for a match between and b. */
+ double score(String a, String b);
+
+ /** Return the minimun score this scorer will ever calculate. */
+ double getMinScore();
+
+ /** Return the maximun score this scorer will ever calculate. */
+ double getMaxScore();
+}
diff --git a/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/TrigramRatioScorer.java b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/TrigramRatioScorer.java
new file mode 100644
index 000000000..ccbdcd185
--- /dev/null
+++ b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/TrigramRatioScorer.java
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016 The Interedition Development Group.
+ *
+ * This file is part of CollateX.
+ *
+ * CollateX is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * CollateX is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with CollateX. If not, see .
+ */
+
+package eu.interedition.collatex.matching;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+
+/**
+ * Calculate a score based on trigram ratio.
+ *
+ * Trigram ratio is defined as:
+ * ta = number of trigrams in a
+ * tb = number of trigrams in b
+ * tab = number of trigrams in that are both in a and in b
+ * ratio = 2 * tab / (ta + tb)
+ *
+ * @author Marcello Perathoner
+ */
+public class TrigramRatioScorer extends AbstractStringMetricScorer {
+
+ final Map> trigrams = new HashMap<>();
+
+ public TrigramRatioScorer() {
+ super();
+ }
+
+ public TrigramRatioScorer(final double minScore, final double maxScore) {
+ super(minScore, maxScore);
+ }
+
+ private Set trigramize(final String s) {
+ assert s.length() > 0;
+
+ Set tri = new HashSet<>();
+ String ss = " " + s + " ";
+
+ for (int i = 0; i < ss.length() - 2; i++) {
+ tri.add(ss.substring(i, i + 3));
+ }
+ return tri;
+ }
+
+ @Override
+ protected double _score(final Pair p) {
+ if (p.a.equals(p.b)) {
+ return maxScore;
+ }
+
+ Set triA = trigrams.computeIfAbsent(p.a, this::trigramize);
+ Set triB = trigrams.computeIfAbsent(p.b, this::trigramize);
+
+ Set triAB = new HashSet<>(triA);
+ triAB.retainAll(triB);
+
+ final double ratio = 2.0 * triAB.size() / (triA.size() + triB.size());
+
+ return minScore + (maxScore - minScore) * ratio;
+ }
+}
diff --git a/collatex-core/src/test/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschGotohTest.java b/collatex-core/src/test/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschGotohTest.java
new file mode 100644
index 000000000..71c80d06e
--- /dev/null
+++ b/collatex-core/src/test/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschGotohTest.java
@@ -0,0 +1,449 @@
+/*
+ * Copyright (c) 2015 The Interedition Development Group.
+ *
+ * This file is part of CollateX.
+ *
+ * CollateX is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * CollateX is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with CollateX. If not, see .
+ */
+
+package eu.interedition.collatex.needlemanwunschgotoh;
+
+import eu.interedition.collatex.AbstractTest;
+import eu.interedition.collatex.CollationAlgorithm;
+import eu.interedition.collatex.CollationAlgorithmFactory;
+import eu.interedition.collatex.Token;
+import eu.interedition.collatex.Witness;
+import eu.interedition.collatex.VariantGraph;
+import eu.interedition.collatex.needlemanwunschgotoh.*;
+import eu.interedition.collatex.matching.*;
+import eu.interedition.collatex.simple.SimpleWitness;
+import eu.interedition.collatex.util.VariantGraphTraversal;
+import org.junit.Test;
+
+import java.util.Arrays;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+import java.util.SortedMap;
+import java.util.stream.*;
+
+import static org.junit.Assert.*;
+
+/**
+ * @author Gregor Middell
+ */
+public class NeedlemanWunschGotohTest extends AbstractTest {
+
+ final double delta = 0.000000001;
+ NeedlemanWunschGotohAligner aligner;
+
+ public class NeedlemanWunschScorerStringString implements NeedlemanWunschScorer {
+ private final StringMetricScorer matchScorer;
+
+ public NeedlemanWunschScorerStringString(StringMetricScorer matchScorer) {
+ this.matchScorer = matchScorer;
+ }
+
+ @Override
+ public double score(String a, String b) {
+ return matchScorer.score(a, b);
+ }
+ };
+
+ private void testAlign(String a, String b) {
+ StringBuilder debugMatrix = new StringBuilder();
+ aligner.setDebugMatrix(debugMatrix);
+
+ List> list, expectedList;
+
+ List aa = Arrays.asList(a.split("\\s+"));
+ List bb = Arrays.asList(b.split("\\s+"));
+ assertEquals(aa.size(), bb.size());
+
+ expectedList = new ArrayList>();
+ for (int i = 0; i < aa.size(); i++) {
+ expectedList.add(new Pair(aa.get(i), bb.get(i)));
+ }
+
+ try {
+ list = aligner.align(
+ aa.stream().filter(p -> !p.equals("-")).collect(Collectors.toList()),
+ bb.stream().filter(p -> !p.equals("-")).collect(Collectors.toList())
+ );
+ } catch (Exception e) {
+ System.out.println(e.getMessage());
+ System.out.println(debugMatrix.toString());
+ throw e;
+ }
+
+ list = list.stream()
+ .map(p -> new Pair(p.a == null ? "-" : p.a, p.b == null ? "-" : p.b))
+ .collect(Collectors.toList());
+
+ String aaa = list .stream().map(Pair::toString).collect(Collectors.joining(" "));
+ String bbb = expectedList.stream().map(Pair::toString).collect(Collectors.joining(" "));
+
+ if (!aaa.equals(bbb)) {
+ System.out.println(debugMatrix.toString());
+ }
+
+ assertEquals(bbb, aaa);
+ }
+
+ private void testCollate(String... witnesses) {
+ StringBuilder debugMatrix = new StringBuilder();
+ ((NeedlemanWunschGotohAlgorithm) collationAlgorithm).setDebugMatrix(debugMatrix);
+
+ List strings = new ArrayList<>();
+ List tables = new ArrayList<>();
+ for (String witness : witnesses) {
+ List l = Arrays.asList(witness.split("\\s+"));
+ strings.add(l.stream().filter(p -> !p.equals("-")).collect(Collectors.joining(" ")));
+ tables.add(l.stream().map(p -> p.equals("-") ? " " : p).collect(Collectors.joining("|", "|", "|")));
+ }
+
+ final SimpleWitness[] w = createWitnesses(strings.toArray(new String[0]));
+
+ List>> t;
+ try {
+ t = table(collate(w));
+ } catch (Exception e) {
+ System.out.println(e);
+ System.out.println(debugMatrix.toString());
+ throw e;
+ }
+
+ int i = 0;
+ for (String table : tables) {
+ String result = toString(t, w[i]);
+ if (!table.equals(result)) {
+ System.out.println(table);
+ System.out.println(result);
+ System.out.println(debugMatrix.toString());
+ }
+ assertEquals(table, result);
+ i++;
+ }
+ }
+
+ @Test
+ public void simple() {
+ setCollationAlgorithm(CollationAlgorithmFactory.needlemanWunschGotoh());
+ LOG.fine(toString(table(collate("a b a b a", "a b a"))));
+ }
+
+ @Test
+ public void equalityScorerTest() {
+ StringMetricScorer scorer = new EqualityScorer(0.0, 1.0);
+
+ assertEquals(1.0, scorer.score("foobar", "foobar"), delta);
+ assertEquals(0.0, scorer.score("foobar", "fobar"), delta);
+ assertEquals(0.0, scorer.score("foobar", "fooobar"), delta);
+ assertEquals(0.0, scorer.score("foobar", "foobaz"), delta);
+ assertEquals(0.0, scorer.score("foobar", "foobiz"), delta);
+ }
+
+ @Test
+ public void levenshteinDistanceScorerTest() {
+ StringMetricScorer scorer = new LevenshteinDistanceScorer(0.0, 1.0, 1);
+
+ assertEquals(1.0, scorer.score("foobar", "foobar"), delta);
+ assertEquals(1.0, scorer.score("foobar", "fobar"), delta);
+ assertEquals(1.0, scorer.score("foobar", "fooobar"), delta);
+ assertEquals(1.0, scorer.score("foobar", "foobaz"), delta);
+ assertEquals(0.0, scorer.score("foobar", "foobiz"), delta);
+ }
+
+ @Test
+ public void levenshteinRatioScorerTest() {
+ StringMetricScorer scorer = new LevenshteinRatioScorer(0.0, 1.0);
+
+ assertEquals(1.0, scorer.score("foobar", "foobar"), delta);
+ assertEquals(0.5, scorer.score("fooabc", "foodef"), delta);
+ assertEquals(0.0, scorer.score("abcdef", "ghijkl"), delta);
+
+ assertTrue(scorer.score("foobar", "fooobar") > scorer.score("fobar", "foobar"));
+ }
+
+ @Test
+ public void trigramRatioScorerTest() {
+ StringMetricScorer scorer = new TrigramRatioScorer(0.0, 1.0);
+
+ assertEquals(1.0, scorer.score("foobar", "foobar"), delta);
+ assertEquals(6.0 / 16, scorer.score("fooabc", "foodef"), delta);
+ assertEquals(6.0 / 16, scorer.score("abcfoo", "deffoo"), delta);
+ assertEquals(0.0, scorer.score("abcdef", "ghijkl"), delta);
+
+ assertEquals(2.0 / 12, scorer.score("heti", "ethi"), delta);
+
+ assertTrue(scorer.score("foobar", "fooobar") > scorer.score("fobar", "foobar"));
+ }
+
+ @Test
+ public void equalityAlignerTest() {
+ aligner = new NeedlemanWunschGotohAligner(
+ new NeedlemanWunschScorerStringString(new EqualityScorer())
+ );
+
+ testAlign("a b c",
+ "a b c");
+
+ testAlign("a b c",
+ "a b d");
+
+ testAlign("a b c",
+ "a - c");
+
+ testAlign("- - -",
+ "a b c");
+
+ testAlign("a b c",
+ "- - -");
+
+ // The equality scorer will match words only if they are equal.
+
+ testAlign("foo foob foobar foob foo",
+ "- - foobar - - ");
+
+ // It will not match a word that is similar.
+
+ testAlign("foo foob fooba foob foo",
+ "foobar - - - -"); // expected wrong answer
+ }
+
+ @Test
+ public void levenshteinDistanceAlignerTest() {
+ aligner = new NeedlemanWunschGotohAligner(
+ new NeedlemanWunschScorerStringString(new LevenshteinDistanceScorer(1))
+ );
+
+ // The Levenshtein distance scorer will match a word that is equal.
+
+ testAlign("x foo foobar foo y",
+ "- - foobar - -");
+
+ // But it will also match a word that is similar within Levenshtein
+ // distance.
+
+ testAlign("x foo fooba foo y",
+ "- - foobar - -");
+
+ // But if many words are within Levenshtein distance they will all look
+ // the same to the scorer, so it picks the wrong one.
+
+ testAlign("x foob fooba foobar fooba foob y",
+ "- - - - foobar - -"); // expected wrong answer
+ }
+
+ @Test
+ public void levenshteinRatioAlignerTest() {
+ aligner = new NeedlemanWunschGotohAligner(
+ new NeedlemanWunschScorerStringString(new LevenshteinRatioScorer())
+ );
+
+ // The Levenshtein ratio scorer will match a word that is equal.
+
+ testAlign("x foo foobar foo y",
+ "- - foobar - -");
+
+ // But it will also match the most similar word.
+
+ testAlign("x foo fooba foo y",
+ "- - foobar - -");
+
+ // But it will give the correct match where the Levenshtein distance
+ // scorer failed.
+
+ testAlign("x foob fooba foobar fooba foob y",
+ "- - - foobar - - -");
+
+ // More tests
+
+ testAlign("- - Sanguis effusio in ecclesiis facta",
+ "Si quis sanguinis effusionem in eclesia fecerit");
+
+ testAlign("Si quis sanguinis effusionem in eclesia fecerit",
+ "- - Sanguis effusio in ecclesiis facta");
+
+ testAlign("uolumus ut ea dentur que",
+ "uolumus ut detur ea que");
+
+
+ testAlign("periurium committitur . Quando maioris pretii",
+ "perIurium committitur - quanto maIoris pretii");
+ }
+
+ @Test
+ public void trigramRatioAlignerTest() {
+ aligner = new NeedlemanWunschGotohAligner(
+ new NeedlemanWunschScorerStringString(new TrigramRatioScorer())
+ );
+
+ // The trigram ratio scorer will match a word that is equal.
+
+ testAlign("x foo foobar foo y",
+ "- - foobar - -");
+
+ // But it will also match the most similar word.
+
+ testAlign("x foo fooba foo y",
+ "- - foobar - -");
+
+ // But it will give the correct match where the Levenshtein distance
+ // scorer failed.
+
+ testAlign("x foob fooba foobar fooba foob y",
+ "- - - foobar - - -");
+
+ // More tests
+
+ testAlign("- - Sanguis effusio in ecclesiis facta",
+ "Si quis sanguinis effusionem in eclesia fecerit");
+
+ testAlign("Si quis sanguinis effusionem in eclesia fecerit",
+ "- - Sanguis effusio in ecclesiis facta");
+
+ testAlign("uolumus ut - ea dentur que",
+ "uolumus ut detur ea - que");
+
+ testAlign("periurium committitur . quando maioris pretii",
+ "perIurium committitur - quanto maIoris pretii");
+ }
+
+ @Test
+ public void collatorSanityTest() {
+ setCollationAlgorithm(CollationAlgorithmFactory.needlemanWunschGotoh());
+
+ // Test if the collator works on an empty graph.
+
+ final SimpleWitness[] w = createWitnesses("a b c d");
+ final VariantGraph graph = collate(w);
+ final List vertices =
+ StreamSupport.stream(VariantGraphTraversal.of(graph).spliterator(), false).collect(Collectors.toList());
+ assertEquals(6, vertices.size());
+ assertEquals(graph.getStart(), vertices.get(0));
+ assertEquals(1, vertices.get(1).tokens().size());
+ assertEquals(1, vertices.get(2).tokens().size());
+ assertEquals(1, vertices.get(3).tokens().size());
+ assertEquals(1, vertices.get(4).tokens().size());
+ assertVertexEquals("a", vertices.get(1));
+ assertVertexEquals("b", vertices.get(2));
+ assertVertexEquals("c", vertices.get(3));
+ assertVertexEquals("d", vertices.get(4));
+ assertEquals(graph.getEnd(), vertices.get(5));
+ }
+
+ @Test
+ public void collatorSanityTest1() {
+ setCollationAlgorithm(CollationAlgorithmFactory.needlemanWunschGotoh());
+
+ testCollate("a b c");
+ }
+
+ @Test
+ public void collatorSanityTest2() {
+ setCollationAlgorithm(CollationAlgorithmFactory.needlemanWunschGotoh());
+
+ testCollate("the cat is black",
+ "the dog is black");
+ }
+
+ @Test
+ public void exactMatch() {
+ setCollationAlgorithm(CollationAlgorithmFactory.needlemanWunschGotoh());
+
+ testCollate("a b c",
+ "a b c");
+
+ testCollate("a b c",
+ "a b d");
+
+ testCollate("a b c",
+ "a d c");
+
+ testCollate("a b c",
+ "d b c");
+
+ testCollate("a b c",
+ "a - c");
+
+ testCollate("a - c",
+ "a b c");
+
+ testCollate("a b c",
+ "- b -");
+
+ testCollate("- b -",
+ "a b c");
+ }
+
+ @Test
+ public void exactMatch1() {
+ setCollationAlgorithm(CollationAlgorithmFactory.needlemanWunschGotoh());
+
+ testCollate("a foob foob foobar foob foob b",
+ "a - - foobar - - b");
+
+ testCollate("a foob foob foobar foob foob b foob foob c",
+ "a - - foobar - - b - - c");
+ }
+
+ @Test
+ public void multiMatch() {
+ setCollationAlgorithm(CollationAlgorithmFactory.needlemanWunschGotoh());
+
+ testCollate("a b c - - -",
+ "- b c d - -",
+ "- - c d e -",
+ "- - - d e f");
+
+ testCollate("- - - d e f",
+ "- - c d e -",
+ "- b c d - -",
+ "a b c - - -");
+ }
+
+ @Test
+ public void distanceMatch() {
+ setCollationAlgorithm(CollationAlgorithmFactory.needlemanWunschGotoh());
+
+ testCollate("x foo foo foobar foo foo y",
+ "- - - fooba - - -");
+ }
+
+ @Test
+ public void ratioMatch() {
+ setCollationAlgorithm(CollationAlgorithmFactory.needlemanWunschGotoh());
+
+ testCollate("x foo foob foobar foo foo y",
+ "- - - fooba - - -");
+
+ testCollate("hadebaldus - heti bernoinus",
+ "hadebaldus bernuinus heti -",
+ "adebaldus - ethi bernoinus");
+ }
+
+ @Test
+ public void preferOneLongGap() {
+ setCollationAlgorithm(CollationAlgorithmFactory.needlemanWunschGotoh());
+
+ // The aligner will prefer one long gap over many short ones.
+
+ testCollate("a b b b b b c",
+ "a b b - - - c");
+
+ testCollate("a b b b b b c",
+ "a b b - - - c");
+ }
+}
diff --git a/collatex-pythonport/collatex/core_classes.py b/collatex-pythonport/collatex/core_classes.py
index 6e8c3d177..1411a82dd 100644
--- a/collatex-pythonport/collatex/core_classes.py
+++ b/collatex-pythonport/collatex/core_classes.py
@@ -16,30 +16,30 @@
from collatex.exceptions import TokenError
class Row(object):
-
+
def __init__(self, header):
self.cells = []
self.header = header
-
+
def append(self, cell):
self.cells.append(cell)
-
+
def to_list(self):
return self.cells
-
+
class Column(object):
-
+
def __init__(self):
self.tokens_per_witness = {}
- self.variant = False
+ self.variant = False
def put(self, sigil, token):
self.tokens_per_witness[sigil]=token
class AlignmentTable(object):
-
+
def __init__(self, collation, graph=None, layout="horizontal"):
self.collation = collation
self.graph = graph
@@ -52,7 +52,7 @@ def __init__(self, collation, graph=None, layout="horizontal"):
def _construct_table(self):
ranking = VariantGraphRanking.of(self.graph)
vertices_per_rank = ranking.byRank
- # construct columns
+ # construct columns
for rank in vertices_per_rank:
column = None
vertices = vertices_per_rank[rank]
@@ -87,11 +87,11 @@ def _construct_table(self):
def __str__(self, *args, **kwargs):
return str(create_table_visualization(self))
-
+
# DISPLAY PART OF THE VARIANT GRAPH IN PLAIN/HTML AND VERTICAL OR HORIZONTAL!
def create_table_visualization(table):
# create visualization of alignment table
- if table.layout == "vertical":
+ if table.layout == "vertical":
prettytable = visualizeTableVertically(table)
elif table.layout == "horizontal":
prettytable = visualizeTableHorizontal(table)
@@ -120,11 +120,11 @@ def visualizeTableVertically(table):
x.add_column(row.header, [fill(cell, 20) if cell else "-" for cell in row.cells])
return x
-
+
# not used in the suffix implementation
# Tokenizer inside suffix array library is used
class Tokenizer(object):
-
+
# by default the tokenizer splits on space characters
def tokenize(self, contents):
return contents.split()
@@ -156,7 +156,7 @@ def __repr__(self):
return self.token_string
class Witness(object):
-
+
def __init__(self, witnessdata):
self.sigil = witnessdata['id']
self._tokens = []
@@ -172,22 +172,22 @@ def __init__(self, witnessdata):
self._tokens.append(Token(tk))
# content string is used for generation of the suffix and LCP arrays.
self.content = ' '.join([x.token_string for x in self._tokens])
-
+
def tokens(self):
return self._tokens
class VariantGraph(object):
-
+
def __init__(self):
self.graph = nx.DiGraph()
self.start = self.add_vertex(Token())
self.end = self.add_vertex(Token())
-
+
# def is_directed(self):
# return self.graph.is_directed()
-#
- # vertex creation uses a unique ID, since the token_content does not have to be unique
- # we store the token content in the label
+#
+ # vertex creation uses a unique ID, since the token_content does not have to be unique
+ # we store the token content in the label
def add_vertex(self, token, sigil=None):
'''
:type token: Token
@@ -196,13 +196,13 @@ def add_vertex(self, token, sigil=None):
# print("Adding node: "+node_id+":"+token_content)
tokens = {}
if sigil:
- tokens[sigil] = token
+ tokens[sigil] = [token]
self.graph.add_node(node_id, label=token.token_string, tokens=tokens)
return node_id
def add_token_to_vertex(self, node, token, sigil):
attributes = self.vertex_attributes(node)
- attributes["tokens"][sigil] = token
+ attributes["tokens"].setdefault(sigil, []).append (token)
def connect(self, source, target, witnesses):
"""
@@ -212,44 +212,44 @@ def connect(self, source, target, witnesses):
# print("Adding Edge: "+source+":"+target)
if self.graph.has_edge(source, target):
self.graph[source][target]["label"] += ", "+str(witnesses)
- else:
+ else:
self.graph.add_edge(source, target, label=witnesses)
-
+
def remove_edge(self, source, target):
self.graph.remove_edge(source, target)
-
+
def remove_node(self, node):
- self.graph.remove_node(node)
-
+ self.graph.remove_node(node)
+
def vertices(self):
return self.graph.nodes()
-
+
def edges(self):
return self.graph.edges()
-
+
def edge_between(self, node, node2):
# return self.graph.get_edge_data(node, node2)
return self.graph.has_edge(node, node2)
-
+
def in_edges(self, node, data=False):
return self.graph.in_edges(nbunch=node, data=data)
-
+
def out_edges(self, node, data=False):
return self.graph.out_edges(nbunch=node, data=data)
-
+
def vertex_attributes(self, node):
return self.graph.node[node]
-
+
# Note: generator implementation
def vertexWith(self, content):
try:
vertex_to_find = (n for n in self.graph if self.graph.node[n]['label'] == content).next()
- return vertex_to_find
+ return vertex_to_find
except StopIteration:
raise Exception("Vertex with "+content+" not found!")
-
+
class CollationAlgorithm(object):
- def merge(self, graph, witness_sigil, witness_tokens, alignments = {}):
+ def merge(self, graph, witness_sigil, witness_tokens, alignments = {}):
"""
:type graph: VariantGraph
"""
@@ -275,7 +275,7 @@ def merge(self, graph, witness_sigil, witness_tokens, alignments = {}):
This function joins the variant graph in place.
This function is a straight port of the Java version of CollateX.
:type graph: VariantGraph
- TODO: add transposition support!
+ TODO: add transposition support!
'''
def join(graph):
processed = set()
@@ -301,7 +301,7 @@ def join(graph):
graph.remove_edge(join_candidate, neighbor)
graph.connect(vertex, neighbor, data["label"])
graph.remove_edge(vertex, join_candidate)
- graph.remove_node(join_candidate)
+ graph.remove_node(join_candidate)
queue.appendleft(vertex);
continue;
processed.add(vertex)
@@ -309,7 +309,7 @@ def join(graph):
# FIXME: Why do we run out of memory in some cases here, if this is not checked?
if neighbor not in processed:
queue.appendleft(neighbor)
-
+
# Port of VariantGraphRanking class from Java
# This is a minimal port; only bare bones
class VariantGraphRanking(object):
@@ -319,13 +319,13 @@ def __init__(self):
# however, a rank can be assigned to multiple vertices
self.byVertex = {}
self.byRank = {}
-
+
def apply(self, vertex):
return self.byVertex[vertex]
-
+
@classmethod
def of(cls, graph):
- variant_graph_ranking = VariantGraphRanking()
+ variant_graph_ranking = VariantGraphRanking()
topological_sorted_vertices = topological_sort(graph.graph)
for v in topological_sorted_vertices:
rank = -1
@@ -335,7 +335,3 @@ def of(cls, graph):
variant_graph_ranking.byVertex[v]=rank
variant_graph_ranking.byRank.setdefault(rank, []).append(v)
return variant_graph_ranking
-
-
-
-
diff --git a/collatex-servlet/src/main/java/eu/interedition/collatex/io/SimpleCollationJSONMessageBodyReader.java b/collatex-servlet/src/main/java/eu/interedition/collatex/io/SimpleCollationJSONMessageBodyReader.java
index 4ea198c36..d0b7f331f 100644
--- a/collatex-servlet/src/main/java/eu/interedition/collatex/io/SimpleCollationJSONMessageBodyReader.java
+++ b/collatex-servlet/src/main/java/eu/interedition/collatex/io/SimpleCollationJSONMessageBodyReader.java
@@ -4,6 +4,7 @@
import eu.interedition.collatex.CollationAlgorithmFactory;
import eu.interedition.collatex.Token;
import eu.interedition.collatex.dekker.InspectableCollationAlgorithm;
+import eu.interedition.collatex.matching.EditDistanceRatioTokenComparator;
import eu.interedition.collatex.matching.EditDistanceTokenComparator;
import eu.interedition.collatex.matching.EqualityTokenComparator;
import eu.interedition.collatex.simple.*;
@@ -21,6 +22,8 @@
import java.lang.reflect.Type;
import java.util.ArrayList;
import java.util.Comparator;
+import java.util.HashMap;
+import java.util.Map;
import java.util.List;
/**
@@ -133,38 +136,7 @@ public SimpleCollation readFrom(Class type, Type genericType, A
throw new IOException("No witnesses in collation");
}
- Comparator tokenComparator = null;
- final JsonValue tokenComparatorNode = collationObject.get("tokenComparator");
- if (tokenComparatorNode != null && tokenComparatorNode.getValueType() == JsonValue.ValueType.OBJECT) {
- final JsonObject tokenComparatorObject = (JsonObject) tokenComparatorNode;
- try {
- if ("levenshtein".equals(tokenComparatorObject.getString("type"))) {
- final int configuredDistance = tokenComparatorObject.getInt("distance", 0);
- tokenComparator = new EditDistanceTokenComparator(configuredDistance == 0 ? 1 : configuredDistance);
- }
- } catch (ClassCastException e) {
- // ignored
- }
- }
- if (tokenComparator == null) {
- tokenComparator = new EqualityTokenComparator();
- }
-
- CollationAlgorithm collationAlgorithm = null;
- final JsonValue collationAlgorithmNode = collationObject.get("algorithm");
- if (collationAlgorithmNode != null && collationAlgorithmNode.getValueType() == JsonValue.ValueType.STRING) {
- final String collationAlgorithmValue = ((JsonString) collationAlgorithmNode).getString();
- if ("needleman-wunsch".equalsIgnoreCase(collationAlgorithmValue)) {
- collationAlgorithm = CollationAlgorithmFactory.needlemanWunsch(tokenComparator);
- } else if ("gst".equalsIgnoreCase(collationAlgorithmValue)) {
- collationAlgorithm = CollationAlgorithmFactory.greedyStringTiling(tokenComparator, 2);
- } else if ("medite".equalsIgnoreCase(collationAlgorithmValue)) {
- collationAlgorithm = CollationAlgorithmFactory.medite(tokenComparator, SimpleToken.TOKEN_MATCH_EVALUATOR);
- }
- }
- if (collationAlgorithm == null) {
- collationAlgorithm = CollationAlgorithmFactory.dekker(tokenComparator);
- }
+ CollationAlgorithm collationAlgorithm = createFromJSON(collationObject);
boolean joined = true;
try {
@@ -185,4 +157,52 @@ public SimpleCollation readFrom(Class type, Type genericType, A
return new SimpleCollation(witnesses, collationAlgorithm, joined);
}
}
+
+ /**
+ * Create CollationAlgorithm from a JSON snippet
+ *
+ * This method is duplicated in {@code JsonProcessor}.
+ *
+ * FIXME: This method could be moved into {@code CollationAlgorithmFactory}
+ * but it would make collatex-core dependent on javax.json.
+ *
+ * @param collationObject The JSON snippet
+ * @return The CollationAlgorithm subclass
+ */
+ private static CollationAlgorithm createFromJSON(JsonObject collationObject) {
+ Comparator comparator = null;
+
+ final JsonValue tokenComparatorNode = collationObject.get("tokenComparator");
+ if (tokenComparatorNode != null && tokenComparatorNode.getValueType() == JsonValue.ValueType.OBJECT) {
+ final JsonObject tokenComparatorObject = (JsonObject) tokenComparatorNode;
+ try {
+ if ("levenshtein".equals(tokenComparatorObject.getString("type"))) {
+ if (tokenComparatorObject.containsKey("ratio")) {
+ comparator = CollationAlgorithmFactory.createComparator (
+ "levenshtein.ratio",
+ new Double (tokenComparatorObject.getJsonNumber("ratio").doubleValue()));
+ } else {
+ comparator = CollationAlgorithmFactory.createComparator (
+ "levenshtein.distance",
+ new Integer (tokenComparatorObject.getInt("distance", 1)));
+ }
+ }
+ } catch (ClassCastException e) {
+ // ignored
+ }
+ }
+ if (comparator == null) {
+ comparator = CollationAlgorithmFactory.createComparator ("equality");
+ }
+
+ String algorithm = "dekker";
+ final JsonValue collationAlgorithmNode = collationObject.get("algorithm");
+ if (collationAlgorithmNode != null &&
+ collationAlgorithmNode.getValueType() == JsonValue.ValueType.STRING) {
+ algorithm = ((JsonString) collationAlgorithmNode).getString();
+ }
+
+ return CollationAlgorithmFactory.createAlgorithm(algorithm, comparator);
+ }
+
}
diff --git a/collatex-tools/src/main/java/eu/interedition/collatex/tools/CollationPipe.java b/collatex-tools/src/main/java/eu/interedition/collatex/tools/CollationPipe.java
index 0f0b23fbd..4caccadd4 100644
--- a/collatex-tools/src/main/java/eu/interedition/collatex/tools/CollationPipe.java
+++ b/collatex-tools/src/main/java/eu/interedition/collatex/tools/CollationPipe.java
@@ -46,7 +46,9 @@
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Comparator;
+import java.util.HashMap;
import java.util.List;
+import java.util.Map;
import java.util.Optional;
import java.util.function.Function;
import java.util.stream.Stream;
@@ -98,19 +100,9 @@ public static void start(CommandLine commandLine) throws Exception {
throw new ParseException("Failed to read script '" + script + "' - " + e.getMessage());
}
- switch (commandLine.getOptionValue("a", "").toLowerCase()) {
- case "needleman-wunsch":
- collationAlgorithm = CollationAlgorithmFactory.needlemanWunsch(comparator);
- break;
- case "medite":
- collationAlgorithm = CollationAlgorithmFactory.medite(comparator, SimpleToken.TOKEN_MATCH_EVALUATOR);
- break;
- case "gst":
- collationAlgorithm = CollationAlgorithmFactory.greedyStringTiling(comparator, 2);
- break;
- default:
- collationAlgorithm = Optional.ofNullable(collationAlgorithm).orElse(CollationAlgorithmFactory.dekker(comparator));
- break;
+ String algorithm = commandLine.getOptionValue("a", "").toLowerCase();
+ if (!algorithm.equals ("")) {
+ collationAlgorithm = CollationAlgorithmFactory.createAlgorithm (algorithm, comparator);
}
if (witnesses == null) {
diff --git a/collatex-tools/src/main/java/eu/interedition/collatex/tools/JsonProcessor.java b/collatex-tools/src/main/java/eu/interedition/collatex/tools/JsonProcessor.java
index 99a5fc583..aa088a233 100644
--- a/collatex-tools/src/main/java/eu/interedition/collatex/tools/JsonProcessor.java
+++ b/collatex-tools/src/main/java/eu/interedition/collatex/tools/JsonProcessor.java
@@ -24,6 +24,7 @@
import eu.interedition.collatex.VariantGraph;
import eu.interedition.collatex.Witness;
import eu.interedition.collatex.dekker.InspectableCollationAlgorithm;
+import eu.interedition.collatex.matching.EditDistanceRatioTokenComparator;
import eu.interedition.collatex.matching.EditDistanceTokenComparator;
import eu.interedition.collatex.matching.EqualityTokenComparator;
import eu.interedition.collatex.simple.SimpleCollation;
@@ -41,6 +42,7 @@
import javax.json.JsonString;
import javax.json.JsonStructure;
import javax.json.JsonValue;
+import javax.json.JsonObjectBuilder;
import javax.json.stream.JsonGenerator;
import java.io.IOException;
import java.io.InputStream;
@@ -48,7 +50,11 @@
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Comparator;
+import java.util.HashMap;
+import java.util.Iterator;
import java.util.List;
+import java.util.Map;
+import java.util.Set;
import java.util.SortedMap;
import java.util.Spliterator;
import java.util.Spliterators;
@@ -155,38 +161,7 @@ public static SimpleCollation read(InputStream inputStream) throws IOException {
throw new IOException("No witnesses in collation");
}
- Comparator tokenComparator = null;
- final JsonValue tokenComparatorNode = collationObject.get("tokenComparator");
- if (tokenComparatorNode != null && tokenComparatorNode.getValueType() == JsonValue.ValueType.OBJECT) {
- final JsonObject tokenComparatorObject = (JsonObject) tokenComparatorNode;
- try {
- if ("levenshtein".equals(tokenComparatorObject.getString("type"))) {
- final int configuredDistance = tokenComparatorObject.getInt("distance", 0);
- tokenComparator = new EditDistanceTokenComparator(configuredDistance == 0 ? 1 : configuredDistance);
- }
- } catch (ClassCastException e) {
- // ignored
- }
- }
- if (tokenComparator == null) {
- tokenComparator = new EqualityTokenComparator();
- }
-
- CollationAlgorithm collationAlgorithm = null;
- final JsonValue collationAlgorithmNode = collationObject.get("algorithm");
- if (collationAlgorithmNode != null && collationAlgorithmNode.getValueType() == JsonValue.ValueType.STRING) {
- final String collationAlgorithmValue = ((JsonString) collationAlgorithmNode).getString();
- if ("needleman-wunsch".equalsIgnoreCase(collationAlgorithmValue)) {
- collationAlgorithm = CollationAlgorithmFactory.needlemanWunsch(tokenComparator);
- } else if ("gst".equalsIgnoreCase(collationAlgorithmValue)) {
- collationAlgorithm = CollationAlgorithmFactory.greedyStringTiling(tokenComparator, 2);
- } else if ("medite".equalsIgnoreCase(collationAlgorithmValue)) {
- collationAlgorithm = CollationAlgorithmFactory.medite(tokenComparator, SimpleToken.TOKEN_MATCH_EVALUATOR);
- }
- }
- if (collationAlgorithm == null) {
- collationAlgorithm = CollationAlgorithmFactory.dekker(tokenComparator);
- }
+ CollationAlgorithm collationAlgorithm = createFromJSON(collationObject);
boolean joined = true;
try {
@@ -221,6 +196,8 @@ public static void write(VariantGraph graph, PrintWriter writer) throws IOExcept
}
protected static void write(JsonGenerator jgen, VariantGraph graph) {
+ insertVertexIds(graph);
+
ParallelSegmentationApparatus.generate(VariantGraphRanking.of(graph), new ParallelSegmentationApparatus.GeneratorCallback() {
@Override
public void start() {
@@ -265,7 +242,7 @@ public void end() {
public static class Token extends SimpleToken {
- private final JsonObject jsonNode;
+ private JsonObject jsonNode;
public Token(SimpleWitness witness, String content, String normalized, JsonObject jsonNode) {
super(witness, content, normalized);
@@ -275,8 +252,83 @@ public Token(SimpleWitness witness, String content, String normalized, JsonObjec
public JsonObject getJsonNode() {
return jsonNode;
}
+
+ public JsonObject setJsonNode(JsonObject jsonNode) {
+ JsonObject oldJsonNode = this.jsonNode;
+ this.jsonNode = jsonNode;
+ return oldJsonNode;
+ }
}
private JsonProcessor() {
}
+
+ private static void insertVertexIds(final VariantGraph graph) {
+ final List> ranking = new ArrayList<>();
+ Iterator> iter = VariantGraphRanking.of(graph).iterator();
+ while (iter.hasNext()) {
+ int id = 0;
+ for (VariantGraph.Vertex vertex : iter.next()) {
+ for (eu.interedition.collatex.Token t : vertex.tokens()) {
+ if (t instanceof Token) {
+ JsonObjectBuilder job = Json.createObjectBuilder();
+ for (Map.Entry entry : ((Token) t).getJsonNode().entrySet()) {
+ job.add(entry.getKey(), entry.getValue());
+ }
+ job.add("_VertexId", id);
+ ((Token) t).setJsonNode(job.build());
+ }
+ }
+ id++;
+ }
+ }
+ }
+
+ /**
+ * Create CollationAlgorithm from a JSON snippet
+ *
+ * This method is duplicated in
+ * {@code SimpleCollationJSONMessageBodyReader}.
+ *
+ * FIXME: This method could be moved into {@code CollationAlgorithmFactory}
+ * but it would make collatex-core dependent on javax.json.
+ *
+ * @param collationObject The JSON snippet
+ * @return The CollationAlgorithm subclass
+ */
+ private static CollationAlgorithm createFromJSON(JsonObject collationObject) {
+ Comparator comparator = null;
+
+ final JsonValue tokenComparatorNode = collationObject.get("tokenComparator");
+ if (tokenComparatorNode != null && tokenComparatorNode.getValueType() == JsonValue.ValueType.OBJECT) {
+ final JsonObject tokenComparatorObject = (JsonObject) tokenComparatorNode;
+ try {
+ if ("levenshtein".equals(tokenComparatorObject.getString("type"))) {
+ if (tokenComparatorObject.containsKey("ratio")) {
+ comparator = CollationAlgorithmFactory.createComparator (
+ "levenshtein.ratio",
+ new Double (tokenComparatorObject.getJsonNumber("ratio").doubleValue()));
+ } else {
+ comparator = CollationAlgorithmFactory.createComparator (
+ "levenshtein.distance",
+ new Integer (tokenComparatorObject.getInt("distance", 1)));
+ }
+ }
+ } catch (ClassCastException e) {
+ // ignored
+ }
+ }
+ if (comparator == null) {
+ comparator = CollationAlgorithmFactory.createComparator ("equality");
+ }
+
+ String algorithm = "dekker";
+ final JsonValue collationAlgorithmNode = collationObject.get("algorithm");
+ if (collationAlgorithmNode != null &&
+ collationAlgorithmNode.getValueType() == JsonValue.ValueType.STRING) {
+ algorithm = ((JsonString) collationAlgorithmNode).getString();
+ }
+
+ return CollationAlgorithmFactory.createAlgorithm(algorithm, comparator);
+ }
}