diff --git a/.gitignore b/.gitignore index d4d8e1436..352452212 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,8 @@ target *.iws *.ipynb *.egg-info +*~ +java.hprof.txt .classpath .project diff --git a/collatex-core/src/main/java/eu/interedition/collatex/CollationAlgorithmFactory.java b/collatex-core/src/main/java/eu/interedition/collatex/CollationAlgorithmFactory.java index b6555d756..6db3ab1a7 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/CollationAlgorithmFactory.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/CollationAlgorithmFactory.java @@ -21,33 +21,107 @@ import eu.interedition.collatex.dekker.DekkerAlgorithm; import eu.interedition.collatex.medite.MediteAlgorithm; -import eu.interedition.collatex.needlemanwunsch.NeedlemanWunschAlgorithm; +import eu.interedition.collatex.needlemanwunsch.*; +import eu.interedition.collatex.needlemanwunschgotoh.*; +import eu.interedition.collatex.matching.*; +import eu.interedition.collatex.simple.SimpleToken; import eu.interedition.collatex.util.GreedyStringTilingAlgorithm; import eu.interedition.collatex.util.VertexMatch; import java.util.Comparator; import java.util.SortedSet; import java.util.function.Function; +import java.util.logging.Level; +import java.util.logging.Logger; /** * @author Gregor Middell * @author Ronald Haentjens Dekker */ public class CollationAlgorithmFactory { + protected final static Logger LOG = Logger.getLogger("CollationAlgorithmFactory"); public static CollationAlgorithm dekker(Comparator comparator) { return new DekkerAlgorithm(comparator); } + public static CollationAlgorithm needlemanWunsch(Comparator comparator) { - return new NeedlemanWunschAlgorithm(comparator); + return new eu.interedition.collatex.needlemanwunsch.NeedlemanWunschAlgorithm(comparator); + } + + + public static CollationAlgorithm needlemanWunschGotoh(StringMetricScorer scorer) { + return new eu.interedition.collatex.needlemanwunschgotoh.NeedlemanWunschGotohAlgorithm(scorer); + } + + public static CollationAlgorithm needlemanWunschGotoh() { + return needlemanWunschGotoh(new TrigramRatioScorer()); + } + + + public static CollationAlgorithm greedyStringTiling(Comparator comparator) { + return greedyStringTiling(comparator, 2); } - public static CollationAlgorithm greedyStringTiling(Comparator comparator, int minimumTileLength) { + public static CollationAlgorithm greedyStringTiling(Comparator comparator, + Integer minimumTileLength) { return new GreedyStringTilingAlgorithm(comparator, minimumTileLength); } - public static CollationAlgorithm medite(Comparator comparator, Function, Integer> matchEvaluator) { + + public static CollationAlgorithm medite(Comparator comparator) { + return medite(comparator, SimpleToken.TOKEN_MATCH_EVALUATOR); + } + + public static CollationAlgorithm medite(Comparator comparator, + Function, Integer> matchEvaluator) { return new MediteAlgorithm(comparator, matchEvaluator); } + + + public static Comparator createComparator(String name, Object... args) { + if (LOG.isLoggable(Level.CONFIG)) { + LOG.log(Level.CONFIG, "Comparator: {0}", name); + } + switch (name) { + case "equality": + new EqualityTokenComparator(); + case "levenshtein.distance": + return args.length >= 1 ? + new EditDistanceTokenComparator((Integer) args[0]) : + new EditDistanceTokenComparator(); + case "levenshtein.ratio": + return args.length >= 1 ? + new EditDistanceRatioTokenComparator((Double) args[0]) : + new EditDistanceRatioTokenComparator(); + } + return new EqualityTokenComparator(); // default + } + + public static CollationAlgorithm createAlgorithm(String name, Comparator comparator, + Object... args) { + if (LOG.isLoggable(Level.CONFIG)) { + LOG.log(Level.CONFIG, "Algorithm: {0}", name); + } + switch (name) { + case "dekker": + return dekker(comparator); + case "gst": + return args.length >= 1 ? + greedyStringTiling(comparator, (Integer) args[0]) : + greedyStringTiling(comparator); + case "medite": + return args.length >= 1 ? + medite(comparator, (Function, Integer>) args[0]) : + medite(comparator); + case "needleman-wunsch": + return needlemanWunsch(comparator); + case "needleman-wunsch-gotoh": + return args.length >= 1 ? + needlemanWunschGotoh((eu.interedition.collatex.matching.StringMetricScorer) args[0]) : + needlemanWunschGotoh(); + } + return dekker(comparator); // default + } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/VariantGraph.java b/collatex-core/src/main/java/eu/interedition/collatex/VariantGraph.java index 8e039bd64..b3a81d805 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/VariantGraph.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/VariantGraph.java @@ -37,12 +37,16 @@ * @author Gregor Middell */ public class VariantGraph { - final VariantGraph.Vertex start; - final VariantGraph.Vertex end; + VariantGraph.Vertex start; + VariantGraph.Vertex end; final Map>> transpositionIndex = new HashMap<>(); public VariantGraph() { super(); + init(); + } + + public void init() { this.start = new VariantGraph.Vertex(this); this.end = new VariantGraph.Vertex(this); diff --git a/collatex-core/src/main/java/eu/interedition/collatex/matching/EditDistance.java b/collatex-core/src/main/java/eu/interedition/collatex/matching/EditDistance.java index 674a50de2..560db5f26 100644 --- a/collatex-core/src/main/java/eu/interedition/collatex/matching/EditDistance.java +++ b/collatex-core/src/main/java/eu/interedition/collatex/matching/EditDistance.java @@ -23,6 +23,10 @@ public final class EditDistance { private static final int MAX_DISTANCE_COMPARISON = 2500; public static int compute(String str1, String str2) { + return compute(str1, str2, 1); + } + + public static int compute(String str1, String str2, int subst_cost) { if ((str1.length() * str2.length() > MAX_DISTANCE_COMPARISON)) { return MAX_DISTANCE_COMPARISON; } @@ -53,7 +57,7 @@ public static int compute(String str1, String str2) { final char str1Char = str1Chars[i - 1]; for (int j = 1; j <= str2Length; j++) { final char str2Char = str2Chars[j - 1]; - final int cost = (str1Char == str2Char ? 0 : 1); + final int cost = (str1Char == str2Char ? 0 : subst_cost); matrix[i][j] = min3(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + cost); } } diff --git a/collatex-core/src/main/java/eu/interedition/collatex/matching/EditDistanceRatioTokenComparator.java b/collatex-core/src/main/java/eu/interedition/collatex/matching/EditDistanceRatioTokenComparator.java new file mode 100644 index 000000000..b7435aab5 --- /dev/null +++ b/collatex-core/src/main/java/eu/interedition/collatex/matching/EditDistanceRatioTokenComparator.java @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2015 The Interedition Development Group. + * + * This file is part of CollateX. + * + * CollateX is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * CollateX is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with CollateX. If not, see . + */ + +package eu.interedition.collatex.matching; + +import eu.interedition.collatex.Token; +import eu.interedition.collatex.simple.SimpleToken; + +import java.util.Comparator; + +public class EditDistanceRatioTokenComparator implements Comparator { + + private final double threshold; + private final LevenshteinRatioScorer scorer; + + public EditDistanceRatioTokenComparator() { + this(0.6); + } + + public EditDistanceRatioTokenComparator(double threshold) { + this.threshold = threshold; + this.scorer = new LevenshteinRatioScorer(); + } + + @Override + public int compare(Token token_a, Token token_b) { + final String a = ((SimpleToken) token_a).getNormalized(); + final String b = ((SimpleToken) token_b).getNormalized(); + return (scorer.score(a, b) >= threshold) ? 0 : a.compareTo(b); + } +} diff --git a/collatex-core/src/main/java/eu/interedition/collatex/matching/StrictEqualityTokenComparator.java b/collatex-core/src/main/java/eu/interedition/collatex/matching/StrictEqualityTokenComparator.java old mode 100755 new mode 100644 diff --git a/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschGotohAlgorithm.java b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschGotohAlgorithm.java new file mode 100644 index 000000000..437cdaf9c --- /dev/null +++ b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschGotohAlgorithm.java @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2015 The Interedition Development Group. + * + * This file is part of CollateX. + * + * CollateX is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * CollateX is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with CollateX. If not, see . + */ + +package eu.interedition.collatex.needlemanwunschgotoh; + +import eu.interedition.collatex.CollationAlgorithm; +import eu.interedition.collatex.Token; +import eu.interedition.collatex.VariantGraph; +import eu.interedition.collatex.Witness; +import eu.interedition.collatex.matching.Pair; +import eu.interedition.collatex.matching.StringMetricScorer; +import eu.interedition.collatex.util.VariantGraphRanking; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + + +/** + * Implements the Needleman-Wunsch-Gotoh collation algorithm. + * + * @author Marcello Perathoner + */ +public class NeedlemanWunschGotohAlgorithm implements CollationAlgorithm { + + private final StringMetricScorer scorer; + /** The minimum similarity score to merge two tokens into one vertex. */ + private final double matchScore; + /** A human-readable matrix. Written to if set. */ + private StringBuilder debugMatrix = null; + + public NeedlemanWunschGotohAlgorithm(final StringMetricScorer scorer) { + this.scorer = scorer; + this.matchScore = 0.6; + } + + @Override + public void collate(final VariantGraph graph, final Iterable witness) { + List> witnesses = new ArrayList<>(); + witnesses.add(witness); + collate(graph, witnesses); + }; + + @Override + public void collate(final VariantGraph graph, final Iterable... witnesses) { + collate(graph, Arrays.asList(witnesses)); + }; + + @Override + public void collate(final VariantGraph graph, final List> witnesses) { + List> rankingA = graphToRanking(graph); + Set witnessesA = getWitnesses(rankingA); + + for (Iterable witness : witnesses) { + final List> rankingB = tokensToRanking(witness); + final Set witnessesB = getWitnesses(rankingB); + + rankingA = doCollate(rankingA, witnessesA, rankingB, witnessesB); + witnessesA.addAll(witnessesB); + } + rankingToGraph(graph, rankingA); + }; + + public void setDebugMatrix(final StringBuilder debugMatrix) { + this.debugMatrix = debugMatrix; + } + + /* Incipit private stuff */ + + private Set getWitnesses(final VariantGraph.Vertex vertex) { + return vertex.tokens().stream().map(t -> t.getWitness()).collect(Collectors.toSet()); + } + + private Set getWitnesses(final Collection> ranks) { + return ranks.stream() + .flatMap(s -> s.stream()) + .flatMap(t -> t.tokens().stream()) + .map(t -> t.getWitness()) + .collect(Collectors.toSet()); + } + + private Set tokenToVertexSet(final Token t) { + Set vertexSet = new HashSet<>(); + VariantGraph.Vertex vertex = new VariantGraph.Vertex(null); + vertex.tokens().add(t); + vertexSet.add(vertex); + return vertexSet; + } + + /** + * Import a set of vertices into a new graph + * + * Makes a copy of the vertex and imports it into the new graph. Does not + * connect the vertices. + * + * @param graph The graph into which to import the vertices + * @param vertices The vertices to import + * @return A set of copied and imported vertices + */ + private Set importVertexSet( + final VariantGraph graph, + final Set vertices) { + return vertices.stream() + .map(vertex -> { + VariantGraph.Vertex v = new VariantGraph.Vertex(graph); + v.tokens().addAll(vertex.tokens()); + return v; + } + ).collect(Collectors.toSet()); + } + + /** + * Create a ranking from a graph. + * + * @param graph The graph + * @return The ranking + */ + private List> graphToRanking(final VariantGraph graph) { + final List> ranking = new ArrayList<>(); + Iterator> iter = VariantGraphRanking.of(graph).iterator(); + while (iter.hasNext()) { + ranking.add(importVertexSet(null, iter.next())); + } + ranking.remove(0); // shave off graph start element + ranking.remove(ranking.size() - 1); // shave off graph end element + return ranking; + } + + /** + * Create a list of {@code Set} from a witness. + * + * The rationale behind this conversion is to make both inputs to the + * collator be of the same type. This because a symmetric problem is + * generally easier to solve. + * + * @param iter A witness as token stream + * @return A ranking with one token in each rank + */ + private List> tokensToRanking(final Iterable iter) { + final List> ranking = new ArrayList<>(); + for (Token t : iter) { + ranking.add(tokenToVertexSet(t)); + } + return ranking; + } + + /** + * Collate two rankings + * + * The aligner decides which ranks of rankingA and rankingB to align. If + * the alignment is good enough, two vertices, one in each ranking, are + * merged. + * + * @param rankingA A ranking + * @param witnessesA All witnesses in rankingA + * @param rankingB A ranking + * @param witnessesB All witnesses in rankingB + * @return The collated ranking + */ + + private List> doCollate(final Collection> rankingA, + final Set witnessesA, + final Collection> rankingB, + final Set witnessesB) { + // Run the aligner. + + NeedlemanWunschGotohAligner aligner = + new NeedlemanWunschGotohAligner(new NeedlemanWunschProfileScorer(scorer, witnessesA.size())); + aligner.setDebugMatrix (debugMatrix); + + NeedlemanWunschScorerSetVertexSetVertex matcher = + new NeedlemanWunschScorerSetVertexSetVertex(scorer); + + List, Set>> alignmentList = + aligner.align(rankingA, rankingB); + + // Build a new ranking by merging the aligner output into one ranking. + List> collated = new ArrayList(); + for (Pair, Set> alignment : alignmentList) { + assert ((alignment.a != null) || (alignment.b != null)); + + Set verticesA = alignment.a; + Set verticesB = alignment.b; + + // Merge the matching vertices of each sequence + NeedlemanWunschScorerSetVertexSetVertex.Match matching = + matcher.match(verticesA, verticesB, matchScore); + if (matching != null) { + matching.vertexA.tokens().addAll(matching.vertexB.tokens()); + verticesB.remove(matching.vertexB); + } + + Set vertices = new HashSet<>(); + if (verticesA != null) { + vertices.addAll(verticesA); + } + if (verticesB != null) { + vertices.addAll(verticesB); + } + collated.add(vertices); + } + return collated; + } + + /** + * Create a graph out of a ranking. + * + * This is the inverse of {@code VariantGraphRanking.of}. + * + * @param graph The graph + * @param ranking The ranking + */ + private void rankingToGraph(final VariantGraph graph, + final List> ranking) { + graph.init(); + + // A map of witness -> last vertex with witness + Map vertexMap = + getWitnesses(ranking).stream().collect(Collectors.toMap(w -> w, w -> graph.getStart())); + + for (Set vertices : ranking) { + // Normally, if we convert a graph to a table, and one path between + // two vertices contains more vertices than another path, the + // relation between vertices in the 'shorter' path and their ranks + // will become ambiguous. To avoid that, we insert placeholder + // vertices, so that all witnesses connect to some vertex at each + // rank and all paths will be of the same length. + VariantGraph.Vertex placeholder = new VariantGraph.Vertex(graph); + Set unconnectedWitnesses = new HashSet<>(vertexMap.keySet()); + unconnectedWitnesses.removeAll(getWitnesses(Collections.singleton(vertices))); + for (Witness w : unconnectedWitnesses) { + graph.connect(vertexMap.put(w, placeholder), placeholder, Collections.singleton(w)); + } + + // Connect the 'real' vertices in each rank. + for (VariantGraph.Vertex vertex : importVertexSet(graph, vertices)) { + for (Witness w : getWitnesses(vertex)) { + graph.connect(vertexMap.put(w, vertex), vertex, Collections.singleton(w)); + } + } + } + + // Connect the end. + for (VariantGraph.Vertex vertex : vertexMap.values()) { + graph.connect(vertex, graph.getEnd(), getWitnesses(vertex)); + } + } +} diff --git a/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschGotohAligner.java b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschGotohAligner.java new file mode 100644 index 000000000..18194b10f --- /dev/null +++ b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschGotohAligner.java @@ -0,0 +1,306 @@ +/* + * Copyright (c) 2016 The Interedition Development Group. + * + * This file is part of CollateX. + * + * CollateX is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * CollateX is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with CollateX. If not, see . + */ + +package eu.interedition.collatex.needlemanwunschgotoh; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.LinkedList; +import java.util.List; + +import eu.interedition.collatex.matching.Pair; + +/** + * A generic Needleman-Wunsch-Gotoh sequence aligner. + * + * This aligner aligns two sequences of type A and type B respectively. It is + * totally agnostic of the type of objects it aligns. + * + * The aligner needs a {@code NeedlemanWunschScorer} that determines the score + * of a match between an object of type A and an object of type B. + * + * This implementation uses Gotoh's improvements to get $\mathcal{O}(mn)$ + * running time and reduce memory requirements to essentially the backtracking + * matrix only. In Gotoh's technique the gap weight formula must be of the + * special form $w_k = uk + v$. $k$ is the gap size, $v$ is the gap opening + * score and $u$ the gap extension score. + * + * See: Gotoh, O. (1982). An Improved Algorithm for Matching Biological + * Sequences, J. Mol. Biol. 162, 705-708 + * + * @param Type of first sequence + * @param Type of second sequence + * + * @author Marcello Perathoner + */ +public class NeedlemanWunschGotohAligner { + /** A Scorer. */ + private final NeedlemanWunschScorer scorer; + /** A human-readable matrix as string. Written to only if set. */ + private StringBuilder debugMatrix = null; + /** The gap opening score. */ + private final double openScore; + /** The gap extension score. */ + private final double extendScore; + + public NeedlemanWunschGotohAligner(final NeedlemanWunschScorer scorer) { + this(scorer, -1.0, -0.5); + } + + public NeedlemanWunschGotohAligner(final NeedlemanWunschScorer scorer, + final double gapOpenScore, + final double gapExtendScore) { + this.scorer = scorer; + this.openScore = gapOpenScore; + this.extendScore = gapExtendScore; + } + + private class Data { + /** The current score. */ + public double score; + /** $P_{m,n}, Q_{m,n} in Gotoh. */ + public double p; + public double q; + /** The size of the gap. $k$ in Gotoh. */ + public int pSize; + public int qSize; + + Data(final double score) { + this.score = score; + this.p = 0.0; + this.q = 0.0; + this.pSize = 0; + this.qSize = 0; + } + } + + public List> align(final Collection inputA, final Collection inputB) { + int i, j; + + final int sizeA = inputA.size(); + final int sizeB = inputB.size(); + + // 0 j . B + // i + // . + // A + + /** + * The backtracking matrix. 0 stands for a match. Negative numbers + * represent a DEL TOP operation. The abs() of the number is the length + * of the gap. Positive numbers represent an INS LEFT operation. The + * value of the muber is the length of the gap. + */ + ArrayList> lenMatrix = new ArrayList<>(sizeA + 1); + /** + * The scoring matrix. We need only the last row of the scoring matrix + * for our calculations, so we allocate the scoring matrix only when + * debugging. + */ + ArrayList> matrix = null; + /** The current row of the backtracking matrix. */ + ArrayList thisLenRow = new ArrayList<>(sizeB + 1); + /** The current row of the scoring matrix. */ + ArrayList thisRow = new ArrayList<>(sizeB + 1); + + // Initialize lenMatrix and one row of the scoring matrix. + + lenMatrix.add(thisLenRow); + + thisRow.add(new Data(0.0)); + thisLenRow.add(0); + for (j = 1; j <= sizeB; ++j) { + Data d = new Data(openScore + (j - 1) * extendScore); + d.p = d.score; + // d.pSize = j; + thisRow.add(d); + thisLenRow.add(j); + } + + if (debugMatrix != null) { + matrix = new ArrayList<>(sizeA + 1); + matrix.add((ArrayList) thisRow.clone()); + } + + // Score the matrix + i = 0; + for (A a : inputA) { + i++; + + // add new lenRow to matrix + thisLenRow = new ArrayList<>(sizeB + 1); + lenMatrix.add(thisLenRow); + thisLenRow.add(-i); // DEL TOP + + Data diag = thisRow.get(0); + Data left = new Data(openScore + (i - 1) * extendScore); + left.q = left.score; + // left.qSize = i; + j = 0; + for (B b : inputB) { + j++; + Data top = thisRow.get(j); + Data curr = new Data(0.0); + + curr.p = top.score + openScore; + curr.pSize = 1; + if (curr.p < top.p + extendScore) { + curr.p = top.p + extendScore; + curr.pSize = top.pSize + 1; + } + curr.q = left.score + openScore; + curr.qSize = 1; + if (curr.q < left.q + extendScore) { + curr.q = left.q + extendScore; + curr.qSize = left.qSize + 1; + } + final double d = diag.score + scorer.score(a, b); + + // Decide which operation is optimal and perform it + if ((d > curr.p) && (d > curr.q)) { + curr.score = d; + thisLenRow.add(0); + } else if (curr.q > curr.p) { + curr.score = curr.q; + thisLenRow.add(curr.qSize); // INS LEFT + } else { + curr.score = curr.p; + thisLenRow.add(-curr.pSize); // DEL TOP + } + + // Advance to next column + thisRow.set(j - 1, left); + thisRow.set(j, curr); + diag = top; + left = curr; + } + + if (matrix != null) { + matrix.add((ArrayList) thisRow.clone()); + } + } + + // Walk back and output alignments. We need random access, so copy the + // input Collections to ArrayLists. + final LinkedList> alignments = new LinkedList<>(); + final ArrayList arrayA = new ArrayList<>(inputA); + final ArrayList arrayB = new ArrayList<>(inputB); + i = sizeA; + j = sizeB; + while ((i > 0) || (j > 0)) { + int len = lenMatrix.get(i).get(j); + if (len == 0) { + alignments.addFirst(new Pair(arrayA.get(i - 1), arrayB.get(j - 1))); + --i; + --j; + } else { + if (len < 0) { + for (int k = 0; k < -len; ++k) { + alignments.addFirst(new Pair(arrayA.get(i - 1), null)); + --i; + } + } else { + for (int k = 0; k < len; ++k) { + alignments.addFirst(new Pair(null, arrayB.get(j - 1))); + --j; + } + } + } + } + + if (matrix != null) { + buildDebugMatrix(matrix, lenMatrix, arrayA, arrayB); + } + + return alignments; + } + + /** + * Set a debug matrix. The aligner will fill the debug matrix with a + * human-readable representation of the Needleman-Wunsch matrix if the debug + * matrix is set. + * + * @param debugMatrix A StringBuilder or null. + */ + public void setDebugMatrix(final StringBuilder debugMatrix) { + this.debugMatrix = debugMatrix; + } + + /** + * Build the debug matrix string. Builds a human-readable matrix in a + * string. + * + * @param matrix + * @param lenMatrix + * @param inputA + * @param inputB + */ + private void buildDebugMatrix(final ArrayList> matrix, + final ArrayList> lenMatrix, + final ArrayList inputA, + final ArrayList inputB) { + + debugMatrix.setLength(0); + + debugMatrix.append(String.format("%29s | ", "")); + debugMatrix.append(String.format("%29s | ", "")); + for (B b : inputB) { + debugMatrix.append(String.format("%-29s | ", b)); + } + debugMatrix.append("\n"); + + for (int i = 0; i < matrix.size(); ++i) { + debugAdd(matrix.get(i), lenMatrix.get(i), i > 0 ? inputA.get(i - 1).toString() : ""); + } + debugMatrix.append("\n"); + } + + /** + * Helper function. + * + * @param dataRow + * @param lenRow + * @param a + */ + private void debugAdd(final ArrayList dataRow, + final ArrayList lenRow, + final String a) { + + debugMatrix.append(String.format("%29s | ", a)); + for (int i = 0; i < dataRow.size(); ++i) { + Data data = dataRow.get(i); + int len = lenRow.get(i); + if (len == 0) { + debugMatrix.append("↖ "); + } else { + if (len < 0) { + debugMatrix.append("↑ "); + } else { + debugMatrix.append("← "); + } + } + debugMatrix.append(String.format("% 2.6f ", data.score)); + debugMatrix.append(String.format("% 2.2f ", data.p)); + debugMatrix.append(String.format("% 2d ", data.pSize)); + debugMatrix.append(String.format("% 2.2f ", data.q)); + debugMatrix.append(String.format("% 2d | ", data.qSize)); + } + debugMatrix.append("\n"); + } +} diff --git a/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschProfileScorer.java b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschProfileScorer.java new file mode 100644 index 000000000..618512df0 --- /dev/null +++ b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschProfileScorer.java @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2016 The Interedition Development Group. + * + * This file is part of CollateX. + * + * CollateX is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * CollateX is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with CollateX. If not, see . + */ + +package eu.interedition.collatex.needlemanwunschgotoh; + +import java.util.Set; + +import eu.interedition.collatex.Token; +import eu.interedition.collatex.VariantGraph; +import eu.interedition.collatex.simple.SimpleToken; +import eu.interedition.collatex.matching.StringMetricScorer; + +/** + * Score against a profile. + * + * "Definition: Given a multiple alignment of a set of strings, a + * profile for that multiple alignment specifies for each column the + * frequency that each character appears in the column." -- Gusfield + * 1977, Algorithms on Strings, Trees and Sequences, Cambridge University Press + * + * The score for the full match is a weighted sum of scores based on the + * frequency of the matched vertices. + * + * @author Marcello Perathoner + */ + +public class NeedlemanWunschProfileScorer + implements NeedlemanWunschScorer, Set> { + private final StringMetricScorer matchScorer; + private final int size; + + public NeedlemanWunschProfileScorer(final StringMetricScorer matchScorer, final int size) { + this.matchScorer = matchScorer; + this.size = size; + } + + @Override + public double score(final Set verticesA, + final Set verticesB) { + + if (verticesA == null || verticesB == null) { + return matchScorer.getMinScore(); + } + if (verticesA.size() == 0 || verticesB.size() == 0) { + return matchScorer.getMinScore(); + } + + double totalScore = 0.0; + int totalMatched = 0; + + for (VariantGraph.Vertex vertexA : verticesA) { + for (VariantGraph.Vertex vertexB : verticesB) { + for (Token tokenA : vertexA.tokens()) { + final String a = ((SimpleToken) tokenA).getNormalized(); + for (Token tokenB : vertexB.tokens()) { + final String b = ((SimpleToken) tokenB).getNormalized(); + totalScore += matchScorer.score(a, b); + totalMatched++; + } + } + } + } + int totalUnmatched = size - totalMatched; + + return (totalScore + totalUnmatched * -1.0) / size; + } +}; diff --git a/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschScorer.java b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschScorer.java new file mode 100644 index 000000000..f7eb18287 --- /dev/null +++ b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschScorer.java @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2016 The Interedition Development Group. + * + * This file is part of CollateX. + * + * CollateX is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * CollateX is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with CollateX. If not, see . + */ + +package eu.interedition.collatex.needlemanwunschgotoh; + +/** + * A scorer for a {@code NeedlemanWunschGotohAligner}. + * + * Calculates the score of a match between two generic objects. + * + * @param Type of the first object + * @param Type of the second object + * + * @author Marcello Perathoner + */ +public interface NeedlemanWunschScorer { + /** + * Calculate the score given to a match between a and b. + * + * @param a An object + * @param b An object + * + * @return The score + */ + double score(A a, B b); +} diff --git a/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschScorerSetVertexSetVertex.java b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschScorerSetVertexSetVertex.java new file mode 100644 index 000000000..c66c86525 --- /dev/null +++ b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschScorerSetVertexSetVertex.java @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2015 The Interedition Development Group. + * + * This file is part of CollateX. + * + * CollateX is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * CollateX is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with CollateX. If not, see . + */ + +package eu.interedition.collatex.needlemanwunschgotoh; + +import java.util.Set; + +import eu.interedition.collatex.Token; +import eu.interedition.collatex.VariantGraph; +import eu.interedition.collatex.simple.SimpleToken; +import eu.interedition.collatex.matching.StringMetricScorer; + +/** + * Score a set of vertices against another set of vertices + * + * The score will be the highest score achieved while matching all tokens of all + * vertices in verticesA against all tokens of all vertices in verticesB. + * + * @author Marcello Perathoner + */ + +public class NeedlemanWunschScorerSetVertexSetVertex + implements NeedlemanWunschScorer, Set> { + private final StringMetricScorer matchScorer; + + public NeedlemanWunschScorerSetVertexSetVertex(final StringMetricScorer matchScorer) { + this.matchScorer = matchScorer; + } + + public class Match { + public final VariantGraph.Vertex vertexA; + public final VariantGraph.Vertex vertexB; + public final double score; + public Match(final VariantGraph.Vertex vertexA, + final VariantGraph.Vertex vertexB, + final double score) { + this.vertexA = vertexA; + this.vertexB = vertexB; + this.score = score; + } + } + + public Match match(final Set verticesA, + final Set verticesB, + final double minScore) { + + if (verticesA == null || verticesB == null) { + return null; + } + if (verticesA.size() == 0 || verticesB.size() == 0) { + return null; + } + + Match matching = null; + double minScoreMatched = minScore; + + for (VariantGraph.Vertex vertexA : verticesA) { + for (VariantGraph.Vertex vertexB : verticesB) { + + for (Token tokenA : vertexA.tokens()) { + final String a = ((SimpleToken) tokenA).getNormalized(); + + for (Token tokenB : vertexB.tokens()) { + final String b = ((SimpleToken) tokenB).getNormalized(); + + double score = matchScorer.score(a, b); + if (score > minScoreMatched) { + minScoreMatched = score; + matching = new Match(vertexA, vertexB, score); + } + } + } + } + } + return matching; + } + + @Override + public double score(final Set verticesA, + final Set verticesB) { + Match matching = match(verticesA, verticesB, matchScorer.getMinScore()); + return (matching != null) ? matching.score : matchScorer.getMinScore(); + } +}; diff --git a/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/package-info.java b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/package-info.java new file mode 100644 index 000000000..1dd01c484 --- /dev/null +++ b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/package-info.java @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2016 The Interedition Development Group. + * + * This file is part of CollateX. + * + * CollateX is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * CollateX is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with CollateX. If not, see . + */ + +/** + * A version of the Needleman-Wunsch algorithm with floating point scoring. + * + * This algorithm + * strives for global alignment of witnesses and bases the alignment on a + * configurable scoring of matches vs. differences/gaps. It does not try to + * detect transpositions. + * + * @author Marcello Perathoner + * + * @see eu.interedition.collatex.needlemanwunschgotoh.NeedlemanWunschGotohAligner + * @see eu.interedition.collatex.needlemanwunschgotoh.NeedlemanWunschGotohAlgorithm + * @see eu.interedition.collatex.needlemanwunschgotoh.NeedlemanWunschScorer + */ +package eu.interedition.collatex.needlemanwunschgotoh; diff --git a/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/AbstractStringMetricScorer.java b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/AbstractStringMetricScorer.java new file mode 100644 index 000000000..78b7b0640 --- /dev/null +++ b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/AbstractStringMetricScorer.java @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2016 The Interedition Development Group. + * + * This file is part of CollateX. + * + * CollateX is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * CollateX is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with CollateX. If not, see . + */ + +package eu.interedition.collatex.matching; + +import java.util.HashMap; +import java.util.Map; + +/** + * Calculate a score based on string metrics. Skeletal implementation. + * + * Note that the minimum score is the score for a mismatch. This score must be + * higher than two times the score of a gap. If a mismatch would score less + * than two gaps, the aligner would always select two gaps and never select a + * mismatch. + * + * This implementation caches the calculated score. + * + * @author Marcello Perathoner + */ +public abstract class AbstractStringMetricScorer implements StringMetricScorer { + + /** Memoization cache. Caches the score of already computed pairs. */ + private final Map, Double> cache = new HashMap<>(); + + /** The minimum score. */ + protected final double minScore; + + /** The maxiumum score. */ + protected final double maxScore; + + /** Constructor. */ + public AbstractStringMetricScorer() { + this(-1.0, 1.0); + } + + /** + * Constructor. + * + * @param minScore The score for the worst match. + * @param maxScore The score for the best match. + */ + public AbstractStringMetricScorer(final double minScore, final double maxScore) { + this.minScore = minScore; + this.maxScore = maxScore; + } + + /** + * Return the minimun score this scorer will ever calculate. + * + * @return The score for the worst match. + */ + public double getMinScore() { + return minScore; + }; + + /** + * Return the maximun score this scorer will ever calculate. + * + * @return The score for the best match. + */ + public double getMaxScore() { + return maxScore; + }; + + /** Calculate the score. Override this for cached scorers. */ + protected double _score(final Pair p) { + return 0.0; + } + + /** Calculate the score. Override this for uncached scorers. */ + public double score(final String a, final String b) { + final Pair pair = new Pair<>(a, b); + + return cache.computeIfAbsent(pair, p -> _score(p)); + } +} diff --git a/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/EqualityScorer.java b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/EqualityScorer.java new file mode 100644 index 000000000..6767650cb --- /dev/null +++ b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/EqualityScorer.java @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2016 The Interedition Development Group. + * + * This file is part of CollateX. + * + * CollateX is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * CollateX is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with CollateX. If not, see . + */ + +package eu.interedition.collatex.matching; + +/** + * Calculate a score based on string equality. + * + * Score is max if strings match, min otherwise. + * + * @author Marcello Perathoner + */ +public class EqualityScorer extends AbstractStringMetricScorer { + + public EqualityScorer() { + super(); + } + + public EqualityScorer(final double minScore, final double maxScore) { + super(minScore, maxScore); + } + + @Override + public double score(final String a, final String b) { + return a.equals(b) ? maxScore : minScore; + } +} diff --git a/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/LevenshteinDistanceScorer.java b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/LevenshteinDistanceScorer.java new file mode 100644 index 000000000..6a6be33cd --- /dev/null +++ b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/LevenshteinDistanceScorer.java @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2016 The Interedition Development Group. + * + * This file is part of CollateX. + * + * CollateX is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * CollateX is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with CollateX. If not, see . + */ + +package eu.interedition.collatex.matching; + +/** + * Calculate a score based on the Levenshtein distance. + * + * Score is 1 if distance <= threshold, -1 otherwise. + * + * @author Marcello Perathoner + */ +public class LevenshteinDistanceScorer extends AbstractStringMetricScorer { + + final private int threshold; + + public LevenshteinDistanceScorer(final int threshold) { + super(); + this.threshold = threshold; + } + + public LevenshteinDistanceScorer(final double minScore, + final double maxScore, + final int threshold) { + super(minScore, maxScore); + this.threshold = threshold; + } + + @Override + protected double _score(final Pair p) { + if (p.a.equals(p.b)) { + return 1.0; + } + return EditDistance.compute(p.a, p.b) <= threshold ? maxScore : minScore; + } +} diff --git a/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/LevenshteinRatioScorer.java b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/LevenshteinRatioScorer.java new file mode 100644 index 000000000..d8082627e --- /dev/null +++ b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/LevenshteinRatioScorer.java @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2016 The Interedition Development Group. + * + * This file is part of CollateX. + * + * CollateX is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * CollateX is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with CollateX. If not, see . + */ + +package eu.interedition.collatex.matching; + +/** + * Calculate a score based on Levenshtein Ratio. + * + * Levenshtein ratio as used in python-Levenshtein + * distance = levenshtein (a, b) + * length = length (a) + length (b) + * ratio = (length - distance) / length + * + * @author Marcello Perathoner + */ +public class LevenshteinRatioScorer extends AbstractStringMetricScorer { + + public LevenshteinRatioScorer() { + super(); + } + + public LevenshteinRatioScorer(final double minScore, final double maxScore) { + super(minScore, maxScore); + } + + @Override + protected double _score(final Pair p) { + if (p.a.equals(p.b)) { + return maxScore; + } + final int distance = EditDistance.compute(p.a, p.b, 2); + final int length = p.a.length() + p.b.length(); + final double ratio = ((double) (length - distance)) / length; + + return minScore + (maxScore - minScore) * ratio; + } +} diff --git a/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/Pair.java b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/Pair.java new file mode 100644 index 000000000..cec091b9d --- /dev/null +++ b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/Pair.java @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2015 The Interedition Development Group. + * + * This file is part of CollateX. + * + * CollateX is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * CollateX is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with CollateX. If not, see . + */ + +package eu.interedition.collatex.matching; + +/** + * A Pair class that can be used as key in Map. + * + * @author Marcello Perathoner + */ +public class Pair { + public final T a; + public final U b; + + public Pair(T a, U b) { + this.a = a; + this.b = b; + } + + @Override + public boolean equals(final Object obj) { + if (obj == this) { + return true; + } + if (obj instanceof Pair) { + Pair other = (Pair) obj; + return a.equals(other.a) && b.equals(other.b); + } + return false; + } + + @Override + public int hashCode() { + // Map.Entry uses operator ^ but + is a better choice because + // we also want to store Pairs of identic strings. + return a.hashCode() + b.hashCode(); + } + + public String toString() { + return "(" + a.toString() + ", " + b.toString() + ")"; + } +} diff --git a/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/StringMetricScorer.java b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/StringMetricScorer.java new file mode 100644 index 000000000..2b1760789 --- /dev/null +++ b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/StringMetricScorer.java @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2016 The Interedition Development Group. + * + * This file is part of CollateX. + * + * CollateX is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * CollateX is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with CollateX. If not, see . + */ + +package eu.interedition.collatex.matching; + +/** + * Calculate a score based on string metrics. + * + * @author Marcello Perathoner + */ +public interface StringMetricScorer { + + /** Calculate the score for a match between and b. */ + double score(String a, String b); + + /** Return the minimun score this scorer will ever calculate. */ + double getMinScore(); + + /** Return the maximun score this scorer will ever calculate. */ + double getMaxScore(); +} diff --git a/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/TrigramRatioScorer.java b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/TrigramRatioScorer.java new file mode 100644 index 000000000..ccbdcd185 --- /dev/null +++ b/collatex-core/src/main/java/eu/interedition/collatex/needlemanwunschgotoh/scorer/TrigramRatioScorer.java @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2016 The Interedition Development Group. + * + * This file is part of CollateX. + * + * CollateX is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * CollateX is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with CollateX. If not, see . + */ + +package eu.interedition.collatex.matching; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + + +/** + * Calculate a score based on trigram ratio. + * + * Trigram ratio is defined as: + * ta = number of trigrams in a + * tb = number of trigrams in b + * tab = number of trigrams in that are both in a and in b + * ratio = 2 * tab / (ta + tb) + * + * @author Marcello Perathoner + */ +public class TrigramRatioScorer extends AbstractStringMetricScorer { + + final Map> trigrams = new HashMap<>(); + + public TrigramRatioScorer() { + super(); + } + + public TrigramRatioScorer(final double minScore, final double maxScore) { + super(minScore, maxScore); + } + + private Set trigramize(final String s) { + assert s.length() > 0; + + Set tri = new HashSet<>(); + String ss = " " + s + " "; + + for (int i = 0; i < ss.length() - 2; i++) { + tri.add(ss.substring(i, i + 3)); + } + return tri; + } + + @Override + protected double _score(final Pair p) { + if (p.a.equals(p.b)) { + return maxScore; + } + + Set triA = trigrams.computeIfAbsent(p.a, this::trigramize); + Set triB = trigrams.computeIfAbsent(p.b, this::trigramize); + + Set triAB = new HashSet<>(triA); + triAB.retainAll(triB); + + final double ratio = 2.0 * triAB.size() / (triA.size() + triB.size()); + + return minScore + (maxScore - minScore) * ratio; + } +} diff --git a/collatex-core/src/test/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschGotohTest.java b/collatex-core/src/test/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschGotohTest.java new file mode 100644 index 000000000..71c80d06e --- /dev/null +++ b/collatex-core/src/test/java/eu/interedition/collatex/needlemanwunschgotoh/NeedlemanWunschGotohTest.java @@ -0,0 +1,449 @@ +/* + * Copyright (c) 2015 The Interedition Development Group. + * + * This file is part of CollateX. + * + * CollateX is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * CollateX is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with CollateX. If not, see . + */ + +package eu.interedition.collatex.needlemanwunschgotoh; + +import eu.interedition.collatex.AbstractTest; +import eu.interedition.collatex.CollationAlgorithm; +import eu.interedition.collatex.CollationAlgorithmFactory; +import eu.interedition.collatex.Token; +import eu.interedition.collatex.Witness; +import eu.interedition.collatex.VariantGraph; +import eu.interedition.collatex.needlemanwunschgotoh.*; +import eu.interedition.collatex.matching.*; +import eu.interedition.collatex.simple.SimpleWitness; +import eu.interedition.collatex.util.VariantGraphTraversal; +import org.junit.Test; + +import java.util.Arrays; +import java.util.ArrayList; +import java.util.List; +import java.util.Set; +import java.util.SortedMap; +import java.util.stream.*; + +import static org.junit.Assert.*; + +/** + * @author Gregor Middell + */ +public class NeedlemanWunschGotohTest extends AbstractTest { + + final double delta = 0.000000001; + NeedlemanWunschGotohAligner aligner; + + public class NeedlemanWunschScorerStringString implements NeedlemanWunschScorer { + private final StringMetricScorer matchScorer; + + public NeedlemanWunschScorerStringString(StringMetricScorer matchScorer) { + this.matchScorer = matchScorer; + } + + @Override + public double score(String a, String b) { + return matchScorer.score(a, b); + } + }; + + private void testAlign(String a, String b) { + StringBuilder debugMatrix = new StringBuilder(); + aligner.setDebugMatrix(debugMatrix); + + List> list, expectedList; + + List aa = Arrays.asList(a.split("\\s+")); + List bb = Arrays.asList(b.split("\\s+")); + assertEquals(aa.size(), bb.size()); + + expectedList = new ArrayList>(); + for (int i = 0; i < aa.size(); i++) { + expectedList.add(new Pair(aa.get(i), bb.get(i))); + } + + try { + list = aligner.align( + aa.stream().filter(p -> !p.equals("-")).collect(Collectors.toList()), + bb.stream().filter(p -> !p.equals("-")).collect(Collectors.toList()) + ); + } catch (Exception e) { + System.out.println(e.getMessage()); + System.out.println(debugMatrix.toString()); + throw e; + } + + list = list.stream() + .map(p -> new Pair(p.a == null ? "-" : p.a, p.b == null ? "-" : p.b)) + .collect(Collectors.toList()); + + String aaa = list .stream().map(Pair::toString).collect(Collectors.joining(" ")); + String bbb = expectedList.stream().map(Pair::toString).collect(Collectors.joining(" ")); + + if (!aaa.equals(bbb)) { + System.out.println(debugMatrix.toString()); + } + + assertEquals(bbb, aaa); + } + + private void testCollate(String... witnesses) { + StringBuilder debugMatrix = new StringBuilder(); + ((NeedlemanWunschGotohAlgorithm) collationAlgorithm).setDebugMatrix(debugMatrix); + + List strings = new ArrayList<>(); + List tables = new ArrayList<>(); + for (String witness : witnesses) { + List l = Arrays.asList(witness.split("\\s+")); + strings.add(l.stream().filter(p -> !p.equals("-")).collect(Collectors.joining(" "))); + tables.add(l.stream().map(p -> p.equals("-") ? " " : p).collect(Collectors.joining("|", "|", "|"))); + } + + final SimpleWitness[] w = createWitnesses(strings.toArray(new String[0])); + + List>> t; + try { + t = table(collate(w)); + } catch (Exception e) { + System.out.println(e); + System.out.println(debugMatrix.toString()); + throw e; + } + + int i = 0; + for (String table : tables) { + String result = toString(t, w[i]); + if (!table.equals(result)) { + System.out.println(table); + System.out.println(result); + System.out.println(debugMatrix.toString()); + } + assertEquals(table, result); + i++; + } + } + + @Test + public void simple() { + setCollationAlgorithm(CollationAlgorithmFactory.needlemanWunschGotoh()); + LOG.fine(toString(table(collate("a b a b a", "a b a")))); + } + + @Test + public void equalityScorerTest() { + StringMetricScorer scorer = new EqualityScorer(0.0, 1.0); + + assertEquals(1.0, scorer.score("foobar", "foobar"), delta); + assertEquals(0.0, scorer.score("foobar", "fobar"), delta); + assertEquals(0.0, scorer.score("foobar", "fooobar"), delta); + assertEquals(0.0, scorer.score("foobar", "foobaz"), delta); + assertEquals(0.0, scorer.score("foobar", "foobiz"), delta); + } + + @Test + public void levenshteinDistanceScorerTest() { + StringMetricScorer scorer = new LevenshteinDistanceScorer(0.0, 1.0, 1); + + assertEquals(1.0, scorer.score("foobar", "foobar"), delta); + assertEquals(1.0, scorer.score("foobar", "fobar"), delta); + assertEquals(1.0, scorer.score("foobar", "fooobar"), delta); + assertEquals(1.0, scorer.score("foobar", "foobaz"), delta); + assertEquals(0.0, scorer.score("foobar", "foobiz"), delta); + } + + @Test + public void levenshteinRatioScorerTest() { + StringMetricScorer scorer = new LevenshteinRatioScorer(0.0, 1.0); + + assertEquals(1.0, scorer.score("foobar", "foobar"), delta); + assertEquals(0.5, scorer.score("fooabc", "foodef"), delta); + assertEquals(0.0, scorer.score("abcdef", "ghijkl"), delta); + + assertTrue(scorer.score("foobar", "fooobar") > scorer.score("fobar", "foobar")); + } + + @Test + public void trigramRatioScorerTest() { + StringMetricScorer scorer = new TrigramRatioScorer(0.0, 1.0); + + assertEquals(1.0, scorer.score("foobar", "foobar"), delta); + assertEquals(6.0 / 16, scorer.score("fooabc", "foodef"), delta); + assertEquals(6.0 / 16, scorer.score("abcfoo", "deffoo"), delta); + assertEquals(0.0, scorer.score("abcdef", "ghijkl"), delta); + + assertEquals(2.0 / 12, scorer.score("heti", "ethi"), delta); + + assertTrue(scorer.score("foobar", "fooobar") > scorer.score("fobar", "foobar")); + } + + @Test + public void equalityAlignerTest() { + aligner = new NeedlemanWunschGotohAligner( + new NeedlemanWunschScorerStringString(new EqualityScorer()) + ); + + testAlign("a b c", + "a b c"); + + testAlign("a b c", + "a b d"); + + testAlign("a b c", + "a - c"); + + testAlign("- - -", + "a b c"); + + testAlign("a b c", + "- - -"); + + // The equality scorer will match words only if they are equal. + + testAlign("foo foob foobar foob foo", + "- - foobar - - "); + + // It will not match a word that is similar. + + testAlign("foo foob fooba foob foo", + "foobar - - - -"); // expected wrong answer + } + + @Test + public void levenshteinDistanceAlignerTest() { + aligner = new NeedlemanWunschGotohAligner( + new NeedlemanWunschScorerStringString(new LevenshteinDistanceScorer(1)) + ); + + // The Levenshtein distance scorer will match a word that is equal. + + testAlign("x foo foobar foo y", + "- - foobar - -"); + + // But it will also match a word that is similar within Levenshtein + // distance. + + testAlign("x foo fooba foo y", + "- - foobar - -"); + + // But if many words are within Levenshtein distance they will all look + // the same to the scorer, so it picks the wrong one. + + testAlign("x foob fooba foobar fooba foob y", + "- - - - foobar - -"); // expected wrong answer + } + + @Test + public void levenshteinRatioAlignerTest() { + aligner = new NeedlemanWunschGotohAligner( + new NeedlemanWunschScorerStringString(new LevenshteinRatioScorer()) + ); + + // The Levenshtein ratio scorer will match a word that is equal. + + testAlign("x foo foobar foo y", + "- - foobar - -"); + + // But it will also match the most similar word. + + testAlign("x foo fooba foo y", + "- - foobar - -"); + + // But it will give the correct match where the Levenshtein distance + // scorer failed. + + testAlign("x foob fooba foobar fooba foob y", + "- - - foobar - - -"); + + // More tests + + testAlign("- - Sanguis effusio in ecclesiis facta", + "Si quis sanguinis effusionem in eclesia fecerit"); + + testAlign("Si quis sanguinis effusionem in eclesia fecerit", + "- - Sanguis effusio in ecclesiis facta"); + + testAlign("uolumus ut ea dentur que", + "uolumus ut detur ea que"); + + + testAlign("periurium committitur . Quando maioris pretii", + "perIurium committitur - quanto maIoris pretii"); + } + + @Test + public void trigramRatioAlignerTest() { + aligner = new NeedlemanWunschGotohAligner( + new NeedlemanWunschScorerStringString(new TrigramRatioScorer()) + ); + + // The trigram ratio scorer will match a word that is equal. + + testAlign("x foo foobar foo y", + "- - foobar - -"); + + // But it will also match the most similar word. + + testAlign("x foo fooba foo y", + "- - foobar - -"); + + // But it will give the correct match where the Levenshtein distance + // scorer failed. + + testAlign("x foob fooba foobar fooba foob y", + "- - - foobar - - -"); + + // More tests + + testAlign("- - Sanguis effusio in ecclesiis facta", + "Si quis sanguinis effusionem in eclesia fecerit"); + + testAlign("Si quis sanguinis effusionem in eclesia fecerit", + "- - Sanguis effusio in ecclesiis facta"); + + testAlign("uolumus ut - ea dentur que", + "uolumus ut detur ea - que"); + + testAlign("periurium committitur . quando maioris pretii", + "perIurium committitur - quanto maIoris pretii"); + } + + @Test + public void collatorSanityTest() { + setCollationAlgorithm(CollationAlgorithmFactory.needlemanWunschGotoh()); + + // Test if the collator works on an empty graph. + + final SimpleWitness[] w = createWitnesses("a b c d"); + final VariantGraph graph = collate(w); + final List vertices = + StreamSupport.stream(VariantGraphTraversal.of(graph).spliterator(), false).collect(Collectors.toList()); + assertEquals(6, vertices.size()); + assertEquals(graph.getStart(), vertices.get(0)); + assertEquals(1, vertices.get(1).tokens().size()); + assertEquals(1, vertices.get(2).tokens().size()); + assertEquals(1, vertices.get(3).tokens().size()); + assertEquals(1, vertices.get(4).tokens().size()); + assertVertexEquals("a", vertices.get(1)); + assertVertexEquals("b", vertices.get(2)); + assertVertexEquals("c", vertices.get(3)); + assertVertexEquals("d", vertices.get(4)); + assertEquals(graph.getEnd(), vertices.get(5)); + } + + @Test + public void collatorSanityTest1() { + setCollationAlgorithm(CollationAlgorithmFactory.needlemanWunschGotoh()); + + testCollate("a b c"); + } + + @Test + public void collatorSanityTest2() { + setCollationAlgorithm(CollationAlgorithmFactory.needlemanWunschGotoh()); + + testCollate("the cat is black", + "the dog is black"); + } + + @Test + public void exactMatch() { + setCollationAlgorithm(CollationAlgorithmFactory.needlemanWunschGotoh()); + + testCollate("a b c", + "a b c"); + + testCollate("a b c", + "a b d"); + + testCollate("a b c", + "a d c"); + + testCollate("a b c", + "d b c"); + + testCollate("a b c", + "a - c"); + + testCollate("a - c", + "a b c"); + + testCollate("a b c", + "- b -"); + + testCollate("- b -", + "a b c"); + } + + @Test + public void exactMatch1() { + setCollationAlgorithm(CollationAlgorithmFactory.needlemanWunschGotoh()); + + testCollate("a foob foob foobar foob foob b", + "a - - foobar - - b"); + + testCollate("a foob foob foobar foob foob b foob foob c", + "a - - foobar - - b - - c"); + } + + @Test + public void multiMatch() { + setCollationAlgorithm(CollationAlgorithmFactory.needlemanWunschGotoh()); + + testCollate("a b c - - -", + "- b c d - -", + "- - c d e -", + "- - - d e f"); + + testCollate("- - - d e f", + "- - c d e -", + "- b c d - -", + "a b c - - -"); + } + + @Test + public void distanceMatch() { + setCollationAlgorithm(CollationAlgorithmFactory.needlemanWunschGotoh()); + + testCollate("x foo foo foobar foo foo y", + "- - - fooba - - -"); + } + + @Test + public void ratioMatch() { + setCollationAlgorithm(CollationAlgorithmFactory.needlemanWunschGotoh()); + + testCollate("x foo foob foobar foo foo y", + "- - - fooba - - -"); + + testCollate("hadebaldus - heti bernoinus", + "hadebaldus bernuinus heti -", + "adebaldus - ethi bernoinus"); + } + + @Test + public void preferOneLongGap() { + setCollationAlgorithm(CollationAlgorithmFactory.needlemanWunschGotoh()); + + // The aligner will prefer one long gap over many short ones. + + testCollate("a b b b b b c", + "a b b - - - c"); + + testCollate("a b b b b b c", + "a b b - - - c"); + } +} diff --git a/collatex-pythonport/collatex/core_classes.py b/collatex-pythonport/collatex/core_classes.py index 6e8c3d177..1411a82dd 100644 --- a/collatex-pythonport/collatex/core_classes.py +++ b/collatex-pythonport/collatex/core_classes.py @@ -16,30 +16,30 @@ from collatex.exceptions import TokenError class Row(object): - + def __init__(self, header): self.cells = [] self.header = header - + def append(self, cell): self.cells.append(cell) - + def to_list(self): return self.cells - + class Column(object): - + def __init__(self): self.tokens_per_witness = {} - self.variant = False + self.variant = False def put(self, sigil, token): self.tokens_per_witness[sigil]=token class AlignmentTable(object): - + def __init__(self, collation, graph=None, layout="horizontal"): self.collation = collation self.graph = graph @@ -52,7 +52,7 @@ def __init__(self, collation, graph=None, layout="horizontal"): def _construct_table(self): ranking = VariantGraphRanking.of(self.graph) vertices_per_rank = ranking.byRank - # construct columns + # construct columns for rank in vertices_per_rank: column = None vertices = vertices_per_rank[rank] @@ -87,11 +87,11 @@ def _construct_table(self): def __str__(self, *args, **kwargs): return str(create_table_visualization(self)) - + # DISPLAY PART OF THE VARIANT GRAPH IN PLAIN/HTML AND VERTICAL OR HORIZONTAL! def create_table_visualization(table): # create visualization of alignment table - if table.layout == "vertical": + if table.layout == "vertical": prettytable = visualizeTableVertically(table) elif table.layout == "horizontal": prettytable = visualizeTableHorizontal(table) @@ -120,11 +120,11 @@ def visualizeTableVertically(table): x.add_column(row.header, [fill(cell, 20) if cell else "-" for cell in row.cells]) return x - + # not used in the suffix implementation # Tokenizer inside suffix array library is used class Tokenizer(object): - + # by default the tokenizer splits on space characters def tokenize(self, contents): return contents.split() @@ -156,7 +156,7 @@ def __repr__(self): return self.token_string class Witness(object): - + def __init__(self, witnessdata): self.sigil = witnessdata['id'] self._tokens = [] @@ -172,22 +172,22 @@ def __init__(self, witnessdata): self._tokens.append(Token(tk)) # content string is used for generation of the suffix and LCP arrays. self.content = ' '.join([x.token_string for x in self._tokens]) - + def tokens(self): return self._tokens class VariantGraph(object): - + def __init__(self): self.graph = nx.DiGraph() self.start = self.add_vertex(Token()) self.end = self.add_vertex(Token()) - + # def is_directed(self): # return self.graph.is_directed() -# - # vertex creation uses a unique ID, since the token_content does not have to be unique - # we store the token content in the label +# + # vertex creation uses a unique ID, since the token_content does not have to be unique + # we store the token content in the label def add_vertex(self, token, sigil=None): ''' :type token: Token @@ -196,13 +196,13 @@ def add_vertex(self, token, sigil=None): # print("Adding node: "+node_id+":"+token_content) tokens = {} if sigil: - tokens[sigil] = token + tokens[sigil] = [token] self.graph.add_node(node_id, label=token.token_string, tokens=tokens) return node_id def add_token_to_vertex(self, node, token, sigil): attributes = self.vertex_attributes(node) - attributes["tokens"][sigil] = token + attributes["tokens"].setdefault(sigil, []).append (token) def connect(self, source, target, witnesses): """ @@ -212,44 +212,44 @@ def connect(self, source, target, witnesses): # print("Adding Edge: "+source+":"+target) if self.graph.has_edge(source, target): self.graph[source][target]["label"] += ", "+str(witnesses) - else: + else: self.graph.add_edge(source, target, label=witnesses) - + def remove_edge(self, source, target): self.graph.remove_edge(source, target) - + def remove_node(self, node): - self.graph.remove_node(node) - + self.graph.remove_node(node) + def vertices(self): return self.graph.nodes() - + def edges(self): return self.graph.edges() - + def edge_between(self, node, node2): # return self.graph.get_edge_data(node, node2) return self.graph.has_edge(node, node2) - + def in_edges(self, node, data=False): return self.graph.in_edges(nbunch=node, data=data) - + def out_edges(self, node, data=False): return self.graph.out_edges(nbunch=node, data=data) - + def vertex_attributes(self, node): return self.graph.node[node] - + # Note: generator implementation def vertexWith(self, content): try: vertex_to_find = (n for n in self.graph if self.graph.node[n]['label'] == content).next() - return vertex_to_find + return vertex_to_find except StopIteration: raise Exception("Vertex with "+content+" not found!") - + class CollationAlgorithm(object): - def merge(self, graph, witness_sigil, witness_tokens, alignments = {}): + def merge(self, graph, witness_sigil, witness_tokens, alignments = {}): """ :type graph: VariantGraph """ @@ -275,7 +275,7 @@ def merge(self, graph, witness_sigil, witness_tokens, alignments = {}): This function joins the variant graph in place. This function is a straight port of the Java version of CollateX. :type graph: VariantGraph - TODO: add transposition support! + TODO: add transposition support! ''' def join(graph): processed = set() @@ -301,7 +301,7 @@ def join(graph): graph.remove_edge(join_candidate, neighbor) graph.connect(vertex, neighbor, data["label"]) graph.remove_edge(vertex, join_candidate) - graph.remove_node(join_candidate) + graph.remove_node(join_candidate) queue.appendleft(vertex); continue; processed.add(vertex) @@ -309,7 +309,7 @@ def join(graph): # FIXME: Why do we run out of memory in some cases here, if this is not checked? if neighbor not in processed: queue.appendleft(neighbor) - + # Port of VariantGraphRanking class from Java # This is a minimal port; only bare bones class VariantGraphRanking(object): @@ -319,13 +319,13 @@ def __init__(self): # however, a rank can be assigned to multiple vertices self.byVertex = {} self.byRank = {} - + def apply(self, vertex): return self.byVertex[vertex] - + @classmethod def of(cls, graph): - variant_graph_ranking = VariantGraphRanking() + variant_graph_ranking = VariantGraphRanking() topological_sorted_vertices = topological_sort(graph.graph) for v in topological_sorted_vertices: rank = -1 @@ -335,7 +335,3 @@ def of(cls, graph): variant_graph_ranking.byVertex[v]=rank variant_graph_ranking.byRank.setdefault(rank, []).append(v) return variant_graph_ranking - - - - diff --git a/collatex-servlet/src/main/java/eu/interedition/collatex/io/SimpleCollationJSONMessageBodyReader.java b/collatex-servlet/src/main/java/eu/interedition/collatex/io/SimpleCollationJSONMessageBodyReader.java index 4ea198c36..d0b7f331f 100644 --- a/collatex-servlet/src/main/java/eu/interedition/collatex/io/SimpleCollationJSONMessageBodyReader.java +++ b/collatex-servlet/src/main/java/eu/interedition/collatex/io/SimpleCollationJSONMessageBodyReader.java @@ -4,6 +4,7 @@ import eu.interedition.collatex.CollationAlgorithmFactory; import eu.interedition.collatex.Token; import eu.interedition.collatex.dekker.InspectableCollationAlgorithm; +import eu.interedition.collatex.matching.EditDistanceRatioTokenComparator; import eu.interedition.collatex.matching.EditDistanceTokenComparator; import eu.interedition.collatex.matching.EqualityTokenComparator; import eu.interedition.collatex.simple.*; @@ -21,6 +22,8 @@ import java.lang.reflect.Type; import java.util.ArrayList; import java.util.Comparator; +import java.util.HashMap; +import java.util.Map; import java.util.List; /** @@ -133,38 +136,7 @@ public SimpleCollation readFrom(Class type, Type genericType, A throw new IOException("No witnesses in collation"); } - Comparator tokenComparator = null; - final JsonValue tokenComparatorNode = collationObject.get("tokenComparator"); - if (tokenComparatorNode != null && tokenComparatorNode.getValueType() == JsonValue.ValueType.OBJECT) { - final JsonObject tokenComparatorObject = (JsonObject) tokenComparatorNode; - try { - if ("levenshtein".equals(tokenComparatorObject.getString("type"))) { - final int configuredDistance = tokenComparatorObject.getInt("distance", 0); - tokenComparator = new EditDistanceTokenComparator(configuredDistance == 0 ? 1 : configuredDistance); - } - } catch (ClassCastException e) { - // ignored - } - } - if (tokenComparator == null) { - tokenComparator = new EqualityTokenComparator(); - } - - CollationAlgorithm collationAlgorithm = null; - final JsonValue collationAlgorithmNode = collationObject.get("algorithm"); - if (collationAlgorithmNode != null && collationAlgorithmNode.getValueType() == JsonValue.ValueType.STRING) { - final String collationAlgorithmValue = ((JsonString) collationAlgorithmNode).getString(); - if ("needleman-wunsch".equalsIgnoreCase(collationAlgorithmValue)) { - collationAlgorithm = CollationAlgorithmFactory.needlemanWunsch(tokenComparator); - } else if ("gst".equalsIgnoreCase(collationAlgorithmValue)) { - collationAlgorithm = CollationAlgorithmFactory.greedyStringTiling(tokenComparator, 2); - } else if ("medite".equalsIgnoreCase(collationAlgorithmValue)) { - collationAlgorithm = CollationAlgorithmFactory.medite(tokenComparator, SimpleToken.TOKEN_MATCH_EVALUATOR); - } - } - if (collationAlgorithm == null) { - collationAlgorithm = CollationAlgorithmFactory.dekker(tokenComparator); - } + CollationAlgorithm collationAlgorithm = createFromJSON(collationObject); boolean joined = true; try { @@ -185,4 +157,52 @@ public SimpleCollation readFrom(Class type, Type genericType, A return new SimpleCollation(witnesses, collationAlgorithm, joined); } } + + /** + * Create CollationAlgorithm from a JSON snippet + * + * This method is duplicated in {@code JsonProcessor}. + * + * FIXME: This method could be moved into {@code CollationAlgorithmFactory} + * but it would make collatex-core dependent on javax.json. + * + * @param collationObject The JSON snippet + * @return The CollationAlgorithm subclass + */ + private static CollationAlgorithm createFromJSON(JsonObject collationObject) { + Comparator comparator = null; + + final JsonValue tokenComparatorNode = collationObject.get("tokenComparator"); + if (tokenComparatorNode != null && tokenComparatorNode.getValueType() == JsonValue.ValueType.OBJECT) { + final JsonObject tokenComparatorObject = (JsonObject) tokenComparatorNode; + try { + if ("levenshtein".equals(tokenComparatorObject.getString("type"))) { + if (tokenComparatorObject.containsKey("ratio")) { + comparator = CollationAlgorithmFactory.createComparator ( + "levenshtein.ratio", + new Double (tokenComparatorObject.getJsonNumber("ratio").doubleValue())); + } else { + comparator = CollationAlgorithmFactory.createComparator ( + "levenshtein.distance", + new Integer (tokenComparatorObject.getInt("distance", 1))); + } + } + } catch (ClassCastException e) { + // ignored + } + } + if (comparator == null) { + comparator = CollationAlgorithmFactory.createComparator ("equality"); + } + + String algorithm = "dekker"; + final JsonValue collationAlgorithmNode = collationObject.get("algorithm"); + if (collationAlgorithmNode != null && + collationAlgorithmNode.getValueType() == JsonValue.ValueType.STRING) { + algorithm = ((JsonString) collationAlgorithmNode).getString(); + } + + return CollationAlgorithmFactory.createAlgorithm(algorithm, comparator); + } + } diff --git a/collatex-tools/src/main/java/eu/interedition/collatex/tools/CollationPipe.java b/collatex-tools/src/main/java/eu/interedition/collatex/tools/CollationPipe.java index 0f0b23fbd..4caccadd4 100644 --- a/collatex-tools/src/main/java/eu/interedition/collatex/tools/CollationPipe.java +++ b/collatex-tools/src/main/java/eu/interedition/collatex/tools/CollationPipe.java @@ -46,7 +46,9 @@ import java.nio.file.Files; import java.util.ArrayList; import java.util.Comparator; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Optional; import java.util.function.Function; import java.util.stream.Stream; @@ -98,19 +100,9 @@ public static void start(CommandLine commandLine) throws Exception { throw new ParseException("Failed to read script '" + script + "' - " + e.getMessage()); } - switch (commandLine.getOptionValue("a", "").toLowerCase()) { - case "needleman-wunsch": - collationAlgorithm = CollationAlgorithmFactory.needlemanWunsch(comparator); - break; - case "medite": - collationAlgorithm = CollationAlgorithmFactory.medite(comparator, SimpleToken.TOKEN_MATCH_EVALUATOR); - break; - case "gst": - collationAlgorithm = CollationAlgorithmFactory.greedyStringTiling(comparator, 2); - break; - default: - collationAlgorithm = Optional.ofNullable(collationAlgorithm).orElse(CollationAlgorithmFactory.dekker(comparator)); - break; + String algorithm = commandLine.getOptionValue("a", "").toLowerCase(); + if (!algorithm.equals ("")) { + collationAlgorithm = CollationAlgorithmFactory.createAlgorithm (algorithm, comparator); } if (witnesses == null) { diff --git a/collatex-tools/src/main/java/eu/interedition/collatex/tools/JsonProcessor.java b/collatex-tools/src/main/java/eu/interedition/collatex/tools/JsonProcessor.java index 99a5fc583..aa088a233 100644 --- a/collatex-tools/src/main/java/eu/interedition/collatex/tools/JsonProcessor.java +++ b/collatex-tools/src/main/java/eu/interedition/collatex/tools/JsonProcessor.java @@ -24,6 +24,7 @@ import eu.interedition.collatex.VariantGraph; import eu.interedition.collatex.Witness; import eu.interedition.collatex.dekker.InspectableCollationAlgorithm; +import eu.interedition.collatex.matching.EditDistanceRatioTokenComparator; import eu.interedition.collatex.matching.EditDistanceTokenComparator; import eu.interedition.collatex.matching.EqualityTokenComparator; import eu.interedition.collatex.simple.SimpleCollation; @@ -41,6 +42,7 @@ import javax.json.JsonString; import javax.json.JsonStructure; import javax.json.JsonValue; +import javax.json.JsonObjectBuilder; import javax.json.stream.JsonGenerator; import java.io.IOException; import java.io.InputStream; @@ -48,7 +50,11 @@ import java.io.PrintWriter; import java.util.ArrayList; import java.util.Comparator; +import java.util.HashMap; +import java.util.Iterator; import java.util.List; +import java.util.Map; +import java.util.Set; import java.util.SortedMap; import java.util.Spliterator; import java.util.Spliterators; @@ -155,38 +161,7 @@ public static SimpleCollation read(InputStream inputStream) throws IOException { throw new IOException("No witnesses in collation"); } - Comparator tokenComparator = null; - final JsonValue tokenComparatorNode = collationObject.get("tokenComparator"); - if (tokenComparatorNode != null && tokenComparatorNode.getValueType() == JsonValue.ValueType.OBJECT) { - final JsonObject tokenComparatorObject = (JsonObject) tokenComparatorNode; - try { - if ("levenshtein".equals(tokenComparatorObject.getString("type"))) { - final int configuredDistance = tokenComparatorObject.getInt("distance", 0); - tokenComparator = new EditDistanceTokenComparator(configuredDistance == 0 ? 1 : configuredDistance); - } - } catch (ClassCastException e) { - // ignored - } - } - if (tokenComparator == null) { - tokenComparator = new EqualityTokenComparator(); - } - - CollationAlgorithm collationAlgorithm = null; - final JsonValue collationAlgorithmNode = collationObject.get("algorithm"); - if (collationAlgorithmNode != null && collationAlgorithmNode.getValueType() == JsonValue.ValueType.STRING) { - final String collationAlgorithmValue = ((JsonString) collationAlgorithmNode).getString(); - if ("needleman-wunsch".equalsIgnoreCase(collationAlgorithmValue)) { - collationAlgorithm = CollationAlgorithmFactory.needlemanWunsch(tokenComparator); - } else if ("gst".equalsIgnoreCase(collationAlgorithmValue)) { - collationAlgorithm = CollationAlgorithmFactory.greedyStringTiling(tokenComparator, 2); - } else if ("medite".equalsIgnoreCase(collationAlgorithmValue)) { - collationAlgorithm = CollationAlgorithmFactory.medite(tokenComparator, SimpleToken.TOKEN_MATCH_EVALUATOR); - } - } - if (collationAlgorithm == null) { - collationAlgorithm = CollationAlgorithmFactory.dekker(tokenComparator); - } + CollationAlgorithm collationAlgorithm = createFromJSON(collationObject); boolean joined = true; try { @@ -221,6 +196,8 @@ public static void write(VariantGraph graph, PrintWriter writer) throws IOExcept } protected static void write(JsonGenerator jgen, VariantGraph graph) { + insertVertexIds(graph); + ParallelSegmentationApparatus.generate(VariantGraphRanking.of(graph), new ParallelSegmentationApparatus.GeneratorCallback() { @Override public void start() { @@ -265,7 +242,7 @@ public void end() { public static class Token extends SimpleToken { - private final JsonObject jsonNode; + private JsonObject jsonNode; public Token(SimpleWitness witness, String content, String normalized, JsonObject jsonNode) { super(witness, content, normalized); @@ -275,8 +252,83 @@ public Token(SimpleWitness witness, String content, String normalized, JsonObjec public JsonObject getJsonNode() { return jsonNode; } + + public JsonObject setJsonNode(JsonObject jsonNode) { + JsonObject oldJsonNode = this.jsonNode; + this.jsonNode = jsonNode; + return oldJsonNode; + } } private JsonProcessor() { } + + private static void insertVertexIds(final VariantGraph graph) { + final List> ranking = new ArrayList<>(); + Iterator> iter = VariantGraphRanking.of(graph).iterator(); + while (iter.hasNext()) { + int id = 0; + for (VariantGraph.Vertex vertex : iter.next()) { + for (eu.interedition.collatex.Token t : vertex.tokens()) { + if (t instanceof Token) { + JsonObjectBuilder job = Json.createObjectBuilder(); + for (Map.Entry entry : ((Token) t).getJsonNode().entrySet()) { + job.add(entry.getKey(), entry.getValue()); + } + job.add("_VertexId", id); + ((Token) t).setJsonNode(job.build()); + } + } + id++; + } + } + } + + /** + * Create CollationAlgorithm from a JSON snippet + * + * This method is duplicated in + * {@code SimpleCollationJSONMessageBodyReader}. + * + * FIXME: This method could be moved into {@code CollationAlgorithmFactory} + * but it would make collatex-core dependent on javax.json. + * + * @param collationObject The JSON snippet + * @return The CollationAlgorithm subclass + */ + private static CollationAlgorithm createFromJSON(JsonObject collationObject) { + Comparator comparator = null; + + final JsonValue tokenComparatorNode = collationObject.get("tokenComparator"); + if (tokenComparatorNode != null && tokenComparatorNode.getValueType() == JsonValue.ValueType.OBJECT) { + final JsonObject tokenComparatorObject = (JsonObject) tokenComparatorNode; + try { + if ("levenshtein".equals(tokenComparatorObject.getString("type"))) { + if (tokenComparatorObject.containsKey("ratio")) { + comparator = CollationAlgorithmFactory.createComparator ( + "levenshtein.ratio", + new Double (tokenComparatorObject.getJsonNumber("ratio").doubleValue())); + } else { + comparator = CollationAlgorithmFactory.createComparator ( + "levenshtein.distance", + new Integer (tokenComparatorObject.getInt("distance", 1))); + } + } + } catch (ClassCastException e) { + // ignored + } + } + if (comparator == null) { + comparator = CollationAlgorithmFactory.createComparator ("equality"); + } + + String algorithm = "dekker"; + final JsonValue collationAlgorithmNode = collationObject.get("algorithm"); + if (collationAlgorithmNode != null && + collationAlgorithmNode.getValueType() == JsonValue.ValueType.STRING) { + algorithm = ((JsonString) collationAlgorithmNode).getString(); + } + + return CollationAlgorithmFactory.createAlgorithm(algorithm, comparator); + } }