diff --git a/public/java/src/uk/ac/ox/well/cortexjdk/commands/discover/call/Call.java b/public/java/src/uk/ac/ox/well/cortexjdk/commands/discover/call/Call.java deleted file mode 100644 index 9374e590..00000000 --- a/public/java/src/uk/ac/ox/well/cortexjdk/commands/discover/call/Call.java +++ /dev/null @@ -1,571 +0,0 @@ -package uk.ac.ox.well.cortexjdk.commands.discover.call; - -import com.google.common.base.Joiner; -import htsjdk.samtools.util.Interval; -import htsjdk.samtools.util.StringUtil; -import org.apache.commons.lang3.tuple.Triple; -import org.apache.commons.math3.util.Pair; -import org.jetbrains.annotations.NotNull; -import org.jgrapht.GraphPath; -import org.jgrapht.graph.DirectedWeightedPseudograph; -import org.mapdb.DB; -import org.mapdb.DBMaker; -import org.mapdb.HTreeMap; -import org.mapdb.Serializer; -import org.mapdb.serializer.SerializerArray; -import uk.ac.ox.well.cortexjdk.commands.Module; -import uk.ac.ox.well.cortexjdk.utils.alignment.mosaic.MosaicAligner; -import uk.ac.ox.well.cortexjdk.utils.alignment.reference.IndexedReference; -import uk.ac.ox.well.cortexjdk.utils.arguments.Argument; -import uk.ac.ox.well.cortexjdk.utils.arguments.Output; -import uk.ac.ox.well.cortexjdk.utils.io.graph.cortex.CortexGraph; -import uk.ac.ox.well.cortexjdk.utils.io.graph.cortex.CortexRecord; -import uk.ac.ox.well.cortexjdk.utils.io.graph.links.CortexLinks; -import uk.ac.ox.well.cortexjdk.utils.kmer.CanonicalKmer; -import uk.ac.ox.well.cortexjdk.utils.kmer.CortexByteKmer; -import uk.ac.ox.well.cortexjdk.utils.stoppingrules.ContigStopper; -import uk.ac.ox.well.cortexjdk.utils.stoppingrules.DestinationStopper; -import uk.ac.ox.well.cortexjdk.utils.stoppingrules.ExplorationStopper; -import uk.ac.ox.well.cortexjdk.utils.traversal.*; - -import java.io.File; -import java.io.PrintStream; -import java.util.*; - -import static uk.ac.ox.well.cortexjdk.utils.traversal.TraversalEngineConfiguration.GraphCombinationOperator.OR; -import static uk.ac.ox.well.cortexjdk.utils.traversal.TraversalEngineConfiguration.TraversalDirection.BOTH; -import static uk.ac.ox.well.cortexjdk.utils.traversal.TraversalEngineConfiguration.TraversalDirection.FORWARD; -import static uk.ac.ox.well.cortexjdk.utils.traversal.TraversalEngineConfiguration.TraversalDirection.REVERSE; -import static uk.ac.ox.well.cortexjdk.utils.traversal.TraversalUtils.getAllNextKmers; - -public class Call extends Module { - @Argument(fullName = "graph", shortName = "g", doc = "Graph") - public CortexGraph GRAPH; - - @Argument(fullName = "links", shortName = "l", doc = "Links", required=false) - public ArrayList LINKS; - - @Argument(fullName = "rois", shortName = "r", doc = "Rois") - public CortexGraph ROIS; - - @Argument(fullName="db", shortName="d", doc="Database") - public File DB_FILE; - - @Argument(fullName="mother", shortName="m", doc="Mother's sample name") - public LinkedHashSet MOTHER; - - @Argument(fullName="father", shortName="f", doc="Father's sample name") - public LinkedHashSet FATHER; - - @Argument(fullName="background", shortName="b", doc="Background", required=false) - public HashMap BACKGROUNDS; - - @Argument(fullName="reference", shortName="R", doc="Reference", required=false) - public HashMap REFERENCE; - - //@Argument(fullName="reads", shortName="r", doc="Read sequences") - //public ArrayList READ_FILES; - - @Output - public PrintStream out; - - @Output(fullName="accOut", shortName="ao", doc="Accounting out") - public PrintStream aout; - - @Override - public void execute() { - log.info("Loading contigs from database..."); - DB dbIn = DBMaker.fileDB(DB_FILE).readOnly().fileMmapEnable().make(); - - HTreeMap contigsMap = initializeDbContigsMap(dbIn); - - Set rois = loadRois(ROIS); - Map> usedRois = new HashMap<>(); - rois.forEach(c -> usedRois.put(c, new TreeSet<>())); - - int sampleColor = GRAPH.getColorForSampleName(ROIS.getSampleName(0)); - - Set contigIndices = new TreeSet<>(contigsMap.getKeys()); - int variantIndex = 0; - for (Integer contigIndex : contigIndices) { - List w = getWalk(contigsMap, contigIndex); - List as = subsetContig(rois, w, 100); - - List> pieces = new ArrayList<>(); - if (as.size() > 1500) { - for (int i = 0; i < as.size(); i++) { - if (rois.contains(as.get(i).getCanonicalKmer())) { - int left = 0, right = 0; - - int windowLeft = i - 100; - for (left = i - 1; left > 0 && left >= windowLeft; left--) { - if (rois.contains(as.get(left).getCanonicalKmer())) { - windowLeft = left - 100; - } - } - - int windowRight = i + 100; - for (right = i + 1; right < as.size() - 2 && right <= windowRight; right++) { - if (rois.contains(as.get(right).getCanonicalKmer())) { - windowRight = right + 100; - } - } - - pieces.add(as.subList(left, right)); - - i = right; - } - } - } else { - pieces.add(as); - } - - for (List ws : pieces) { - String contig = TraversalUtils.toContig(ws); - - log.info("contig {}/{}: {} ({})", contigIndex, contigsMap.size(), w.size(), ws.size()); - - /* - List> regions = getRegions(rois, ws); - Map asmTracks = new HashMap<>(); - for (Set css : Arrays.asList(MOTHER, FATHER)) { - List lv = getParentalContig(sampleColor, ws, regions, css); - - String lc = TraversalUtils.toContig(lv); - - if (!lc.equals(contig)) { - asmTracks.put(css.iterator().next(), TraversalUtils.toContig(lv)); - } - } - List> aAlignments = ma.align(contig, asmTracks); - log.info("\n{}", ma); - */ - - MosaicAligner ma = new MosaicAligner(); - - List> parentIntervals = IntervalCombiner.getIntervals(ws, BACKGROUNDS, 100, 5); - - for (int f = 0; f < parentIntervals.size(); f++) { - log.info("{} {} {}", f, parentIntervals.get(f).getFirst(), parentIntervals.get(f).getSecond()); - } - - Map parentTracks = loadSequences(parentIntervals, BACKGROUNDS); - List, String>> pAlignments = ma.align(contig, parentTracks); - //log.info("\n{}", ma); - - StringBuilder sb = new StringBuilder(StringUtil.repeatCharNTimes(' ', pAlignments.get(0).getRight().length())); - int numKmersMarked = 0; - for (int i = 0; i <= pAlignments.get(0).getRight().length() - GRAPH.getKmerSize(); i++) { - int gapsize = GRAPH.getKmerSize() - pAlignments.get(0).getRight().substring(i, i + GRAPH.getKmerSize()).replaceAll("-", "").length(); - - if (i + GRAPH.getKmerSize() + gapsize <= pAlignments.get(0).getRight().length() - GRAPH.getKmerSize()) { - CanonicalKmer ck = new CanonicalKmer(pAlignments.get(0).getRight().substring(i, i + GRAPH.getKmerSize() + gapsize).replaceAll("-", "")); - - if (rois.contains(ck)) { - numKmersMarked++; - for (int j = i; j < i + GRAPH.getKmerSize() + gapsize; j++) { - sb.setCharAt(j, 'v'); - } - } - } - } - - if (numKmersMarked == 0) { - sb = new StringBuilder(StringUtil.repeatCharNTimes('v', pAlignments.get(0).getRight().length())); - } - - log.info("{} {}", sb.toString(), numKmersMarked); - for (Triple, String> p : pAlignments) { - log.info("{} {}", p.getRight(), p.getLeft()); - } - - List> variants = new ArrayList<>(); - List> allNovelKmers = new ArrayList<>(); - - int variantStart = -1; - StringBuilder childAllele = new StringBuilder(); - StringBuilder parentAllele = new StringBuilder(); - Set novelKmers = new HashSet<>(); - - String childContig = pAlignments.get(0).getRight(); - for (int j = 1; j < pAlignments.size(); j++) { - String parentContig = pAlignments.get(j).getRight(); - - for (int i = 0; i < parentContig.length(); i++) { - if (sb.charAt(i) == 'v') { - if (childContig.charAt(i) != parentContig.charAt(i) && parentContig.charAt(i) != ' ') { - if (variantStart == -1) { - variantStart = i; - - if (i > 0 && (childContig.charAt(i) == '-' || parentContig.charAt(i) == '-')) { - variantStart--; - childAllele.append(childContig.charAt(i - 1)); - parentAllele.append(parentContig.charAt(i - 1)); - } - } - - if (childContig.charAt(i) != '-') { - childAllele.append(childContig.charAt(i)); - } - if (parentContig.charAt(i) != '-') { - parentAllele.append(parentContig.charAt(i)); - } - } else { - if (variantStart >= 0) { - variants.add(Triple.of(variantStart, childAllele.toString(), parentAllele.toString())); - - for (int k = variantStart - GRAPH.getKmerSize(); k < variantStart + childAllele.length() + GRAPH.getKmerSize(); k++) { - if (k >= 0 && k <= childContig.length() - GRAPH.getKmerSize()) { - //log.info("debug: {} {}", k, childContig.length()); - String qrs = childContig.substring(k, childContig.length() - 1); - qrs = qrs.replaceAll("-", ""); - if (qrs.length() > GRAPH.getKmerSize()) { - CanonicalKmer ck = new CanonicalKmer(qrs.substring(0, GRAPH.getKmerSize())); - - if (rois.contains(ck)) { - novelKmers.add(ck); - } - } - } - - } - - allNovelKmers.add(novelKmers); - - variantStart = -1; - childAllele = new StringBuilder(); - parentAllele = new StringBuilder(); - novelKmers = new HashSet<>(); - } - } - } - } - } - - int q = 0; - for (int j = 1; j < pAlignments.size() - 1; j++) { - q += pAlignments.get(j).getRight().length(); - variants.add(Triple.of(q, pAlignments.get(j).getLeft(), pAlignments.get(j + 1).getLeft())); - - for (int k = q - GRAPH.getKmerSize(); k < q + childAllele.length() + GRAPH.getKmerSize(); k++) { - if (k >= 0 && k <= childContig.length() - GRAPH.getKmerSize()) { - String qrs = childContig.substring(k, contig.length()).replaceAll("-", ""); - if (qrs.length() > GRAPH.getKmerSize()) { - CanonicalKmer ck = new CanonicalKmer(qrs.substring(0, GRAPH.getKmerSize())); - - if (rois.contains(ck)) { - novelKmers.add(ck); - } - } - } - } - - allNovelKmers.add(novelKmers); - } - - if (variantStart >= 0) { - variants.add(Triple.of(variantStart, childAllele.toString(), parentAllele.toString())); - - for (int k = variantStart - GRAPH.getKmerSize(); k < variantStart + childAllele.length() + GRAPH.getKmerSize(); k++) { - if (k >= 0 && k <= childContig.length() - GRAPH.getKmerSize()) { - String qrs = childContig.substring(k, contig.length()).replaceAll("-", ""); - if (qrs.length() > GRAPH.getKmerSize()) { - CanonicalKmer ck = new CanonicalKmer(qrs.substring(0, GRAPH.getKmerSize())); - - if (rois.contains(ck)) { - novelKmers.add(ck); - } - } - } - } - - allNovelKmers.add(novelKmers); - } - - for (int i = 0; i < variants.size(); i++) { - Triple variant = variants.get(i); - log.info(" {} {}", variant, allNovelKmers.get(i).size()); - - for (CanonicalKmer ck : allNovelKmers.get(i)) { - usedRois.get(ck).add(String.valueOf(variantIndex)); - } - - out.println(Joiner.on("\t").join(variantIndex, contigIndex, i, variant.getLeft(), variant.getMiddle(), variant.getRight(), Joiner.on(",").join(allNovelKmers.get(i)))); - - variantIndex++; - } - - for (CanonicalKmer ck : usedRois.keySet()) { - if (usedRois.get(ck).size() == 0) { - aout.println(ck + "\tNA"); - } else { - aout.println(ck + "\t" + Joiner.on(",").join(usedRois.get(ck))); - } - } - - /* - for (String refid : REFERENCE.keySet()) { - Map ref = new HashMap<>(); - ref.put(refid, REFERENCE.get(refid)); - - List> refIntervals = IntervalCombiner.getIntervals(ws, ref, 100, 10); - Map refTracks = loadSequences(refIntervals, ref); - List> rAlignments = ma.align(contig, refTracks); - log.info("\n{}", ma); - } - */ - } - } - } - - private List getParentalContig(int sampleColor, List ws, List> regions, Set css) { - List>> replacements = new ArrayList<>(); - - for (Pair r : regions) { - Set left = getLeft(sampleColor, ws, r, css); - Set right = getRight(sampleColor, ws, r, css); - - TraversalEngine ef = new TraversalEngineFactory() - .traversalColors(GRAPH.getColorsForSampleNames(css)) - .combinationOperator(OR) - .traversalDirection(FORWARD) - .graph(GRAPH) - .links(LINKS) - .stoppingRule(DestinationStopper.class) - .make(); - - TraversalEngine er = new TraversalEngineFactory() - .traversalColors(GRAPH.getColorsForSampleNames(css)) - .combinationOperator(OR) - .traversalDirection(REVERSE) - .graph(GRAPH) - .links(LINKS) - .stoppingRule(DestinationStopper.class) - .make(); - - DirectedWeightedPseudograph gf = ef.dfs(left, right); - DirectedWeightedPseudograph gr = er.dfs(right, left); - - for (DirectedWeightedPseudograph g : Arrays.asList(gf, gr)) { - if (g != null && g.vertexSet().size() > 0) { - Triple> replacement = getReplacement(ws, left, right, g); - - if (replacement != null) { - replacements.add(replacement); - break; - } - } - } - } - - List qss = new ArrayList<>(); - - Iterator>> rit = replacements.iterator(); - Triple> currentReplacement = rit.hasNext() ? rit.next() : null; - for (int i = 0; i < ws.size(); i++) { - if (currentReplacement == null || i < currentReplacement.getLeft()) { - qss.add(ws.get(i)); - } else if (i == currentReplacement.getLeft()) { - qss.addAll(currentReplacement.getRight()); - i = currentReplacement.getMiddle(); - - currentReplacement = rit.hasNext() ? rit.next() : null; - } - } - - return qss; - } - - private Triple> getReplacement(List ws, Set left, Set right, DirectedWeightedPseudograph g) { - PathFinder pf = new PathFinder(g, g.edgeSet().iterator().next().getColor()); - - for (String before : left) { - for (String after : right) { - List> gps = pf.getPaths(TraversalUtils.findVertex(g, before), TraversalUtils.findVertex(g, after)); - - for (GraphPath gp : gps) { - int start = 0, end = ws.size() - 1; - for (int i = 0; i < ws.size(); i++) { - if (ws.get(i).getKmerAsString().equals(before)) { start = i; } - if (ws.get(i).getKmerAsString().equals(after)) { end = i; } - } - - return Triple.of(start, end, gp.getVertexList()); - } - } - } - - return null; - } - - private Set getRight(int sampleColor, List ws, Pair r, Set css) { - Set rightKmers = new LinkedHashSet<>(); - - int right = 0; - for (right = r.getSecond() + 1; right < ws.size() - 2 && right <= r.getSecond() + 50; right++) { - Map> cbks = TraversalUtils.getAllPrevKmers(ws.get(right).getCortexRecord(), !ws.get(right).getKmerAsString().equals(ws.get(right).getCanonicalKmer().getKmerAsString())); - - for (int c : GRAPH.getColorsForSampleNames(css)) { - if (cbks.get(c).size() > 0 && !cbks.get(c).equals(cbks.get(sampleColor))) { - rightKmers.add(ws.get(right).getKmerAsString()); - } - } - } - - return rightKmers; - } - - private Set getLeft(int sampleColor, List ws, Pair r, Set css) { - Set leftKmers = new LinkedHashSet<>(); - - int left = 0; - for (left = r.getFirst() - 1; left > 0 && left >= r.getFirst() - 50; left--) { - Map> cbks = TraversalUtils.getAllNextKmers(ws.get(left).getCortexRecord(), !ws.get(left).getKmerAsString().equals(ws.get(left).getCanonicalKmer().getKmerAsString())); - - for (int c : GRAPH.getColorsForSampleNames(css)) { - if (cbks.get(c).size() > 0 && !cbks.get(c).equals(cbks.get(sampleColor))) { - leftKmers.add(ws.get(left).getKmerAsString()); - } - } - } - - return leftKmers; - } - - @NotNull - private Map loadSequences(List> bfParents, Map refs) { - Map targets = new LinkedHashMap<>(); - for (Pair l : bfParents) { - Interval it = l.getSecond(); - - String seq = refs.get(l.getFirst()).find(it); - String targetName = l.getFirst() + ":" + it.getContig() + ":" + it.getStart() + "-" + it.getEnd() + ":" + (it.isPositiveStrand() ? "+" : "-"); - targets.put(targetName, seq); - } - return targets; - } - - private List> call(String[] al) { - List> calls = new ArrayList<>(); - - StringBuilder cAllele = new StringBuilder(); - StringBuilder pAllele = new StringBuilder(); - - for (int i = 0; i < al[0].length(); i++) { - if (al[0].charAt(i) == al[1].charAt(i)) { - if (cAllele.length() > 0 || pAllele.length() > 0) { - String cStr = cAllele.toString().replaceAll("-", ""); - String pStr = pAllele.toString().replaceAll("-", ""); - - calls.add(Triple.of(i - cAllele.length(), cStr, pStr)); - - cAllele = new StringBuilder(); - pAllele = new StringBuilder(); - } - } else { - cAllele.append(al[0].charAt(i)); - pAllele.append(al[1].charAt(i)); - } - } - - if (cAllele.length() > 0 || pAllele.length() > 0) { - String cStr = cAllele.toString().replaceAll("-", ""); - String pStr = pAllele.toString().replaceAll("-", ""); - - calls.add(Triple.of(al[0].length() - cAllele.length(), cStr, pStr)); - } - - return calls; - } - - private List subsetContig(Set rois, List w, int window) { - List> regions = getRegions(rois, w); - - int subcontigStart = regions.get(0).getFirst() - window; - if (subcontigStart < 0) { subcontigStart = 0; } - - int subcontigStop = regions.get(regions.size() - 1).getSecond() + window; - if (subcontigStop >= w.size()) { subcontigStop = w.size() - 1; } - - List ws = new ArrayList<>(); - for (int i = subcontigStart; i <= subcontigStop; i++) { - ws.add(w.get(i)); - } - - return ws; - } - - @NotNull - private List> getRegions(Set rois, List cvs) { - List> regions = new ArrayList<>(); - - int regionStart = -1; - int regionStop = 0; - for (int i = 0; i < cvs.size(); i++) { - CanonicalKmer currentKmer = cvs.get(i).getCanonicalKmer(); - - if (rois.contains(currentKmer)) { - if (regionStart == -1) { regionStart = i; } - regionStop = i; - } else { - if (regionStart > -1) { - regions.add(new Pair<>(regionStart, regionStop)); - - regionStart = -1; - regionStop = 0; - } - } - } - - if (regionStart > -1) { - regions.add(new Pair<>(regionStart, regionStop)); - } - - return regions; - } - - private List getWalk(HTreeMap contigsMap, int contigIndex) { - List cvs = new ArrayList<>(); - - Object[] oa = contigsMap.get(contigIndex); - - for (Object o : oa) { - cvs.add((CortexVertex) o); - } - - return cvs; - } - - /* - private HTreeMap initializeDbReadsMap(DB dbIn, boolean firstEnd) { - HTreeMap dbReads = dbIn - .hashMap("readsEnd" + (firstEnd ? "1" : "2")) - .keySerializer(Serializer.INTEGER) - .valueSerializer(new SerializerArray(Serializer.JAVA)) - .counterEnable() - .open(); - - return dbReads; - } - */ - - @NotNull - private HTreeMap initializeDbContigsMap(DB dbIn) { - return (HTreeMap) dbIn - .hashMap("contigs") - .keySerializer(Serializer.INTEGER) - .valueSerializer(new SerializerArray(Serializer.JAVA)) - .counterEnable() - .open(); - } - - private Set loadRois(CortexGraph rois) { - Set roiSet = new HashSet<>(); - - for (CortexRecord rr : rois) { - roiSet.add(rr.getCanonicalKmer()); - } - - return roiSet; - } -} diff --git a/public/java/src/uk/ac/ox/well/cortexjdk/commands/discover/call/FindPaths.java b/public/java/src/uk/ac/ox/well/cortexjdk/commands/discover/call/FindPaths.java index 5a545862..f7d47ade 100644 --- a/public/java/src/uk/ac/ox/well/cortexjdk/commands/discover/call/FindPaths.java +++ b/public/java/src/uk/ac/ox/well/cortexjdk/commands/discover/call/FindPaths.java @@ -80,171 +80,193 @@ public void execute() { List ws = section.getRight(); String query = TraversalUtils.toContig(ws); - Map targets = new HashMap<>(); List> regions = getRegions(rois, ws); + Map targets = new HashMap<>(); for (Set parentName : Arrays.asList(MOTHER, FATHER)) { - DirectedWeightedPseudograph g = new DirectedWeightedPseudograph<>(CortexEdge.class); - List> walks = new ArrayList<>(); - List contigs = new ArrayList<>(); + DirectedWeightedPseudograph g = assembleSections(rois, ws, regions, parentName); - for (int i = 0; i < regions.size(); i++) { - int lastEnd = i == 0 ? 0 : regions.get(i-1).getSecond(); - int nextStart = i == regions.size() - 1 ? ws.size() - 1 : regions.get(i+1).getFirst(); + addToTargetList(targets, parentName, g); + } - DirectedWeightedPseudograph gl = assembleLeft(rois, parentName, ws, regions.get(i), lastEnd); - DirectedWeightedPseudograph gr = assembleRight(rois, parentName, ws, regions.get(i), nextStart); + if (targets.size() > 0) { + MosaicAligner ma = new MosaicAligner(); + List, String>> lps = ma.align(query, targets); - Graphs.addGraph(g, gl); - Graphs.addGraph(g, gr); + String noveltyTrack = getNoveltyTrack(rois, query, lps); - List wl = gl.vertexSet().size() == 0 ? new ArrayList<>() : TraversalUtils.toWalk(gl, gl.vertexSet().iterator().next().getKmerAsString(), gl.edgeSet().iterator().next().getColor()); - List wr = gr.vertexSet().size() == 0 ? new ArrayList<>() : TraversalUtils.toWalk(gr, gr.vertexSet().iterator().next().getKmerAsString(), gr.edgeSet().iterator().next().getColor()); + log.info("\n{}\n{}", noveltyTrack, ma); + } + } + } + } - if (wl.size() > 0 && !contigs.contains(TraversalUtils.toContig(wl))) { - walks.add(wl); - contigs.add(TraversalUtils.toContig(wl)); - } + private void addToTargetList(Map targets, Set parentName, DirectedWeightedPseudograph g) { + ConnectivityInspector ci = new ConnectivityInspector<>(g); + int index = 0; - if (wr.size() > 0 && !contigs.contains(TraversalUtils.toContig(wr))) { - walks.add(wr); - contigs.add(TraversalUtils.toContig(wr)); - } - } + for (Set s : ci.connectedSets()) { + String name = String.format("%s:contig%d", parentName.iterator().next(), index); + index++; - for (int i = 0; i < walks.size() - 1; i++) { - DirectedWeightedPseudograph gm = new DirectedWeightedPseudograph<>(CortexEdge.class); - for (int j = i + 1; j < walks.size() && gm.vertexSet().size() == 0; j++) { - gm = assembleMiddle(parentName, walks.get(i), walks.get(j)); - } + List l = extractContig(parentName, g, s); + + if (l.size() > 0) { + labelTargets(targets, parentName, name, l); + } + } + } - Graphs.addGraph(g, gm); + private String getNoveltyTrack(Set rois, String query, List, String>> lps) { + int maxLength = 0; + for (Triple, String> lp : lps) { + String name = String.format("%s (%d-%d)", lp.getLeft(), lp.getMiddle().getFirst(), lp.getMiddle().getSecond()); + maxLength = Math.max(maxLength, name.length()); + } - TraversalEngine e = new TraversalEngineFactory() - .traversalColors(GRAPH.getColorsForSampleNames(parentName)) - .traversalDirection(BOTH) - .combinationOperator(OR) - .stoppingRule(ContigStopper.class) - .maxBranchLength(2*GRAPH.getKmerSize()) - .graph(GRAPH) - .links(LINKS) - .make(); + StringBuilder sb = new StringBuilder(StringUtil.repeatCharNTimes(' ', query.length())); + for (int i = 0; i <= query.length() - GRAPH.getKmerSize(); i++) { + CanonicalKmer ck = new CanonicalKmer(query.substring(i, i + GRAPH.getKmerSize())); - DirectedWeightedPseudograph gb = e.dfs(walks.get(i).get(0).getKmerAsString()); - if (gb != null) { Graphs.addGraph(g, gb); } + if (rois.contains(ck)) { + for (int j = i; j <= i + GRAPH.getKmerSize(); j++) { + sb.setCharAt(j, '*'); + } + } + } - DirectedWeightedPseudograph gf = e.dfs(walks.get(i).get(walks.get(i).size() - 1).getKmerAsString()); - if (gf != null) { Graphs.addGraph(g, gf); } - } + for (int i = 0; i < lps.get(0).getRight().length(); i++) { + if (lps.get(0).getRight().charAt(i) == '-') { + sb.insert(i, sb.charAt(i) == '*' ? '*' : ' '); + } + } + + return String.format("%" + maxLength + "s %s", "novel", sb.toString()); + } + + @NotNull + private List extractContig(Set parentName, DirectedWeightedPseudograph g, Set s) { + List l = TraversalUtils.toWalk(g, s.iterator().next().getKmerAsString(), g.edgeSet().iterator().next().getColor()); + + if (l.size() == 0 && s.size() > 0) { + TraversalEngine e = new TraversalEngineFactory() + .traversalColors(GRAPH.getColorsForSampleNames(parentName)) + .combinationOperator(OR) + .traversalDirection(BOTH) + .stoppingRule(ContigStopper.class) + .maxBranchLength(2*s.size()) + .graph(GRAPH) + .links(LINKS) + .make(); + + for (CortexVertex v : s) { + List ll = e.walk(v.getKmerAsString()); + + if (ll.size() > l.size()) { + l = ll; + } + } + } + return l; + } + + private void labelTargets(Map targets, Set parentName, String name, List l) { + String cl = TraversalUtils.toContig(l); - ConnectivityInspector ci = new ConnectivityInspector<>(g); - int index = 0; - - for (Set s : ci.connectedSets()) { - String name = String.format("%s:contig%d", parentName.iterator().next(), index); - List l = TraversalUtils.toWalk(g, s.iterator().next().getKmerAsString(), g.edgeSet().iterator().next().getColor()); - - if (l.size() == 0 && s.size() > 0) { - TraversalEngine e = new TraversalEngineFactory() - .traversalColors(GRAPH.getColorsForSampleNames(parentName)) - .combinationOperator(OR) - .traversalDirection(BOTH) - .stoppingRule(ContigStopper.class) - .maxBranchLength(2*s.size()) - .graph(GRAPH) - .links(LINKS) - .make(); - - for (CortexVertex v : s) { - List ll = e.walk(v.getKmerAsString()); - - if (ll.size() > l.size()) { - l = ll; - } + if (BACKGROUNDS != null) { + for (String pn : parentName) { + if (BACKGROUNDS.containsKey(pn)) { + List srs = BACKGROUNDS.get(pn).align(cl); + + if (srs.size() > 0) { + srs.sort((s1, s2) -> { + int s1length = s1.getAlignmentEnd() - s1.getAlignmentStart(); + int nm1 = s1.getIntegerAttribute("NM"); + + int s2length = s2.getAlignmentEnd() - s2.getAlignmentStart(); + int nm2 = s2.getIntegerAttribute("NM"); + + if (s1length != s2length) { + return s1length > s2length ? -1 : 1; } - } - if (l.size() > 0) { - String cl = TraversalUtils.toContig(l); - - if (BACKGROUNDS != null) { - for (String pn : parentName) { - if (BACKGROUNDS.containsKey(pn)) { - List srs = BACKGROUNDS.get(pn).align(cl); - - if (srs.size() > 0) { - srs.sort((s1, s2) -> { - int s1length = s1.getAlignmentEnd() - s1.getAlignmentStart(); - int nm1 = s1.getIntegerAttribute("NM"); - - int s2length = s2.getAlignmentEnd() - s2.getAlignmentStart(); - int nm2 = s2.getIntegerAttribute("NM"); - - if (s1length != s2length) { - return s1length > s2length ? -1 : 1; - } - - if (nm1 != nm2) { - return nm1 < nm2 ? -1 : 1; - } - - return 0; - }); - - SAMRecord sr = srs.get(0); - Interval it = new Interval(sr.getReferenceName(), sr.getAlignmentStart(), sr.getAlignmentEnd(), sr.getReadNegativeStrandFlag(), sr.getCigarString()); - - String seq = BACKGROUNDS.get(pn).find(it); - targets.put(String.format("%s:%s:%d-%d:%s:%s", pn, it.getContig(), it.getStart(), it.getEnd(), it.isPositiveStrand() ? "+" : "-", it.getName()), seq); - } else { - targets.put(name, cl); - } - } - } - } else { - targets.put(name, cl); + if (nm1 != nm2) { + return nm1 < nm2 ? -1 : 1; } - index++; - } - } + return 0; + }); - if (index == 0) { + SAMRecord sr = srs.get(0); + Interval it = new Interval(sr.getReferenceName(), sr.getAlignmentStart(), sr.getAlignmentEnd(), sr.getReadNegativeStrandFlag(), sr.getCigarString()); + String seq = BACKGROUNDS.get(pn).find(it); + targets.put(String.format("%s:%s:%d-%d:%s:%s", pn, it.getContig(), it.getStart(), it.getEnd(), it.isPositiveStrand() ? "+" : "-", it.getName()), seq); + } else { + targets.put(name, cl); } } + } + } else { + targets.put(name, cl); + } + } - if (targets.size() > 0) { - MosaicAligner ma = new MosaicAligner(); - List, String>> lps = ma.align(query, targets); + @NotNull + private DirectedWeightedPseudograph assembleSections(Set rois, List ws, List> regions, Set parentName) { + DirectedWeightedPseudograph g = new DirectedWeightedPseudograph<>(CortexEdge.class); + List> walks = new ArrayList<>(); + List contigs = new ArrayList<>(); - int maxLength = 0; - for (Triple, String> lp : lps) { - String name = String.format("%s (%d-%d)", lp.getLeft(), lp.getMiddle().getFirst(), lp.getMiddle().getSecond()); - maxLength = Math.max(maxLength, name.length()); - } + for (int i = 0; i < regions.size(); i++) { + int lastEnd = i == 0 ? 0 : regions.get(i-1).getSecond(); + int nextStart = i == regions.size() - 1 ? ws.size() - 1 : regions.get(i+1).getFirst(); - StringBuilder sb = new StringBuilder(StringUtil.repeatCharNTimes(' ', query.length())); - for (int i = 0; i <= query.length() - GRAPH.getKmerSize(); i++) { - CanonicalKmer ck = new CanonicalKmer(query.substring(i, i + GRAPH.getKmerSize())); + DirectedWeightedPseudograph gl = assembleLeft(rois, parentName, ws, regions.get(i), lastEnd); + DirectedWeightedPseudograph gr = assembleRight(rois, parentName, ws, regions.get(i), nextStart); - if (rois.contains(ck)) { - for (int j = i; j <= i + GRAPH.getKmerSize(); j++) { - sb.setCharAt(j, '*'); - } - } - } + Graphs.addGraph(g, gl); + Graphs.addGraph(g, gr); - for (int i = 0; i < lps.get(0).getRight().length(); i++) { - if (lps.get(0).getRight().charAt(i) == '-') { - sb.insert(i, sb.charAt(i) == '*' ? '*' : ' '); - } - } + List wl = gl.vertexSet().size() == 0 ? new ArrayList<>() : TraversalUtils.toWalk(gl, gl.vertexSet().iterator().next().getKmerAsString(), gl.edgeSet().iterator().next().getColor()); + List wr = gr.vertexSet().size() == 0 ? new ArrayList<>() : TraversalUtils.toWalk(gr, gr.vertexSet().iterator().next().getKmerAsString(), gr.edgeSet().iterator().next().getColor()); - log.info("\n{} {}\n{}", String.format("%" + maxLength + "s", "novel"), sb.toString(), ma); - } + if (wl.size() > 0 && !contigs.contains(TraversalUtils.toContig(wl))) { + walks.add(wl); + contigs.add(TraversalUtils.toContig(wl)); } + + if (wr.size() > 0 && !contigs.contains(TraversalUtils.toContig(wr))) { + walks.add(wr); + contigs.add(TraversalUtils.toContig(wr)); + } + } + + for (int i = 0; i < walks.size() - 1; i++) { + DirectedWeightedPseudograph gm = new DirectedWeightedPseudograph<>(CortexEdge.class); + for (int j = i + 1; j < walks.size() && gm.vertexSet().size() == 0; j++) { + gm = assembleMiddle(parentName, walks.get(i), walks.get(j)); + } + + Graphs.addGraph(g, gm); + + TraversalEngine e = new TraversalEngineFactory() + .traversalColors(GRAPH.getColorsForSampleNames(parentName)) + .traversalDirection(BOTH) + .combinationOperator(OR) + .stoppingRule(ContigStopper.class) + .maxBranchLength(2*GRAPH.getKmerSize()) + .graph(GRAPH) + .links(LINKS) + .make(); + + DirectedWeightedPseudograph gb = e.dfs(walks.get(i).get(0).getKmerAsString()); + if (gb != null) { Graphs.addGraph(g, gb); } + + DirectedWeightedPseudograph gf = e.dfs(walks.get(i).get(walks.get(i).size() - 1).getKmerAsString()); + if (gf != null) { Graphs.addGraph(g, gf); } } + return g; } private DirectedWeightedPseudograph assembleMiddle(Set parentName, List wLeft, List wRight) { @@ -273,7 +295,7 @@ private DirectedWeightedPseudograph assembleMiddle(Set .traversalDirection(FORWARD) .combinationOperator(OR) .stoppingRule(DestinationStopper.class) - .maxBranchLength(1000) + .maxBranchLength(10000) .graph(GRAPH) .links(LINKS) .make(); @@ -283,7 +305,7 @@ private DirectedWeightedPseudograph assembleMiddle(Set .traversalDirection(REVERSE) .combinationOperator(OR) .stoppingRule(DestinationStopper.class) - .maxBranchLength(1000) + .maxBranchLength(10000) .graph(GRAPH) .links(LINKS) .make(); diff --git a/public/java/src/uk/ac/ox/well/cortexjdk/commands/utils/ToGfa1.java b/public/java/src/uk/ac/ox/well/cortexjdk/commands/utils/ToGfa1.java new file mode 100644 index 00000000..ede308ce --- /dev/null +++ b/public/java/src/uk/ac/ox/well/cortexjdk/commands/utils/ToGfa1.java @@ -0,0 +1,147 @@ +package uk.ac.ox.well.cortexjdk.commands.utils; + +import com.google.common.base.Joiner; +import htsjdk.samtools.reference.FastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequence; +import org.jgrapht.DirectedGraph; +import org.jgrapht.graph.DefaultDirectedGraph; +import org.jgrapht.graph.DefaultEdge; +import uk.ac.ox.well.cortexjdk.commands.Module; +import uk.ac.ox.well.cortexjdk.utils.arguments.Argument; +import uk.ac.ox.well.cortexjdk.utils.arguments.Output; +import uk.ac.ox.well.cortexjdk.utils.io.graph.cortex.CortexGraph; +import uk.ac.ox.well.cortexjdk.utils.io.graph.cortex.CortexRecord; +import uk.ac.ox.well.cortexjdk.utils.kmer.CanonicalKmer; +import uk.ac.ox.well.cortexjdk.utils.kmer.CortexByteKmer; +import uk.ac.ox.well.cortexjdk.utils.sequence.SequenceUtils; +import uk.ac.ox.well.cortexjdk.utils.traversal.TraversalUtils; + +import java.io.PrintStream; +import java.util.*; + +public class ToGfa1 extends Module { + @Argument(fullName="graph", shortName="g", doc="Cortex graph") + public CortexGraph GRAPH; + + @Argument(fullName="fasta", shortName="f", doc="Fasta (unitigs, or kmers, etc.)") + public FastaSequenceFile FASTA; + + @Argument(fullName="sampleName", shortName="s", doc="Sample name", required=false) + public String SAMPLE_NAME; + + @Output + public PrintStream out; + + @Override + public void execute() { + Map crs = new HashMap<>(); + for (CortexRecord cr : GRAPH) { + crs.put(cr.getCanonicalKmer(), cr); + } + + DirectedGraph g = new DefaultDirectedGraph<>(DefaultEdge.class); + Map beginningKmers = new HashMap<>(); + Map endingKmers = new HashMap<>(); + + Map seqNames = new HashMap<>(); + Map positiveStrand = new HashMap<>(); + + ReferenceSequence rseq; + int index = 0; + while ((rseq = FASTA.nextSequence()) != null) { + for (String v : Arrays.asList(rseq.getBaseString(), SequenceUtils.reverseComplement(rseq.getBaseString()))) { + g.addVertex(v); + + String beginningSk = v.substring(0, GRAPH.getKmerSize()); + String endingSk = v.substring(v.length() - GRAPH.getKmerSize(), v.length()); + + beginningKmers.put(beginningSk, v); + endingKmers.put(endingSk, v); + + seqNames.put(v, index); + } + + positiveStrand.put(rseq.getBaseString(), true); + positiveStrand.put(SequenceUtils.reverseComplement(rseq.getBaseString()), false); + + index++; + } + + int sampleColor = SAMPLE_NAME == null ? 0 : GRAPH.getColorForSampleName(SAMPLE_NAME); + + Map averageCoverages = new HashMap<>(); + + for (String v : g.vertexSet()) { + int cov = 0; + + for (int i = 0; i <= v.length() - GRAPH.getKmerSize(); i++) { + String sk = v.substring(i, i + GRAPH.getKmerSize()); + CanonicalKmer ck = new CanonicalKmer(sk); + + if (crs.containsKey(ck)) { + CortexRecord cr = crs.get(ck); + cov += cr.getCoverage(sampleColor); + } + } + + int avCov = (int) ((float) cov / (float) (v.length() - GRAPH.getKmerSize() + 1)); + + averageCoverages.put(v, avCov); + + String beginningSk = v.substring(0, GRAPH.getKmerSize()); + CortexRecord beginningCr = crs.getOrDefault(new CanonicalKmer(beginningSk), null); + + Set ins = TraversalUtils.getAllPrevKmers(beginningCr, !beginningCr.getCanonicalKmer().getKmerAsString().equals(beginningSk)).get(sampleColor); + for (CortexByteKmer cbk : ins) { + String sk = new String(cbk.getKmer()); + + if (endingKmers.containsKey(sk)) { + String v0 = endingKmers.get(sk); + + g.addEdge(v0, v); + } + } + + String endingSk = v.substring(v.length() - GRAPH.getKmerSize(), v.length()); + CortexRecord endingCr = crs.getOrDefault(new CanonicalKmer(endingSk), null); + + Set outs = TraversalUtils.getAllNextKmers(endingCr, !endingCr.getCanonicalKmer().getKmerAsString().equals(endingSk)).get(sampleColor); + for (CortexByteKmer cbk : outs) { + String sk = new String(cbk.getKmer()); + + if (beginningKmers.containsKey(sk)) { + String v1 = beginningKmers.get(sk); + + g.addEdge(v, v1); + } + } + } + + out.println(Joiner.on("\t").join("H", "VN:Z:1.0")); + + Set seen = new HashSet<>(); + for (String v : g.vertexSet()) { + int vid = seqNames.get(v); + if (!seen.contains(vid)) { + int avCov = averageCoverages.get(v); + int totCov = avCov * v.length(); + + out.println(Joiner.on("\t").join("S", vid, v, String.format("RC:i:%d", totCov), String.format("AC:i:%d", avCov))); + + seen.add(vid); + } + } + + for (DefaultEdge e : g.edgeSet()) { + String vs = g.getEdgeSource(e); + int vsid = seqNames.get(vs); + String vsStrand = positiveStrand.get(vs) ? ":" : "-"; + + String vt = g.getEdgeTarget(e); + int vtid = seqNames.get(vt); + String vtStrand = positiveStrand.get(vt) ? ":" : "-"; + + out.println(Joiner.on("\t").join("L", vsid, vsStrand, vtid, vtStrand, GRAPH.getKmerSize() + "M")); + } + } +} diff --git a/public/java/src/uk/ac/ox/well/cortexjdk/commands/discover/call/AnnotateReads.java b/public/java/src/uk/ac/ox/well/cortexjdk/playground/assemblies/AnnotateReads.java similarity index 99% rename from public/java/src/uk/ac/ox/well/cortexjdk/commands/discover/call/AnnotateReads.java rename to public/java/src/uk/ac/ox/well/cortexjdk/playground/assemblies/AnnotateReads.java index eb07754f..f918fe7d 100644 --- a/public/java/src/uk/ac/ox/well/cortexjdk/commands/discover/call/AnnotateReads.java +++ b/public/java/src/uk/ac/ox/well/cortexjdk/playground/assemblies/AnnotateReads.java @@ -1,4 +1,4 @@ -package uk.ac.ox.well.cortexjdk.commands.discover.call; +package uk.ac.ox.well.cortexjdk.playground.assemblies; import com.google.common.base.Joiner; import htsjdk.samtools.fastq.FastqRecord; diff --git a/public/java/src/uk/ac/ox/well/cortexjdk/utils/alignment/graph/PairedEndAlignmentInfo.java b/public/java/src/uk/ac/ox/well/cortexjdk/utils/alignment/graph/PairedEndAlignmentInfo.java deleted file mode 100644 index dca7f439..00000000 --- a/public/java/src/uk/ac/ox/well/cortexjdk/utils/alignment/graph/PairedEndAlignmentInfo.java +++ /dev/null @@ -1,22 +0,0 @@ -package uk.ac.ox.well.cortexjdk.utils.alignment.graph; - -import java.util.Map; -import java.util.Set; - -public class PairedEndAlignmentInfo { - private SingleEndAlignmentInfo sai1; - private SingleEndAlignmentInfo sai2; - private Map> dists; - - public PairedEndAlignmentInfo(SingleEndAlignmentInfo sai1, SingleEndAlignmentInfo sai2, Map> dists) { - this.sai1 = sai1; - this.sai2 = sai2; - this.dists = dists; - } - - public SingleEndAlignmentInfo getFirstEndAlignment() { return sai1; } - - public SingleEndAlignmentInfo getSecondEndAlignment() { return sai2; } - - public Map> getAlignmentDistanceMap() { return dists; } -} diff --git a/public/java/src/uk/ac/ox/well/cortexjdk/utils/alignment/graph/ReadToGraphAligner.java b/public/java/src/uk/ac/ox/well/cortexjdk/utils/alignment/graph/ReadToGraphAligner.java deleted file mode 100644 index c572c193..00000000 --- a/public/java/src/uk/ac/ox/well/cortexjdk/utils/alignment/graph/ReadToGraphAligner.java +++ /dev/null @@ -1,295 +0,0 @@ -package uk.ac.ox.well.cortexjdk.utils.alignment.graph; - -import htsjdk.samtools.fastq.FastqRecord; -import org.apache.commons.math3.util.Pair; -import org.jgrapht.GraphPath; -import org.jgrapht.Graphs; -import org.jgrapht.graph.DirectedWeightedPseudograph; -import uk.ac.ox.well.cortexjdk.utils.io.graph.cortex.CortexGraph; -import uk.ac.ox.well.cortexjdk.utils.io.graph.cortex.CortexRecord; -import uk.ac.ox.well.cortexjdk.utils.io.graph.links.CortexLinks; -import uk.ac.ox.well.cortexjdk.utils.kmer.CanonicalKmer; -import uk.ac.ox.well.cortexjdk.utils.sequence.SequenceUtils; -import uk.ac.ox.well.cortexjdk.utils.stoppingrules.PairedReadClosingStopper; -import uk.ac.ox.well.cortexjdk.utils.traversal.*; - -import java.util.*; - -import static uk.ac.ox.well.cortexjdk.utils.traversal.TraversalEngineConfiguration.GraphCombinationOperator.OR; -import static uk.ac.ox.well.cortexjdk.utils.traversal.TraversalEngineConfiguration.TraversalDirection.BOTH; - -public class ReadToGraphAligner { - private CortexGraph graph; - private List links; - private TraversalEngine[] engines; - private List colors = new ArrayList<>(); - private Map, Set>> cachedDists = new HashMap<>(); - - private DirectedWeightedPseudograph sg = new DirectedWeightedPseudograph<>(CortexEdge.class); - - public ReadToGraphAligner(CortexGraph g, ArrayList l, int... colors) { - this.graph = g; - this.links = l; - - this.engines = new TraversalEngine[g.getNumColors()]; - for (int c : colors) { - this.colors.add(c); - - engines[c] = new TraversalEngineFactory() - .traversalColors(c) - .traversalDirection(BOTH) - .combinationOperator(OR) - .maxBranchLength(5000) - //.connectAllNeighbors(true) - .secondaryColors(colors) - .stoppingRule(PairedReadClosingStopper.class) - .graph(graph) - .links(links) - .make(); - - cachedDists.put(c, new HashMap<>()); - } - } - - public ReadToGraphAligner(CortexGraph g, ArrayList l, Collection colors) { - this.graph = g; - this.links = l; - - this.engines = new TraversalEngine[g.getNumColors()]; - for (int c : colors) { - this.colors.add(c); - - engines[c] = new TraversalEngineFactory() - .traversalColors(c) - .traversalDirection(BOTH) - .combinationOperator(OR) - .maxBranchLength(5000) - //.connectAllNeighbors(true) - .secondaryColors(colors) - .stoppingRule(PairedReadClosingStopper.class) - .graph(graph) - .links(links) - .make(); - - cachedDists.put(c, new HashMap<>()); - } - } - - public SingleEndAlignmentInfo align(FastqRecord fq) { - String[] sks = new String[fq.length() - graph.getKmerSize() + 1]; - CortexRecord[] crs = new CortexRecord[fq.length() - graph.getKmerSize() + 1]; - - for (int i = 0; i <= fq.length() - graph.getKmerSize(); i++) { - byte[] bk = fq.getReadString().substring(i, i + graph.getKmerSize()).getBytes(); - CortexRecord cr = graph.findRecord(bk); - - sks[i] = new String(bk); - crs[i] = cr; - } - - return new SingleEndAlignmentInfo(fq, sks, crs); - } - - public PairedEndAlignmentInfo align2(FastqRecord fq1, FastqRecord fq2) { - Set cks1 = new HashSet<>(); - Set cks2 = new HashSet<>(); - - for (int i = 0; i <= fq1.length() - graph.getKmerSize(); i++) { - cks1.add(fq1.getReadString().substring(i, i + graph.getKmerSize())); - } - - for (int i = 0; i <= fq2.length() - graph.getKmerSize(); i++) { - cks2.add(fq2.getReadString().substring(i, i + graph.getKmerSize())); - } - - for (int c : colors) { - } - - return null; - } - - public PairedEndAlignmentInfo align(FastqRecord fq1, FastqRecord fq2) { - SingleEndAlignmentInfo sai1 = align(fq1); - SingleEndAlignmentInfo sai2 = align(fq2); - - Map distFrom5pEnd = new HashMap<>(); - for (int i = 0; i < sai1.getRecords().length; i++) { - if (sai1.getRecords()[i] != null) { - distFrom5pEnd.put(sai1.getRecords()[i].getCanonicalKmer(), i); - } - } - - for (int i = sai2.getRecords().length - 1; i >= 0; i--) { - if (sai2.getRecords()[i] != null) { - distFrom5pEnd.put(sai2.getRecords()[i].getCanonicalKmer(), i); - } - } - - Map> allDists = new HashMap<>(); - - Map, List>>> cachedPaths = new HashMap<>(); - for (int q = 0; q < colors.size(); q++) { - cachedPaths.put(colors.get(q), new HashMap<>()); - } - - DirectedWeightedPseudograph g = null; - - for (int q = 0; q < colors.size(); q++) { - int c = colors.get(q); - allDists.put(c, new HashSet<>()); - - List k1 = sai1.getAvailableKmers(c); - List k2 = reverseComplement(sai2.getAvailableKmers(c)); - - if (k1.size() > 0 && k2.size() > 0) { - String sk1 = k1.get(k1.size() - 1); - String sk2 = k2.get(0); - - CanonicalKmer ck1 = new CanonicalKmer(sk1); - CanonicalKmer ck2 = new CanonicalKmer(sk2); - - Pair key = ck1.compareTo(ck2) <= 0 ? new Pair<>(ck1, ck2) : new Pair<>(ck2, ck1); - - if (!cachedDists.get(c).containsKey(key)) { - if (g == null) { - TraversalEngine ef = new TraversalEngineFactory().configuration(engines[c].getConfiguration()).make(); - g = ef.dfs(sk1); - - if (g == null) { - TraversalEngine eb = new TraversalEngineFactory().configuration(engines[c].getConfiguration()).make(); - g = eb.dfs(sk2); - } - } else { - Set newOuts = new HashSet<>(); - Set newIns = new HashSet<>(); - - for (CortexVertex v : g.vertexSet()) { - boolean isNotFlipped = v.getKmerAsString().equals(v.getCanonicalKmer().getKmerAsString()); - - Collection remainingInEdges = isNotFlipped ? v.getCortexRecord().getInEdgesAsStrings(c, false) : v.getCortexRecord().getOutEdgesAsStrings(c, true); - Collection remainingOutEdges = isNotFlipped ? v.getCortexRecord().getOutEdgesAsStrings(c, false) : v.getCortexRecord().getInEdgesAsStrings(c, true); - for (int r = 0; r < q; r++) { - Collection prevInEdges = isNotFlipped ? v.getCortexRecord().getInEdgesAsStrings(colors.get(r), false) : v.getCortexRecord().getOutEdgesAsStrings(colors.get(r), true); - Collection prevOutEdges = isNotFlipped ? v.getCortexRecord().getOutEdgesAsStrings(colors.get(r), false) : v.getCortexRecord().getInEdgesAsStrings(colors.get(r), true); - - remainingInEdges.removeAll(prevInEdges); - remainingOutEdges.removeAll(prevOutEdges); - } - - for (String roe : remainingOutEdges) { - newOuts.add(v.getKmerAsString().substring(1, v.getKmerAsString().length()) + roe); - } - - for (String rie : remainingInEdges) { - newIns.add(rie + v.getKmerAsString().substring(0, v.getKmerAsString().length() - 1)); - } - } - - for (String newOut : newOuts) { - TraversalEngine ef = new TraversalEngineFactory().configuration(engines[c].getConfiguration()).make(); - DirectedWeightedPseudograph g1 = ef.dfs(newOut); - - if (g1 != null) { - Graphs.addGraph(g, g1); - } - } - - for (String newIn : newIns) { - TraversalEngine ef = new TraversalEngineFactory().configuration(engines[c].getConfiguration()).make(); - DirectedWeightedPseudograph g1 = ef.dfs(newIn); - - if (g1 != null) { - Graphs.addGraph(g, g1); - } - } - } - - if (g != null) { - CortexVertex vSource = TraversalUtils.findVertex(g, ck1); - CortexVertex vSink = TraversalUtils.findVertex(g, ck2); - - if (vSource != null && vSink != null) { - List> ps = cachedPaths.get(c).containsKey(key) ? cachedPaths.get(c).get(key) : new PathFinder(g, c).getPaths(vSource, vSink); - - for (int qc : colors) { - boolean isIdentical = true; - if (qc != c) { - for (CortexVertex v : g.vertexSet()) { - if (v.getCortexRecord().getEdges()[qc] != v.getCortexRecord().getEdges()[c]) { - isIdentical = false; - break; - } - } - } - - if (isIdentical) { - cachedPaths.get(c).put(key, ps); - } - } - - Set d = new TreeSet<>(); - for (GraphPath p : ps) { - d.add(p.getLength()); - } - - cachedDists.get(c).put(key, d); - - for (int i = 0; i < sai1.getRecords().length; i++) { - for (int j = sai2.getRecords().length - 1; j >= 0; j--) { - if (sai1.getRecords()[i] != null && sai2.getRecords()[j] != null) { - CanonicalKmer tck1 = sai1.getRecords()[i].getCanonicalKmer(); - CanonicalKmer tck2 = sai2.getRecords()[j].getCanonicalKmer(); - - if (!(ck1.equals(tck1) && ck2.equals(tck2))) { - Pair newkey = tck1.compareTo(tck2) <= 0 ? new Pair<>(tck1, tck2) : new Pair<>(tck2, tck1); - - if (!cachedDists.get(c).containsKey(newkey)) { - cachedDists.get(c).put(newkey, new TreeSet<>()); - } - - for (int di : d) { - cachedDists.get(c).get(newkey).add(di + distFrom5pEnd.get(key.getFirst()) + distFrom5pEnd.get(key.getSecond()) - distFrom5pEnd.get(newkey.getFirst()) - distFrom5pEnd.get(newkey.getSecond())); - } - } - } - } - } - } - } - } - - if (cachedDists.get(c).containsKey(key)) { - Set dists = cachedDists.get(c).get(key); - - for (Integer dist : dists) { - allDists.get(c).add(dist + distFrom5pEnd.get(key.getFirst()) + distFrom5pEnd.get(key.getSecond()) + fq2.length() - 1); - } - } - } - } - - return new PairedEndAlignmentInfo(sai1, sai2, allDists); - } - - private Set filterEdges(Set edges, int color) { - Set filteredEdges = new HashSet<>(); - for (CortexEdge e : edges) { - if (e.getColor() == color) { - filteredEdges.add(e); - } - } - - return filteredEdges; - } - - private List reverseComplement(List kmers) { - List revComps = new ArrayList<>(); - - for (int i = kmers.size() - 1; i >= 0; i--) { - String kmer = kmers.get(i); - revComps.add(SequenceUtils.reverseComplement(kmer)); - } - - return revComps; - } -} diff --git a/public/java/src/uk/ac/ox/well/cortexjdk/utils/alignment/graph/SingleEndAlignmentInfo.java b/public/java/src/uk/ac/ox/well/cortexjdk/utils/alignment/graph/SingleEndAlignmentInfo.java deleted file mode 100644 index 58e36f47..00000000 --- a/public/java/src/uk/ac/ox/well/cortexjdk/utils/alignment/graph/SingleEndAlignmentInfo.java +++ /dev/null @@ -1,164 +0,0 @@ -package uk.ac.ox.well.cortexjdk.utils.alignment.graph; - -import htsjdk.samtools.fastq.FastqRecord; -import org.apache.commons.math3.util.Pair; -import uk.ac.ox.well.cortexjdk.utils.io.graph.cortex.CortexRecord; - -import java.util.*; - -public class SingleEndAlignmentInfo { - private FastqRecord fq; - private String[] sks; - private CortexRecord[] crs; - private int kmerSize; - - public SingleEndAlignmentInfo(FastqRecord fq, String[] sks, CortexRecord[] crs) { - this.fq = fq; - this.sks = sks; - this.crs = crs; - - for (CortexRecord cr : crs) { - if (cr != null) { - kmerSize = cr.getKmerSize(); - break; - } - } - } - - @Override - public String toString() { - return "SingleEndAlignmentInfo{" + - "fq=" + fq + - ", crs=" + Arrays.toString(crs) + - '}'; - } - - public CortexRecord[] getRecords() { return crs; } - - public FastqRecord getFastqRecord() { - return fq; - } - - public boolean isSplitRead(int baselineColor, int comparisonColor) { - for (int i = 0; i < crs.length; i++) { - if (crs[i] != null && crs[i].getCoverage(baselineColor) > 0 && crs[i].getCoverage(comparisonColor) == 0) { - int numMissingKmers = 1; - - int edgeConflictLeftIndex = i; - boolean edgeConflictLeft = false; - for (int j = i - 1; j >= 0; j--) { - if (crs[j] != null) { - if (crs[j].getCoverage(baselineColor) > 0 && crs[j].getCoverage(comparisonColor) == 0) { - numMissingKmers++; - } else if (crs[j].getEdges()[baselineColor] != crs[j].getEdges()[comparisonColor]) { - edgeConflictLeft = true; - edgeConflictLeftIndex = j; - break; - } - } - } - - int edgeConflictRightIndex = i; - boolean edgeConflictRight = false; - for (int j = i + 1; j < crs.length; j++) { - if (crs[j] != null) { - if (crs[j].getCoverage(baselineColor) > 0 && crs[j].getCoverage(comparisonColor) == 0) { - numMissingKmers++; - } else if (crs[j].getEdges()[baselineColor] != crs[j].getEdges()[comparisonColor]) { - edgeConflictRight = true; - edgeConflictRightIndex = j; - break; - } - } - } - - float missingPct = 100.0f * ((float) numMissingKmers / ((float) (edgeConflictRightIndex - edgeConflictLeftIndex - 1))); - - if (missingPct >= 80 && edgeConflictLeft && edgeConflictRight) { - return true; - } - } - } - - return false; - } - - public Pair splitRead(int baselineColor, int comparisonColor) { - for (int i = 0; i < crs.length; i++) { - if (crs[i] != null && crs[i].getCoverage(baselineColor) > 0 && crs[i].getCoverage(comparisonColor) == 0) { - int numMissingKmers = 1; - - int edgeConflictLeftIndex = i; - boolean edgeConflictLeft = false; - for (int j = i - 1; j >= 0; j--) { - if (crs[j] != null) { - if (crs[j].getCoverage(baselineColor) > 0 && crs[j].getCoverage(comparisonColor) == 0) { - numMissingKmers++; - } else if (crs[j].getEdges()[baselineColor] != crs[j].getEdges()[comparisonColor]) { - edgeConflictLeft = true; - edgeConflictLeftIndex = j; - break; - } - } - } - - int edgeConflictRightIndex = i; - boolean edgeConflictRight = false; - for (int j = i + 1; j < crs.length; j++) { - if (crs[j] != null) { - if (crs[j].getCoverage(baselineColor) > 0 && crs[j].getCoverage(comparisonColor) == 0) { - numMissingKmers++; - } else if (crs[j].getEdges()[baselineColor] != crs[j].getEdges()[comparisonColor]) { - edgeConflictRight = true; - edgeConflictRightIndex = j; - break; - } - } - } - - float missingPct = 100.0f * ((float) numMissingKmers / ((float) (edgeConflictRightIndex - edgeConflictLeftIndex - 1))); - - if (missingPct >= 80 && edgeConflictLeft && edgeConflictRight) { - StringBuilder end1 = new StringBuilder(); - - for (i = 0; i < edgeConflictLeftIndex; i++) { - String sk = fq.getReadString().substring(i, i + kmerSize); - - if (end1.length() == 0) { - end1.append(sk); - } else { - end1.append(sk.substring(sk.length() - 1, sk.length())); - } - } - - StringBuilder end2 = new StringBuilder(); - for (i = edgeConflictRightIndex; i < crs.length; i++) { - String sk = fq.getReadString().substring(i, i + kmerSize); - - if (end2.length() == 0) { - end2.append(sk); - } else { - end2.append(sk.substring(sk.length() - 1, sk.length())); - } - } - - return new Pair<>(end1.toString(), end2.toString()); - } - } - } - - return null; - } - - public List getAvailableKmers(int color) { - List kmers = new ArrayList<>(); - - for (int i = 0; i < sks.length; i++) { - if (crs[i] != null && crs[i].getCoverage(color) > 0 && crs[i].getInDegree(color) == 1 && crs[i].getOutDegree(color) == 1) { - kmers.add(sks[i]); - } - } - - return kmers; - } -} diff --git a/public/java/src/uk/ac/ox/well/cortexjdk/commands/discover/call/IntervalCombiner.java b/public/java/src/uk/ac/ox/well/cortexjdk/utils/alignment/intervalcombiner/IntervalCombiner.java similarity index 98% rename from public/java/src/uk/ac/ox/well/cortexjdk/commands/discover/call/IntervalCombiner.java rename to public/java/src/uk/ac/ox/well/cortexjdk/utils/alignment/intervalcombiner/IntervalCombiner.java index 46c19388..ad6dc5b0 100644 --- a/public/java/src/uk/ac/ox/well/cortexjdk/commands/discover/call/IntervalCombiner.java +++ b/public/java/src/uk/ac/ox/well/cortexjdk/utils/alignment/intervalcombiner/IntervalCombiner.java @@ -1,4 +1,4 @@ -package uk.ac.ox.well.cortexjdk.commands.discover.call; +package uk.ac.ox.well.cortexjdk.utils.alignment.intervalcombiner; import htsjdk.samtools.util.Interval; import htsjdk.samtools.util.IntervalTreeMap; diff --git a/public/java/src/uk/ac/ox/well/cortexjdk/utils/stoppingrules/DestinationStopper.java b/public/java/src/uk/ac/ox/well/cortexjdk/utils/stoppingrules/DestinationStopper.java index a912dd26..e858450c 100644 --- a/public/java/src/uk/ac/ox/well/cortexjdk/utils/stoppingrules/DestinationStopper.java +++ b/public/java/src/uk/ac/ox/well/cortexjdk/utils/stoppingrules/DestinationStopper.java @@ -12,6 +12,6 @@ public boolean hasTraversalSucceeded(TraversalState s) { @Override public boolean hasTraversalFailed(TraversalState s) { - return s.getCurrentJunctionDepth() > 5 || s.reachedMaxBranchLength(); + return s.getCurrentJunctionDepth() > 2 || s.reachedMaxBranchLength(); } } diff --git a/public/java/tests/uk/ac/ox/well/cortexjdk/utils/alignment/graph/TestReadToGraphAligner.java b/public/java/tests/uk/ac/ox/well/cortexjdk/utils/alignment/graph/TestReadToGraphAligner.java deleted file mode 100644 index dbf57080..00000000 --- a/public/java/tests/uk/ac/ox/well/cortexjdk/utils/alignment/graph/TestReadToGraphAligner.java +++ /dev/null @@ -1,83 +0,0 @@ -package uk.ac.ox.well.cortexjdk.utils.alignment.graph; - -import com.google.common.base.Joiner; -import htsjdk.samtools.fastq.FastqRecord; -import htsjdk.samtools.util.StringUtil; -import org.apache.commons.math3.util.Pair; -import org.testng.Assert; -import org.testng.annotations.Test; -import uk.ac.ox.well.cortexjdk.Main; -import uk.ac.ox.well.cortexjdk.utils.assembler.TempGraphAssembler; -import uk.ac.ox.well.cortexjdk.utils.io.graph.cortex.CortexGraph; -import uk.ac.ox.well.cortexjdk.utils.sequence.SequenceUtils; - -import java.util.*; - -public class TestReadToGraphAligner { - @Test - public void testSingleEndAlignment() throws Exception { - Map> haplotypes = new LinkedHashMap<>(); - haplotypes.put("mom", Arrays.asList("AGTTCGAATCTGGTCATGGTATTAGCGTATCGCCATCGATCAGGGCTATATGCT")); - haplotypes.put("kid", Arrays.asList("AGTTCGAATCTGGTCATGGTCGATCAGGGCTATATGCT", "AGTTCGAATCTGGTCATGGTATTAGCGTATCGCCATCGATCAGGGCTATATGCT")); - - CortexGraph g = TempGraphAssembler.buildGraph(haplotypes, 7); - ReadToGraphAligner ga = new ReadToGraphAligner(g, null); - - Map reads = new LinkedHashMap<>(); - reads.put("AGTTCGAATCTGGTCATGGTCGATCAGGGCTATATGCT", true); - reads.put("CGATCAGGGCTATATGCT", false); - reads.put("AGTTCGAATCTG", false); - reads.put("TCATGGTCGATCA", true); - reads.put("ATGGTATTA", false); - - for (String readSequence : reads.keySet()) { - FastqRecord fq = new FastqRecord("test", readSequence, null, StringUtil.repeatCharNTimes('I', readSequence.length())); - - SingleEndAlignmentInfo sai = ga.align(fq); - boolean isSplitRead = sai.isSplitRead(1, 0); - - Assert.assertEquals(isSplitRead, (boolean) reads.get(readSequence)); - } - } - - @Test - public void testPairedEndAlignment() throws Exception { - Map> haplotypes = new LinkedHashMap<>(); - haplotypes.put("mom", Arrays.asList( "AGTTCGAATCTGGTCATGGTATTAGCGTATCGCCATCGATCAGGGCTATATGCT")); - haplotypes.put("kid", Arrays.asList("AGTTCGAATCTGGTCATGGTCGATCAGGGCTATATGCT", "AGTTCGAATCTGGTCATGGTATTAGCGTATCGCCATCGATCAGGGCTATATGCT")); - - CortexGraph g = TempGraphAssembler.buildGraph(haplotypes, 7); - ReadToGraphAligner ga = new ReadToGraphAligner(g, null, 0, 1); - - Set> reads = new LinkedHashSet<>(); - reads.add(new Pair<>("AGTTCGAATCTGG", SequenceUtils.reverseComplement("GGGCTATATGCT"))); - reads.add(new Pair<>("AGCATATAGCCCT", SequenceUtils.reverseComplement("CAGATTCGAACT"))); - - for (Pair read : reads) { - FastqRecord fq1 = new FastqRecord("test", read.getFirst(), null, StringUtil.repeatCharNTimes('I', read.getFirst().length())); - FastqRecord fq2 = new FastqRecord("test", read.getSecond(), null, StringUtil.repeatCharNTimes('I', read.getSecond().length())); - - PairedEndAlignmentInfo pai = ga.align(fq1, fq2); - } - } - - @Test - public void testPairedEndAlignmentWithSV() throws Exception { - Map> haplotypes = new LinkedHashMap<>(); - haplotypes.put("mom", Arrays.asList("AGTTCGAATCTGGTCATGGTATTAGCGTATCGCCATCGATCAGGGCTATATGCT")); - haplotypes.put("kid", Arrays.asList("AGTTCGAATCTGGTCATGGTATTAGGGCTAAATAGTTCATGATTGCATGTAGTCATGTGCGCATGACTAGTGCTCCCATAGCGTATCGCCATCGATCAGGGCTATATGCT")); - - CortexGraph g = TempGraphAssembler.buildGraph(haplotypes, 11); - ReadToGraphAligner ga = new ReadToGraphAligner(g, null, 0, 1); - - Set> reads = new LinkedHashSet<>(); - reads.add(new Pair<>("AGTTCGAATCTGG", SequenceUtils.reverseComplement("GGGCTATATGCT"))); - - for (Pair read : reads) { - FastqRecord fq1 = new FastqRecord("test", read.getFirst(), null, StringUtil.repeatCharNTimes('I', read.getFirst().length())); - FastqRecord fq2 = new FastqRecord("test", read.getSecond(), null, StringUtil.repeatCharNTimes('I', read.getSecond().length())); - - PairedEndAlignmentInfo pai = ga.align(fq1, fq2); - } - } -}