From 4411b1d0b53429d14a01108288e04a8adf249516 Mon Sep 17 00:00:00 2001 From: Luke Slater Date: Thu, 9 Dec 2021 00:53:25 +0000 Subject: [PATCH] egl, ecl, min power, many other fixes --- klarigi/src/main/groovy/klarigi/App.groovy | 7 +- .../src/main/groovy/klarigi/Classifier.groovy | 27 +++++-- .../groovy/klarigi/InformationContent.groovy | 5 +- .../src/main/groovy/klarigi/Klarigi.groovy | 78 +++++++++++++------ .../groovy/klarigi/PacketConverter.groovy | 10 ++- klarigi/src/main/groovy/klarigi/Scorer.groovy | 5 +- .../src/main/groovy/klarigi/StepDown.groovy | 13 +++- 7 files changed, 105 insertions(+), 40 deletions(-) diff --git a/klarigi/src/main/groovy/klarigi/App.groovy b/klarigi/src/main/groovy/klarigi/App.groovy index 049360c..8641c5a 100644 --- a/klarigi/src/main/groovy/klarigi/App.groovy +++ b/klarigi/src/main/groovy/klarigi/App.groovy @@ -29,6 +29,7 @@ class App { _ longOpt: 'save-ic', 'Save the IC values to the given file', args:1 g longOpt: 'group', 'The group to explain.', args: 1 + egl longOpt: 'exclusive-group-load', 'If set to true, only the group given in -g will be loaded into the corpus', type: Boolean gf longOpt: 'group-file', 'You can pass a file with a list of groups to: one per line. If you do this, the --group argument will be ignored.', args: 1 _ longOpt: 'max-ic', 'Max IC to use in stepdown algorithm. Default: 0.8', args: 1 @@ -38,6 +39,7 @@ class App { _ longOpt: 'max-exclusion', 'Max exclusion to use in stepdown algorithm. Default: 0.95', args: 1 _ longOpt: 'min-exclusion', 'Min exclusion to use in stepdown algorithm. Default: 0.3', args: 1 _ longOpt: 'max-total-inclusion', 'Max total inclusion to use in stepdown algorithm. Default: 0.95 (probably don\'t want to edit this one)', args: 1 + _ longOpt: 'min-power', 'Min acceptable value of power.', args: 1 _ longOpt: 'step', 'Step by which to reduce coefficients in stepdown algorithm. Default: 0.05', args: 1 _ longOpt: 'debug', 'Print some debug output', type: Boolean @@ -45,6 +47,7 @@ class App { _ longOpt: 'reclassify', 'Attempt to reclassify the input using the derived explanations. This will help give some scores about how well the explanations fit the data', type: Boolean _ longOpt: 'classify', 'Pass a new file of unseen examples to classify using the explanations derived (test classify)', args: 1 + ecm longOpt: 'explainers-classify-mode', 'Only use the smaller set of explanatory variables for classification.', type: Boolean p longOpt: 'perms', 'Do permutation testing to provide p values for power, inclusion, and exclusion.', args: 1 _ longOpt: 'output-scores', 'Output the results of the scorer. This can be useful for debugging, or identifying coefficient settings.', type: Boolean @@ -115,10 +118,10 @@ class App { } if(o['reclassify']) { - k.reclassify(allExplanations, o['output-classification-scores']) + k.reclassify(allExplanations, o['output-classification-scores'], o['ecm']) } if(o['classify']) { - k.classify(o['classify'], allExplanations, o['output-classification-scores']) + k.classify(o['classify'], allExplanations, o['output-classification-scores'], o['ecm']) if(o['output-exp-dataframe']) { k.writeDataframe('test', allExplanations) diff --git a/klarigi/src/main/groovy/klarigi/Classifier.groovy b/klarigi/src/main/groovy/klarigi/Classifier.groovy index 2315c24..b232bbc 100644 --- a/klarigi/src/main/groovy/klarigi/Classifier.groovy +++ b/klarigi/src/main/groovy/klarigi/Classifier.groovy @@ -4,7 +4,7 @@ import org.semanticweb.owlapi.model.IRI import be.cylab.java.roc.* public class Classifier { - static def classify(allExplanations, data, ontoHelper) { + static def classify(allExplanations, data, ontoHelper, ecm) { def subclassCache = [:] def metrics = [:] @@ -29,7 +29,12 @@ public class Classifier { scores[exps.cluster] = 1 // Iterate all scored candidates (results[3]) - def rs = exps.results[2].collect { e -> + def sterms = exps.results[2] + if(ecm) { + sterms = exps.results[0] + } + + def rs = sterms.collect { e -> // Get subclasses + equivalent of this explanatory class if(!subclassCache.containsKey(e.iri)) { def ce = ontoHelper.dataFactory.getOWLClass(IRI.create(e.iri)) @@ -90,14 +95,18 @@ public class Classifier { // trues m1.scores.each { s -> - scores << s[cid].toDouble() - truths << 1 + if(s.containsKey(cid)) { + scores << s[cid].toDouble() + truths << 1 + } } metrics.findAll { c2, m2 -> c2 != cid }.each { c2, m2 -> m2.scores.each { s -> - scores << s[cid].toDouble() - truths << 0 + if(s.containsKey(cid)) { + scores << s[cid].toDouble() + truths << 0 + } } } @@ -109,9 +118,11 @@ public class Classifier { double[] scar = scores.toArray() double[] trar = truths.toArray() def roc = new Roc(scar, trar) + def auc = roc.computeAUC() - //def roc = new Roc(scores, truths) - println "$cid AUC: ${roc.computeAUC()}" + if(!auc.isNaN()) { + println "$cid AUC: ${auc}" + } } } diff --git a/klarigi/src/main/groovy/klarigi/InformationContent.groovy b/klarigi/src/main/groovy/klarigi/InformationContent.groovy index bf818f4..3ca8c9e 100644 --- a/klarigi/src/main/groovy/klarigi/InformationContent.groovy +++ b/klarigi/src/main/groovy/klarigi/InformationContent.groovy @@ -55,7 +55,7 @@ public class InformationContent { this(ontologyPath, false, false, false) } - InformationContent(ontologyPath, dataPath, annotIC, turtle) { + InformationContent(ontologyPath, dataPath, annotIC, turtle, pp) { factory = URIFactoryMemory.getSingleton() /*def graphURI = factory.getURI('http://purl.obolibrary.org/obo/HP_') @@ -79,6 +79,9 @@ public class InformationContent { def icMeasure = DEFAULT_IC if(annotIC) { + if(pp) { + dataPath = "pp_conv.tsv" + } gConf.addGDataConf(new GDataConf(GFormat.TSV_ANNOT, dataPath)); } //gConf.addGAction(actionRerootConf) diff --git a/klarigi/src/main/groovy/klarigi/Klarigi.groovy b/klarigi/src/main/groovy/klarigi/Klarigi.groovy index 724c8b8..fef43fd 100644 --- a/klarigi/src/main/groovy/klarigi/Klarigi.groovy +++ b/klarigi/src/main/groovy/klarigi/Klarigi.groovy @@ -22,7 +22,14 @@ import java.math.MathContext public class Klarigi { def data - def allAssociations // lazy + + // A unique list containing every class directly annotated to any entity in the + // corpus. Generated after data corpus is loaded at the end of loadData. It's needed + // for permutation, primarily, but there are probably other reasons that it's useful + // to keep around too... + // TODO: consider merging with above data, or in the class that eventually replaces that. + def allAssociations + def ontoHelper = [ reasoner: null, dataFactory: null, @@ -35,9 +42,9 @@ public class Klarigi { Klarigi(o) { verbose = o['verbose'] - loadData(o['data'], o['pp']) + loadData(o['data'], o['pp'], o['group'], o['egl']) loadOntology(o['ontology']) - loadIc(o['ic'], o['ontology'], o['data'], o['resnik-ic'], o['save-ic'], o['turtle']) + loadIc(o['ic'], o['ontology'], o['data'], o['resnik-ic'], o['save-ic'], o['turtle'], o['pp']) coefficients = Coefficients.Generate(o) if(o['output']) { // blank the output file, since we will subsequently append to it. all the output stuff could probs be better abstracted. @@ -45,7 +52,7 @@ public class Klarigi { } } - def loadData(dataFile, pp) { + def loadData(dataFile, pp, interestGroup, egl) { data = [ groupings: [:], associations: [:], @@ -57,7 +64,11 @@ public class Klarigi { def input if(pp) { // Phenopackets mode - def toProcess + if(verbose) { + println "Phenopackets input mode engaged" + } + + def toProcess = [] if(inputFile.isDirectory()) { inputFile.eachFile { f -> if(f.getName() =~ /json$/) { @@ -65,18 +76,40 @@ public class Klarigi { } } } else { - toProcess = [inputFile] + toProcess << inputFile } // Convert each of the phenopackets to input triples input = toProcess.collect { PacketConverter.Convert(PacketConverter.Load(it)) } + + if(verbose) { + def outName = "pp_conv.tsv" + println "Phenopackets loaded. Also saving a converted copy to $outName" + PacketConverter.Save(input, outName) + } } else { input = new File(dataFile).collect { it.split('\t') } } - + input.each { def (entity, terms, group) = it + def gs = group.tokenize(';') + if(egl) { + gs = gs.findAll { g -> g == interestGroup } + // Here we exit if there are no interestGroup associations for this entity. We don't do this in the regular mode, because even ungrouped entities provide useful background... + if(gs.size() == 0) { + return; + } + } + + gs.each { g -> + if(!data.groupings.containsKey(g)) { + data.groupings[g] = [] + } + data.groupings[g] << entity + } + terms = terms.tokenize(';') if(terms.size() > 0 && terms[0] =~ /:/ && terms[0].indexOf('http') == -1) { // stupid terms = terms.collect { @@ -89,22 +122,15 @@ public class Klarigi { terms.each { data.associations[entity][it] = true } - - group.tokenize(';').each { g -> - if(!data.groupings.containsKey(g)) { - data.groupings[g] = [] - } - data.groupings[g] << entity - } } } catch(e) { HandleError(e, verbose, "Error loading data file ($dataFile)") } + // kind of stupid but ok allAssociations = data.associations.collect { entity, terms -> terms.keySet().toList() }.flatten().unique(false) - println allAssociations if(verbose) { println "Done loading dataset" @@ -123,7 +149,7 @@ public class Klarigi { return newSample } - def loadIc(icFile, ontologyFile, annotFile, resnikIc, saveIc, turtle) { + def loadIc(icFile, ontologyFile, annotFile, resnikIc, saveIc, turtle, pp) { if(icFile) { try { new File(icFile).splitEachLine('\t') { @@ -134,7 +160,7 @@ public class Klarigi { } } else { try { - icFactory = new InformationContent(ontologyFile, annotFile, resnikIc, turtle) + icFactory = new InformationContent(ontologyFile, annotFile, resnikIc, turtle, pp) def allClasses = ontoHelper.reasoner.getSubClasses(ontoHelper.dataFactory.getOWLThing(), false).collect { it.getRepresentativeElement().getIRI().toString() }.unique(false) allClasses = allClasses.findAll { it != 'http://www.w3.org/2002/07/owl#Nothing' } // heh data.ic = icFactory.getInformationContent(allClasses) @@ -292,8 +318,11 @@ public class Klarigi { } } - def reclassify(allExplanations, outClassScores) { - def m = Classifier.classify(allExplanations, data, ontoHelper) + def reclassify(allExplanations, outClassScores, ecm) { + def m = Classifier.classify(allExplanations, data, ontoHelper, ecm) + if(!m) { + RaiseError("Failed to build reclassifier. There may have been too few examples.") + } println 'Reclassification:' Classifier.Print(m) @@ -304,14 +333,19 @@ public class Klarigi { } } - def classify(path, allExplanations, outClassScores) { + def classify(path, allExplanations, outClassScores, ecm) { loadData(path) // TODO I know, i know, this is awful state management and design. i'll fix it later + def m = Classifier.classify(allExplanations, data, ontoHelper, ecm) + if(!m) { + RaiseError("Failed to build classifier. There may have been too few examples.") + } + println 'Classification:' - def m = Classifier.classify(allExplanations, data, ontoHelper) Classifier.Print(m) println '' + if(outClassScores) { Classifier.WriteScores(m, "classify") } @@ -330,7 +364,7 @@ public class Klarigi { def cSize = data.groupings[cid].size() if(outType) { if(outType == 'latex') { - StepDown.PrintLaTeX(cid, results, ontoHelper.labels, cSize, toFile) + StepDown.PrintLaTeX(cid, results, pVals, ontoHelper.labels, cSize, toFile) } else if(outType == 'tsv') { StepDown.PrintTSV(cid, results, ontoHelper.labels, cSize, toFile) } diff --git a/klarigi/src/main/groovy/klarigi/PacketConverter.groovy b/klarigi/src/main/groovy/klarigi/PacketConverter.groovy index a70adb9..2b53d37 100644 --- a/klarigi/src/main/groovy/klarigi/PacketConverter.groovy +++ b/klarigi/src/main/groovy/klarigi/PacketConverter.groovy @@ -10,8 +10,14 @@ public class PacketConverter { public static def Convert(pDict) { [ pDict['id'], - pDict['phenotypicFeatures'].collect { it.type.id }, - pDict['diseases'].collect { term.id } + pDict['phenotypicFeatures'].collect { it.type.id }.join(';'), + pDict['diseases'].collect { it.term.id }.join(';') ] } + + public static def Save(triples, String fName) { + new File(fName).text = triples.collect { + it.join('\t') + }.join('\n') + } } diff --git a/klarigi/src/main/groovy/klarigi/Scorer.groovy b/klarigi/src/main/groovy/klarigi/Scorer.groovy index 127a9c4..b14db72 100644 --- a/klarigi/src/main/groovy/klarigi/Scorer.groovy +++ b/klarigi/src/main/groovy/klarigi/Scorer.groovy @@ -49,7 +49,10 @@ public class Scorer { // how about we make it the proportion of total mentions that are in this group vs the other group? //v.nExclusion = 1 - (v.exclusion / data.groupings.findAll { kk, vv -> kk != cid }.collect { kk, vv -> vv.size() }.sum()) - v.nExclusion = 1 - (v.exclusion / (v.inclusion + v.exclusion)) + v.nExclusion = 0 + if((v.inclusion + v.exclusion) > 0) { + v.nExclusion = v.inclusion / (v.inclusion + v.exclusion) + } } v.nPower = v.nInclusion - (1-v.nExclusion) diff --git a/klarigi/src/main/groovy/klarigi/StepDown.groovy b/klarigi/src/main/groovy/klarigi/StepDown.groovy index 2210fe9..526a7a6 100644 --- a/klarigi/src/main/groovy/klarigi/StepDown.groovy +++ b/klarigi/src/main/groovy/klarigi/StepDown.groovy @@ -135,10 +135,10 @@ public class StepDown { } - static def PrintLaTeX(cid, res, labels, s, toFile) { + static def PrintLaTeX(cid, res, pVals, labels, s, toFile) { def out = [] out << "\\begin{tabular}{p{10cm}|c|c|c|c}" - out << "{\\bf Group: $cid ($s members)} & {\\bf Power} & {\\bf Exclusion} & {\\bf Inclusion} & {\\bf IC} \\\\" + out << "{\\bf Group: $cid ($s members)} & {\\bf Power} & {\\bf Inclusion} & {\\bf Exclusion} & {\\bf IC} \\\\" res[0].sort { -it.nIc }.each { def pIri = it.iri if(pIri =~ 'obolibrary.org') { @@ -149,9 +149,14 @@ public class StepDown { pIri = it.iri.replaceAll('_', '\\\\_') } - out << "${labels[it.iri]} (${pIri}) & ${it.nPower.toDouble().round(2)} & ${it.nExclusion.toDouble().round(2)} & ${it.nInclusion.toDouble().round(2)} & ${it.ic.toDouble().round(2)} \\\\" + if(pVals) { + def ps = pVals[it.iri] + out << "${labels[it.iri]} (${pIri}) & ${it.nPower.toDouble().round(2)} (p\$<\$=${ps.powP}) & ${it.nInclusion.toDouble().round(2)} (p\$<\$=${ps.incP}) & ${it.nExclusion.toDouble().round(2)} (p\$<\$=${ps.excP}) & ${it.ic.toDouble().round(2)} \\\\" + } else { + out << "${labels[it.iri]} (${pIri}) & ${it.nPower.toDouble().round(2)} & ${it.nInclusion.toDouble().round(2)} & ${it.nExclusion.toDouble().round(2)} & ${it.ic.toDouble().round(2)} \\\\" + } } - out << "{\\em Overall} & - & ${res[2].toDouble().round(2)} & - & - \\\\ " + out << "{\\em Overall} & - & ${res[1].toDouble().round(2)} & - & - \\\\ " out << "\\hline" out << "\\end{tabular}" out = out.join('\n')