Skip to content

Commit

Permalink
egl, ecl, min power, many other fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
reality committed Dec 9, 2021
1 parent 97ecc13 commit 4411b1d
Show file tree
Hide file tree
Showing 7 changed files with 105 additions and 40 deletions.
7 changes: 5 additions & 2 deletions klarigi/src/main/groovy/klarigi/App.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class App {
_ longOpt: 'save-ic', 'Save the IC values to the given file', args:1

g longOpt: 'group', 'The group to explain.', args: 1
egl longOpt: 'exclusive-group-load', 'If set to true, only the group given in -g will be loaded into the corpus', type: Boolean
gf longOpt: 'group-file', 'You can pass a file with a list of groups to: one per line. If you do this, the --group argument will be ignored.', args: 1

_ longOpt: 'max-ic', 'Max IC to use in stepdown algorithm. Default: 0.8', args: 1
Expand All @@ -38,13 +39,15 @@ class App {
_ longOpt: 'max-exclusion', 'Max exclusion to use in stepdown algorithm. Default: 0.95', args: 1
_ longOpt: 'min-exclusion', 'Min exclusion to use in stepdown algorithm. Default: 0.3', args: 1
_ longOpt: 'max-total-inclusion', 'Max total inclusion to use in stepdown algorithm. Default: 0.95 (probably don\'t want to edit this one)', args: 1
_ longOpt: 'min-power', 'Min acceptable value of power.', args: 1
_ longOpt: 'step', 'Step by which to reduce coefficients in stepdown algorithm. Default: 0.05', args: 1
_ longOpt: 'debug', 'Print some debug output', type: Boolean

_ longOpt: 'power', 'Use modification of algorithm which uses normalised power instead of inc/exc', type: Boolean

_ longOpt: 'reclassify', 'Attempt to reclassify the input using the derived explanations. This will help give some scores about how well the explanations fit the data', type: Boolean
_ longOpt: 'classify', 'Pass a new file of unseen examples to classify using the explanations derived (test classify)', args: 1
ecm longOpt: 'explainers-classify-mode', 'Only use the smaller set of explanatory variables for classification.', type: Boolean
p longOpt: 'perms', 'Do permutation testing to provide p values for power, inclusion, and exclusion.', args: 1

_ longOpt: 'output-scores', 'Output the results of the scorer. This can be useful for debugging, or identifying coefficient settings.', type: Boolean
Expand Down Expand Up @@ -115,10 +118,10 @@ class App {
}

if(o['reclassify']) {
k.reclassify(allExplanations, o['output-classification-scores'])
k.reclassify(allExplanations, o['output-classification-scores'], o['ecm'])
}
if(o['classify']) {
k.classify(o['classify'], allExplanations, o['output-classification-scores'])
k.classify(o['classify'], allExplanations, o['output-classification-scores'], o['ecm'])

if(o['output-exp-dataframe']) {
k.writeDataframe('test', allExplanations)
Expand Down
27 changes: 19 additions & 8 deletions klarigi/src/main/groovy/klarigi/Classifier.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import org.semanticweb.owlapi.model.IRI
import be.cylab.java.roc.*

public class Classifier {
static def classify(allExplanations, data, ontoHelper) {
static def classify(allExplanations, data, ontoHelper, ecm) {
def subclassCache = [:]

def metrics = [:]
Expand All @@ -29,7 +29,12 @@ public class Classifier {
scores[exps.cluster] = 1

// Iterate all scored candidates (results[3])
def rs = exps.results[2].collect { e ->
def sterms = exps.results[2]
if(ecm) {
sterms = exps.results[0]
}

def rs = sterms.collect { e ->
// Get subclasses + equivalent of this explanatory class
if(!subclassCache.containsKey(e.iri)) {
def ce = ontoHelper.dataFactory.getOWLClass(IRI.create(e.iri))
Expand Down Expand Up @@ -90,14 +95,18 @@ public class Classifier {

// trues
m1.scores.each { s ->
scores << s[cid].toDouble()
truths << 1
if(s.containsKey(cid)) {
scores << s[cid].toDouble()
truths << 1
}
}

metrics.findAll { c2, m2 -> c2 != cid }.each { c2, m2 ->
m2.scores.each { s ->
scores << s[cid].toDouble()
truths << 0
if(s.containsKey(cid)) {
scores << s[cid].toDouble()
truths << 0
}
}
}

Expand All @@ -109,9 +118,11 @@ public class Classifier {
double[] scar = scores.toArray()
double[] trar = truths.toArray()
def roc = new Roc(scar, trar)
def auc = roc.computeAUC()

//def roc = new Roc(scores, truths)
println "$cid AUC: ${roc.computeAUC()}"
if(!auc.isNaN()) {
println "$cid AUC: ${auc}"
}
}
}

Expand Down
5 changes: 4 additions & 1 deletion klarigi/src/main/groovy/klarigi/InformationContent.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ public class InformationContent {
this(ontologyPath, false, false, false)
}

InformationContent(ontologyPath, dataPath, annotIC, turtle) {
InformationContent(ontologyPath, dataPath, annotIC, turtle, pp) {
factory = URIFactoryMemory.getSingleton()

/*def graphURI = factory.getURI('http://purl.obolibrary.org/obo/HP_')
Expand All @@ -79,6 +79,9 @@ public class InformationContent {

def icMeasure = DEFAULT_IC
if(annotIC) {
if(pp) {
dataPath = "pp_conv.tsv"
}
gConf.addGDataConf(new GDataConf(GFormat.TSV_ANNOT, dataPath));
}
//gConf.addGAction(actionRerootConf)
Expand Down
78 changes: 56 additions & 22 deletions klarigi/src/main/groovy/klarigi/Klarigi.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,14 @@ import java.math.MathContext

public class Klarigi {
def data
def allAssociations // lazy

// A unique list containing every class directly annotated to any entity in the
// corpus. Generated after data corpus is loaded at the end of loadData. It's needed
// for permutation, primarily, but there are probably other reasons that it's useful
// to keep around too...
// TODO: consider merging with above data, or in the class that eventually replaces that.
def allAssociations

def ontoHelper = [
reasoner: null,
dataFactory: null,
Expand All @@ -35,17 +42,17 @@ public class Klarigi {
Klarigi(o) {
verbose = o['verbose']

loadData(o['data'], o['pp'])
loadData(o['data'], o['pp'], o['group'], o['egl'])
loadOntology(o['ontology'])
loadIc(o['ic'], o['ontology'], o['data'], o['resnik-ic'], o['save-ic'], o['turtle'])
loadIc(o['ic'], o['ontology'], o['data'], o['resnik-ic'], o['save-ic'], o['turtle'], o['pp'])
coefficients = Coefficients.Generate(o)

if(o['output']) { // blank the output file, since we will subsequently append to it. all the output stuff could probs be better abstracted.
new File(o['output']).text = ''
}
}

def loadData(dataFile, pp) {
def loadData(dataFile, pp, interestGroup, egl) {
data = [
groupings: [:],
associations: [:],
Expand All @@ -57,26 +64,52 @@ public class Klarigi {

def input
if(pp) { // Phenopackets mode
def toProcess
if(verbose) {
println "Phenopackets input mode engaged"
}

def toProcess = []
if(inputFile.isDirectory()) {
inputFile.eachFile { f ->
if(f.getName() =~ /json$/) {
toProcess << f
}
}
} else {
toProcess = [inputFile]
toProcess << inputFile
}

// Convert each of the phenopackets to input triples
input = toProcess.collect { PacketConverter.Convert(PacketConverter.Load(it)) }

if(verbose) {
def outName = "pp_conv.tsv"
println "Phenopackets loaded. Also saving a converted copy to $outName"
PacketConverter.Save(input, outName)
}
} else {
input = new File(dataFile).collect { it.split('\t') }
}

input.each {
def (entity, terms, group) = it

def gs = group.tokenize(';')
if(egl) {
gs = gs.findAll { g -> g == interestGroup }
// Here we exit if there are no interestGroup associations for this entity. We don't do this in the regular mode, because even ungrouped entities provide useful background...
if(gs.size() == 0) {
return;
}
}

gs.each { g ->
if(!data.groupings.containsKey(g)) {
data.groupings[g] = []
}
data.groupings[g] << entity
}

terms = terms.tokenize(';')
if(terms.size() > 0 && terms[0] =~ /:/ && terms[0].indexOf('http') == -1) { // stupid
terms = terms.collect {
Expand All @@ -89,22 +122,15 @@ public class Klarigi {
terms.each {
data.associations[entity][it] = true
}

group.tokenize(';').each { g ->
if(!data.groupings.containsKey(g)) {
data.groupings[g] = []
}
data.groupings[g] << entity
}
}
} catch(e) {
HandleError(e, verbose, "Error loading data file ($dataFile)")
}

// kind of stupid but ok
allAssociations = data.associations.collect { entity, terms ->
terms.keySet().toList()
}.flatten().unique(false)
println allAssociations

if(verbose) {
println "Done loading dataset"
Expand All @@ -123,7 +149,7 @@ public class Klarigi {
return newSample
}

def loadIc(icFile, ontologyFile, annotFile, resnikIc, saveIc, turtle) {
def loadIc(icFile, ontologyFile, annotFile, resnikIc, saveIc, turtle, pp) {
if(icFile) {
try {
new File(icFile).splitEachLine('\t') {
Expand All @@ -134,7 +160,7 @@ public class Klarigi {
}
} else {
try {
icFactory = new InformationContent(ontologyFile, annotFile, resnikIc, turtle)
icFactory = new InformationContent(ontologyFile, annotFile, resnikIc, turtle, pp)
def allClasses = ontoHelper.reasoner.getSubClasses(ontoHelper.dataFactory.getOWLThing(), false).collect { it.getRepresentativeElement().getIRI().toString() }.unique(false)
allClasses = allClasses.findAll { it != 'http://www.w3.org/2002/07/owl#Nothing' } // heh
data.ic = icFactory.getInformationContent(allClasses)
Expand Down Expand Up @@ -292,8 +318,11 @@ public class Klarigi {
}
}

def reclassify(allExplanations, outClassScores) {
def m = Classifier.classify(allExplanations, data, ontoHelper)
def reclassify(allExplanations, outClassScores, ecm) {
def m = Classifier.classify(allExplanations, data, ontoHelper, ecm)
if(!m) {
RaiseError("Failed to build reclassifier. There may have been too few examples.")
}

println 'Reclassification:'
Classifier.Print(m)
Expand All @@ -304,14 +333,19 @@ public class Klarigi {
}
}

def classify(path, allExplanations, outClassScores) {
def classify(path, allExplanations, outClassScores, ecm) {
loadData(path) // TODO I know, i know, this is awful state management and design. i'll fix it later

def m = Classifier.classify(allExplanations, data, ontoHelper, ecm)
if(!m) {
RaiseError("Failed to build classifier. There may have been too few examples.")
}

println 'Classification:'
def m = Classifier.classify(allExplanations, data, ontoHelper)
Classifier.Print(m)
println ''


if(outClassScores) {
Classifier.WriteScores(m, "classify")
}
Expand All @@ -330,7 +364,7 @@ public class Klarigi {
def cSize = data.groupings[cid].size()
if(outType) {
if(outType == 'latex') {
StepDown.PrintLaTeX(cid, results, ontoHelper.labels, cSize, toFile)
StepDown.PrintLaTeX(cid, results, pVals, ontoHelper.labels, cSize, toFile)
} else if(outType == 'tsv') {
StepDown.PrintTSV(cid, results, ontoHelper.labels, cSize, toFile)
}
Expand Down
10 changes: 8 additions & 2 deletions klarigi/src/main/groovy/klarigi/PacketConverter.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,14 @@ public class PacketConverter {
public static def Convert(pDict) {
[
pDict['id'],
pDict['phenotypicFeatures'].collect { it.type.id },
pDict['diseases'].collect { term.id }
pDict['phenotypicFeatures'].collect { it.type.id }.join(';'),
pDict['diseases'].collect { it.term.id }.join(';')
]
}

public static def Save(triples, String fName) {
new File(fName).text = triples.collect {
it.join('\t')
}.join('\n')
}
}
5 changes: 4 additions & 1 deletion klarigi/src/main/groovy/klarigi/Scorer.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,10 @@ public class Scorer {
// how about we make it the proportion of total mentions that are in this group vs the other group?

//v.nExclusion = 1 - (v.exclusion / data.groupings.findAll { kk, vv -> kk != cid }.collect { kk, vv -> vv.size() }.sum())
v.nExclusion = 1 - (v.exclusion / (v.inclusion + v.exclusion))
v.nExclusion = 0
if((v.inclusion + v.exclusion) > 0) {
v.nExclusion = v.inclusion / (v.inclusion + v.exclusion)
}
}

v.nPower = v.nInclusion - (1-v.nExclusion)
Expand Down
13 changes: 9 additions & 4 deletions klarigi/src/main/groovy/klarigi/StepDown.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,10 @@ public class StepDown {

}

static def PrintLaTeX(cid, res, labels, s, toFile) {
static def PrintLaTeX(cid, res, pVals, labels, s, toFile) {
def out = []
out << "\\begin{tabular}{p{10cm}|c|c|c|c}"
out << "{\\bf Group: $cid ($s members)} & {\\bf Power} & {\\bf Exclusion} & {\\bf Inclusion} & {\\bf IC} \\\\"
out << "{\\bf Group: $cid ($s members)} & {\\bf Power} & {\\bf Inclusion} & {\\bf Exclusion} & {\\bf IC} \\\\"
res[0].sort { -it.nIc }.each {
def pIri = it.iri
if(pIri =~ 'obolibrary.org') {
Expand All @@ -149,9 +149,14 @@ public class StepDown {
pIri = it.iri.replaceAll('_', '\\\\_')
}

out << "${labels[it.iri]} (${pIri}) & ${it.nPower.toDouble().round(2)} & ${it.nExclusion.toDouble().round(2)} & ${it.nInclusion.toDouble().round(2)} & ${it.ic.toDouble().round(2)} \\\\"
if(pVals) {
def ps = pVals[it.iri]
out << "${labels[it.iri]} (${pIri}) & ${it.nPower.toDouble().round(2)} (p\$<\$=${ps.powP}) & ${it.nInclusion.toDouble().round(2)} (p\$<\$=${ps.incP}) & ${it.nExclusion.toDouble().round(2)} (p\$<\$=${ps.excP}) & ${it.ic.toDouble().round(2)} \\\\"
} else {
out << "${labels[it.iri]} (${pIri}) & ${it.nPower.toDouble().round(2)} & ${it.nInclusion.toDouble().round(2)} & ${it.nExclusion.toDouble().round(2)} & ${it.ic.toDouble().round(2)} \\\\"
}
}
out << "{\\em Overall} & - & ${res[2].toDouble().round(2)} & - & - \\\\ "
out << "{\\em Overall} & - & ${res[1].toDouble().round(2)} & - & - \\\\ "
out << "\\hline"
out << "\\end{tabular}"
out = out.join('\n')
Expand Down

0 comments on commit 4411b1d

Please sign in to comment.