From 4411b1d0b53429d14a01108288e04a8adf249516 Mon Sep 17 00:00:00 2001
From: Luke Slater <tinmachin3@gmail.com>
Date: Thu, 9 Dec 2021 00:53:25 +0000
Subject: [PATCH] egl, ecl, min power, many other fixes

---
 klarigi/src/main/groovy/klarigi/App.groovy    |  7 +-
 .../src/main/groovy/klarigi/Classifier.groovy | 27 +++++--
 .../groovy/klarigi/InformationContent.groovy  |  5 +-
 .../src/main/groovy/klarigi/Klarigi.groovy    | 78 +++++++++++++------
 .../groovy/klarigi/PacketConverter.groovy     | 10 ++-
 klarigi/src/main/groovy/klarigi/Scorer.groovy |  5 +-
 .../src/main/groovy/klarigi/StepDown.groovy   | 13 +++-
 7 files changed, 105 insertions(+), 40 deletions(-)

diff --git a/klarigi/src/main/groovy/klarigi/App.groovy b/klarigi/src/main/groovy/klarigi/App.groovy
index 049360c..8641c5a 100644
--- a/klarigi/src/main/groovy/klarigi/App.groovy
+++ b/klarigi/src/main/groovy/klarigi/App.groovy
@@ -29,6 +29,7 @@ class App {
       _ longOpt: 'save-ic', 'Save the IC values to the given file', args:1
 
       g longOpt: 'group', 'The group to explain.', args: 1
+      egl longOpt: 'exclusive-group-load', 'If set to true, only the group given in -g will be loaded into the corpus', type: Boolean
       gf longOpt: 'group-file', 'You can pass a file with a list of groups to: one per line. If you do this, the --group argument will be ignored.', args: 1
 
       _ longOpt: 'max-ic', 'Max IC to use in stepdown algorithm. Default: 0.8', args: 1
@@ -38,6 +39,7 @@ class App {
       _ longOpt: 'max-exclusion', 'Max exclusion to use in stepdown algorithm. Default: 0.95', args: 1
       _ longOpt: 'min-exclusion', 'Min exclusion to use in stepdown algorithm. Default: 0.3', args: 1
       _ longOpt: 'max-total-inclusion', 'Max total inclusion to use in stepdown algorithm. Default: 0.95 (probably don\'t want to edit this one)', args: 1
+      _ longOpt: 'min-power', 'Min acceptable value of power.', args: 1
       _ longOpt: 'step', 'Step by which to reduce coefficients in stepdown algorithm. Default: 0.05', args: 1
       _ longOpt: 'debug', 'Print some debug output', type: Boolean
 
@@ -45,6 +47,7 @@ class App {
 
       _ longOpt: 'reclassify', 'Attempt to reclassify the input using the derived explanations. This will help give some scores about how well the explanations fit the data', type: Boolean
       _ longOpt: 'classify', 'Pass a new file of unseen examples to classify using the explanations derived (test classify)', args: 1
+      ecm longOpt: 'explainers-classify-mode', 'Only use the smaller set of explanatory variables for classification.', type: Boolean
       p longOpt: 'perms', 'Do permutation testing to provide p values for power, inclusion, and exclusion.', args: 1
 
       _ longOpt: 'output-scores', 'Output the results of the scorer. This can be useful for debugging, or identifying coefficient settings.', type: Boolean
@@ -115,10 +118,10 @@ class App {
       }
       
       if(o['reclassify']) {
-        k.reclassify(allExplanations, o['output-classification-scores'])
+        k.reclassify(allExplanations, o['output-classification-scores'], o['ecm'])
       }
       if(o['classify']) {
-        k.classify(o['classify'], allExplanations, o['output-classification-scores'])
+        k.classify(o['classify'], allExplanations, o['output-classification-scores'], o['ecm'])
 
         if(o['output-exp-dataframe']) {
           k.writeDataframe('test', allExplanations)
diff --git a/klarigi/src/main/groovy/klarigi/Classifier.groovy b/klarigi/src/main/groovy/klarigi/Classifier.groovy
index 2315c24..b232bbc 100644
--- a/klarigi/src/main/groovy/klarigi/Classifier.groovy
+++ b/klarigi/src/main/groovy/klarigi/Classifier.groovy
@@ -4,7 +4,7 @@ import org.semanticweb.owlapi.model.IRI
 import be.cylab.java.roc.*
 
 public class Classifier {
-  static def classify(allExplanations, data, ontoHelper) {
+  static def classify(allExplanations, data, ontoHelper, ecm) {
     def subclassCache = [:]
 
     def metrics = [:]
@@ -29,7 +29,12 @@ public class Classifier {
           scores[exps.cluster] = 1
 
           // Iterate all scored candidates (results[3])
-          def rs = exps.results[2].collect { e ->
+          def sterms = exps.results[2]
+          if(ecm) {
+            sterms = exps.results[0]
+          }
+
+          def rs = sterms.collect { e ->
             // Get subclasses + equivalent of this explanatory class
             if(!subclassCache.containsKey(e.iri)) {
               def ce = ontoHelper.dataFactory.getOWLClass(IRI.create(e.iri))
@@ -90,14 +95,18 @@ public class Classifier {
 
       // trues
       m1.scores.each { s ->
-        scores << s[cid].toDouble()
-        truths << 1
+        if(s.containsKey(cid)) {
+          scores << s[cid].toDouble()
+          truths << 1
+        }
       }
 
       metrics.findAll { c2, m2 -> c2 != cid }.each { c2, m2 ->
         m2.scores.each { s ->
-          scores << s[cid].toDouble()
-          truths << 0
+          if(s.containsKey(cid)) {
+            scores << s[cid].toDouble()
+            truths << 0
+          }
         }
       }
 
@@ -109,9 +118,11 @@ public class Classifier {
       double[] scar = scores.toArray()
       double[] trar = truths.toArray()
       def roc = new Roc(scar, trar)
+      def auc = roc.computeAUC()
 
-      //def roc = new Roc(scores, truths)
-      println "$cid AUC: ${roc.computeAUC()}"
+      if(!auc.isNaN()) {
+        println "$cid AUC: ${auc}"
+      }
     }
   }
 
diff --git a/klarigi/src/main/groovy/klarigi/InformationContent.groovy b/klarigi/src/main/groovy/klarigi/InformationContent.groovy
index bf818f4..3ca8c9e 100644
--- a/klarigi/src/main/groovy/klarigi/InformationContent.groovy
+++ b/klarigi/src/main/groovy/klarigi/InformationContent.groovy
@@ -55,7 +55,7 @@ public class InformationContent {
     this(ontologyPath, false, false, false)
   }
 
-  InformationContent(ontologyPath, dataPath, annotIC, turtle) {
+  InformationContent(ontologyPath, dataPath, annotIC, turtle, pp) {
     factory = URIFactoryMemory.getSingleton()
 
     /*def graphURI = factory.getURI('http://purl.obolibrary.org/obo/HP_')
@@ -79,6 +79,9 @@ public class InformationContent {
 
     def icMeasure = DEFAULT_IC
     if(annotIC) {
+      if(pp) {
+        dataPath = "pp_conv.tsv"
+      } 
       gConf.addGDataConf(new GDataConf(GFormat.TSV_ANNOT, dataPath));
     }
     //gConf.addGAction(actionRerootConf)
diff --git a/klarigi/src/main/groovy/klarigi/Klarigi.groovy b/klarigi/src/main/groovy/klarigi/Klarigi.groovy
index 724c8b8..fef43fd 100644
--- a/klarigi/src/main/groovy/klarigi/Klarigi.groovy
+++ b/klarigi/src/main/groovy/klarigi/Klarigi.groovy
@@ -22,7 +22,14 @@ import java.math.MathContext
 
 public class Klarigi {
   def data
-  def allAssociations // lazy
+
+  // A unique list containing every class directly annotated to any entity in the 
+  //  corpus. Generated after data corpus is loaded at the end of loadData. It's needed
+  //  for permutation, primarily, but there are probably other reasons that it's useful
+  //  to keep around too...
+  //  TODO: consider merging with above data, or in the class that eventually replaces that.
+  def allAssociations 
+
   def ontoHelper = [
     reasoner: null, 
     dataFactory: null,
@@ -35,9 +42,9 @@ public class Klarigi {
   Klarigi(o) {
     verbose = o['verbose']
 
-    loadData(o['data'], o['pp'])
+    loadData(o['data'], o['pp'], o['group'], o['egl'])
     loadOntology(o['ontology'])
-    loadIc(o['ic'], o['ontology'], o['data'], o['resnik-ic'], o['save-ic'], o['turtle'])
+    loadIc(o['ic'], o['ontology'], o['data'], o['resnik-ic'], o['save-ic'], o['turtle'], o['pp'])
     coefficients = Coefficients.Generate(o)
 
     if(o['output']) { // blank the output file, since we will subsequently append to it. all the output stuff could probs be better abstracted.
@@ -45,7 +52,7 @@ public class Klarigi {
     }
   }
 
-  def loadData(dataFile, pp) {
+  def loadData(dataFile, pp, interestGroup, egl) {
     data = [
       groupings: [:],
       associations: [:],
@@ -57,7 +64,11 @@ public class Klarigi {
 
       def input
       if(pp) { // Phenopackets mode
-        def toProcess
+        if(verbose) {
+          println "Phenopackets input mode engaged"
+        }
+
+        def toProcess = []
         if(inputFile.isDirectory()) {
           inputFile.eachFile { f ->
             if(f.getName() =~ /json$/) {
@@ -65,18 +76,40 @@ public class Klarigi {
             }
           } 
         } else {
-          toProcess = [inputFile]
+          toProcess << inputFile
         } 
 
         // Convert each of the phenopackets to input triples
         input = toProcess.collect { PacketConverter.Convert(PacketConverter.Load(it)) }
+
+        if(verbose) {
+          def outName = "pp_conv.tsv"
+          println "Phenopackets loaded. Also saving a converted copy to $outName"
+          PacketConverter.Save(input, outName) 
+        }
       } else {
         input = new File(dataFile).collect { it.split('\t') }
       }
-      
+
       input.each {
         def (entity, terms, group) = it
 
+        def gs = group.tokenize(';')
+        if(egl) {
+          gs = gs.findAll { g -> g == interestGroup }
+          // Here we exit if there are no interestGroup associations for this entity. We don't do this in the regular mode, because even ungrouped entities provide useful background...
+          if(gs.size() == 0) {
+            return;
+          }
+        }
+
+        gs.each { g ->
+          if(!data.groupings.containsKey(g)) {
+            data.groupings[g] = []
+          }
+          data.groupings[g] << entity
+        }
+
         terms = terms.tokenize(';')
         if(terms.size() > 0 && terms[0] =~ /:/ && terms[0].indexOf('http') == -1) { // stupid
           terms = terms.collect { 
@@ -89,22 +122,15 @@ public class Klarigi {
         terms.each {
           data.associations[entity][it] = true
         }
-
-        group.tokenize(';').each { g ->
-          if(!data.groupings.containsKey(g)) {
-            data.groupings[g] = []
-          }
-          data.groupings[g] << entity
-        }
       }
     } catch(e) {
       HandleError(e, verbose, "Error loading data file ($dataFile)")
     }
 
+    // kind of stupid but ok 
     allAssociations = data.associations.collect { entity, terms ->
       terms.keySet().toList()
     }.flatten().unique(false)
-    println allAssociations
 
     if(verbose) {
       println "Done loading dataset"
@@ -123,7 +149,7 @@ public class Klarigi {
     return newSample
   }
 
-  def loadIc(icFile, ontologyFile, annotFile, resnikIc, saveIc, turtle) {
+  def loadIc(icFile, ontologyFile, annotFile, resnikIc, saveIc, turtle, pp) {
     if(icFile) {
       try {
       new File(icFile).splitEachLine('\t') {
@@ -134,7 +160,7 @@ public class Klarigi {
       }
     } else {
       try {
-        icFactory = new InformationContent(ontologyFile, annotFile, resnikIc, turtle)
+        icFactory = new InformationContent(ontologyFile, annotFile, resnikIc, turtle, pp)
         def allClasses = ontoHelper.reasoner.getSubClasses(ontoHelper.dataFactory.getOWLThing(), false).collect { it.getRepresentativeElement().getIRI().toString() }.unique(false)
         allClasses = allClasses.findAll { it != 'http://www.w3.org/2002/07/owl#Nothing' } // heh
         data.ic = icFactory.getInformationContent(allClasses)
@@ -292,8 +318,11 @@ public class Klarigi {
     }
   }
 
-  def reclassify(allExplanations, outClassScores) {
-    def m = Classifier.classify(allExplanations, data, ontoHelper)
+  def reclassify(allExplanations, outClassScores, ecm) {
+    def m = Classifier.classify(allExplanations, data, ontoHelper, ecm)
+    if(!m) {
+      RaiseError("Failed to build reclassifier. There may have been too few examples.")
+    }
 
     println 'Reclassification:'
     Classifier.Print(m)
@@ -304,14 +333,19 @@ public class Klarigi {
     }
   }
 
-  def classify(path, allExplanations, outClassScores) {
+  def classify(path, allExplanations, outClassScores, ecm) {
     loadData(path) // TODO I know, i know, this is awful state management and design. i'll fix it later
 
+    def m = Classifier.classify(allExplanations, data, ontoHelper, ecm)
+    if(!m) {
+      RaiseError("Failed to build classifier. There may have been too few examples.")
+    }
+
     println 'Classification:'
-    def m = Classifier.classify(allExplanations, data, ontoHelper)
     Classifier.Print(m)
     println ''
 
+
     if(outClassScores) {
       Classifier.WriteScores(m, "classify")
     }
@@ -330,7 +364,7 @@ public class Klarigi {
     def cSize = data.groupings[cid].size()
     if(outType) {
       if(outType == 'latex') {
-        StepDown.PrintLaTeX(cid, results, ontoHelper.labels, cSize, toFile)
+        StepDown.PrintLaTeX(cid, results, pVals, ontoHelper.labels, cSize, toFile)
       } else if(outType == 'tsv') {
         StepDown.PrintTSV(cid, results, ontoHelper.labels, cSize, toFile)
       }
diff --git a/klarigi/src/main/groovy/klarigi/PacketConverter.groovy b/klarigi/src/main/groovy/klarigi/PacketConverter.groovy
index a70adb9..2b53d37 100644
--- a/klarigi/src/main/groovy/klarigi/PacketConverter.groovy
+++ b/klarigi/src/main/groovy/klarigi/PacketConverter.groovy
@@ -10,8 +10,14 @@ public class PacketConverter {
   public static def Convert(pDict) {
     [
       pDict['id'],
-      pDict['phenotypicFeatures'].collect { it.type.id },
-      pDict['diseases'].collect { term.id }
+      pDict['phenotypicFeatures'].collect { it.type.id }.join(';'),
+      pDict['diseases'].collect { it.term.id }.join(';')
     ] 
   }
+
+  public static def Save(triples, String fName) {
+    new File(fName).text = triples.collect {
+      it.join('\t')
+    }.join('\n')
+  }
 }
diff --git a/klarigi/src/main/groovy/klarigi/Scorer.groovy b/klarigi/src/main/groovy/klarigi/Scorer.groovy
index 127a9c4..b14db72 100644
--- a/klarigi/src/main/groovy/klarigi/Scorer.groovy
+++ b/klarigi/src/main/groovy/klarigi/Scorer.groovy
@@ -49,7 +49,10 @@ public class Scorer {
           // how about we make it the proportion of total mentions that are in this group vs the other group?
 
           //v.nExclusion = 1 - (v.exclusion / data.groupings.findAll { kk, vv -> kk != cid }.collect { kk, vv -> vv.size() }.sum())
-          v.nExclusion = 1 - (v.exclusion / (v.inclusion + v.exclusion))
+          v.nExclusion = 0
+          if((v.inclusion + v.exclusion) > 0) {
+            v.nExclusion = v.inclusion / (v.inclusion + v.exclusion)
+          }
         }
 
         v.nPower = v.nInclusion - (1-v.nExclusion)
diff --git a/klarigi/src/main/groovy/klarigi/StepDown.groovy b/klarigi/src/main/groovy/klarigi/StepDown.groovy
index 2210fe9..526a7a6 100644
--- a/klarigi/src/main/groovy/klarigi/StepDown.groovy
+++ b/klarigi/src/main/groovy/klarigi/StepDown.groovy
@@ -135,10 +135,10 @@ public class StepDown {
 
   }
 
-  static def PrintLaTeX(cid, res, labels, s, toFile) {
+  static def PrintLaTeX(cid, res, pVals, labels, s, toFile) {
     def out = []
     out << "\\begin{tabular}{p{10cm}|c|c|c|c}"
-    out << "{\\bf Group: $cid ($s members)} & {\\bf Power} & {\\bf Exclusion} & {\\bf Inclusion} & {\\bf IC} \\\\"
+    out << "{\\bf Group: $cid ($s members)} & {\\bf Power} & {\\bf Inclusion} & {\\bf Exclusion} & {\\bf IC} \\\\"
     res[0].sort { -it.nIc }.each {
       def pIri = it.iri
       if(pIri =~ 'obolibrary.org') {
@@ -149,9 +149,14 @@ public class StepDown {
         pIri = it.iri.replaceAll('_', '\\\\_')
       }
 
-      out << "${labels[it.iri]} (${pIri}) & ${it.nPower.toDouble().round(2)} & ${it.nExclusion.toDouble().round(2)} & ${it.nInclusion.toDouble().round(2)} & ${it.ic.toDouble().round(2)} \\\\"
+      if(pVals) {
+        def ps = pVals[it.iri]
+        out << "${labels[it.iri]} (${pIri}) & ${it.nPower.toDouble().round(2)} (p\$<\$=${ps.powP}) & ${it.nInclusion.toDouble().round(2)} (p\$<\$=${ps.incP}) & ${it.nExclusion.toDouble().round(2)} (p\$<\$=${ps.excP}) & ${it.ic.toDouble().round(2)} \\\\"
+      } else {
+        out << "${labels[it.iri]} (${pIri}) & ${it.nPower.toDouble().round(2)} & ${it.nInclusion.toDouble().round(2)} & ${it.nExclusion.toDouble().round(2)} & ${it.ic.toDouble().round(2)} \\\\"
+      }
     }
-    out << "{\\em Overall} & - & ${res[2].toDouble().round(2)} & - & - \\\\ "
+    out << "{\\em Overall} & - & ${res[1].toDouble().round(2)} & - & - \\\\ "
     out << "\\hline"
     out << "\\end{tabular}"
     out = out.join('\n')