From 251989979ba2cb39e217a91aa7fefc85c554ebe1 Mon Sep 17 00:00:00 2001
From: Pall Melsted <pmelsted@gmail.com>
Date: Thu, 20 Jul 2017 21:13:50 +0000
Subject: [PATCH] new release

---
 README.md                      |  6 +++++
 common.h                       |  2 +-
 scripts/flatten_json.py        | 49 ++++++++++++++++++++++++++++++++++
 scripts/get_fragment_length.py | 31 +++++++++++++++++++++
 4 files changed, 87 insertions(+), 1 deletion(-)
 create mode 100644 scripts/flatten_json.py
 create mode 100644 scripts/get_fragment_length.py
diff --git a/README.md b/README.md
index 4ccc102..5f4bbd4 100644
--- a/README.md
+++ b/README.md
@@ -71,6 +71,12 @@ A more sophisticated example is in the `test` directory which contains a `snakem
 
 The `--output test` parameter is used as a prefix and two files are created `test.fusions.fasta` and `test.json`, this contains the filtered fusion calls. For unfiltered fusion calls, use the corresponding `.unfiltered` files.
 
+### Scripts
+
+The `scripts` subfolder contains useful Python scripts
+
+- `get_fragment_length.py` examines an `abundance.h5` produced by `kallisto` and finds the 95th percentile of the fragment length distribution
+- `flatten_json.py` reads the `.json` output and converts to a simple gene table
 
 ### Annotations
 
diff --git a/common.h b/common.h
index 9b9ffb3..0dbf774 100644
--- a/common.h
+++ b/common.h
@@ -3,7 +3,7 @@
 
 #include <string>
 
-#define PIZZLY_VERSION "0.37.2"
+#define PIZZLY_VERSION "0.37.3"
 
 struct ProgramOptions {
   std::string gtf;
diff --git a/scripts/flatten_json.py b/scripts/flatten_json.py
new file mode 100644
index 0000000..3679011
--- /dev/null
+++ b/scripts/flatten_json.py
@@ -0,0 +1,49 @@
+import sys
+import json
+from collections import OrderedDict
+
+###
+### gene1_name gene1_id, gene2_name, gene2_id, type, pair, split, txlist
+
+def loadJSON(fn):
+    with open(fn) as f:
+        JJ = json.load(f,object_pairs_hook=OrderedDict)
+    return JJ['genes']
+
+def outputGeneTable(fusions, outf, filters = None):
+    outf.write('\t'.join("geneA.name geneA.id geneB.name geneB.id paircount splitcount transcripts.list".split()))
+    outf.write('\n')
+    for gf in fusions:
+        gAname = gf['geneA']['name']
+        gAid   = gf['geneA']['id']
+        gBname = gf['geneB']['name']
+        gBid   = gf['geneB']['id']
+        pairs  = str(gf['paircount'])
+        split  = str(gf['splitcount'])
+        txp = [tp['fasta_record'] for tp in gf['transcripts']]
+
+        outf.write('\t'.join([gAname, gAid, gBname, gBid, pairs, split, ';'.join(txp)]))
+        outf.write('\n')
+
+def usage():
+    print("Usage: python flatten_json.py fusion.out.json [genetable.txt]")
+    print("")
+    print("       outputs a flat table listing all gene fusions, if the output file is not")
+    print("       specified it prints to standard output")
+
+
+if __name__ == "__main__":
+    nargs = len(sys.argv)
+    if nargs <= 1:
+        usage()
+    else:
+        infn = sys.argv[1]
+        fusions = loadJSON(infn)
+        outf = sys.stdout
+        if nargs == 3:
+            outf = open(sys.argv[2],'w')
+
+        outputGeneTable(fusions,outf)
+
+        if outf != sys.stdout:
+            outf.close()
diff --git a/scripts/get_fragment_length.py b/scripts/get_fragment_length.py
new file mode 100644
index 0000000..5b48543
--- /dev/null
+++ b/scripts/get_fragment_length.py
@@ -0,0 +1,31 @@
+import h5py
+import numpy as np
+import sys
+
+
+def get_cumulative_dist(fn):
+    f = h5py.File(fn)
+    x = np.asarray(f['aux']['fld'], dtype='float64')
+    y = np.cumsum(x)/np.sum(x)
+    f.close()
+    return y
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python get_fragment_length.py H5FILE [cutoff]")
+        print("")
+        print("Prints 95 percentile fragment length")
+        print("")
+        print("       H5FILE:      abundance.h5 file produced by kallisto")
+        print("       cutoff:      percentage cutoff to use (default .95)")
+    else:
+        fn = sys.argv[1]
+        if len(sys.argv) >=3:
+            cutoff = float(sys.argv[2])
+            if cutoff <= 0 or cutoff >= 1.0:
+                cutoff = 0.95
+        else:
+            cutoff = 0.95
+        y = get_cumulative_dist(fn)
+        fraglen = np.argmax(y > .95)
+        print(fraglen)