From 251989979ba2cb39e217a91aa7fefc85c554ebe1 Mon Sep 17 00:00:00 2001 From: Pall Melsted Date: Thu, 20 Jul 2017 21:13:50 +0000 Subject: [PATCH] new release --- README.md | 6 +++++ common.h | 2 +- scripts/flatten_json.py | 49 ++++++++++++++++++++++++++++++++++ scripts/get_fragment_length.py | 31 +++++++++++++++++++++ 4 files changed, 87 insertions(+), 1 deletion(-) create mode 100644 scripts/flatten_json.py create mode 100644 scripts/get_fragment_length.py diff --git a/README.md b/README.md index 4ccc102..5f4bbd4 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,12 @@ A more sophisticated example is in the `test` directory which contains a `snakem The `--output test` parameter is used as a prefix and two files are created `test.fusions.fasta` and `test.json`, this contains the filtered fusion calls. For unfiltered fusion calls, use the corresponding `.unfiltered` files. +### Scripts + +The `scripts` subfolder contains useful Python scripts + +- `get_fragment_length.py` examines an `abundance.h5` produced by `kallisto` and finds the 95th percentile of the fragment length distribution +- `flatten_json.py` reads the `.json` output and converts to a simple gene table ### Annotations diff --git a/common.h b/common.h index 9b9ffb3..0dbf774 100644 --- a/common.h +++ b/common.h @@ -3,7 +3,7 @@ #include -#define PIZZLY_VERSION "0.37.2" +#define PIZZLY_VERSION "0.37.3" struct ProgramOptions { std::string gtf; diff --git a/scripts/flatten_json.py b/scripts/flatten_json.py new file mode 100644 index 0000000..3679011 --- /dev/null +++ b/scripts/flatten_json.py @@ -0,0 +1,49 @@ +import sys +import json +from collections import OrderedDict + +### +### gene1_name gene1_id, gene2_name, gene2_id, type, pair, split, txlist + +def loadJSON(fn): + with open(fn) as f: + JJ = json.load(f,object_pairs_hook=OrderedDict) + return JJ['genes'] + +def outputGeneTable(fusions, outf, filters = None): + outf.write('\t'.join("geneA.name geneA.id geneB.name geneB.id paircount splitcount transcripts.list".split())) + outf.write('\n') + for gf in fusions: + gAname = gf['geneA']['name'] + gAid = gf['geneA']['id'] + gBname = gf['geneB']['name'] + gBid = gf['geneB']['id'] + pairs = str(gf['paircount']) + split = str(gf['splitcount']) + txp = [tp['fasta_record'] for tp in gf['transcripts']] + + outf.write('\t'.join([gAname, gAid, gBname, gBid, pairs, split, ';'.join(txp)])) + outf.write('\n') + +def usage(): + print("Usage: python flatten_json.py fusion.out.json [genetable.txt]") + print("") + print(" outputs a flat table listing all gene fusions, if the output file is not") + print(" specified it prints to standard output") + + +if __name__ == "__main__": + nargs = len(sys.argv) + if nargs <= 1: + usage() + else: + infn = sys.argv[1] + fusions = loadJSON(infn) + outf = sys.stdout + if nargs == 3: + outf = open(sys.argv[2],'w') + + outputGeneTable(fusions,outf) + + if outf != sys.stdout: + outf.close() diff --git a/scripts/get_fragment_length.py b/scripts/get_fragment_length.py new file mode 100644 index 0000000..5b48543 --- /dev/null +++ b/scripts/get_fragment_length.py @@ -0,0 +1,31 @@ +import h5py +import numpy as np +import sys + + +def get_cumulative_dist(fn): + f = h5py.File(fn) + x = np.asarray(f['aux']['fld'], dtype='float64') + y = np.cumsum(x)/np.sum(x) + f.close() + return y + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python get_fragment_length.py H5FILE [cutoff]") + print("") + print("Prints 95 percentile fragment length") + print("") + print(" H5FILE: abundance.h5 file produced by kallisto") + print(" cutoff: percentage cutoff to use (default .95)") + else: + fn = sys.argv[1] + if len(sys.argv) >=3: + cutoff = float(sys.argv[2]) + if cutoff <= 0 or cutoff >= 1.0: + cutoff = 0.95 + else: + cutoff = 0.95 + y = get_cumulative_dist(fn) + fraglen = np.argmax(y > .95) + print(fraglen)