Skip to content

Commit

Permalink
new release
Browse files Browse the repository at this point in the history
  • Loading branch information
pmelsted committed Jul 20, 2017
1 parent 55b2de5 commit 2519899
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 1 deletion.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,12 @@ A more sophisticated example is in the `test` directory which contains a `snakem

The `--output test` parameter is used as a prefix and two files are created `test.fusions.fasta` and `test.json`, this contains the filtered fusion calls. For unfiltered fusion calls, use the corresponding `.unfiltered` files.

### Scripts

The `scripts` subfolder contains useful Python scripts

- `get_fragment_length.py` examines an `abundance.h5` produced by `kallisto` and finds the 95th percentile of the fragment length distribution
- `flatten_json.py` reads the `.json` output and converts to a simple gene table

### Annotations

Expand Down
2 changes: 1 addition & 1 deletion common.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

#include <string>

#define PIZZLY_VERSION "0.37.2"
#define PIZZLY_VERSION "0.37.3"

struct ProgramOptions {
std::string gtf;
Expand Down
49 changes: 49 additions & 0 deletions scripts/flatten_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import sys
import json
from collections import OrderedDict

###
### gene1_name gene1_id, gene2_name, gene2_id, type, pair, split, txlist

def loadJSON(fn):
with open(fn) as f:
JJ = json.load(f,object_pairs_hook=OrderedDict)
return JJ['genes']

def outputGeneTable(fusions, outf, filters = None):
outf.write('\t'.join("geneA.name geneA.id geneB.name geneB.id paircount splitcount transcripts.list".split()))
outf.write('\n')
for gf in fusions:
gAname = gf['geneA']['name']
gAid = gf['geneA']['id']
gBname = gf['geneB']['name']
gBid = gf['geneB']['id']
pairs = str(gf['paircount'])
split = str(gf['splitcount'])
txp = [tp['fasta_record'] for tp in gf['transcripts']]

outf.write('\t'.join([gAname, gAid, gBname, gBid, pairs, split, ';'.join(txp)]))
outf.write('\n')

def usage():
print("Usage: python flatten_json.py fusion.out.json [genetable.txt]")
print("")
print(" outputs a flat table listing all gene fusions, if the output file is not")
print(" specified it prints to standard output")


if __name__ == "__main__":
nargs = len(sys.argv)
if nargs <= 1:
usage()
else:
infn = sys.argv[1]
fusions = loadJSON(infn)
outf = sys.stdout
if nargs == 3:
outf = open(sys.argv[2],'w')

outputGeneTable(fusions,outf)

if outf != sys.stdout:
outf.close()
31 changes: 31 additions & 0 deletions scripts/get_fragment_length.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import h5py
import numpy as np
import sys


def get_cumulative_dist(fn):
f = h5py.File(fn)
x = np.asarray(f['aux']['fld'], dtype='float64')
y = np.cumsum(x)/np.sum(x)
f.close()
return y

if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python get_fragment_length.py H5FILE [cutoff]")
print("")
print("Prints 95 percentile fragment length")
print("")
print(" H5FILE: abundance.h5 file produced by kallisto")
print(" cutoff: percentage cutoff to use (default .95)")
else:
fn = sys.argv[1]
if len(sys.argv) >=3:
cutoff = float(sys.argv[2])
if cutoff <= 0 or cutoff >= 1.0:
cutoff = 0.95
else:
cutoff = 0.95
y = get_cumulative_dist(fn)
fraglen = np.argmax(y > .95)
print(fraglen)

0 comments on commit 2519899

Please sign in to comment.