From 7e7b665b0ed231e21c44477bbbc9203a4691c682 Mon Sep 17 00:00:00 2001 From: jrderuiter Date: Tue, 7 Jul 2015 13:00:13 +0200 Subject: [PATCH 001/100] Import readline to avoid rpy2 bug with conda. --- pyim/cis/cimpl.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyim/cis/cimpl.py b/pyim/cis/cimpl.py index 1b36a57..57086c9 100644 --- a/pyim/cis/cimpl.py +++ b/pyim/cis/cimpl.py @@ -1,6 +1,8 @@ __author__ = 'Julian' import pandas as pd + +import readline from rpy2 import robjects from tkgeno.util.rpy2 import importr, pandas_to_dataframe, dataframe_to_pandas From 2aca6e7451d41c77f64c7fea95cdfe37504abed2 Mon Sep 17 00:00:00 2001 From: jrderuiter Date: Fri, 10 Jul 2015 15:52:14 +0200 Subject: [PATCH 002/100] Add toolz dependency, fix check for python version. --- pyim/annotation/rbm.py | 1 + pyim/annotation/window.py | 1 + setup.py | 6 +++--- 3 files changed, 5 insertions(+), 3 deletions(-) create mode 100644 pyim/annotation/rbm.py create mode 100644 pyim/annotation/window.py diff --git a/pyim/annotation/rbm.py b/pyim/annotation/rbm.py new file mode 100644 index 0000000..29d13c6 --- /dev/null +++ b/pyim/annotation/rbm.py @@ -0,0 +1 @@ +__author__ = 'Julian' diff --git a/pyim/annotation/window.py b/pyim/annotation/window.py new file mode 100644 index 0000000..29d13c6 --- /dev/null +++ b/pyim/annotation/window.py @@ -0,0 +1 @@ +__author__ = 'Julian' diff --git a/setup.py b/setup.py index f1c9fbf..b26aa54 100644 --- a/setup.py +++ b/setup.py @@ -5,10 +5,10 @@ from setuptools import setup, find_packages install_requires = ['future', 'numpy', 'scipy', 'pandas', 'pysam', - 'natsort', 'rpy2', 'scikit-bio', 'tkgeno'] + 'natsort', 'rpy2', 'scikit-bio', 'tkgeno', 'toolz'] -if sys.version_info[0] == 2: - install_requires += ['pathlib', 'enum'] +if not sys.version_info >= (3, ): + install_requires += ['pathlib'] setup( name='pyim', From 6943dd6b325949153d11997244e9828face71d87 Mon Sep 17 00:00:00 2001 From: jrderuiter Date: Fri, 10 Jul 2015 15:52:37 +0200 Subject: [PATCH 003/100] Add rbm and window annotators. --- pyim/annotation/__init__.py | 2 + pyim/annotation/base.py | 13 +++- pyim/annotation/kcrbm.py | 31 ++++++--- pyim/annotation/rbm.py | 132 +++++++++++++++++++++++++++++++++++- pyim/annotation/window.py | 123 ++++++++++++++++++++++++++++++++- pyim/main/annotate.py | 6 +- 6 files changed, 292 insertions(+), 15 deletions(-) diff --git a/pyim/annotation/__init__.py b/pyim/annotation/__init__.py index 58a112f..5dc17da 100644 --- a/pyim/annotation/__init__.py +++ b/pyim/annotation/__init__.py @@ -1 +1,3 @@ from .kcrbm import KcRbmAnnotator +from .rbm import RbmAnnotator +from .window import WindowAnnotator diff --git a/pyim/annotation/base.py b/pyim/annotation/base.py index 3cd3a73..7a790a5 100644 --- a/pyim/annotation/base.py +++ b/pyim/annotation/base.py @@ -7,7 +7,7 @@ class Annotator(object): - def __init__(self, **kwargs): + def __init__(self): super().__init__() @classmethod @@ -16,7 +16,16 @@ def configure_argparser(cls, subparsers, name='name'): @classmethod def from_args(cls, args): - raise NotImplementedError() + return cls(**args) def annotate(self, frame): raise NotImplementedError() + + +def closest_genes(frame, id_col='insertion_id', distance_col='distance'): + select_closest = lambda x: x.ix[ + x[distance_col] == x[distance_col].abs().min()] + + return (frame.groupby(id_col) + .apply(select_closest) + .reset_index(drop=True)) diff --git a/pyim/annotation/kcrbm.py b/pyim/annotation/kcrbm.py index 0357b7a..514e07e 100644 --- a/pyim/annotation/kcrbm.py +++ b/pyim/annotation/kcrbm.py @@ -23,7 +23,7 @@ class KcRbmAnnotator(Annotator): - def __init__(self, reference, system): + def __init__(self, reference, system, closest=False): super().__init__() if system not in {'SB'}: @@ -34,6 +34,7 @@ def __init__(self, reference, system): self._reference = reference self._system = system + self._closest = closest @classmethod def configure_argparser(cls, subparsers, name='kcrbm'): @@ -44,18 +45,24 @@ def configure_argparser(cls, subparsers, name='kcrbm'): parser.add_argument('--reference', default='mm10') parser.add_argument('--system', default='SB') + parser.add_argument('--closest', default=False, action='store_true') return parser - @classmethod - def from_args(cls, args): - return cls(reference=args['reference'], system=args['system']) - def annotate(self, frame, type_='gene'): kcrbm_ins = self._convert_to_kcrbm_frame(frame) kcrbm_result = self._run_kcrbm(kcrbm_ins, method='genes') gene_mapping = self._parse_gene_result(kcrbm_result) + + if self._closest: + closest = lambda x: x.ix[ + x.gene_distance == x.gene_distance.abs().min()] + + gene_mapping = (gene_mapping.groupby('insertion_id') + .apply(closest) + .reset_index(drop=True)) + return pd.merge(frame, gene_mapping, on='insertion_id') @staticmethod @@ -97,10 +104,16 @@ def _run_kcrbm(self, kcrbm_frame, method): @staticmethod def _parse_gene_result(result): result = result.ix[result['ensid'].astype(str) != 'NA'] - return pd.DataFrame({'insertion_id': result['ins_id'], - 'gene_id': result['ensid'], - 'mechanism': result['mechanism']}, - columns=['insertion_id', 'gene_id', 'mechanism']) + + gene_distance = result[['d2gss', 'd2gts']].abs().min(axis=1).astype(int) + gene_distance.ix[result.mechanism.str.startswith('u')] *= -1 + + return pd.DataFrame({ + 'insertion_id': result['ins_id'], + 'gene_id': result['ensid'], + 'distance': gene_distance, + 'mechanism': result['mechanism']}, + columns=['insertion_id', 'gene_id', 'distance', 'mechanism']) @staticmethod def _load_genome(genome): diff --git a/pyim/annotation/rbm.py b/pyim/annotation/rbm.py index 29d13c6..44f7392 100644 --- a/pyim/annotation/rbm.py +++ b/pyim/annotation/rbm.py @@ -1 +1,131 @@ -__author__ = 'Julian' +from __future__ import (absolute_import, division, + print_function, unicode_literals) +from builtins import (ascii, bytes, chr, dict, filter, hex, input, + int, map, next, oct, open, pow, range, round, + str, super, zip) + +from pathlib import Path + +import pandas as pd + +from toolz import curry, pipe, merge_with, keymap +from toolz.curried import get, filter, map, valfilter, valmap + +from tkgeno.io import GtfFile +from tkgeno.util.pandas import reorder_columns + +from .base import Annotator, closest_genes +from .window import Window, apply_window, fetch_features, annotate_features + + +# Window format: (us, ua, ds, da) +WINDOW_SIZE_PRESETS = { + 'SB': (20000, 10000, 25000, 5000), + 'MULV': (20000, 120000, 40000, 5000), + 'MMTV': (20000, 120000, 40000, 5000) +} + + +class RbmAnnotator(Annotator): + + def __init__(self, gtf, window_sizes=None, preset=None, + feature_type='gene', closest=False, id_column='insertion_id'): + super().__init__() + + if window_sizes is None: + if preset is None: + raise ValueError('Either windows or preset must be given') + window_sizes = WINDOW_SIZE_PRESETS[preset] + + self._gtf = GtfFile(gtf) + self._feature_type = feature_type + + self._closest = closest + self._id_column = id_column + + self._windows = { + 'is': Window(None, 0, 1, 1, True, True), + 'ia': Window(None, 0, 1, -1, True, True), + 'us': Window(None, -window_sizes[0], 0, 1, True, False), + 'ua': Window(None, -window_sizes[1], 0, -1, True, False), + 'ds': Window(None, 1, window_sizes[2], 1, False, True), + 'da': Window(None, 1, window_sizes[3], -1, False, True) + } + + @classmethod + def configure_argparser(cls, subparsers, name='rbm'): + parser = subparsers.add_parser(name, help=name + ' help') + + parser.add_argument('input', type=Path) + parser.add_argument('output', type=Path) + parser.add_argument('gtf') + + parser.add_argument('--feature_type', default='gene', + choices={'gene', 'transcript'}) + parser.add_argument('--id_column', default='insertion_id') + parser.add_argument('--closest', default=False, action='store_true') + + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument('--preset') + group.add_argument('--window_sizes', nargs=4, type=int) + + return parser + + def annotate(self, frame, type_='gene'): + results = [self._annotate_row(row, self._windows, + self._gtf, self._feature_type) + for _, row in frame.iterrows()] + + results = pd.concat(filter(lambda x: x is not None, results), + ignore_index=True) + + return results if not self._closest \ + else closest_genes(results, id_col=self._id_column) + + @staticmethod + def _annotate_row(row, windows, gtf, feature_type='gene'): + strand = row.strand if hasattr(row, 'strand') else None + + # Fetch features for orientation, or for the forward orientation. + apply_func = curry(apply_window, row.seqname, row.location, strand or 1) + windows_fwd = valmap(apply_func, windows) + + features = fetch_features_windows( + gtf, windows_fwd, feature_type=feature_type) + + if strand is None: + # Try again with reverse window orientation. + apply_func = curry(apply_window, row.seqname, row.location, -1) + windows_rev = valmap(apply_func, windows) + + features_rev = fetch_features_windows( + gtf, windows_rev, feature_type=feature_type) + + # Reflect sense/antisense to match fwd windows. + features_rev = keymap( + curry(str_translate, table=str.maketrans('sa', 'as')), + features_rev) + + features = merge_with( + lambda frames: pd.merge(frames[0], frames[1], how='inner') + if len(frames) == 2 else frames[0], + features, features_rev) + + if len(features) > 0: + annotated = {mech: annotate_features(row, features, mechanism=mech) + for mech, features in features.items()} + + frame = pd.concat(annotated.values(), ignore_index=True) + return reorder_columns(frame, order=row.index) + + return None + + +def fetch_features_windows(gtf, windows, feature_type): + return pipe(windows, + valmap(lambda w: fetch_features(gtf, w, feature_type)), + valfilter(lambda x: x is not None)) + + +def str_translate(s, table): + return str.translate(s, table) diff --git a/pyim/annotation/window.py b/pyim/annotation/window.py index 29d13c6..318f4d2 100644 --- a/pyim/annotation/window.py +++ b/pyim/annotation/window.py @@ -1 +1,122 @@ -__author__ = 'Julian' +from __future__ import (absolute_import, division, + print_function, unicode_literals) +from builtins import (ascii, bytes, chr, dict, filter, hex, input, + int, map, next, oct, open, pow, range, round, + str, super, zip) + +from collections import namedtuple +from functools import lru_cache +from pathlib import Path + +from toolz import pipe +from toolz.curried import get, filter, map, valmap, valfilter + +import pandas as pd + +from tkgeno.io import GtfFile +from tkgeno.util.pandas import reorder_columns + +from .base import Annotator, closest_genes + + +Window = namedtuple('Window', ['seqname', 'start', 'end', 'strand', + 'incl_left', 'incl_right']) + + +def apply_window(seqname, location, strand, window): + start = location + (window.start * strand) + end = location + (window.end * strand) + + if strand == -1: + start, end = end, start + incl_left, incl_right = window.incl_right, window.incl_left + else: + incl_left, incl_right = window.incl_left, window.incl_right + + new_strand = strand * window.strand if window.strand is not None else None + + return Window(seqname, start, end, new_strand, incl_left, incl_right) + + +class WindowAnnotator(Annotator): + + def __init__(self, gtf_path, window_size, feature_type='gene', + id_column='insertion_id', closest=False): + super().__init__() + + self._gtf = GtfFile(gtf_path) + self._window = Window( + seqname=None, start=-1 * window_size, end=window_size, + strand=None, incl_left=True, incl_right=True) + + self._feature_type = feature_type + self._closest = closest + self._id_column = id_column + + @classmethod + def configure_argparser(cls, subparsers, name='window'): + parser = subparsers.add_parser(name, help=name + ' help') + + parser.add_argument('input', type=Path) + parser.add_argument('output', type=Path) + parser.add_argument('gtf') + + parser.add_argument('--feature_type', default='gene') + parser.add_argument('--window_size', default=20000, type=int) + + parser.add_argument('--id_column', default='insertion_id') + parser.add_argument('--closest', default=False, action='store_true') + + return parser + + def annotate(self, frame, type_='gene'): + results = [self._annotate_row(row, self._window, + self._gtf, self._feature_type) + for _, row in frame.iterrows()] + + results = pd.concat(filter(lambda x: x is not None, results), + ignore_index=True) + + return results if self._closest is not None \ + else closest_genes(frame, id_col=self._id_column) + + @staticmethod + def _annotate_row(row, window, gtf, feature_type='gene'): + # Apply window for row. + window = apply_window(row.seqname, row.location, + row.seqname, window) + + # Fetch features for row. + features = fetch_features(gtf, window, feature_type=feature_type) + + # Annotate row with features, if any were found. + if len(features) > 0: + frame = annotate_features(row, features) + return reorder_columns(frame, order=row.index) + + return None + + +@lru_cache(maxsize=64) +def fetch_features(gtf, window, feature_type): + return gtf.get_region(feature=feature_type, **window._asdict()) + + +def annotate_features(row, features, **kwargs): + data = dict(row) + data.update(dict( + gene_id=features.gene_id, + distance=[feature_distance(s, e, row.location) + for s, e in zip(features.start, features.end)])) + data.update(**kwargs) + + return reorder_columns(pd.DataFrame(data), order=row.index) + + +def feature_distance(start, end, location): + if start <= location <= end: + return 0 + elif location > end: + return location - end + else: + return location - start diff --git a/pyim/main/annotate.py b/pyim/main/annotate.py index eb38294..7f8549a 100644 --- a/pyim/main/annotate.py +++ b/pyim/main/annotate.py @@ -9,10 +9,12 @@ import pandas as pd -from pyim.annotation import KcRbmAnnotator +from pyim.annotation import KcRbmAnnotator, RbmAnnotator, WindowAnnotator ANNOTATORS = { - 'kcrbm': KcRbmAnnotator + 'kcrbm': KcRbmAnnotator, + 'rbm': RbmAnnotator, + 'window': WindowAnnotator } From 575b924aa82628afdcd2f7461151f76f0a18d22c Mon Sep 17 00:00:00 2001 From: jrderuiter Date: Fri, 10 Jul 2015 15:52:49 +0200 Subject: [PATCH 004/100] Add strand homogeneity to cis identification. --- pyim/main/cis.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/pyim/main/cis.py b/pyim/main/cis.py index e04de16..380247d 100644 --- a/pyim/main/cis.py +++ b/pyim/main/cis.py @@ -8,7 +8,9 @@ from argparse import ArgumentParser from pathlib import Path +import numpy as np import pandas as pd +from toolz import curry from pyim.cis.cimpl import cimpl, get_cis, get_cis_mapping @@ -27,6 +29,8 @@ def setup_parser(): parser.add_argument('--chromosomes', nargs='+', default=None) parser.add_argument('--scales', nargs='+', type=int, default=30000) + parser.add_argument('--strand_homogeneity', type=float, default=0.75) + parser.add_argument('--alpha', type=float, default=0.05) parser.add_argument('--iterations', type=int, default=1000) parser.add_argument('--threads', type=int, default=1) @@ -52,13 +56,46 @@ def main(): cis = get_cis(cimpl_obj, alpha=args.alpha, mul_test=True) cis_mapping = get_cis_mapping(cimpl_obj, cis_frame=cis) + # Annotate insertions with cis mapping. ins_annotated = pd.merge(ins_frame, cis_mapping, on='insertion_id') + # Determine strand of cis sites. + strand_func = curry(_strandedness, min_homogeneity=args.strand_homogeneity) + cis_strand = ins_annotated.groupby('cis_id').apply(strand_func) + + # Merge strand information with cis sites. + cis = pd.merge(cis, cis_strand.reset_index(), on='cis_id') + + # Rename and reshuffle cis columns. + cis = cis.rename(columns={'peak_location': 'location', + 'peak_height': 'height'}) + cis = cis[['cis_id', 'seqname', 'location', 'strand', 'scale', + 'n_insertions', 'p_value', 'start', 'end', 'height', 'width', + 'strand_mean', 'strand_homogeneity']] + # Write out outputs. cis.to_csv(str(args.output.with_suffix('.sites.txt')), sep=native_str('\t'), index=False) ins_annotated.to_csv(str(args.output), sep=native_str('\t'), index=False) +def _strandedness(insertions, min_homogeneity): + strand_mean = insertions.strand.mean() + strand = int(np.sign(strand_mean)) + + if strand != 0: + homogeneity = (insertions.strand == strand).sum() / len(insertions) + else: + homogeneity = 0.5 + + if homogeneity < min_homogeneity: + strand = 0 + + return pd.Series(dict(strand=strand, + strand_mean=strand_mean, + strand_homogeneity=homogeneity)) + + + if __name__ == '__main__': main() From 9879da8bf6289e3c7d64f1abe94c0b2b1ab7f0f9 Mon Sep 17 00:00:00 2001 From: jrderuiter Date: Wed, 15 Jul 2015 16:26:26 +0200 Subject: [PATCH 005/100] Updates for new skbio version (master). --- data/sb.barcodes.fa | 2 +- pyim/alignment/vector.py | 21 ++++---- pyim/annotation/kcrbm.py | 9 ++-- pyim/pipelines/_base.py | 90 +++++++++++++++++----------------- pyim/pipelines/lam_pcr.py | 11 +++-- pyim/pipelines/shear_splink.py | 30 ++++++------ 6 files changed, 83 insertions(+), 80 deletions(-) diff --git a/data/sb.barcodes.fa b/data/sb.barcodes.fa index a20b8c8..501fb59 100644 --- a/data/sb.barcodes.fa +++ b/data/sb.barcodes.fa @@ -135,7 +135,7 @@ CGATCTGTCGGTGTATGTAA >SB068 CGCAGTACGAGTGTATGTAA >SB069 -cGCGTATACAGTGTATGTAA +CGCGTATACAGTGTATGTAA >SB070 CGCGTGCTAGGTGTATGTAA >SB071 diff --git a/pyim/alignment/vector.py b/pyim/alignment/vector.py index 60f21a9..9ad6d40 100644 --- a/pyim/alignment/vector.py +++ b/pyim/alignment/vector.py @@ -66,7 +66,7 @@ def align_multiple(self, queries, target, how='unique'): else: if how == 'unique': raise ValueError('Multiple matching queries for target {}' - .format(target.id)) + .format(target.metadata['id'])) elif how == 'any': return alignments[0] else: @@ -94,17 +94,17 @@ def _align_exact(query, target, query_ori): # Note that this alignment returns the first occurrence it finds, # later occurrences will not be found and are not checked for. try: - index = target.sequence.index(query.sequence) + index = str(target).index(str(query)) except ValueError: return None else: q_len = len(query) return VectorAlignment( - query_id=query.id, query_start=0, query_end=q_len, - query_len=q_len, target_id=target.id, target_start=index, - target_end=index + q_len, target_strand=query_ori, - target_len=len(target), type='exact', + query_id=query.metadata['id'], query_start=0, query_end=q_len, + query_len=q_len, target_id=target.metadata['id'], + target_start=index, target_end=index + q_len, + target_strand=query_ori, target_len=len(target), type='exact', identity=1.0, coverage=1.0) @@ -140,7 +140,7 @@ def align(self, query, target): return alignment def _align_ssw(self, query, target, query_ori): - ssw_aln = local_pairwise_align_ssw(target.sequence, query.sequence) + ssw_aln = local_pairwise_align_ssw(target, query) # Extract positions. pos = ssw_aln.start_end_positions() @@ -154,11 +154,12 @@ def _align_ssw(self, query, target, query_ori): # Calculate basic metrics. coverage = (q_end - q_start) / float(len(query)) - identity = ssw_aln[0].fraction_same(ssw_aln[1]) + identity = 1.0 - ssw_aln[0].distance(ssw_aln[1]) aln = VectorAlignment( - query_id=query.id, query_start=q_start, query_end=q_end, - query_len=len(query), target_id=target.id, target_start=t_start, + query_id=query.metadata['id'], query_start=q_start, + query_end=q_end, query_len=len(query), + target_id=target.metadata['id'], target_start=t_start, target_end=t_end, target_strand=query_ori, target_len=len(target), type='ssw', identity=identity, coverage=coverage) diff --git a/pyim/annotation/kcrbm.py b/pyim/annotation/kcrbm.py index 514e07e..d3e2a08 100644 --- a/pyim/annotation/kcrbm.py +++ b/pyim/annotation/kcrbm.py @@ -56,14 +56,14 @@ def annotate(self, frame, type_='gene'): gene_mapping = self._parse_gene_result(kcrbm_result) if self._closest: - closest = lambda x: x.ix[ - x.gene_distance == x.gene_distance.abs().min()] + def closest(x): + return x.ix[x['distance'] == x['distance'].abs().min()] gene_mapping = (gene_mapping.groupby('insertion_id') .apply(closest) .reset_index(drop=True)) - return pd.merge(frame, gene_mapping, on='insertion_id') + return pd.merge(frame, gene_mapping, on='insertion_id', how='left') @staticmethod def _convert_to_kcrbm_frame(frame): @@ -105,7 +105,8 @@ def _run_kcrbm(self, kcrbm_frame, method): def _parse_gene_result(result): result = result.ix[result['ensid'].astype(str) != 'NA'] - gene_distance = result[['d2gss', 'd2gts']].abs().min(axis=1).astype(int) + gene_distance = result[['d2gss', 'd2gts']]\ + .abs().min(axis=1).astype(int) gene_distance.ix[result.mechanism.str.startswith('u')] *= -1 return pd.DataFrame({ diff --git a/pyim/pipelines/_base.py b/pyim/pipelines/_base.py index 11e9b94..8b84739 100644 --- a/pyim/pipelines/_base.py +++ b/pyim/pipelines/_base.py @@ -15,10 +15,10 @@ import pandas as pd import numpy as np from scipy.spatial.distance import pdist + +from skbio import DNA from skbio import io as skbio_io -# noinspection PyUnresolvedReferences -from pyim.io import fastq from pyim.util import PrioritySet logging.basicConfig( @@ -58,46 +58,44 @@ def run(self, input_path, output_dir): if not output_dir.exists(): output_dir.mkdir() - # if input_path.suffix not in {'.bam', '.sam'}: - # genomic_path = output_dir / ('genomic' + - # ''.join(input_path.suffixes)) - # barcode_path = output_dir / 'genomic.barcodes.txt' - # - # # Extract genomic reads from input. - # logger.info('Extracting genomic sequences from reads') - # - # _, barcodes = self._extractor.extract_file( - # input_path=input_path, output_path=genomic_path) - # - # # Log statistics. - # total_reads = sum(self._extractor.stats.values()) - # - # logger.info('- Processed {} reads'.format(total_reads)) - # logger.info('- Read statistics') - # for status in self._extractor.STATUS: - # count = self._extractor.stats[status] - # logger.info('\t- {}: {} ({:3.2f}%)' - # .format(status.name, count, - # (count / total_reads) * 100)) - # - # # Write out barcodes as frame. - # barcode_frame = pd.DataFrame.from_records( - # iter(barcodes.items()), columns=['read_id', 'barcode']) - # barcode_frame.to_csv( - # str(barcode_path), sep=native_str('\t'), index=False) - # - # # Align to reference genome. - # logger.info('Aligning genomic sequences to reference') - # logger.info('- Using {} aligner (v{})'.format( - # self._aligner.__class__.__name__.replace('Aligner', ''), - # self._aligner.get_version())) - # - # aln_path = self._aligner.align_file( - # file=genomic_path, output_dir=output_dir) - # else: - # aln_path, barcodes = input_path, None - - aln_path = output_dir / 'alignment.bam' + if input_path.suffix not in {'.bam', '.sam'}: + genomic_path = output_dir / ('genomic' + + ''.join(input_path.suffixes)) + barcode_path = output_dir / 'genomic.barcodes.txt' + + # Extract genomic reads from input. + logger.info('Extracting genomic sequences from reads') + + _, barcodes = self._extractor.extract_file( + input_path=input_path, output_path=genomic_path) + + # Log statistics. + total_reads = sum(self._extractor.stats.values()) + + logger.info('- Processed {} reads'.format(total_reads)) + logger.info('- Read statistics') + for status in self._extractor.STATUS: + count = self._extractor.stats[status] + logger.info('\t- {}: {} ({:3.2f}%)' + .format(status.name, count, + (count / total_reads) * 100)) + + # Write out barcodes as frame. + barcode_frame = pd.DataFrame.from_records( + iter(barcodes.items()), columns=['read_id', 'barcode']) + barcode_frame.to_csv( + str(barcode_path), sep=native_str('\t'), index=False) + + # Align to reference genome. + logger.info('Aligning genomic sequences to reference') + logger.info('- Using {} aligner (v{})'.format( + self._aligner.__class__.__name__.replace('Aligner', ''), + self._aligner.get_version())) + + aln_path = self._aligner.align_file( + file=genomic_path, output_dir=output_dir) + else: + aln_path, barcodes = input_path, None barcode_map = pd.read_csv( str(output_dir / 'genomic.barcodes.txt'), sep='\t') @@ -148,7 +146,8 @@ def extract_read(self, read): def extract_from_file(self, file_path, format=None): format = self.DEFAULT_IN_FORMAT if format is None else format - reads = skbio_io.read(str(file_path), format=format) + reads = skbio_io.read( + str(file_path), format=format, constructor=DNA) for genomic, barcode in self.extract(reads): yield genomic, barcode @@ -158,7 +157,7 @@ def extract_to_file(self, reads, file_path, format=None): barcodes = {} with open(str(file_path), 'w') as file_: for genomic, barcode in self.extract(reads): - barcodes[genomic.id] = barcode + barcodes[genomic.metadata['id']] = barcode skbio_io.write(obj=genomic, format=format, into=file_) return file_path, barcodes @@ -169,7 +168,8 @@ def extract_file(self, input_path, output_path, format_out = self.DEFAULT_OUT_FORMAT \ if format_out is None else format_out - reads = skbio_io.read(str(input_path), format=format_in) + reads = skbio_io.read( + str(input_path), format=format_in, constructor=DNA) return self.extract_to_file(reads, output_path, format=format_out) diff --git a/pyim/pipelines/lam_pcr.py b/pyim/pipelines/lam_pcr.py index 750f972..0376fba 100644 --- a/pyim/pipelines/lam_pcr.py +++ b/pyim/pipelines/lam_pcr.py @@ -10,7 +10,7 @@ import numpy as np import pandas as pd -from skbio import DNASequence, SequenceCollection +from skbio import DNA, SequenceCollection from pyim.alignment.genome import Bowtie2Aligner from pyim.alignment.vector import ExactAligner @@ -47,17 +47,17 @@ def configure_argparser(cls, subparsers, name='lampcr'): def from_args(cls, args): # Read transposon sequence. - transposon_seq = DNASequence.read(str(args['transposon'])) \ + transposon_seq = DNA.read(str(args['transposon'])) \ if args['transposon'] is not None else None # Read contaminant sequences. contaminant_seqs = SequenceCollection.read( - str(args['contaminants']), constructor=DNASequence) \ + str(args['contaminants']), constructor=DNA) \ if args['contaminants'] is not None else None # Read barcode sequences if supplied. barcode_seqs = SequenceCollection.read( - str(args['barcodes']), constructor=DNASequence) \ + str(args['barcodes']), constructor=DNA) \ if args['barcodes'] is not None else None # Read barcode map if supplied. @@ -216,7 +216,8 @@ def identify(self, alignment_path, barcode_map=None): t=self._merge_distance) # Filter by min_depth. - insertions = insertions.ix[insertions['depth_unique'] > self._min_depth] + insertions = insertions.ix[ + insertions['depth_unique'] > self._min_depth] # Sort by coordinate and add identifiers. insertions = insertions.sort(['seqname', 'location']) diff --git a/pyim/pipelines/shear_splink.py b/pyim/pipelines/shear_splink.py index 3c448bb..0a5ef4f 100644 --- a/pyim/pipelines/shear_splink.py +++ b/pyim/pipelines/shear_splink.py @@ -11,7 +11,7 @@ import numpy as np import pandas as pd -from skbio import DNASequence, SequenceCollection +from skbio import DNA, SequenceCollection from pyim.alignment.genome import Bowtie2Aligner from pyim.alignment.vector import (ExactAligner, SswAligner, ChainedAligner, @@ -48,16 +48,16 @@ def configure_argparser(cls, subparsers, name='shear_splink'): @classmethod def from_args(cls, args): # Read transposon, barcode and linker sequences. - transposon_seq = DNASequence.read(str(args['transposon'])) + transposon_seq = DNA.read(str(args['transposon'])) - linker_seq = DNASequence.read(str(args['linker'])) + linker_seq = DNA.read(str(args['linker'])) barcode_seqs = SequenceCollection.read( - str(args['barcodes']), constructor=DNASequence) + str(args['barcodes']), constructor=DNA) # Read contaminants if supplied. contaminant_seqs = SequenceCollection.read( - str(args['contaminants']), constructor=DNASequence) \ + str(args['contaminants']), constructor=DNA) \ if args['contaminants'] is not None else None # Read barcode map if supplied. @@ -279,27 +279,27 @@ def _merge_insertions(cls, frame): return frame.iloc[0] else: # Check if merging is sane. - assert len(set(frame.seqname)) == 1 - assert len(set(frame.strand)) == 1 - assert len(set(frame.sample.astype(str))) == 1 + assert len(set(frame['seqname'])) == 1 + assert len(set(frame['strand'])) == 1 + assert len(set(frame['sample'].astype(str))) == 1 # Pick first row as reference for shared fields. ref = frame.iloc[0] # Calculate new location as mean, biased towards # insertions with more weight (a higher ULP). - weighted_loc = np.average( - frame.location, weights=frame.depth_unique) + weighted_loc = np.average(frame.location, + weights=frame['depth_unique']) weighted_loc = int(round(weighted_loc)) return pd.Series( {'insertion_id': np.nan, - 'seqname': ref.seqname, + 'seqname': ref['seqname'], 'location': weighted_loc, - 'strand': ref.strand, - 'sample': ref.sample, - 'depth': frame.depth.sum(), - 'depth_unique': frame.depth_unique.sum()}, + 'strand': ref['strand'], + 'sample': ref['sample'], + 'depth': frame['depth'].sum(), + 'depth_unique': frame['depth_unique'].sum()}, index=ref.index) @staticmethod From 843e82576ceeee24f558a9b8289732930f2339a0 Mon Sep 17 00:00:00 2001 From: jrderuiter Date: Wed, 15 Jul 2015 16:26:38 +0200 Subject: [PATCH 006/100] Added sample selection to merge command. --- pyim/main/merge.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/pyim/main/merge.py b/pyim/main/merge.py index 71a39e2..174fc26 100644 --- a/pyim/main/merge.py +++ b/pyim/main/merge.py @@ -17,7 +17,9 @@ def setup_parser(): parser.add_argument('insertions', nargs='+', type=Path) parser.add_argument('output', type=Path) - parser.add_argument('--names', nargs='+', required=False, default=None) + parser.add_argument('--names', nargs='+', default=None) + parser.add_argument('--samples', nargs='+', default=None) + parser.add_argument('--complement', default=False, action='store_true') return parser @@ -53,8 +55,24 @@ def main(): ins_frames.append(frame) - # Merge and write output. + # Merge frames. merged = pd.concat(ins_frames, ignore_index=True) + + # Filter samples if needed. + if args.samples is not None: + merged_samples = set(merged['sample']) + for sample in args.samples: + if sample not in merged_samples: + print('WARNING: unknown sample {}'.format(sample)) + + mask = merged['sample'].isin(set(args.samples)) + + if not args.complement: + merged = merged.ix[mask] + else: + merged = merged.ix[~mask] + + # Write output. merged.to_csv(str(args.output), sep=native_str('\t'), index=False) From 3743c9088ea47934434d925dd064e4e989c8349f Mon Sep 17 00:00:00 2001 From: jrderuiter Date: Thu, 16 Jul 2015 17:00:05 +0200 Subject: [PATCH 007/100] Add local hopping exclusion argument. --- pyim/cis/cimpl.py | 2 -- pyim/main/cis.py | 7 +++++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pyim/cis/cimpl.py b/pyim/cis/cimpl.py index 57086c9..b1e3f7b 100644 --- a/pyim/cis/cimpl.py +++ b/pyim/cis/cimpl.py @@ -1,5 +1,3 @@ -__author__ = 'Julian' - import pandas as pd import readline diff --git a/pyim/main/cis.py b/pyim/main/cis.py index 380247d..518f86e 100644 --- a/pyim/main/cis.py +++ b/pyim/main/cis.py @@ -24,6 +24,8 @@ def setup_parser(): group = parser.add_mutually_exclusive_group(required=True) group.add_argument('--pattern', default=None) group.add_argument('--system', choices={'SB'}, default=None) + group.add_argument('--lhc_method', choices={'none', 'exclude'}, + default='exclude') parser.add_argument('--genome', choices={'mm10'}, default='mm10') parser.add_argument('--chromosomes', nargs='+', default=None) @@ -49,8 +51,9 @@ def main(): # Run cimpl. cimpl_obj = cimpl(ins_frame, scales=args.scales, genome=args.genome, system=args.system, pattern=args.pattern, - chromosomes=args.chromosomes, iterations=args.iterations, - threads=args.threads, verbose=args.verbose) + lhc_method=args.lhc_method, chromosomes=args.chromosomes, + iterations=args.iterations, threads=args.threads, + verbose=args.verbose) # Extract cis and cis mapping from object. cis = get_cis(cimpl_obj, alpha=args.alpha, mul_test=True) From 78733c2de27a2824792a49efbd19c9c0c872aa93 Mon Sep 17 00:00:00 2001 From: jrderuiter Date: Fri, 17 Jul 2015 08:59:51 +0200 Subject: [PATCH 008/100] Allow merge without name. --- pyim/main/merge.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyim/main/merge.py b/pyim/main/merge.py index 174fc26..42d631c 100644 --- a/pyim/main/merge.py +++ b/pyim/main/merge.py @@ -50,8 +50,9 @@ def main(): samples = samples.union(frame_samples) # Augment ids to avoid duplicates in merged frame. - frame['insertion_id'] = ['{}.{}'.format(name, id_) - for id_ in frame['insertion_id']] + if name != '': + frame['insertion_id'] = ['{}.{}'.format(name, id_) + for id_ in frame['insertion_id']] ins_frames.append(frame) From fa37b5f032178384908722002f6bec55dedc0de1 Mon Sep 17 00:00:00 2001 From: jrderuiter Date: Fri, 17 Jul 2015 09:03:20 +0200 Subject: [PATCH 009/100] Add lhc_method as main argument (not in mutex group). --- pyim/main/cis.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyim/main/cis.py b/pyim/main/cis.py index 518f86e..ff25db4 100644 --- a/pyim/main/cis.py +++ b/pyim/main/cis.py @@ -24,8 +24,6 @@ def setup_parser(): group = parser.add_mutually_exclusive_group(required=True) group.add_argument('--pattern', default=None) group.add_argument('--system', choices={'SB'}, default=None) - group.add_argument('--lhc_method', choices={'none', 'exclude'}, - default='exclude') parser.add_argument('--genome', choices={'mm10'}, default='mm10') parser.add_argument('--chromosomes', nargs='+', default=None) @@ -35,6 +33,9 @@ def setup_parser(): parser.add_argument('--alpha', type=float, default=0.05) parser.add_argument('--iterations', type=int, default=1000) + parser.add_argument('--lhc_method', choices={'none', 'exclude'}, + default='exclude') + parser.add_argument('--threads', type=int, default=1) parser.add_argument('--verbose', default=False, action='store_true') From 9a25cb58a86e9b2cea484adbd023176e828c1206 Mon Sep 17 00:00:00 2001 From: jrderuiter Date: Fri, 17 Jul 2015 10:00:04 +0200 Subject: [PATCH 010/100] Add contig_depth inclusion, check for presence. --- pyim/cis/cimpl.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pyim/cis/cimpl.py b/pyim/cis/cimpl.py index b1e3f7b..a6e6421 100644 --- a/pyim/cis/cimpl.py +++ b/pyim/cis/cimpl.py @@ -40,10 +40,17 @@ def cimpl(insertions, scales, genome, system=None, pattern=None, # Load genome object from R. genome_obj = _load_genome(genome) + # Convert insertions to cimpl format. + cimpl_frame = _convert_to_cimpl_dataframe(insertions) + + # Check if contig_depth is present (if doing hop exclusion). + if lhc_method == 'exclude' and 'contig_depth' not in cimpl_frame: + raise ValueError('Insertion depth is needed for lhc exclusion') + # Run CIMPL! cimpl_r = importr('cimpl') cimpl_obj = cimpl_r.doCimplAnalysis( - _convert_to_cimpl_dataframe(insertions), + pandas_to_dataframe(cimpl_frame), scales=scales, n_iterations=iterations, lhc_method=lhc_method, threads=threads, BSgenome=genome_obj, chromosomes=chromosomes, verbose=verbose, **extra_args) @@ -57,10 +64,13 @@ def _convert_to_cimpl_dataframe(insertions): 'location', 'sample']] cimpl_frame.columns = ['id', 'chr', 'location', 'sampleID'] + if 'depth_unique' in insertions: + cimpl_frame['contig_depth'] = insertions['depth_unique'] + # Add 'chr' prefix to the chromosome names if needed. cimpl_frame['chr'] = _prefix_chromosomes(cimpl_frame['chr']) - return pandas_to_dataframe(cimpl_frame) + return cimpl_frame def _prefix_chromosomes(series, prefix='chr'): From 771b6456742820e9e49e5cf5693c6353cf3358ef Mon Sep 17 00:00:00 2001 From: jrderuiter Date: Fri, 17 Jul 2015 10:03:03 +0200 Subject: [PATCH 011/100] Version bump. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b26aa54..8d45aec 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ setup( name='pyim', - version='0.4.2', + version='0.4.3', url='', author='Julian de Ruiter', author_email='j.r.deruiter@icloud.com', From a854d83fbf9a5b599e268382938504a28ed228d5 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Fri, 11 Dec 2015 14:17:19 +0100 Subject: [PATCH 012/100] Before functional. --- pyim/annotation/base.py | 9 +++++---- pyim/annotation/kcrbm.py | 9 ++------- pyim/annotation/rbm.py | 13 +++++++------ pyim/annotation/window.py | 7 ++----- setup.py | 4 +--- 5 files changed, 17 insertions(+), 25 deletions(-) diff --git a/pyim/annotation/base.py b/pyim/annotation/base.py index 7a790a5..3d44273 100644 --- a/pyim/annotation/base.py +++ b/pyim/annotation/base.py @@ -22,10 +22,11 @@ def annotate(self, frame): raise NotImplementedError() -def closest_genes(frame, id_col='insertion_id', distance_col='distance'): - select_closest = lambda x: x.ix[ - x[distance_col] == x[distance_col].abs().min()] +def get_closest(frame, id_col='insertion_id', distance_col='distance'): + def _is_closest(x): + abs_dist = x[distance_col].abs() + return x.ix[abs_dist == abs_dist.min()] return (frame.groupby(id_col) - .apply(select_closest) + .apply(_is_closest) .reset_index(drop=True)) diff --git a/pyim/annotation/kcrbm.py b/pyim/annotation/kcrbm.py index d3e2a08..e6957fa 100644 --- a/pyim/annotation/kcrbm.py +++ b/pyim/annotation/kcrbm.py @@ -13,7 +13,7 @@ from tkgeno.util.rpy2 import importr, pandas_to_dataframe, dataframe_to_pandas -from .base import Annotator +from .base import Annotator, get_closest CHR_MAP = dict(zip( list(map(str, range(1, 19+1))) + ['X', 'Y'], @@ -56,12 +56,7 @@ def annotate(self, frame, type_='gene'): gene_mapping = self._parse_gene_result(kcrbm_result) if self._closest: - def closest(x): - return x.ix[x['distance'] == x['distance'].abs().min()] - - gene_mapping = (gene_mapping.groupby('insertion_id') - .apply(closest) - .reset_index(drop=True)) + gene_mapping = get_closest(gene_mapping) return pd.merge(frame, gene_mapping, on='insertion_id', how='left') diff --git a/pyim/annotation/rbm.py b/pyim/annotation/rbm.py index 44f7392..b3d4a38 100644 --- a/pyim/annotation/rbm.py +++ b/pyim/annotation/rbm.py @@ -1,20 +1,20 @@ from __future__ import (absolute_import, division, print_function, unicode_literals) -from builtins import (ascii, bytes, chr, dict, filter, hex, input, +from builtins import (ascii, bytes, chr, dict, hex, input, int, map, next, oct, open, pow, range, round, - str, super, zip) + str, super, zip) # filter from pathlib import Path import pandas as pd from toolz import curry, pipe, merge_with, keymap -from toolz.curried import get, filter, map, valfilter, valmap +from toolz.curried import filter, valfilter, valmap from tkgeno.io import GtfFile from tkgeno.util.pandas import reorder_columns -from .base import Annotator, closest_genes +from .base import Annotator, get_closest from .window import Window, apply_window, fetch_features, annotate_features @@ -80,14 +80,15 @@ def annotate(self, frame, type_='gene'): ignore_index=True) return results if not self._closest \ - else closest_genes(results, id_col=self._id_column) + else get_closest(results, id_col=self._id_column) @staticmethod def _annotate_row(row, windows, gtf, feature_type='gene'): strand = row.strand if hasattr(row, 'strand') else None # Fetch features for orientation, or for the forward orientation. - apply_func = curry(apply_window, row.seqname, row.location, strand or 1) + apply_func = curry(apply_window, row.seqname, + row.location, strand or 1) windows_fwd = valmap(apply_func, windows) features = fetch_features_windows( diff --git a/pyim/annotation/window.py b/pyim/annotation/window.py index 318f4d2..47cc1e1 100644 --- a/pyim/annotation/window.py +++ b/pyim/annotation/window.py @@ -8,15 +8,12 @@ from functools import lru_cache from pathlib import Path -from toolz import pipe -from toolz.curried import get, filter, map, valmap, valfilter - import pandas as pd from tkgeno.io import GtfFile from tkgeno.util.pandas import reorder_columns -from .base import Annotator, closest_genes +from .base import Annotator, get_closest Window = namedtuple('Window', ['seqname', 'start', 'end', 'strand', @@ -78,7 +75,7 @@ def annotate(self, frame, type_='gene'): ignore_index=True) return results if self._closest is not None \ - else closest_genes(frame, id_col=self._id_column) + else get_closest(frame, id_col=self._id_column) @staticmethod def _annotate_row(row, window, gtf, feature_type='gene'): diff --git a/setup.py b/setup.py index 8d45aec..374a51f 100644 --- a/setup.py +++ b/setup.py @@ -1,11 +1,9 @@ -__author__ = 'Julian' - import sys from setuptools import setup, find_packages install_requires = ['future', 'numpy', 'scipy', 'pandas', 'pysam', - 'natsort', 'rpy2', 'scikit-bio', 'tkgeno', 'toolz'] + 'natsort', 'rpy2', 'scikit-bio', 'toolz'] if not sys.version_info >= (3, ): install_requires += ['pathlib'] From f2cef60b32d7f3d81c8b9d79306ccb8d55c8fff6 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Fri, 11 Dec 2015 14:17:57 +0100 Subject: [PATCH 013/100] Start of functional pipeline re-write. --- functional.py | 515 ++++++++++++++++++++++++++++++++++++++ pyim/alignment/bowtie2.py | 47 ++++ pyim/alignment/genome.py | 1 - 3 files changed, 562 insertions(+), 1 deletion(-) create mode 100644 functional.py create mode 100644 pyim/alignment/bowtie2.py delete mode 100644 pyim/alignment/genome.py diff --git a/functional.py b/functional.py new file mode 100644 index 0000000..e84e290 --- /dev/null +++ b/functional.py @@ -0,0 +1,515 @@ +import collections +import heapq +import itertools +import operator +from enum import Enum + +import pysam +import numpy as np +import pandas as pd +from toolz import curry, map, pipe, merge_with +from toolz.curried import filter + +import skbio +from skbio import DNA +from skbio.alignment import local_pairwise_align_ssw + + +# --- Model --- # + +Alignment = collections.namedtuple( + 'Alignment', + ['query_id', 'query_start', 'query_end', 'query_len', + 'target_id', 'target_start', 'target_end', 'target_len', + 'strand', 'identity', 'coverage', 'score']) + +ExtractResult = collections.namedtuple( + 'ExtractResult', ['genomic_sequence', 'barcode', 'status']) + + +def reverse_alignment(aln): + target_len = aln.target_len + + return Alignment( + query_id=aln.query_id, query_start=aln.query_start, + query_end=aln.query_end, query_len=aln.query_len, + target_id=aln.target_id, target_start=target_len - aln.target_end, + target_end=target_len - aln.target_start, target_len=target_len, + strand=aln.strand * -1, type=aln.type, identity=aln.identity, + coverage=aln.coverage, score=aln.score) + + +Insertion = collections.namedtuple( + 'Insertion', ['id', 'seqname', 'location', + 'strand', 'sample', 'metadata']) + + +# --- Alignment --- # + +@curry +def align_exact(target, query, query_strand=1): + # Note that this alignment returns the first occurrence it finds, + # later occurrences will not be found and are not checked for. + try: + index = str(target).index(str(query)) + except ValueError: + return None + else: + q_len = len(query) + + return Alignment( + query_id=query.metadata.get('id', None), query_start=0, + query_end=q_len, query_len=q_len, + target_id=target.metadata.get('id', None), target_start=index, + target_end=index + q_len, target_len=len(target), + strand=query_strand, identity=1.0, coverage=1.0, score=100) + + +@curry +def align_ssw(target, query, query_strand=1): + ssw_aln = local_pairwise_align_ssw(target.sequence, query.sequence) + + # Extract positions. + pos = ssw_aln.start_end_positions() + q_start, q_end = pos[1] + t_start, t_end = pos[0] + + # Offset ends by one, making them exclusive + # to match python conventions. + q_end += 1 + t_end += 1 + + # Calculate basic metrics. + coverage = (q_end - q_start) / float(len(query)) + identity = ssw_aln[0].fraction_same(ssw_aln[1]) + + aln = Alignment( + query_id=query.id, query_start=q_start, query_end=q_end, + query_len=len(query), target_id=target.id, target_start=t_start, + target_end=t_end, target_len=len(target), strand=query_strand, + identity=identity, coverage=coverage, + score=int(identity * coverage * 100)) + + return aln + + +@curry +def align_with_reverse(target, query, align_func, query_strand=1, **kwargs): + aln_fwd = align_func(target, query, query_strand=query_strand, **kwargs) + aln_rev = align_func(target, query.reverse_complement(), + query_strand=query_strand * -1, **kwargs) + + if aln_fwd is None: + return aln_rev + elif aln_rev is None: + return aln_fwd + else: + return aln_rev if aln_rev.score > aln_fwd.score else aln_fwd + + +@curry +def align_multiple(target, queries, align_func, **kwargs): + alns = (align_func(target, query, **kwargs) for query in queries) + alns = list(filter(bool, alns)) + + if len(alns) == 0: + return None + elif len(alns) == 1: + return alns[0] + else: + raise ValueError('Multiple alignments') + + +# --- Filtering --- # + +def filter_alignment(alignment, filters): + for filter_ in filters: + if not filter_(alignment): + return False + return True + + +# --- Extract pipeline --- # + +def extract(read): + raise NotImplementedError() + + +def print_stats(results): + # Iterate over results, counting statuses. + status_counts = collections.defaultdict(int) + + for result in results: + status_counts[result.status.name] += 1 + yield result + + # We're done, so print frequencies! + total = sum(status_counts.values()) + for status, count in status_counts.items(): + percentage = (count / total) * 100 + print('{}: {} ({}%)'.format(status, count, percentage)) + + +@curry +def write_sequences(results, file_path, format, mode='w', + compression='auto', compresslevel=9): + """ Test docstring """ + with skbio.io.util.open(file_path, mode=mode, compression=compression, + compresslevel=compresslevel) as file_: + for result in results: + skbio.io.write(result.genomic_sequence, into=file_, format=format) + yield result + + +@curry +def build_barcode_map(results, sample_map=None): + if sample_map is None: + return {result.genomic_sequence.metadata['id']: + result.barcode + for result in results} + else: + return {result.genomic_sequence.metadata['id']: + sample_map[result.barcode] + for result in results} + + +def consume(iterator, n=None): + "Advance the iterator n-steps ahead. If n is none, consume entirely." + # Use functions that consume iterators at C speed. + if n is None: + # Feed the entire iterator into a zero-length deque + collections.deque(iterator, maxlen=0) + else: + # Advance to the empty slice starting at position n + next(itertools.islice(iterator, n, n), None) + + +# --- Identify pipeline --- # + +class PrioritySet(object): + + def __init__(self): + self._heap = [] + self._set = set() + + def push(self, item, priority): + if item not in self._set: + heapq.heappush(self._heap, (priority, item)) + self._set.add(item) + + def pop(self): + priority, item = heapq.heappop(self._heap) + self._set.remove(item) + return item + + def first(self): + _, item = min(self._heap) + return item + + def __len__(self): + return len(self._heap) + + def __str__(self): + return 'PrioritySet(heap={}, set={})'\ + .format(str(self._heap), str(self._set)) + + def __repr__(self): + return str(self) + + +@curry +def groupby_reference(alignments, alignment_file=None): + for reference, group in itertools.groupby( + alignments, operator.attrgetter('reference_id')): + if alignment_file is not None: + reference = alignment_file.getrname(reference) + yield reference, group + + +def groupby_position(alignments): + """ Groups alignments by their positions, grouping forward strand + alignments with the same start position and reverse strand + alignments with the same end position. Assumes alignments + are all on a single reference sequence. + """ + # Setup our collections for tracking reads and positions. + # + # The priority set is used to track positions with alignment groups, + # ensuring that no position is listed twice (the set part) and + # always giving the lowest position first (the priority part). + # + # The alignment dict contains two lists for each position with at + # least one alignment, one for forward reads and one for reverse. + # Any alignments encountered as position x in orientation o are added + # to the corresponding entry dict[x][o] in the list, in which + # o is encoded as {0,1}, with 1 being for reverse strand alignments. + position_set = PrioritySet() + aln_dict = collections.defaultdict(lambda: ([], [])) + + curr_pos = 0 + for aln in alignments: + # Check our ordering. + if aln.reference_start < curr_pos: + raise ValueError('Alignments not ordered by position') + + curr_pos = aln.reference_start + + # Add current read to collections. + is_reverse = aln.is_reverse + ref_pos = aln.reference_end if is_reverse else curr_pos + aln_dict[ref_pos][bool(is_reverse)].append(aln) + position_set.push(ref_pos, ref_pos) + + # Return any alignment groups before our current position. + try: + while position_set.first() < curr_pos: + first_pos = position_set.pop() + fwd_grp, rev_grp = aln_dict.pop(first_pos) + if len(fwd_grp) > 0: + yield (fwd_grp[0].reference_start, 1), fwd_grp + if len(rev_grp) > 0: + yield (rev_grp[0].reference_end, -1), rev_grp + except ValueError: + pass + + # We're done, yield any remaining alignment groups. + for _ in range(len(position_set)): + fwd_grp, rev_grp = aln_dict.pop(position_set.pop()) + if len(fwd_grp) > 0: + yield (fwd_grp[0].reference_start, 1), fwd_grp + if len(rev_grp) > 0: + yield (rev_grp[0].reference_end, -1), rev_grp + + +@curry +def groupby_barcode(alignments, barcode_map): + # Group alignments by barcodes. + groups = collections.defaultdict(list) + for aln in alignments: + barcode = barcode_map[aln.query_name] + groups[barcode].append(aln) + + # Yield group together with barcode. + for barcode, group in groups.items(): + yield barcode, group + + +def chain_groupby(iterable, groupby_funcs): + grouped = groupby_funcs[0](iterable) + + if len(groupby_funcs) == 1: + for key, group in grouped: + if not isinstance(key, tuple): + key = (key,) + yield key, group + else: + for key, group in grouped: + for sub_key, sub_group in chain_groupby(group, groupby_funcs[1:]): + yield key + sub_key, sub_group + + +# --- ShearSplink --- # + +class ShearSplinkStatus(Enum): + contaminant = 1 + no_transposon = 2 + no_linker = 3 + no_barcode = 4 + multiple_barcodes = 5 + too_short = 6 + proper_read = 7 + + +@curry +def shearsplink_extract( + reads, transposon_sequence, barcode_sequences, linker_sequence, + contaminant_sequences=None, transposon_func=None, + barcode_func=None, linker_func=None, barcode_map=None): + + # Specify defaults for not provided aligners. + if transposon_func is None: + transposon_func = align_with_reverse(align_func=align_exact) + + if barcode_func is None: + barcode_func = align_multiple(align_func=align_exact) + + if linker_func is None: + linker_func = align_exact + + # Setup contaminant aligner if sequences are provided. + if contaminant_sequences is not None: + contaminant_func = align_multiple(queries=contaminant_sequences, + align_func=align_exact) + else: + contaminant_func = None + + # Prime aligners with their respective sequences. + transposon_func = transposon_func(query=transposon_sequence) + barcode_func = barcode_func(queries=barcode_sequences) + linker_func = linker_func(query=linker_sequence) + + # Extract and return results. + extract_func = curry(_shearsplink_extract, + transposon_func=transposon_func, + barcode_func=barcode_func, + linker_func=linker_func, + contaminant_func=contaminant_func) + + for result in map(extract_func, reads): + yield result + + +def _shearsplink_extract( + read, transposon_func, barcode_func, + linker_func, contaminant_func=None): + """ Extracts the genomic sequence and barcode from the passed + read. Reads containing contaminants are dropped. Reads are + expected to look as follows: + + [barcode][transposon][genomic-sequence][linker] + + Each of these sequences is recognized by their corresponding + alignment function. The barcode alignment identifies the + barcode (and thus the sample) of the read, whilst the transposon + and linker alignments are used to delineate the genomic sequence. + + The function returns an ExactResult tuple that contains the + genomic sequence, barcode and a status flag. If any errors + occur during the extraction, the genomic sequence and barcode + values are None and the status flag indicates the underlying reason. + """ + + # Drop read if it contains a contaminant. + if contaminant_func is not None and len(contaminant_func(read)) > 0: + return ExtractResult(None, None, ShearSplinkStatus.contaminant) + + # Identify location of the transposon. + transposon_aln = transposon_func(read) + if transposon_aln is None: + return ExtractResult(None, None, ShearSplinkStatus.no_transposon) + + # If transposon is on the reverse strand, flip the read and the + # alignment to bring everything into the same (fwd) orientation. + if transposon_aln.strand == -1: + read = read.reverse_complement() + transposon_aln = reverse_alignment(transposon_aln) + + # Identify barcode of the read. + try: + barcode_aln = barcode_func(read) + if barcode_aln is None: + return ExtractResult(None, None, ShearSplinkStatus.no_barcode) + except ValueError: + return ExtractResult(None, None, ShearSplinkStatus.multiple_barcodes) + + barcode = barcode_aln.query_id + + # Identify location of linker. + linker_aln = linker_func(read) + if linker_aln is None: + return ExtractResult(None, None, ShearSplinkStatus.no_linker) + + # Extract genomic sequence using previous alignments. + genomic = read[transposon_aln.target_end:linker_aln.target_start] + + return ExtractResult(genomic, barcode, ShearSplinkStatus.proper_read) + + +def shearsplink_identify(alignments): + pass + + +def insertion_from_group(info, group): + ref, pos, strand, bc = info + + # Get positions of the non-transposon ends of the alignment. + end_field = 'reference_end' if strand == 1 else 'reference_start' + end_positions = map(operator.attrgetter(end_field), group) + + # Calulate overall depth and unique end depth. + depth = len(group) + depth_unique = len(set(end_positions)) + + metadata = dict(depth=depth, depth_unique=depth_unique) + + return Insertion(id=None, seq_name=ref, location=pos, strand=strand, + sample=bc, metadata=metadata) + + +def group_insertions(insertions, distance): + # for insertion in insertions: + # check if we have an insertion from this sample in our collection + # if so, add to collection + + # - When did we last see SAMPLE_X? + # - Which sample have we not seen within distance? + pass + + +def merge_insertions(insertions): + # Summarize location as mean. + location = np.average([ins.location for ins in insertions]) + + # Merge metadata by summing depths. + metadata = merge_with(sum, *[ins.metadata for ins in insertions]) + + # Take first insertion as reference for other attributes. + ref = insertions[0] + + return Insertion(id=None, seqname=ref.seqname, location=location, + strand=ref.strand, sample=ref.sample, metadata=metadata) + + +# --- Main --- # + +# Extraction. + +seq1 = DNA('CACTGGCCACGCGAAGGTGC') +seq2 = DNA('GACCACTGGCCACGCGAAGG').reverse_complement() +seq3 = DNA('CGTTGGTCACTCTACCCACA') + +transposon = DNA('TTTG', metadata=dict(id='transposon')) +barcodes = [DNA('AAAT', metadata=dict(id='BC01')), + DNA('AAAA', metadata=dict(id='BC02'))] +linker = DNA('CCCG', metadata=dict(id='linker')) + +reads = [DNA(str(barcodes[0]) + str(transposon) + + str(seq1) + str(linker), metadata=dict(id='read_1')), + DNA(str(transposon) + str(seq1) + str(linker))] + +genomic_path = '/Users/Julian/Scratch/pyim/functional/genomic.fasta.gz' +barcode_path = '/Users/Julian/Scratch/pyim/functional/barcodes.txt' + +barcode_map = pipe( + reads, + shearsplink_extract(transposon_sequence=transposon, + barcode_sequences=barcodes, + linker_sequence=linker), + print_stats, + filter(lambda r: r.status == ShearSplinkStatus.proper_read), + filter(lambda r: len(r.genomic_sequence) >= 15), + write_sequences(file_path=genomic_path, format='fasta', + compression='gzip', compresslevel=9), + build_barcode_map) + +barcode_frame = pd.DataFrame.from_records( + iter(barcode_map.items()), columns=['read_id', 'barcode']) +barcode_frame.to_csv(barcode_path, sep='\t', index=False) + + +# Grouping. + +bam = pysam.AlignmentFile('/Volumes/Datastore/Scratch/' + 'lam-pcr-sjors/out/alignment.bam') +alns = itertools.islice(bam.fetch(), 0, 1000) + +it = chain_groupby( + itertools.islice(bam.fetch(multiple_iterators=True), 0, 1000), + [curry(groupby_reference, alignment_file=bam), groupby_position]) + +barcode_map = collections.defaultdict(lambda: 'BC01') +it2 = chain_groupby( + itertools.islice(bam.fetch(multiple_iterators=True), 0, 1000), + [groupby_reference(alignment_file=bam), + groupby_position, + groupby_barcode(barcode_map=barcode_map)]) diff --git a/pyim/alignment/bowtie2.py b/pyim/alignment/bowtie2.py new file mode 100644 index 0000000..14a210f --- /dev/null +++ b/pyim/alignment/bowtie2.py @@ -0,0 +1,47 @@ +from __future__ import (absolute_import, division, + print_function, unicode_literals) +from builtins import (ascii, bytes, chr, dict, filter, hex, input, + int, map, next, oct, open, pow, range, round, + str, super, zip) + +import subprocess + + +def align(m1, index, output, m2=None, options=None, log=None): + options = {} or options + + # Inject inputs into options. + if m2 is None: + options['-U'] = m1 + else: + options['-1'] = m1 + options['-2'] = m2 + + # Inject index and output. + options['-x'] = index + options['-S'] = output + + # Format into arguments. + args = ['bowtie2'] + dict_to_args(options) + + if log is not None: + with open(log, 'w') as log_file: + subprocess.check_call(args, stderr=log_file) + else: + subprocess.check_call(args) + + return output + + +def dict_to_args(arg_dict): + args = [] + + for key, value in arg_dict.items(): + if type(value) == bool: + if value: + args.append(key) + else: + args.append(key) + args.append(str(value)) + + return args diff --git a/pyim/alignment/genome.py b/pyim/alignment/genome.py deleted file mode 100644 index 99782da..0000000 --- a/pyim/alignment/genome.py +++ /dev/null @@ -1 +0,0 @@ -from tkgeno.alignment.genome import Bowtie2Aligner From 06ced745429e1e705d287828cf3d14ee235c1d80 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Mon, 21 Dec 2015 13:25:10 +0100 Subject: [PATCH 014/100] Intermediate refactoring. --- pyim/alignment/_vector.py | 203 +++++++++++ pyim/alignment/vector.py | 254 ++++--------- pyim/pipelines/_helpers.py | 147 ++++++++ pyim/pipelines/_model.py | 10 + pyim/pipelines/_shear_splink.py | 321 +++++++++++++++++ pyim/pipelines/shear_splink.py | 613 ++++++++++++++++---------------- 6 files changed, 1069 insertions(+), 479 deletions(-) create mode 100644 pyim/alignment/_vector.py create mode 100644 pyim/pipelines/_helpers.py create mode 100644 pyim/pipelines/_model.py create mode 100644 pyim/pipelines/_shear_splink.py diff --git a/pyim/alignment/_vector.py b/pyim/alignment/_vector.py new file mode 100644 index 0000000..9ad6d40 --- /dev/null +++ b/pyim/alignment/_vector.py @@ -0,0 +1,203 @@ +from __future__ import (absolute_import, division, + print_function, unicode_literals) +from builtins import (ascii, bytes, chr, dict, filter, hex, input, + int, map, next, oct, open, pow, range, round, + str, super, zip) + +from skbio.alignment import local_pairwise_align_ssw + + +class VectorAlignment(object): + + def __init__(self, query_id, query_start, query_end, query_len, + target_id, target_start, target_end, target_strand, + target_len, type, identity, coverage): + self.query_id = query_id + self.query_start = query_start + self.query_end = query_end + self.query_len = query_len + self.target_id = target_id + self.target_start = target_start + self.target_end = target_end + self.target_strand = target_strand + self.target_len = target_len + self.type = type + self.identity = identity + self.coverage = coverage + + @property + def score(self): + return self.identity * self.coverage + + def reverse(self, read): + read_len = len(read) + + return self.__class__( + query_id=self.query_id, + query_start=self.query_start, + query_end=self.query_end, + query_len=self.query_len, + target_id=self.target_id, + target_start=read_len - self.target_end, + target_end=read_len - self.target_start, + target_len=self.target_len, + target_strand=1 if self.target_strand == -1 else 1, + type=self.type, identity=self.identity, coverage=self.coverage + ) + + +class VectorAligner(object): + + def __init__(self, **kwargs): + pass + + def align(self, query, target): + raise NotImplementedError() + + def align_multiple(self, queries, target, how='unique'): + alignments = filter(bool, (self.align(q, target) for q in queries)) + alignments = list(alignments) + + num_alignments = len(alignments) + if num_alignments == 0: + return None + elif num_alignments == 1: + return alignments[0] + else: + if how == 'unique': + raise ValueError('Multiple matching queries for target {}' + .format(target.metadata['id'])) + elif how == 'any': + return alignments[0] + else: + raise ValueError('Unknown value for how ({})'.format(how)) + + +class ExactAligner(VectorAligner): + + def __init__(self, try_reverse=False): + super().__init__() + self._try_reverse = try_reverse + + def align(self, query, target): + alignment = self._align_exact(query, target, query_ori=1) + + # Try reverse complement if first alignment failed. + if alignment is None and self._try_reverse: + alignment = self._align_exact( + query.reverse_complement(), target, query_ori=-1) + + return alignment + + @staticmethod + def _align_exact(query, target, query_ori): + # Note that this alignment returns the first occurrence it finds, + # later occurrences will not be found and are not checked for. + try: + index = str(target).index(str(query)) + except ValueError: + return None + else: + q_len = len(query) + + return VectorAlignment( + query_id=query.metadata['id'], query_start=0, query_end=q_len, + query_len=q_len, target_id=target.metadata['id'], + target_start=index, target_end=index + q_len, + target_strand=query_ori, target_len=len(target), type='exact', + identity=1.0, coverage=1.0) + + +class SswAligner(VectorAligner): + + def __init__(self, try_reverse=False, filters=None): + super().__init__() + self._try_reverse = try_reverse + self._filters = filters + + def align(self, query, target): + fwd_alignment = self._align_ssw(query, target, query_ori=1) + + if self._try_reverse: + rev_alignment = self._align_ssw( + query.reverse_complement(), target, query_ori=-1) + + if fwd_alignment is None: + # Default to reverse if no forward. + alignment = rev_alignment + elif rev_alignment is None: + # Default to forward if no reverse. + alignment = fwd_alignment + else: + # Otherwise choose the best of the two. + if rev_alignment.score > fwd_alignment.score: + alignment = rev_alignment + else: + alignment = fwd_alignment + else: + alignment = fwd_alignment + + return alignment + + def _align_ssw(self, query, target, query_ori): + ssw_aln = local_pairwise_align_ssw(target, query) + + # Extract positions. + pos = ssw_aln.start_end_positions() + q_start, q_end = pos[1] + t_start, t_end = pos[0] + + # Offset ends by one, making them exclusive + # to match python conventions. + q_end += 1 + t_end += 1 + + # Calculate basic metrics. + coverage = (q_end - q_start) / float(len(query)) + identity = 1.0 - ssw_aln[0].distance(ssw_aln[1]) + + aln = VectorAlignment( + query_id=query.metadata['id'], query_start=q_start, + query_end=q_end, query_len=len(query), + target_id=target.metadata['id'], target_start=t_start, + target_end=t_end, target_strand=query_ori, target_len=len(target), + type='ssw', identity=identity, coverage=coverage) + + # Check if alignment passes any filter. + if self._filters is None: + return aln + else: + for filter_ in self._filters: + if filter_(aln): + return aln + return None + + +class ChainedAligner(VectorAligner): + + def __init__(self, aligners): + super().__init__() + self._aligners = aligners + + def align(self, query, target): + aln = None + + for aligner in self._aligners: + aln = aligner.align(query, target) + if aln is not None: + break + + return aln + + +def filter_identity(aln, min_identity): + return aln.identity >= min_identity + + +def filter_score(aln, min_score): + return aln.score >= min_score + + +def filter_end_match(aln, min_coverage=0.5, min_identity=1.0): + return aln.target_end == aln.target_len and \ + aln.coverage >= min_coverage and aln.identity >= min_identity diff --git a/pyim/alignment/vector.py b/pyim/alignment/vector.py index 9ad6d40..517f928 100644 --- a/pyim/alignment/vector.py +++ b/pyim/alignment/vector.py @@ -1,146 +1,51 @@ -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import (ascii, bytes, chr, dict, filter, hex, input, - int, map, next, oct, open, pow, range, round, - str, super, zip) +import collections from skbio.alignment import local_pairwise_align_ssw +from toolz import curry -class VectorAlignment(object): - - def __init__(self, query_id, query_start, query_end, query_len, - target_id, target_start, target_end, target_strand, - target_len, type, identity, coverage): - self.query_id = query_id - self.query_start = query_start - self.query_end = query_end - self.query_len = query_len - self.target_id = target_id - self.target_start = target_start - self.target_end = target_end - self.target_strand = target_strand - self.target_len = target_len - self.type = type - self.identity = identity - self.coverage = coverage - - @property - def score(self): - return self.identity * self.coverage - - def reverse(self, read): - read_len = len(read) - - return self.__class__( - query_id=self.query_id, - query_start=self.query_start, - query_end=self.query_end, - query_len=self.query_len, - target_id=self.target_id, - target_start=read_len - self.target_end, - target_end=read_len - self.target_start, - target_len=self.target_len, - target_strand=1 if self.target_strand == -1 else 1, - type=self.type, identity=self.identity, coverage=self.coverage - ) - - -class VectorAligner(object): - - def __init__(self, **kwargs): - pass - - def align(self, query, target): - raise NotImplementedError() - - def align_multiple(self, queries, target, how='unique'): - alignments = filter(bool, (self.align(q, target) for q in queries)) - alignments = list(alignments) - - num_alignments = len(alignments) - if num_alignments == 0: - return None - elif num_alignments == 1: - return alignments[0] - else: - if how == 'unique': - raise ValueError('Multiple matching queries for target {}' - .format(target.metadata['id'])) - elif how == 'any': - return alignments[0] - else: - raise ValueError('Unknown value for how ({})'.format(how)) - - -class ExactAligner(VectorAligner): - - def __init__(self, try_reverse=False): - super().__init__() - self._try_reverse = try_reverse - - def align(self, query, target): - alignment = self._align_exact(query, target, query_ori=1) - - # Try reverse complement if first alignment failed. - if alignment is None and self._try_reverse: - alignment = self._align_exact( - query.reverse_complement(), target, query_ori=-1) - - return alignment - - @staticmethod - def _align_exact(query, target, query_ori): - # Note that this alignment returns the first occurrence it finds, - # later occurrences will not be found and are not checked for. - try: - index = str(target).index(str(query)) - except ValueError: - return None - else: - q_len = len(query) - - return VectorAlignment( - query_id=query.metadata['id'], query_start=0, query_end=q_len, - query_len=q_len, target_id=target.metadata['id'], - target_start=index, target_end=index + q_len, - target_strand=query_ori, target_len=len(target), type='exact', - identity=1.0, coverage=1.0) - - -class SswAligner(VectorAligner): - - def __init__(self, try_reverse=False, filters=None): - super().__init__() - self._try_reverse = try_reverse - self._filters = filters - - def align(self, query, target): - fwd_alignment = self._align_ssw(query, target, query_ori=1) - - if self._try_reverse: - rev_alignment = self._align_ssw( - query.reverse_complement(), target, query_ori=-1) - - if fwd_alignment is None: - # Default to reverse if no forward. - alignment = rev_alignment - elif rev_alignment is None: - # Default to forward if no reverse. - alignment = fwd_alignment - else: - # Otherwise choose the best of the two. - if rev_alignment.score > fwd_alignment.score: - alignment = rev_alignment - else: - alignment = fwd_alignment - else: - alignment = fwd_alignment - - return alignment - - def _align_ssw(self, query, target, query_ori): - ssw_aln = local_pairwise_align_ssw(target, query) +Alignment = collections.namedtuple( + 'Alignment', + ['query_id', 'query_start', 'query_end', 'query_len', + 'target_id', 'target_start', 'target_end', 'target_len', + 'strand', 'identity', 'coverage', 'score']) + + +def reverse_alignment(aln): + """Reverses strand of alignment object.""" + target_len = aln.target_len + + return Alignment( + query_id=aln.query_id, query_start=aln.query_start, + query_end=aln.query_end, query_len=aln.query_len, + target_id=aln.target_id, target_start=target_len - aln.target_end, + target_end=target_len - aln.target_start, target_len=target_len, + strand=aln.strand * -1, type=aln.type, identity=aln.identity, + coverage=aln.coverage, score=aln.score) + + +@curry +def align_exact(target, query, query_strand=1): + # Note that this alignment returns the first occurrence it finds, + # later occurrences will not be found and are not checked for. + try: + index = str(target).index(str(query)) + except ValueError: + return None + else: + q_len = len(query) + + return Alignment( + query_id=query.metadata.get('id', None), query_start=0, + query_end=q_len, query_len=q_len, + target_id=target.metadata.get('id', None), target_start=index, + target_end=index + q_len, target_len=len(target), + strand=query_strand, identity=1.0, coverage=1.0, score=100) + + +@curry +def align_ssw(target, query, query_strand=1): + ssw_aln = local_pairwise_align_ssw(target.sequence, query.sequence) # Extract positions. pos = ssw_aln.start_end_positions() @@ -154,50 +59,49 @@ def _align_ssw(self, query, target, query_ori): # Calculate basic metrics. coverage = (q_end - q_start) / float(len(query)) - identity = 1.0 - ssw_aln[0].distance(ssw_aln[1]) - - aln = VectorAlignment( - query_id=query.metadata['id'], query_start=q_start, - query_end=q_end, query_len=len(query), - target_id=target.metadata['id'], target_start=t_start, - target_end=t_end, target_strand=query_ori, target_len=len(target), - type='ssw', identity=identity, coverage=coverage) - - # Check if alignment passes any filter. - if self._filters is None: - return aln - else: - for filter_ in self._filters: - if filter_(aln): - return aln - return None + identity = ssw_aln[0].fraction_same(ssw_aln[1]) + aln = Alignment( + query_id=query.id, query_start=q_start, query_end=q_end, + query_len=len(query), target_id=target.id, target_start=t_start, + target_end=t_end, target_len=len(target), strand=query_strand, + identity=identity, coverage=coverage, + score=int(identity * coverage * 100)) -class ChainedAligner(VectorAligner): - - def __init__(self, aligners): - super().__init__() - self._aligners = aligners + return aln - def align(self, query, target): - aln = None - for aligner in self._aligners: - aln = aligner.align(query, target) - if aln is not None: - break +@curry +def align_with_reverse(target, query, align_func, query_strand=1, **kwargs): + aln_fwd = align_func(target, query, query_strand=query_strand, **kwargs) + aln_rev = align_func(target, query.reverse_complement(), + query_strand=query_strand * -1, **kwargs) - return aln + if aln_fwd is None: + return aln_rev + elif aln_rev is None: + return aln_fwd + else: + return aln_rev if aln_rev.score > aln_fwd.score else aln_fwd -def filter_identity(aln, min_identity): - return aln.identity >= min_identity +@curry +def align_multiple(target, queries, align_func, return_first=False, **kwargs): + alns = (align_func(target, query, **kwargs) for query in queries) + alns = list(filter(bool, alns)) + if len(alns) == 0: + return None + elif len(alns) == 1 or return_first: + return alns[0] + else: + raise ValueError('Multiple alignments') -def filter_score(aln, min_score): - return aln.score >= min_score +# --- Filtering --- # -def filter_end_match(aln, min_coverage=0.5, min_identity=1.0): - return aln.target_end == aln.target_len and \ - aln.coverage >= min_coverage and aln.identity >= min_identity +def filter_alignment(alignment, filters): + for filter_ in filters: + if not filter_(alignment): + return False + return True \ No newline at end of file diff --git a/pyim/pipelines/_helpers.py b/pyim/pipelines/_helpers.py new file mode 100644 index 0000000..bd73f0c --- /dev/null +++ b/pyim/pipelines/_helpers.py @@ -0,0 +1,147 @@ +import collections +import itertools +import operator + +import skbio +from toolz import curry + +from pyim.util import PrioritySet + + +def print_stats(results): + # Iterate over results, counting statuses. + status_counts = collections.defaultdict(int) + + for result in results: + status_counts[result.status.name] += 1 + yield result + + # We're done, so print frequencies! + print('\nExtract statistics:') + + total = sum(status_counts.values()) + for status, count in status_counts.items(): + percentage = (count / total) * 100 + print('{:>18}: {:>8} ({:05.2f}%)'.format(status, count, percentage)) + + +@curry +def write_genomic_sequences(results, file_path, format='fastq', + mode='w', **io_kwargs): + """ Test docstring """ + with skbio.io.open(file_path, mode, **io_kwargs) as file_: + for result in results: + skbio.io.write(result.genomic_sequence, into=file_, format=format) + yield result + + +@curry +def build_barcode_map(results, sample_map=None): + if sample_map is None: + return {result.genomic_sequence.metadata['id']: + result.barcode + for result in results} + else: + return {result.genomic_sequence.metadata['id']: + sample_map[result.barcode] + for result in results} + + +@curry +def groupby_reference(alignments, alignment_file=None): + for reference, group in itertools.groupby( + alignments, operator.attrgetter('reference_id')): + if alignment_file is not None: + reference = alignment_file.getrname(reference) + yield reference, group + + +def groupby_position(alignments): + """ Groups alignments by their positions, grouping forward strand + alignments with the same start position and reverse strand + alignments with the same end position. Assumes alignments + are all on a single reference sequence. + """ + # Setup our collections for tracking reads and positions. + # + # The priority set is used to track positions with alignment groups, + # ensuring that no position is listed twice (the set part) and + # always giving the lowest position first (the priority part). + # + # The alignment dict contains two lists for each position with at + # least one alignment, one for forward reads and one for reverse. + # Any alignments encountered as position x in orientation o are added + # to the corresponding entry dict[x][o] in the list, in which + # o is encoded as {0,1}, with 1 being for reverse strand alignments. + position_set = PrioritySet() + aln_dict = collections.defaultdict(lambda: ([], [])) + + curr_pos = 0 + for aln in alignments: + # Check our ordering. + if aln.reference_start < curr_pos: + raise ValueError('Alignments not ordered by position') + + curr_pos = aln.reference_start + + # Add current read to collections. + is_reverse = aln.is_reverse + ref_pos = aln.reference_end if is_reverse else curr_pos + aln_dict[ref_pos][bool(is_reverse)].append(aln) + position_set.push(ref_pos, ref_pos) + + # Return any alignment groups before our current position. + try: + while position_set.first() < curr_pos: + first_pos = position_set.pop() + fwd_grp, rev_grp = aln_dict.pop(first_pos) + if len(fwd_grp) > 0: + yield (fwd_grp[0].reference_start, 1), fwd_grp + if len(rev_grp) > 0: + yield (rev_grp[0].reference_end, -1), rev_grp + except ValueError: + pass + + # We're done, yield any remaining alignment groups. + for _ in range(len(position_set)): + fwd_grp, rev_grp = aln_dict.pop(position_set.pop()) + if len(fwd_grp) > 0: + yield (fwd_grp[0].reference_start, 1), fwd_grp + if len(rev_grp) > 0: + yield (rev_grp[0].reference_end, -1), rev_grp + + +@curry +def groupby_reference_position(alignments, alignment_file=None): + chained = chain_groupby( + alignments, [groupby_reference(alignment_file=alignment_file), + groupby_position]) + for res in chained: + yield res + + +@curry +def groupby_barcode(alignments, barcode_map): + # Group alignments by barcodes. + groups = collections.defaultdict(list) + for aln in alignments: + barcode = barcode_map[aln.query_name] + groups[barcode].append(aln) + + # Yield group together with barcode. + for barcode, group in groups.items(): + yield barcode, group + + +def chain_groupby(iterable, groupby_funcs): + grouped = groupby_funcs[0](iterable) + + if len(groupby_funcs) == 1: + for key, group in grouped: + if not isinstance(key, tuple): + key = (key,) + yield key, group + else: + for key, group in grouped: + for sub_key, sub_group in chain_groupby(group, groupby_funcs[1:]): + yield key + sub_key, sub_group diff --git a/pyim/pipelines/_model.py b/pyim/pipelines/_model.py new file mode 100644 index 0000000..ad3bbcc --- /dev/null +++ b/pyim/pipelines/_model.py @@ -0,0 +1,10 @@ +import collections + + +ExtractResult = collections.namedtuple( + 'ExtractResult', ['genomic_sequence', 'barcode', 'status']) + + +Insertion = collections.namedtuple( + 'Insertion', ['id', 'seqname', 'location', + 'strand', 'sample', 'metadata']) diff --git a/pyim/pipelines/_shear_splink.py b/pyim/pipelines/_shear_splink.py new file mode 100644 index 0000000..c477b8b --- /dev/null +++ b/pyim/pipelines/_shear_splink.py @@ -0,0 +1,321 @@ +from __future__ import (absolute_import, division, + print_function, unicode_literals) +from builtins import (ascii, bytes, chr, dict, filter, hex, input, + int, map, next, oct, open, pow, range, round, + str, super, zip) +from future.utils import native_str + +from enum import Enum +from functools import partial +from pathlib import Path + +import numpy as np +import pandas as pd +from skbio import DNA, SequenceCollection + +from pyim.alignment.genome import Bowtie2Aligner +from pyim.alignment._vector import (ExactAligner, SswAligner, ChainedAligner, + filter_score, filter_end_match) +from pyim.cluster import cluster_frame_merged + +from ._base import (Pipeline, ParallelGenomicExtractor, + InsertionIdentifier, genomic_distance) + + +class ShearSplinkPipeline(Pipeline): + + @classmethod + def configure_argparser(cls, subparsers, name='shear_splink'): + parser = subparsers.add_parser(name, help=name + ' help') + + parser.add_argument('input', type=Path) + parser.add_argument('output_dir', type=Path) + parser.add_argument('reference', type=Path) + parser.add_argument('transposon', type=Path) + parser.add_argument('barcodes', type=Path) + parser.add_argument('linker', type=Path) + + parser.add_argument('--contaminants', type=Path) + parser.add_argument('--barcode_mapping', type=Path) + parser.add_argument('--min_genomic_length', type=int, default=15) + parser.add_argument('--min_depth', type=int, default=2) + parser.add_argument('--min_mapq', type=int, default=37) + + parser.add_argument('--threads', type=int, default=1) + + return parser + + @classmethod + def from_args(cls, args): + # Read transposon, barcode and linker sequences. + transposon_seq = DNA.read(str(args['transposon'])) + + linker_seq = DNA.read(str(args['linker'])) + + barcode_seqs = SequenceCollection.read( + str(args['barcodes']), constructor=DNA) + + # Read contaminants if supplied. + contaminant_seqs = SequenceCollection.read( + str(args['contaminants']), constructor=DNA) \ + if args['contaminants'] is not None else None + + # Read barcode map if supplied. + if barcode_seqs is not None and args['barcode_mapping'] is not None: + barcode_map = pd.read_csv(str(args['barcode_mapping']), + sep=native_str('\t')) + barcode_map = dict(zip(barcode_map['barcode'], + barcode_map['sample'])) + else: + barcode_map = None + + # Setup transposon aligner. + + transposon_filters = [ + # Require at least 90% of the sequence to be matched. + partial(filter_score, min_score=0.9) + ] + + transposon_aligner = ChainedAligner( + [ExactAligner(try_reverse=True), + SswAligner(try_reverse=True, filters=transposon_filters)]) + + # Setup linker aligner. + linker_filters = [ + # Require at least 90% of the sequence to be matched. + partial(filter_score, min_score=0.9), + + # Perfect match at the end of the read? + partial(filter_end_match, min_coverage=0.5, min_identity=0.9) + ] + + linker_aligner = ChainedAligner( + [ExactAligner(try_reverse=False), + SswAligner(try_reverse=False, filters=linker_filters)] + ) + + # Setup extractor and identifier for pipeline. + extractor = ShearSplinkExtractor( + transposon_sequence=transposon_seq, + transposon_aligner=transposon_aligner, + barcode_sequences=barcode_seqs, + barcode_map=barcode_map, + barcode_aligner=ExactAligner(try_reverse=False), + linker_sequence=linker_seq, + linker_aligner=linker_aligner, + contaminant_sequences=contaminant_seqs, + min_length=args['min_genomic_length'], + threads=args['threads']) + + aligner = Bowtie2Aligner(args['reference'], bam_output=True, + threads=args['threads']) + identifier = ShearSplinkIdentifier( + min_mapq=args['min_mapq'], min_depth=args['min_depth']) + + return cls(extractor=extractor, aligner=aligner, identifier=identifier) + + +class ShearSplinkStatus(Enum): + contaminant = 1 + no_transposon = 2 + no_linker = 3 + no_barcode = 4 + multiple_barcodes = 5 + too_short = 6 + proper_read = 7 + + +class ShearSplinkExtractor(ParallelGenomicExtractor): + + STATUS = ShearSplinkStatus + + def __init__(self, transposon_sequence, barcode_sequences, linker_sequence, + contaminant_sequences=None, transposon_aligner=None, + barcode_aligner=None, linker_aligner=None, barcode_map=None, + min_length=1, threads=1, chunk_size=1000): + super().__init__(min_length=min_length, + threads=threads, + chunk_size=chunk_size) + + # Sequences. + self._transposon = transposon_sequence + self._contaminants = contaminant_sequences + self._barcodes = barcode_sequences + self._linker = linker_sequence + + # Aligners. + self._transposon_aligner = transposon_aligner \ + if transposon_aligner is not None \ + else ExactAligner(try_reverse=True) + + self._contaminant_aligner = ExactAligner(try_reverse=True) + + self._barcode_aligner = barcode_aligner \ + if barcode_aligner is not None else ExactAligner() + + self._linker_aligner = linker_aligner \ + if linker_aligner is not None else ExactAligner() + + # Barcode map if given (maps barcodes to samples). + self._barcode_map = barcode_map + + def extract_read(self, read): + # Check for contaminants. + if self._contaminants is not None: + contaminant_aln = self._contaminant_aligner.\ + align_multiple(self._contaminants, read, how='any') + + if contaminant_aln is not None: + return None, self.STATUS.contaminant + + # Check for transposon sequence. + transposon_aln = self._transposon_aligner.align( + self._transposon, read) + + if transposon_aln is None: + # Missing transposon sequence. + return None, self.STATUS.no_transposon + else: + # If we have a transposon sequence, continue. + if transposon_aln.target_strand == -1: + # If transposon is on the reverse strand, flip the + # read and the alignment to bring everything downstream + # into the same (fwd) orientation. + read = read.reverse_complement() + transposon_aln = transposon_aln.reverse(read) + + linker_aln = self._linker_aligner.align( + self._linker, read) + + if linker_aln is None: + # Missing linker sequence. + return None, self.STATUS.no_linker + else: + try: + barcode_aln = self._barcode_aligner.\ + align_multiple(self._barcodes, read) + except ValueError: + return None, self.STATUS.multiple_barcodes + + if barcode_aln is None: + # Missing barcode sequence. + return None, self.STATUS.no_barcode + else: + # Read is complete, return genomic part and barcode. + genomic = read[transposon_aln.target_end: + linker_aln.target_start] + + if len(genomic) < self._min_length: + return None, self.STATUS.too_short + else: + barcode = barcode_aln.query_id + + if self._barcode_map is not None: + barcode = self._barcode_map[barcode] + + return ((genomic, barcode), + self.STATUS.proper_read) + + +class ShearSplinkIdentifier(InsertionIdentifier): + + def __init__(self, min_depth=0, min_mapq=37, merge_distance=10): + super().__init__() + + self._min_depth = min_depth + self._min_mapq = min_mapq + self._merge_distance = merge_distance + + def identify(self, alignment_path, barcode_map=None): + insertions = [] + + groups = self._group_by_position_bam( + alignment_path, min_mapq=self._min_mapq, barcode_map=barcode_map) + for (ref_id, pos, strand, bc), alns in groups: + # Determine depth as the number of reads at this position. + depth = len(alns) + + # Determine depth_unique by looking at differences in the + # other position (end for fwd strand, start for rev strand). + other_pos = (a.reference_end for a in alns) if strand == 1 \ + else (a.reference_start for a in alns) + depth_unique = len(set(other_pos)) + + insertions.append( + {'insertion_id': np.nan, 'seqname': ref_id, + 'location': pos, 'strand': strand, 'sample': bc, + 'depth': depth, 'depth_unique': depth_unique}) + + # Create insertion frame. + insertions = pd.DataFrame.from_records( + insertions, columns=['insertion_id', 'seqname', 'location', + 'strand', 'sample', 'depth', 'depth_unique']) + # Merge insertions in close proximity to account for sequencing errors. + if self._merge_distance > 0: + insertions = cluster_frame_merged( + insertions, groupby=['seqname', 'sample', 'strand'], + dist_func=genomic_distance, merge_func=self._merge_insertions, + linkage='complete', criterion='distance', + t=self._merge_distance) + + # Filter by min_depth. + insertions = insertions.ix[ + insertions['depth_unique'] >= self._min_depth] + + # Add clonality annotation. + insertions = self._annotate_clonality(insertions) + + # Sort by coordinate and add identifiers. + insertions = insertions.sort(['seqname', 'location']) + + insertions['insertion_id'] = ['INS_{}'.format(i+1) + for i in range(len(insertions))] + + return insertions + + @classmethod + def _merge_insertions(cls, frame): + if len(frame) == 0: + return frame.iloc[0] + else: + # Check if merging is sane. + assert len(set(frame['seqname'])) == 1 + assert len(set(frame['strand'])) == 1 + assert len(set(frame['sample'].astype(str))) == 1 + + # Pick first row as reference for shared fields. + ref = frame.iloc[0] + + # Calculate new location as mean, biased towards + # insertions with more weight (a higher ULP). + weighted_loc = np.average(frame.location, + weights=frame['depth_unique']) + weighted_loc = int(round(weighted_loc)) + + return pd.Series( + {'insertion_id': np.nan, + 'seqname': ref['seqname'], + 'location': weighted_loc, + 'strand': ref['strand'], + 'sample': ref['sample'], + 'depth': frame['depth'].sum(), + 'depth_unique': frame['depth_unique'].sum()}, + index=ref.index) + + @staticmethod + def _annotate_clonality(ins_frame): + groups = ins_frame.groupby('sample') + + if len(groups) > 0: + clonality = groups.apply(lambda grp: grp['depth_unique'] / + grp['depth_unique'].max()) + + clonality.index = clonality.index.droplevel() + clonality.name = 'clonality' + else: + clonality = pd.Series({'clonality': np.NaN}, + index=ins_frame.index) + + ins_frame_clonality = pd.concat([ins_frame, clonality], axis=1) + + return ins_frame_clonality diff --git a/pyim/pipelines/shear_splink.py b/pyim/pipelines/shear_splink.py index 0a5ef4f..80bb635 100644 --- a/pyim/pipelines/shear_splink.py +++ b/pyim/pipelines/shear_splink.py @@ -1,119 +1,130 @@ -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import (ascii, bytes, chr, dict, filter, hex, input, - int, map, next, oct, open, pow, range, round, - str, super, zip) -from future.utils import native_str - +import collections +import itertools +import operator from enum import Enum -from functools import partial -from pathlib import Path +from os import path +import pysam import numpy as np import pandas as pd -from skbio import DNA, SequenceCollection - -from pyim.alignment.genome import Bowtie2Aligner -from pyim.alignment.vector import (ExactAligner, SswAligner, ChainedAligner, - filter_score, filter_end_match) -from pyim.cluster import cluster_frame_merged - -from ._base import (Pipeline, ParallelGenomicExtractor, - InsertionIdentifier, genomic_distance) - - -class ShearSplinkPipeline(Pipeline): - - @classmethod - def configure_argparser(cls, subparsers, name='shear_splink'): - parser = subparsers.add_parser(name, help=name + ' help') - - parser.add_argument('input', type=Path) - parser.add_argument('output_dir', type=Path) - parser.add_argument('reference', type=Path) - parser.add_argument('transposon', type=Path) - parser.add_argument('barcodes', type=Path) - parser.add_argument('linker', type=Path) - - parser.add_argument('--contaminants', type=Path) - parser.add_argument('--barcode_mapping', type=Path) - parser.add_argument('--min_genomic_length', type=int, default=15) - parser.add_argument('--min_depth', type=int, default=2) - parser.add_argument('--min_mapq', type=int, default=37) - - parser.add_argument('--threads', type=int, default=1) - - return parser - - @classmethod - def from_args(cls, args): - # Read transposon, barcode and linker sequences. - transposon_seq = DNA.read(str(args['transposon'])) - - linker_seq = DNA.read(str(args['linker'])) - - barcode_seqs = SequenceCollection.read( - str(args['barcodes']), constructor=DNA) - - # Read contaminants if supplied. - contaminant_seqs = SequenceCollection.read( - str(args['contaminants']), constructor=DNA) \ - if args['contaminants'] is not None else None - - # Read barcode map if supplied. - if barcode_seqs is not None and args['barcode_mapping'] is not None: - barcode_map = pd.read_csv(str(args['barcode_mapping']), - sep=native_str('\t')) - barcode_map = dict(zip(barcode_map['barcode'], - barcode_map['sample'])) - else: - barcode_map = None - - # Setup transposon aligner. - - transposon_filters = [ - # Require at least 90% of the sequence to be matched. - partial(filter_score, min_score=0.9) - ] - - transposon_aligner = ChainedAligner( - [ExactAligner(try_reverse=True), - SswAligner(try_reverse=True, filters=transposon_filters)]) - - # Setup linker aligner. - linker_filters = [ - # Require at least 90% of the sequence to be matched. - partial(filter_score, min_score=0.9), - - # Perfect match at the end of the read? - partial(filter_end_match, min_coverage=0.5, min_identity=0.9) - ] - - linker_aligner = ChainedAligner( - [ExactAligner(try_reverse=False), - SswAligner(try_reverse=False, filters=linker_filters)] - ) - - # Setup extractor and identifier for pipeline. - extractor = ShearSplinkExtractor( - transposon_sequence=transposon_seq, - transposon_aligner=transposon_aligner, - barcode_sequences=barcode_seqs, - barcode_map=barcode_map, - barcode_aligner=ExactAligner(try_reverse=False), - linker_sequence=linker_seq, - linker_aligner=linker_aligner, - contaminant_sequences=contaminant_seqs, - min_length=args['min_genomic_length'], - threads=args['threads']) - - aligner = Bowtie2Aligner(args['reference'], bam_output=True, - threads=args['threads']) - identifier = ShearSplinkIdentifier( - min_mapq=args['min_mapq'], min_depth=args['min_depth']) - - return cls(extractor=extractor, aligner=aligner, identifier=identifier) +from toolz import curry, map, pipe, merge_with +from toolz.curried import filter + +import skbio +from tqdm import tqdm + +from pyim.alignment.bowtie2 import align as bowtie_align +from pyim.alignment.vector import (align_exact, align_multiple, + align_with_reverse, reverse_alignment) +from pyim.util import count_lines + +from pyim.pipelines._model import ExtractResult, Insertion +from pyim.pipelines._helpers import ( + print_stats, write_genomic_sequences, build_barcode_map, + chain_groupby, groupby_barcode, + groupby_reference_position) + + +# --- Register pipeline --- # + +def register(subparsers, name='shear_splink'): + parser = subparsers.add_parser(name, help=name + ' help') + + # Required arguments. + parser.add_argument('input') + parser.add_argument('output_dir') + parser.add_argument('bowtie_index') + parser.add_argument('transposon') + parser.add_argument('barcodes') + parser.add_argument('linker') + + # Optional arguments. + parser.add_argument('--contaminants', default=None) + parser.add_argument('--sample_map', default=None) + parser.add_argument('--min_genomic_length', type=int, default=15) + parser.add_argument('--min_depth', type=int, default=2) + parser.add_argument('--min_mapq', type=int, default=37) + + # Set main for dispatch. + parser.set_defaults(main=main) + + return parser + + +def main(args): + + # Setup input reads. + reads = tqdm(skbio.io.read(args.input, format='fasta'), + total=count_lines(args.input) // 2, leave=True) + + # Read transposon, linker and barcode sequences. + transposon = skbio.io.read(args.transposon, format='fasta', into=skbio.DNA) + linker = skbio.io.read(args.linker, format='fasta', into=skbio.DNA) + + barcodes = list(skbio.io.read(args.barcodes, format='fasta', + constructor=skbio.DNA)) + + if args.contaminants is not None: + contaminants = list(skbio.io.read(args.contaminants, format='fasta', + constructor=skbio.DNA)) + else: + contaminants = None + + # Read barcode --> sample map if given. + if args.sample_map is not None: + sample_map = pd.read_csv(args.sample_map, sep='\t') + else: + sample_map = None + # Run pipeline! + shear_splink(reads, transposon, linker, barcodes, + args.bowtie_index, args.output_dir, + contaminants=contaminants, sample_map=sample_map, + min_genomic_length=args.min_genomic_length) + + +# --- Overall pipeline --- # + +def shear_splink(reads, transposon, linker, barcodes, bowtie_index, output_dir, + contaminants=None, sample_map=None, min_genomic_length=15): + # seq1 = DNA('CACTGGCCACGCGAAGGTGC') + # seq2 = DNA('GACCACTGGCCACGCGAAGG').reverse_complement() + # seq3 = DNA('CGTTGGTCACTCTACCCACA') + + # transposon = DNA('TTTG', metadata=dict(id='transposon')) + # barcodes = [DNA('AAAT', metadata=dict(id='BC01')), + # DNA('AAAA', metadata=dict(id='BC02'))] + # linker = DNA('CCCG', metadata=dict(id='linker')) + + # genomic_path = '/Users/Julian/Scratch/pyim/functional/genomic.fasta.gz' + # barcode_path = '/Users/Julian/Scratch/pyim/functional/barcodes.txt' + + # index_path = '/path/to/index' + # alignment_path = '/Users/Julian/Scratch/pyim/functional/alignment.bam' + + genomic_path = path.join(output_dir, 'genomic.fna') + barcode_path = path.join(output_dir, 'barcodes.txt') + alignment_path = path.join(output_dir, 'alignment.bam') + + # Extract genomic sequences and barcodes + _, barcode_map = extract_genomic( + reads, transposon=transposon, barcodes=barcodes, + linker=linker, output_path=genomic_path, + contaminants=contaminants, min_length=min_genomic_length) + barcode_map.to_csv(barcode_path, sep='\t', index=False) + + # Align to reference with Bowtie2. + bowtie_align(genomic_path, bowtie_index, alignment_path, + options={}, log=alignment_path + '.log') + + # Identify insertions from alignment. + # insertions = identify_insertions(alignment_path, barcode_map=barcode_map) + # print(insertions) + + # return insertions + + +# --- Genomic sequence extraction --- # class ShearSplinkStatus(Enum): contaminant = 1 @@ -125,197 +136,191 @@ class ShearSplinkStatus(Enum): proper_read = 7 -class ShearSplinkExtractor(ParallelGenomicExtractor): - - STATUS = ShearSplinkStatus +def extract_genomic(reads, transposon, barcodes, linker, output_path, + sample_map=None, contaminants=None, min_length=15, + io_kwargs=None): + io_kwargs = io_kwargs or {} + + # Extract and write genomic sequences. + barcode_map = pipe( + reads, + _extract_reads(transposon=transposon, + barcodes=barcodes, + linker=linker, + contaminants=contaminants, + sample_map=sample_map), + print_stats, + filter(lambda r: r.status == ShearSplinkStatus.proper_read), + filter(lambda r: len(r.genomic_sequence) >= min_length), + write_genomic_sequences(file_path=output_path, + format='fasta', **io_kwargs), + build_barcode_map) + + # Build frame mapping reads to barcodes. + barcode_frame = pd.DataFrame.from_records( + iter(barcode_map.items()), columns=['read_id', 'barcode']) + + return output_path, barcode_frame + + +@curry +def _extract_reads(reads, transposon, barcodes, linker, contaminants=None, + transposon_func=None, barcode_func=None, + linker_func=None, sample_map=None): + + # Specify defaults for not provided aligners. + if transposon_func is None: + transposon_func = align_with_reverse(align_func=align_exact) + + if barcode_func is None: + barcode_func = align_multiple(align_func=align_exact) + + if linker_func is None: + linker_func = align_exact + + # Setup contaminant aligner if sequences are provided. + if contaminants is not None: + contaminant_func = align_multiple(queries=contaminants, + align_func=align_exact, + return_first=True) + else: + contaminant_func = None + + # Prime aligners with their respective sequences. + transposon_func = transposon_func(query=transposon) + barcode_func = barcode_func(queries=barcodes) + linker_func = linker_func(query=linker) + + # Extract and return results. + extract_func = curry(_extract_read, + transposon_func=transposon_func, + barcode_func=barcode_func, + linker_func=linker_func, + contaminant_func=contaminant_func) + + for result in map(extract_func, reads): + yield result + + +def _extract_read( + read, transposon_func, barcode_func, + linker_func, contaminant_func=None): + """ Extracts the genomic sequence and barcode from the passed + read. Reads containing contaminants are dropped. Reads are + expected to look as follows: + + [barcode][transposon][genomic-sequence][linker] + + Each of these sequences is recognized by their corresponding + alignment function. The barcode alignment identifies the + barcode (and thus the sample) of the read, whilst the transposon + and linker alignments are used to delineate the genomic sequence. + + The function returns an ExactResult tuple that contains the + genomic sequence, barcode and a status flag. If any errors + occur during the extraction, the genomic sequence and barcode + values are None and the status flag indicates the underlying reason. + """ + + # Drop read if it contains a contaminant. + if contaminant_func is not None and contaminant_func(read) is not None: + return ExtractResult(None, None, ShearSplinkStatus.contaminant) + + # Identify location of the transposon. + transposon_aln = transposon_func(read) + if transposon_aln is None: + return ExtractResult(None, None, ShearSplinkStatus.no_transposon) + + # If transposon is on the reverse strand, flip the read and the + # alignment to bring everything into the same (fwd) orientation. + if transposon_aln.strand == -1: + read = read.reverse_complement() + transposon_aln = reverse_alignment(transposon_aln) + + # Identify barcode of the read. + try: + barcode_aln = barcode_func(read) + if barcode_aln is None: + return ExtractResult(None, None, ShearSplinkStatus.no_barcode) + except ValueError: + return ExtractResult(None, None, ShearSplinkStatus.multiple_barcodes) + + barcode = barcode_aln.query_id + + # Identify location of linker. + linker_aln = linker_func(read) + if linker_aln is None: + return ExtractResult(None, None, ShearSplinkStatus.no_linker) + + # Extract genomic sequence using previous alignments. + genomic = read[transposon_aln.target_end:linker_aln.target_start] + + return ExtractResult(genomic, barcode, ShearSplinkStatus.proper_read) + + +# --- Insertion identification --- # + +def identify_insertions(alignment_path, barcode_map): + + bam = pysam.AlignmentFile(alignment_path) + alns = bam.fetch(multiple_iterators=True) + + # Group alignments by barcode and position. + aln_groups = chain_groupby( + alns, + [groupby_reference_position(alignment_file=bam), + groupby_barcode(barcode_map=barcode_map)]) + + # Convert groups into insertion frame. + insertions = pd.DataFrame.from_records( + (_alignments_to_insertion(info, alns) + for info, alns in aln_groups) , + columns=['id', 'chrom', 'position', 'strand', + 'barcode', 'depth', 'depth_unique']) + + # Cluster and merge close insertions + + + return insertions + + +def _alignments_to_insertion(info, alignments, id_=None): + ref, pos, strand, bc = info + + # Get positions of the non-transposon ends of the alignment. + end_field = 'reference_end' if strand == 1 else 'reference_start' + end_positions = map(operator.attrgetter(end_field), alignments) + + # Calculate overall depth and unique end depth. + depth = len(alignments) + depth_unique = len(set(end_positions)) + + return id_, ref, pos, strand, bc, depth, depth_unique + + # metadata = dict(depth=depth, depth_unique=depth_unique) + #return Insertion(id=id_, seq_name=ref, location=pos, strand=strand, + # sample=bc, metadata=metadata) + + +def group_insertions(insertions, distance): + # for insertion in insertions: + # check if we have an insertion from this sample in our collection + # if so, add to collection + + # - When did we last see SAMPLE_X? + # - Which sample have we not seen within distance? + pass + + +def merge_insertions(insertions): + # Summarize location as mean. + location = np.average([ins.location for ins in insertions]) + + # Merge metadata by summing depths. + metadata = merge_with(sum, *[ins.metadata for ins in insertions]) + + # Take first insertion as reference for other attributes. + ref = insertions[0] - def __init__(self, transposon_sequence, barcode_sequences, linker_sequence, - contaminant_sequences=None, transposon_aligner=None, - barcode_aligner=None, linker_aligner=None, barcode_map=None, - min_length=1, threads=1, chunk_size=1000): - super().__init__(min_length=min_length, - threads=threads, - chunk_size=chunk_size) - - # Sequences. - self._transposon = transposon_sequence - self._contaminants = contaminant_sequences - self._barcodes = barcode_sequences - self._linker = linker_sequence - - # Aligners. - self._transposon_aligner = transposon_aligner \ - if transposon_aligner is not None \ - else ExactAligner(try_reverse=True) - - self._contaminant_aligner = ExactAligner(try_reverse=True) - - self._barcode_aligner = barcode_aligner \ - if barcode_aligner is not None else ExactAligner() - - self._linker_aligner = linker_aligner \ - if linker_aligner is not None else ExactAligner() - - # Barcode map if given (maps barcodes to samples). - self._barcode_map = barcode_map - - def extract_read(self, read): - # Check for contaminants. - if self._contaminants is not None: - contaminant_aln = self._contaminant_aligner.\ - align_multiple(self._contaminants, read, how='any') - - if contaminant_aln is not None: - return None, self.STATUS.contaminant - - # Check for transposon sequence. - transposon_aln = self._transposon_aligner.align( - self._transposon, read) - - if transposon_aln is None: - # Missing transposon sequence. - return None, self.STATUS.no_transposon - else: - # If we have a transposon sequence, continue. - if transposon_aln.target_strand == -1: - # If transposon is on the reverse strand, flip the - # read and the alignment to bring everything downstream - # into the same (fwd) orientation. - read = read.reverse_complement() - transposon_aln = transposon_aln.reverse(read) - - linker_aln = self._linker_aligner.align( - self._linker, read) - - if linker_aln is None: - # Missing linker sequence. - return None, self.STATUS.no_linker - else: - try: - barcode_aln = self._barcode_aligner.\ - align_multiple(self._barcodes, read) - except ValueError: - return None, self.STATUS.multiple_barcodes - - if barcode_aln is None: - # Missing barcode sequence. - return None, self.STATUS.no_barcode - else: - # Read is complete, return genomic part and barcode. - genomic = read[transposon_aln.target_end: - linker_aln.target_start] - - if len(genomic) < self._min_length: - return None, self.STATUS.too_short - else: - barcode = barcode_aln.query_id - - if self._barcode_map is not None: - barcode = self._barcode_map[barcode] - - return ((genomic, barcode), - self.STATUS.proper_read) - - -class ShearSplinkIdentifier(InsertionIdentifier): - - def __init__(self, min_depth=0, min_mapq=37, merge_distance=10): - super().__init__() - - self._min_depth = min_depth - self._min_mapq = min_mapq - self._merge_distance = merge_distance - - def identify(self, alignment_path, barcode_map=None): - insertions = [] - - groups = self._group_by_position_bam( - alignment_path, min_mapq=self._min_mapq, barcode_map=barcode_map) - for (ref_id, pos, strand, bc), alns in groups: - # Determine depth as the number of reads at this position. - depth = len(alns) - - # Determine depth_unique by looking at differences in the - # other position (end for fwd strand, start for rev strand). - other_pos = (a.reference_end for a in alns) if strand == 1 \ - else (a.reference_start for a in alns) - depth_unique = len(set(other_pos)) - - insertions.append( - {'insertion_id': np.nan, 'seqname': ref_id, - 'location': pos, 'strand': strand, 'sample': bc, - 'depth': depth, 'depth_unique': depth_unique}) - - # Create insertion frame. - insertions = pd.DataFrame.from_records( - insertions, columns=['insertion_id', 'seqname', 'location', - 'strand', 'sample', 'depth', 'depth_unique']) - # Merge insertions in close proximity to account for sequencing errors. - if self._merge_distance > 0: - insertions = cluster_frame_merged( - insertions, groupby=['seqname', 'sample', 'strand'], - dist_func=genomic_distance, merge_func=self._merge_insertions, - linkage='complete', criterion='distance', - t=self._merge_distance) - - # Filter by min_depth. - insertions = insertions.ix[ - insertions['depth_unique'] >= self._min_depth] - - # Add clonality annotation. - insertions = self._annotate_clonality(insertions) - - # Sort by coordinate and add identifiers. - insertions = insertions.sort(['seqname', 'location']) - - insertions['insertion_id'] = ['INS_{}'.format(i+1) - for i in range(len(insertions))] - - return insertions - - @classmethod - def _merge_insertions(cls, frame): - if len(frame) == 0: - return frame.iloc[0] - else: - # Check if merging is sane. - assert len(set(frame['seqname'])) == 1 - assert len(set(frame['strand'])) == 1 - assert len(set(frame['sample'].astype(str))) == 1 - - # Pick first row as reference for shared fields. - ref = frame.iloc[0] - - # Calculate new location as mean, biased towards - # insertions with more weight (a higher ULP). - weighted_loc = np.average(frame.location, - weights=frame['depth_unique']) - weighted_loc = int(round(weighted_loc)) - - return pd.Series( - {'insertion_id': np.nan, - 'seqname': ref['seqname'], - 'location': weighted_loc, - 'strand': ref['strand'], - 'sample': ref['sample'], - 'depth': frame['depth'].sum(), - 'depth_unique': frame['depth_unique'].sum()}, - index=ref.index) - - @staticmethod - def _annotate_clonality(ins_frame): - groups = ins_frame.groupby('sample') - - if len(groups) > 0: - clonality = groups.apply(lambda grp: grp['depth_unique'] / - grp['depth_unique'].max()) - - clonality.index = clonality.index.droplevel() - clonality.name = 'clonality' - else: - clonality = pd.Series({'clonality': np.NaN}, - index=ins_frame.index) - - ins_frame_clonality = pd.concat([ins_frame, clonality], axis=1) - - return ins_frame_clonality + return Insertion(id=None, seqname=ref.seqname, location=location, + strand=ref.strand, sample=ref.sample, metadata=metadata) From 5cc83ca224ed7ca7b6bf75525f19a25160555879 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Mon, 21 Dec 2015 16:40:51 +0100 Subject: [PATCH 015/100] Basic implementation of 'functional' ShearSplink pipeline. --- pyim/alignment/bowtie2.py | 46 ++- pyim/cluster.py | 57 ---- pyim/main/align.py | 44 +-- pyim/pipelines/_base.py | 313 ----------------- pyim/pipelines/_helpers/__init__.py | 0 pyim/pipelines/_helpers/clustering.py | 64 ++++ .../{_helpers.py => _helpers/grouping.py} | 66 ++-- pyim/pipelines/_helpers/pipeline.py | 55 +++ pyim/pipelines/_model.py | 5 - pyim/pipelines/_shear_splink.py | 321 ------------------ pyim/pipelines/shear_splink.py | 158 ++++----- pyim/util.py | 44 ++- 12 files changed, 289 insertions(+), 884 deletions(-) delete mode 100644 pyim/cluster.py delete mode 100644 pyim/pipelines/_base.py create mode 100644 pyim/pipelines/_helpers/__init__.py create mode 100644 pyim/pipelines/_helpers/clustering.py rename pyim/pipelines/{_helpers.py => _helpers/grouping.py} (73%) create mode 100644 pyim/pipelines/_helpers/pipeline.py delete mode 100644 pyim/pipelines/_shear_splink.py diff --git a/pyim/alignment/bowtie2.py b/pyim/alignment/bowtie2.py index 14a210f..2c156fa 100644 --- a/pyim/alignment/bowtie2.py +++ b/pyim/alignment/bowtie2.py @@ -4,10 +4,13 @@ int, map, next, oct, open, pow, range, round, str, super, zip) +import os import subprocess +from os import path -def align(m1, index, output, m2=None, options=None, log=None): +def align(m1, index, output, m2=None, options=None, + log=None, bam_output=False): options = {} or options # Inject inputs into options. @@ -18,6 +21,12 @@ def align(m1, index, output, m2=None, options=None, log=None): options['-2'] = m2 # Inject index and output. + if not output.endswith('.sam'): + if output.endswith('.bam'): + output = output.replace('.bam', '.sam') + else: + output = output + '.sam' + options['-x'] = index options['-S'] = output @@ -30,6 +39,11 @@ def align(m1, index, output, m2=None, options=None, log=None): else: subprocess.check_call(args) + # Convert to bam if needed. + if bam_output: + output = sam_to_bam(output, sort=True, + index=True, delete_sam=True) + return output @@ -45,3 +59,33 @@ def dict_to_args(arg_dict): args.append(str(value)) return args + + +def sam_to_bam(sam_path, bam_path=None, sort=False, + index=False, delete_sam=False): + if bam_path is None: + # Default output name replaces .sam with .bam. + bam_path = path.splitext(sam_path)[0] + '.bam' + + if sort: + # Pipe bam into samtools sort for sorting. + p1 = subprocess.Popen(['samtools', 'view', '-b', sam_path], + stdout=subprocess.PIPE) + p2 = subprocess.Popen(['samtools', 'sort', '-', + path.splitext(bam_path)[0]], stdin=p1.stdout) + p1.stdout.close() + p2.communicate() + + if index: + # Indexing bam file if needed. + subprocess.check_call(['samtools', 'index', bam_path]) + else: + # Only convert to bam. + subprocess.check_call(['samtools', 'view', '-b', + '-o', bam_path, sam_path]) + + if delete_sam: + # Delete original sam if requested. + os.unlink(sam_path) + + return bam_path \ No newline at end of file diff --git a/pyim/cluster.py b/pyim/cluster.py deleted file mode 100644 index 74c394a..0000000 --- a/pyim/cluster.py +++ /dev/null @@ -1,57 +0,0 @@ -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import (bytes, dict, int, list, object, range, str, - ascii, chr, hex, input, next, oct, open, - pow, round, super, filter, map, zip) - -import numpy as np -import pandas as pd -from scipy.cluster.hierarchy import fcluster, complete, average - -LINKAGE_MAP = { - 'complete': complete, - 'average': average -} - - -def cluster_frame(frame, dist_func, groupby=None, - linkage='complete', criterion='distance', t=0): - # Lookup linkage type. - try: - linkage = LINKAGE_MAP[linkage] - except KeyError: - raise ValueError('Unknown linkage type {}'.format(linkage)) - - # Group by columns if any are given. - if groupby is None: - groups = [(None, frame)] - else: - # Replace NaNs to avoid dropping entries. - frame = frame.fillna('NaN') - groups = frame.groupby(groupby) - - # Determine clusters and use to sub-group frame. - for _, grp in groups: - if len(grp) == 1: - yield grp.replace('NaN', np.nan) - else: - dists = dist_func(grp) - clusters = fcluster(complete(dists), t=t, criterion=criterion) - for _, cluster_grp in grp.groupby(clusters, group_keys=False): - if groupby is not None: - cluster_grp = cluster_grp.replace('NaN', np.nan) - yield cluster_grp - - -def cluster_frame_merged(frame, dist_func, merge_func, **kwargs): - # Otherwise we merge each group into a single row (series) - # and return the summarized dataframe. - groups = list(cluster_frame(frame, dist_func, **kwargs)) - - frame = pd.DataFrame.from_records( - (merge_func(grp) for grp in groups), - columns=frame.columns) - - return frame - - diff --git a/pyim/main/align.py b/pyim/main/align.py index 41d96e6..149aad0 100644 --- a/pyim/main/align.py +++ b/pyim/main/align.py @@ -6,51 +6,23 @@ import argparse -from pyim.pipelines.lam_pcr import LamPcrPipeline -from pyim.pipelines.shear_splink import ShearSplinkPipeline +# from pyim.pipelines.lam_pcr import LamPcrPipeline +from pyim.pipelines import shear_splink -PIPELINES = { - 'lam_pcr': LamPcrPipeline, - 'shear_splink': ShearSplinkPipeline -} - -def setup_parser(): +def main(): + # Setup main parser. parser = argparse.ArgumentParser(prog='pyim-align') subparsers = parser.add_subparsers(dest='pipeline') subparsers.required = True - for name, class_ in PIPELINES.items(): - class_.configure_argparser(subparsers, name=name) - - return parser + # Register pipelines. + shear_splink.register(subparsers) - -def main(): - parser = setup_parser() + # Parse args and dispatch. args = parser.parse_args() - - # Check if a sub-parser was chosen. - if args.pipeline is None: - raise ValueError('No pipeline was specified as sub-command (choose ' - 'from {})' .format(', '.join(PIPELINES.keys()))) - - # Parse options and extract main input/output parameters. - arg_dict = vars(args) - - pipeline_name = arg_dict.pop('pipeline') - input_path = arg_dict.pop('input') - output_path = arg_dict.pop('output_dir') - - # Instantiate chosen pipeline and run! - try: - pipeline_class = PIPELINES[pipeline_name] - except KeyError: - raise ValueError('Pipeline \'{}\' does not exist'.format(pipeline_name)) - else: - pipeline = pipeline_class.from_args(arg_dict) - pipeline.run(input_path, output_path) + args.main(args) if __name__ == '__main__': diff --git a/pyim/pipelines/_base.py b/pyim/pipelines/_base.py deleted file mode 100644 index 8b84739..0000000 --- a/pyim/pipelines/_base.py +++ /dev/null @@ -1,313 +0,0 @@ -from __future__ import (absolute_import, division, - print_function, unicode_literals) -# noinspection PyUnresolvedReferences -from builtins import (ascii, bytes, chr, dict, filter, hex, input, - int, map, next, oct, open, pow, range, round, - str, super, zip) -from future.utils import native_str - -import logging -import pkg_resources -from collections import defaultdict -from multiprocessing import Pool - -import pysam -import pandas as pd -import numpy as np -from scipy.spatial.distance import pdist - -from skbio import DNA -from skbio import io as skbio_io - -from pyim.util import PrioritySet - -logging.basicConfig( - format='%(asctime)-15s %(message)s', - datefmt='[%Y-%m-%d %H:%M:%S]', - level=logging.INFO) - - -# --- Pipelines --- # - -class Pipeline(object): - - def __init__(self, extractor, aligner, identifier): - super().__init__() - self._extractor = extractor - self._aligner = aligner - self._identifier = identifier - - @classmethod - def configure_argparser(cls, parser): - raise NotImplementedError() - - @classmethod - def from_args(cls, args): - raise NotImplementedError() - - def run(self, input_path, output_dir): - logger = logging.getLogger() - - version = pkg_resources.get_distribution('pyim').version - logger.info('--- PyIM v{} ---'.format(version)) - - logger.info('Starting {} pipeline'.format( - self.__class__.__name__.replace('Pipeline', ''))) - - # Create directories if needed. - if not output_dir.exists(): - output_dir.mkdir() - - if input_path.suffix not in {'.bam', '.sam'}: - genomic_path = output_dir / ('genomic' + - ''.join(input_path.suffixes)) - barcode_path = output_dir / 'genomic.barcodes.txt' - - # Extract genomic reads from input. - logger.info('Extracting genomic sequences from reads') - - _, barcodes = self._extractor.extract_file( - input_path=input_path, output_path=genomic_path) - - # Log statistics. - total_reads = sum(self._extractor.stats.values()) - - logger.info('- Processed {} reads'.format(total_reads)) - logger.info('- Read statistics') - for status in self._extractor.STATUS: - count = self._extractor.stats[status] - logger.info('\t- {}: {} ({:3.2f}%)' - .format(status.name, count, - (count / total_reads) * 100)) - - # Write out barcodes as frame. - barcode_frame = pd.DataFrame.from_records( - iter(barcodes.items()), columns=['read_id', 'barcode']) - barcode_frame.to_csv( - str(barcode_path), sep=native_str('\t'), index=False) - - # Align to reference genome. - logger.info('Aligning genomic sequences to reference') - logger.info('- Using {} aligner (v{})'.format( - self._aligner.__class__.__name__.replace('Aligner', ''), - self._aligner.get_version())) - - aln_path = self._aligner.align_file( - file=genomic_path, output_dir=output_dir) - else: - aln_path, barcodes = input_path, None - - barcode_map = pd.read_csv( - str(output_dir / 'genomic.barcodes.txt'), sep='\t') - barcodes = dict(zip(barcode_map['read_id'], barcode_map['barcode'])) - - # Identify transposon insertions. - logger.info('Identifying insertions from alignment') - - insertions = self._identifier.identify(aln_path, barcode_map=barcodes) - insertions.to_csv(str(output_dir / 'insertions.txt'), - sep=native_str('\t'), index=False) - - logger.info('--- Done! ---') - - -# --- Extractors --- # - -# noinspection PyShadowingBuiltins -class GenomicExtractor(object): - - DEFAULT_IN_FORMAT = 'fasta' - DEFAULT_OUT_FORMAT = 'fasta' - - def __init__(self, min_length=1, **kwargs): - super().__init__() - self._min_length = min_length - self._stats = None - - self.reset_stats() - - @property - def stats(self): - return self._stats - - def reset_stats(self): - self._stats = defaultdict(int) - - def extract(self, reads): - for read in reads: - result, status = self.extract_read(read) - self._stats[status] += 1 - if result is not None: - yield result - - def extract_read(self, read): - raise NotImplementedError() - - def extract_from_file(self, file_path, format=None): - format = self.DEFAULT_IN_FORMAT if format is None else format - - reads = skbio_io.read( - str(file_path), format=format, constructor=DNA) - for genomic, barcode in self.extract(reads): - yield genomic, barcode - - def extract_to_file(self, reads, file_path, format=None): - format = self.DEFAULT_OUT_FORMAT if format is None else format - - barcodes = {} - with open(str(file_path), 'w') as file_: - for genomic, barcode in self.extract(reads): - barcodes[genomic.metadata['id']] = barcode - skbio_io.write(obj=genomic, format=format, into=file_) - - return file_path, barcodes - - def extract_file(self, input_path, output_path, - format_in=None, format_out=None): - format_in = self.DEFAULT_IN_FORMAT if format_in is None else format_in - format_out = self.DEFAULT_OUT_FORMAT \ - if format_out is None else format_out - - reads = skbio_io.read( - str(input_path), format=format_in, constructor=DNA) - return self.extract_to_file(reads, output_path, format=format_out) - - -class ParallelGenomicExtractor(GenomicExtractor): - - def __init__(self, min_length=1, threads=1, chunk_size=1000, **kwargs): - super().__init__(min_length=min_length) - - self._threads = threads - self._chunk_size = chunk_size - - def extract(self, reads): - if self._threads == 1: - for result in super().extract(reads): - yield result - else: - pool = Pool(self._threads) - - for result, status in pool.imap_unordered( - self.extract_read, reads, chunksize=self._chunk_size): - self._stats[status] += 1 - if result is not None: - yield result - - pool.close() - pool.join() - - def extract_read(self, read): - raise NotImplementedError() - - -# --- Identifiers --- # - -class InsertionIdentifier(object): - - def __init__(self, **kwargs): - super().__init__() - - def identify(self, alignment): - raise NotImplementedError() - - @classmethod - def _group_by_position_bam(cls, bam_path, barcode_map=None, min_mapq=0): - bam_file = pysam.AlignmentFile(str(bam_path), 'rb') - - # Collect insertions from alignments. - for ref_id in bam_file.references: - alignments = bam_file.fetch(reference=ref_id) - alignments = (aln for aln in alignments - if aln.mapping_quality >= min_mapq) - - # Group alignments by genomic position. - aln_groups = cls._group_by_position_barcode( - alignments, barcode_map=barcode_map) - - for (pos, strand, bc), alns in aln_groups: - yield (ref_id, pos, strand, bc), alns - - @staticmethod - def _group_by_position(alignments): - """ Groups alignments by their positions, grouping forward strand - alignments with the same start position and reverse strand - alignments with the same end position. Assumes alignments - are all on a single reference sequence. - """ - # Setup our collections for tracking reads and positions. - # - # The priority set is used to track positions with alignment groups, - # ensuring that no position is listed twice (the set part) and - # always giving the lowest position first (the priority part). - # - # The alignment dict contains two lists for each position with at - # least one alignment, one for forward reads and one for reverse. - # Any alignments encountered as position x in orientation o are added - # to the corresponding entry dict[x][o] in the list, in which - # o is encoded as {0,1}, with 1 being for reverse strand alignments. - position_set = PrioritySet() - aln_dict = defaultdict(lambda: ([], [])) - - curr_pos = 0 - for aln in alignments: - # Check our ordering. - if aln.reference_start < curr_pos: - raise ValueError('Alignments not ordered by position') - - curr_pos = aln.reference_start - - # Add current read to collections. - is_reverse = aln.is_reverse - ref_pos = aln.reference_end if is_reverse else curr_pos - aln_dict[ref_pos][bool(is_reverse)].append(aln) - position_set.push(ref_pos, ref_pos) - - # Return any alignment groups before our current position. - try: - while position_set.first() < curr_pos: - first_pos = position_set.pop() - fwd_grp, rev_grp = aln_dict.pop(first_pos) - if len(fwd_grp) > 0: - yield (fwd_grp[0].reference_start, 1), fwd_grp - if len(rev_grp) > 0: - yield (rev_grp[0].reference_end, -1), rev_grp - except ValueError: - pass - - # We're done, yield any remaining alignment groups. - for _ in range(len(position_set)): - fwd_grp, rev_grp = aln_dict.pop(position_set.pop()) - if len(fwd_grp) > 0: - yield (fwd_grp[0].reference_start, 1), fwd_grp - if len(rev_grp) > 0: - yield (rev_grp[0].reference_end, -1), rev_grp - - @classmethod - def _group_by_position_barcode(cls, alignments, barcode_map=None): - grouped = cls._group_by_position(alignments) - - if barcode_map is None: - for tup, grp in grouped: - yield tup + (np.nan, ), grp - else: - for tup, grp in grouped: - for bc, bc_grp in cls._split_by_barcode(grp, barcode_map): - yield tup + (bc, ), bc_grp - - @staticmethod - def _split_by_barcode(alignments, barcode_map): - split_groups = defaultdict(list) - for aln in alignments: - barcode = barcode_map[aln.query_name] - split_groups[barcode].append(aln) - - for k, v in split_groups.items(): - yield k, v - - -def genomic_distance(insertions): - loc = insertions['location'] - loc_2d = np.vstack([loc, np.zeros_like(loc)]).T - dist = pdist(loc_2d, lambda u, v: np.abs(u-v).sum()) - return dist diff --git a/pyim/pipelines/_helpers/__init__.py b/pyim/pipelines/_helpers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pyim/pipelines/_helpers/clustering.py b/pyim/pipelines/_helpers/clustering.py new file mode 100644 index 0000000..a05a019 --- /dev/null +++ b/pyim/pipelines/_helpers/clustering.py @@ -0,0 +1,64 @@ +import toolz +import numpy as np +import pandas as pd +import scipy.cluster.hierarchy as sch +import scipy.spatial.distance as ssd + + +def merge_within_distance(insertions, max_dist=2000, agg_funcs=None): + clustered = cluster_insertions(insertions, max_dist=max_dist) + return merge_insertions(clustered, by='cluster', agg_funcs=agg_funcs) + + +def cluster_insertions(insertions, max_dist=2000, method='single'): + prev_n_clusters = 0 + + clustered_grps = [] + for _, group in insertions.groupby(['chrom', 'barcode', 'strand']): + clusters = _cluster_group(group, max_dist, method) + clusters += prev_n_clusters + + group = group.copy() + group['cluster'] = clusters + clustered_grps.append(group) + + prev_n_clusters = np.max(clusters) + + return pd.concat(clustered_grps, ignore_index=True) + + +def _cluster_group(insertions, max_dist, method): + if len(insertions) == 1: + clusters = np.array([1], dtype=np.int32) + else: + dists = genomic_distance(insertions) + z = sch.linkage(dists, method=method) + clusters = sch.fcluster(z, criterion='distance', t=max_dist) + + return clusters + + +def genomic_distance(insertions): + # Sanity check insertions (for debugging). + assert(insertions['chrom'].nunique() == 1) + assert(insertions['barcode'].nunique() == 1) + assert(insertions['strand'].nunique() == 1) + + # Calculate 1d distances. + loc = insertions['position'] + loc_2d = np.vstack([loc, np.zeros_like(loc)]).T + dist = ssd.pdist(loc_2d, lambda u, v: np.abs(u-v).sum()) + + return dist + + +def merge_insertions(insertions, by='cluster', agg_funcs=None): + # TODO: use weighted median. + default_agg = {'id': 'first', 'chrom': 'first', 'position': 'median', + 'strand': 'first', 'barcode': 'first'} + agg_funcs = toolz.merge(default_agg, agg_funcs or {}) + + col_order = [c for c in insertions.columns if c in agg_funcs] + merged = insertions.groupby(by).agg(agg_funcs)[col_order] + + return merged diff --git a/pyim/pipelines/_helpers.py b/pyim/pipelines/_helpers/grouping.py similarity index 73% rename from pyim/pipelines/_helpers.py rename to pyim/pipelines/_helpers/grouping.py index bd73f0c..abf91e3 100644 --- a/pyim/pipelines/_helpers.py +++ b/pyim/pipelines/_helpers/grouping.py @@ -2,52 +2,42 @@ import itertools import operator -import skbio -from toolz import curry +import heapq +import toolz -from pyim.util import PrioritySet +class PrioritySet(object): -def print_stats(results): - # Iterate over results, counting statuses. - status_counts = collections.defaultdict(int) + def __init__(self): + self._heap = [] + self._set = set() - for result in results: - status_counts[result.status.name] += 1 - yield result + def push(self, item, priority): + if item not in self._set: + heapq.heappush(self._heap, (priority, item)) + self._set.add(item) - # We're done, so print frequencies! - print('\nExtract statistics:') + def pop(self): + priority, item = heapq.heappop(self._heap) + self._set.remove(item) + return item - total = sum(status_counts.values()) - for status, count in status_counts.items(): - percentage = (count / total) * 100 - print('{:>18}: {:>8} ({:05.2f}%)'.format(status, count, percentage)) + def first(self): + _, item = min(self._heap) + return item + def __len__(self): + return len(self._heap) -@curry -def write_genomic_sequences(results, file_path, format='fastq', - mode='w', **io_kwargs): - """ Test docstring """ - with skbio.io.open(file_path, mode, **io_kwargs) as file_: - for result in results: - skbio.io.write(result.genomic_sequence, into=file_, format=format) - yield result + def __str__(self): + return 'PrioritySet(heap={}, set={})'\ + .format(str(self._heap), str(self._set)) - -@curry -def build_barcode_map(results, sample_map=None): - if sample_map is None: - return {result.genomic_sequence.metadata['id']: - result.barcode - for result in results} - else: - return {result.genomic_sequence.metadata['id']: - sample_map[result.barcode] - for result in results} + def __repr__(self): + return str(self) -@curry +@toolz.curry def groupby_reference(alignments, alignment_file=None): for reference, group in itertools.groupby( alignments, operator.attrgetter('reference_id')): @@ -111,7 +101,7 @@ def groupby_position(alignments): yield (rev_grp[0].reference_end, -1), rev_grp -@curry +@toolz.curry def groupby_reference_position(alignments, alignment_file=None): chained = chain_groupby( alignments, [groupby_reference(alignment_file=alignment_file), @@ -120,7 +110,7 @@ def groupby_reference_position(alignments, alignment_file=None): yield res -@curry +@toolz.curry def groupby_barcode(alignments, barcode_map): # Group alignments by barcodes. groups = collections.defaultdict(list) @@ -143,5 +133,7 @@ def chain_groupby(iterable, groupby_funcs): yield key, group else: for key, group in grouped: + if not isinstance(key, tuple): + key = (key,) for sub_key, sub_group in chain_groupby(group, groupby_funcs[1:]): yield key + sub_key, sub_group diff --git a/pyim/pipelines/_helpers/pipeline.py b/pyim/pipelines/_helpers/pipeline.py new file mode 100644 index 0000000..e604dc7 --- /dev/null +++ b/pyim/pipelines/_helpers/pipeline.py @@ -0,0 +1,55 @@ +import collections +import itertools + +import skbio +import toolz + + +def print_stats(results): + # Iterate over results, counting statuses. + status_counts = collections.defaultdict(int) + + for result in results: + status_counts[result.status.name] += 1 + yield result + + # We're done, so print frequencies! + print('\nExtract statistics:') + + total = sum(status_counts.values()) + for status, count in status_counts.items(): + percentage = (count / total) * 100 + print('{:>18}: {:>8} ({:05.2f}%)'.format(status, count, percentage)) + + +@toolz.curry +def write_genomic_sequences(results, file_path, format='fastq', + mode='w', **io_kwargs): + """ Test docstring """ + with skbio.io.open(file_path, mode, **io_kwargs) as file_: + for result in results: + skbio.io.write(result.genomic_sequence, into=file_, format=format) + yield result + + +@toolz.curry +def build_barcode_map(results, sample_map=None): + if sample_map is None: + return {result.genomic_sequence.metadata['id']: + result.barcode + for result in results} + else: + return {result.genomic_sequence.metadata['id']: + sample_map[result.barcode] + for result in results} + + +def consume(iterator, n=None): + "Advance the iterator n-steps ahead. If n is none, consume entirely." + # Use functions that consume iterators at C speed. + if n is None: + # Feed the entire iterator into a zero-length deque + collections.deque(iterator, maxlen=0) + else: + # Advance to the empty slice starting at position n + next(itertools.islice(iterator, n, n), None) diff --git a/pyim/pipelines/_model.py b/pyim/pipelines/_model.py index ad3bbcc..d50f909 100644 --- a/pyim/pipelines/_model.py +++ b/pyim/pipelines/_model.py @@ -3,8 +3,3 @@ ExtractResult = collections.namedtuple( 'ExtractResult', ['genomic_sequence', 'barcode', 'status']) - - -Insertion = collections.namedtuple( - 'Insertion', ['id', 'seqname', 'location', - 'strand', 'sample', 'metadata']) diff --git a/pyim/pipelines/_shear_splink.py b/pyim/pipelines/_shear_splink.py deleted file mode 100644 index c477b8b..0000000 --- a/pyim/pipelines/_shear_splink.py +++ /dev/null @@ -1,321 +0,0 @@ -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import (ascii, bytes, chr, dict, filter, hex, input, - int, map, next, oct, open, pow, range, round, - str, super, zip) -from future.utils import native_str - -from enum import Enum -from functools import partial -from pathlib import Path - -import numpy as np -import pandas as pd -from skbio import DNA, SequenceCollection - -from pyim.alignment.genome import Bowtie2Aligner -from pyim.alignment._vector import (ExactAligner, SswAligner, ChainedAligner, - filter_score, filter_end_match) -from pyim.cluster import cluster_frame_merged - -from ._base import (Pipeline, ParallelGenomicExtractor, - InsertionIdentifier, genomic_distance) - - -class ShearSplinkPipeline(Pipeline): - - @classmethod - def configure_argparser(cls, subparsers, name='shear_splink'): - parser = subparsers.add_parser(name, help=name + ' help') - - parser.add_argument('input', type=Path) - parser.add_argument('output_dir', type=Path) - parser.add_argument('reference', type=Path) - parser.add_argument('transposon', type=Path) - parser.add_argument('barcodes', type=Path) - parser.add_argument('linker', type=Path) - - parser.add_argument('--contaminants', type=Path) - parser.add_argument('--barcode_mapping', type=Path) - parser.add_argument('--min_genomic_length', type=int, default=15) - parser.add_argument('--min_depth', type=int, default=2) - parser.add_argument('--min_mapq', type=int, default=37) - - parser.add_argument('--threads', type=int, default=1) - - return parser - - @classmethod - def from_args(cls, args): - # Read transposon, barcode and linker sequences. - transposon_seq = DNA.read(str(args['transposon'])) - - linker_seq = DNA.read(str(args['linker'])) - - barcode_seqs = SequenceCollection.read( - str(args['barcodes']), constructor=DNA) - - # Read contaminants if supplied. - contaminant_seqs = SequenceCollection.read( - str(args['contaminants']), constructor=DNA) \ - if args['contaminants'] is not None else None - - # Read barcode map if supplied. - if barcode_seqs is not None and args['barcode_mapping'] is not None: - barcode_map = pd.read_csv(str(args['barcode_mapping']), - sep=native_str('\t')) - barcode_map = dict(zip(barcode_map['barcode'], - barcode_map['sample'])) - else: - barcode_map = None - - # Setup transposon aligner. - - transposon_filters = [ - # Require at least 90% of the sequence to be matched. - partial(filter_score, min_score=0.9) - ] - - transposon_aligner = ChainedAligner( - [ExactAligner(try_reverse=True), - SswAligner(try_reverse=True, filters=transposon_filters)]) - - # Setup linker aligner. - linker_filters = [ - # Require at least 90% of the sequence to be matched. - partial(filter_score, min_score=0.9), - - # Perfect match at the end of the read? - partial(filter_end_match, min_coverage=0.5, min_identity=0.9) - ] - - linker_aligner = ChainedAligner( - [ExactAligner(try_reverse=False), - SswAligner(try_reverse=False, filters=linker_filters)] - ) - - # Setup extractor and identifier for pipeline. - extractor = ShearSplinkExtractor( - transposon_sequence=transposon_seq, - transposon_aligner=transposon_aligner, - barcode_sequences=barcode_seqs, - barcode_map=barcode_map, - barcode_aligner=ExactAligner(try_reverse=False), - linker_sequence=linker_seq, - linker_aligner=linker_aligner, - contaminant_sequences=contaminant_seqs, - min_length=args['min_genomic_length'], - threads=args['threads']) - - aligner = Bowtie2Aligner(args['reference'], bam_output=True, - threads=args['threads']) - identifier = ShearSplinkIdentifier( - min_mapq=args['min_mapq'], min_depth=args['min_depth']) - - return cls(extractor=extractor, aligner=aligner, identifier=identifier) - - -class ShearSplinkStatus(Enum): - contaminant = 1 - no_transposon = 2 - no_linker = 3 - no_barcode = 4 - multiple_barcodes = 5 - too_short = 6 - proper_read = 7 - - -class ShearSplinkExtractor(ParallelGenomicExtractor): - - STATUS = ShearSplinkStatus - - def __init__(self, transposon_sequence, barcode_sequences, linker_sequence, - contaminant_sequences=None, transposon_aligner=None, - barcode_aligner=None, linker_aligner=None, barcode_map=None, - min_length=1, threads=1, chunk_size=1000): - super().__init__(min_length=min_length, - threads=threads, - chunk_size=chunk_size) - - # Sequences. - self._transposon = transposon_sequence - self._contaminants = contaminant_sequences - self._barcodes = barcode_sequences - self._linker = linker_sequence - - # Aligners. - self._transposon_aligner = transposon_aligner \ - if transposon_aligner is not None \ - else ExactAligner(try_reverse=True) - - self._contaminant_aligner = ExactAligner(try_reverse=True) - - self._barcode_aligner = barcode_aligner \ - if barcode_aligner is not None else ExactAligner() - - self._linker_aligner = linker_aligner \ - if linker_aligner is not None else ExactAligner() - - # Barcode map if given (maps barcodes to samples). - self._barcode_map = barcode_map - - def extract_read(self, read): - # Check for contaminants. - if self._contaminants is not None: - contaminant_aln = self._contaminant_aligner.\ - align_multiple(self._contaminants, read, how='any') - - if contaminant_aln is not None: - return None, self.STATUS.contaminant - - # Check for transposon sequence. - transposon_aln = self._transposon_aligner.align( - self._transposon, read) - - if transposon_aln is None: - # Missing transposon sequence. - return None, self.STATUS.no_transposon - else: - # If we have a transposon sequence, continue. - if transposon_aln.target_strand == -1: - # If transposon is on the reverse strand, flip the - # read and the alignment to bring everything downstream - # into the same (fwd) orientation. - read = read.reverse_complement() - transposon_aln = transposon_aln.reverse(read) - - linker_aln = self._linker_aligner.align( - self._linker, read) - - if linker_aln is None: - # Missing linker sequence. - return None, self.STATUS.no_linker - else: - try: - barcode_aln = self._barcode_aligner.\ - align_multiple(self._barcodes, read) - except ValueError: - return None, self.STATUS.multiple_barcodes - - if barcode_aln is None: - # Missing barcode sequence. - return None, self.STATUS.no_barcode - else: - # Read is complete, return genomic part and barcode. - genomic = read[transposon_aln.target_end: - linker_aln.target_start] - - if len(genomic) < self._min_length: - return None, self.STATUS.too_short - else: - barcode = barcode_aln.query_id - - if self._barcode_map is not None: - barcode = self._barcode_map[barcode] - - return ((genomic, barcode), - self.STATUS.proper_read) - - -class ShearSplinkIdentifier(InsertionIdentifier): - - def __init__(self, min_depth=0, min_mapq=37, merge_distance=10): - super().__init__() - - self._min_depth = min_depth - self._min_mapq = min_mapq - self._merge_distance = merge_distance - - def identify(self, alignment_path, barcode_map=None): - insertions = [] - - groups = self._group_by_position_bam( - alignment_path, min_mapq=self._min_mapq, barcode_map=barcode_map) - for (ref_id, pos, strand, bc), alns in groups: - # Determine depth as the number of reads at this position. - depth = len(alns) - - # Determine depth_unique by looking at differences in the - # other position (end for fwd strand, start for rev strand). - other_pos = (a.reference_end for a in alns) if strand == 1 \ - else (a.reference_start for a in alns) - depth_unique = len(set(other_pos)) - - insertions.append( - {'insertion_id': np.nan, 'seqname': ref_id, - 'location': pos, 'strand': strand, 'sample': bc, - 'depth': depth, 'depth_unique': depth_unique}) - - # Create insertion frame. - insertions = pd.DataFrame.from_records( - insertions, columns=['insertion_id', 'seqname', 'location', - 'strand', 'sample', 'depth', 'depth_unique']) - # Merge insertions in close proximity to account for sequencing errors. - if self._merge_distance > 0: - insertions = cluster_frame_merged( - insertions, groupby=['seqname', 'sample', 'strand'], - dist_func=genomic_distance, merge_func=self._merge_insertions, - linkage='complete', criterion='distance', - t=self._merge_distance) - - # Filter by min_depth. - insertions = insertions.ix[ - insertions['depth_unique'] >= self._min_depth] - - # Add clonality annotation. - insertions = self._annotate_clonality(insertions) - - # Sort by coordinate and add identifiers. - insertions = insertions.sort(['seqname', 'location']) - - insertions['insertion_id'] = ['INS_{}'.format(i+1) - for i in range(len(insertions))] - - return insertions - - @classmethod - def _merge_insertions(cls, frame): - if len(frame) == 0: - return frame.iloc[0] - else: - # Check if merging is sane. - assert len(set(frame['seqname'])) == 1 - assert len(set(frame['strand'])) == 1 - assert len(set(frame['sample'].astype(str))) == 1 - - # Pick first row as reference for shared fields. - ref = frame.iloc[0] - - # Calculate new location as mean, biased towards - # insertions with more weight (a higher ULP). - weighted_loc = np.average(frame.location, - weights=frame['depth_unique']) - weighted_loc = int(round(weighted_loc)) - - return pd.Series( - {'insertion_id': np.nan, - 'seqname': ref['seqname'], - 'location': weighted_loc, - 'strand': ref['strand'], - 'sample': ref['sample'], - 'depth': frame['depth'].sum(), - 'depth_unique': frame['depth_unique'].sum()}, - index=ref.index) - - @staticmethod - def _annotate_clonality(ins_frame): - groups = ins_frame.groupby('sample') - - if len(groups) > 0: - clonality = groups.apply(lambda grp: grp['depth_unique'] / - grp['depth_unique'].max()) - - clonality.index = clonality.index.droplevel() - clonality.name = 'clonality' - else: - clonality = pd.Series({'clonality': np.NaN}, - index=ins_frame.index) - - ins_frame_clonality = pd.concat([ins_frame, clonality], axis=1) - - return ins_frame_clonality diff --git a/pyim/pipelines/shear_splink.py b/pyim/pipelines/shear_splink.py index 80bb635..3295cfc 100644 --- a/pyim/pipelines/shear_splink.py +++ b/pyim/pipelines/shear_splink.py @@ -1,31 +1,29 @@ -import collections -import itertools +import os import operator from enum import Enum from os import path import pysam -import numpy as np import pandas as pd -from toolz import curry, map, pipe, merge_with -from toolz.curried import filter - import skbio -from tqdm import tqdm +import toolz +import tqdm +from toolz.curried import filter as curried_filter from pyim.alignment.bowtie2 import align as bowtie_align from pyim.alignment.vector import (align_exact, align_multiple, align_with_reverse, reverse_alignment) -from pyim.util import count_lines +from pyim.util import count_fasta_entries -from pyim.pipelines._model import ExtractResult, Insertion -from pyim.pipelines._helpers import ( - print_stats, write_genomic_sequences, build_barcode_map, - chain_groupby, groupby_barcode, - groupby_reference_position) +from ._model import ExtractResult +from ._helpers.pipeline import (print_stats, build_barcode_map, + write_genomic_sequences) +from ._helpers.grouping import (chain_groupby, groupby_barcode, + groupby_reference_position) +from ._helpers.clustering import merge_within_distance -# --- Register pipeline --- # +# --- Pipeline register hook + main --- # def register(subparsers, name='shear_splink'): parser = subparsers.add_parser(name, help=name + ' help') @@ -52,11 +50,6 @@ def register(subparsers, name='shear_splink'): def main(args): - - # Setup input reads. - reads = tqdm(skbio.io.read(args.input, format='fasta'), - total=count_lines(args.input) // 2, leave=True) - # Read transposon, linker and barcode sequences. transposon = skbio.io.read(args.transposon, format='fasta', into=skbio.DNA) linker = skbio.io.read(args.linker, format='fasta', into=skbio.DNA) @@ -76,52 +69,65 @@ def main(args): else: sample_map = None - # Run pipeline! - shear_splink(reads, transposon, linker, barcodes, - args.bowtie_index, args.output_dir, - contaminants=contaminants, sample_map=sample_map, - min_genomic_length=args.min_genomic_length) + # Create output_dir if it does not exist. + if not path.exists(args.output_dir): + os.mkdir(args.output_dir) + # Run pipeline! + insertions = shear_splink( + args.input, transposon, linker, barcodes, + args.bowtie_index, args.output_dir, + contaminants=contaminants, sample_map=sample_map, + min_genomic_length=args.min_genomic_length) -# --- Overall pipeline --- # - -def shear_splink(reads, transposon, linker, barcodes, bowtie_index, output_dir, - contaminants=None, sample_map=None, min_genomic_length=15): - # seq1 = DNA('CACTGGCCACGCGAAGGTGC') - # seq2 = DNA('GACCACTGGCCACGCGAAGG').reverse_complement() - # seq3 = DNA('CGTTGGTCACTCTACCCACA') + # Write insertion output. + insertions.to_csv(path.join(args.output_dir, 'insertions.txt'), + sep='\t', index=False) - # transposon = DNA('TTTG', metadata=dict(id='transposon')) - # barcodes = [DNA('AAAT', metadata=dict(id='BC01')), - # DNA('AAAA', metadata=dict(id='BC02'))] - # linker = DNA('CCCG', metadata=dict(id='linker')) - # genomic_path = '/Users/Julian/Scratch/pyim/functional/genomic.fasta.gz' - # barcode_path = '/Users/Julian/Scratch/pyim/functional/barcodes.txt' +# --- Overall pipeline --- # - # index_path = '/path/to/index' - # alignment_path = '/Users/Julian/Scratch/pyim/functional/alignment.bam' +def shear_splink(read_path, transposon, linker, barcodes, + bowtie_index, output_dir, contaminants=None, + sample_map=None, min_genomic_length=15): + # Determine paths for intermediates/outputs. genomic_path = path.join(output_dir, 'genomic.fna') - barcode_path = path.join(output_dir, 'barcodes.txt') - alignment_path = path.join(output_dir, 'alignment.bam') + barcode_path = path.join(output_dir, 'genomic.barcodes.txt') + alignment_base = path.join(output_dir, 'alignment') + + # Log progress with progressbar. + reads = skbio.read(read_path, format='fasta') + reads = tqdm.tqdm(reads, total=count_fasta_entries(read_path), + leave=True, ncols=80, desc='Test') # Extract genomic sequences and barcodes - _, barcode_map = extract_genomic( + _, barcode_frame = extract_genomic( reads, transposon=transposon, barcodes=barcodes, linker=linker, output_path=genomic_path, contaminants=contaminants, min_length=min_genomic_length) - barcode_map.to_csv(barcode_path, sep='\t', index=False) + barcode_frame.to_csv(barcode_path, sep='\t', index=False) # Align to reference with Bowtie2. - bowtie_align(genomic_path, bowtie_index, alignment_path, - options={}, log=alignment_path + '.log') + aln_path = bowtie_align(genomic_path, bowtie_index, alignment_base, + bam_output=True, options={'-f': True}, + log=alignment_base + '.log') # Identify insertions from alignment. - # insertions = identify_insertions(alignment_path, barcode_map=barcode_map) - # print(insertions) + barcode_map = dict(zip(barcode_frame['read_id'], + barcode_frame['barcode'])) + insertions = identify_insertions(aln_path, barcode_map=barcode_map) + + # Cluster and merge close insertions + agg_funcs = {'depth': 'sum', 'depth_unique': 'sum'} + insertions = merge_within_distance( + insertions, max_dist=2000, agg_funcs=agg_funcs) + + # Assign ids to insertions. + insertions['id'] = ['INS_{}'.format(i) + for i in range(1, len(insertions) + 1)] - # return insertions + return insertions # --- Genomic sequence extraction --- # @@ -142,7 +148,7 @@ def extract_genomic(reads, transposon, barcodes, linker, output_path, io_kwargs = io_kwargs or {} # Extract and write genomic sequences. - barcode_map = pipe( + barcode_map = toolz.pipe( reads, _extract_reads(transposon=transposon, barcodes=barcodes, @@ -150,8 +156,8 @@ def extract_genomic(reads, transposon, barcodes, linker, output_path, contaminants=contaminants, sample_map=sample_map), print_stats, - filter(lambda r: r.status == ShearSplinkStatus.proper_read), - filter(lambda r: len(r.genomic_sequence) >= min_length), + curried_filter(lambda r: r.status == ShearSplinkStatus.proper_read), + curried_filter(lambda r: len(r.genomic_sequence) >= min_length), write_genomic_sequences(file_path=output_path, format='fasta', **io_kwargs), build_barcode_map) @@ -163,7 +169,7 @@ def extract_genomic(reads, transposon, barcodes, linker, output_path, return output_path, barcode_frame -@curry +@toolz.curry def _extract_reads(reads, transposon, barcodes, linker, contaminants=None, transposon_func=None, barcode_func=None, linker_func=None, sample_map=None): @@ -192,11 +198,12 @@ def _extract_reads(reads, transposon, barcodes, linker, contaminants=None, linker_func = linker_func(query=linker) # Extract and return results. - extract_func = curry(_extract_read, - transposon_func=transposon_func, - barcode_func=barcode_func, - linker_func=linker_func, - contaminant_func=contaminant_func) + extract_func = toolz.curry( + _extract_read, + transposon_func=transposon_func, + barcode_func=barcode_func, + linker_func=linker_func, + contaminant_func=contaminant_func) for result in map(extract_func, reads): yield result @@ -261,13 +268,13 @@ def _extract_read( # --- Insertion identification --- # def identify_insertions(alignment_path, barcode_map): - + # Get alignments from bowtie. bam = pysam.AlignmentFile(alignment_path) - alns = bam.fetch(multiple_iterators=True) + alignments = bam.fetch(multiple_iterators=True) # Group alignments by barcode and position. aln_groups = chain_groupby( - alns, + alignments, [groupby_reference_position(alignment_file=bam), groupby_barcode(barcode_map=barcode_map)]) @@ -278,9 +285,6 @@ def identify_insertions(alignment_path, barcode_map): columns=['id', 'chrom', 'position', 'strand', 'barcode', 'depth', 'depth_unique']) - # Cluster and merge close insertions - - return insertions @@ -296,31 +300,3 @@ def _alignments_to_insertion(info, alignments, id_=None): depth_unique = len(set(end_positions)) return id_, ref, pos, strand, bc, depth, depth_unique - - # metadata = dict(depth=depth, depth_unique=depth_unique) - #return Insertion(id=id_, seq_name=ref, location=pos, strand=strand, - # sample=bc, metadata=metadata) - - -def group_insertions(insertions, distance): - # for insertion in insertions: - # check if we have an insertion from this sample in our collection - # if so, add to collection - - # - When did we last see SAMPLE_X? - # - Which sample have we not seen within distance? - pass - - -def merge_insertions(insertions): - # Summarize location as mean. - location = np.average([ins.location for ins in insertions]) - - # Merge metadata by summing depths. - metadata = merge_with(sum, *[ins.metadata for ins in insertions]) - - # Take first insertion as reference for other attributes. - ref = insertions[0] - - return Insertion(id=None, seqname=ref.seqname, location=location, - strand=ref.strand, sample=ref.sample, metadata=metadata) diff --git a/pyim/util.py b/pyim/util.py index 5491d4b..82a568e 100644 --- a/pyim/util.py +++ b/pyim/util.py @@ -1,31 +1,29 @@ -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import (bytes, dict, int, list, object, range, str, - ascii, chr, hex, input, next, oct, open, - pow, round, super, filter, map, zip) +import pysam +from functools import reduce -import heapq +def _make_gen(reader): + b = reader(1024 * 1024) + while b: + yield b + b = reader(1024*1024) -class PrioritySet(object): - def __init__(self): - self._heap = [] - self._set = set() +def count_lines(file_path): + f = open(file_path, 'rb') + f_gen = _make_gen(f.raw.read) + return sum(buf.count(b'\n') for buf in f_gen) - def push(self, item, priority): - if item not in self._set: - heapq.heappush(self._heap, (priority, item)) - self._set.add(item) - def pop(self): - priority, item = heapq.heappop(self._heap) - self._set.remove(item) - return item +def count_fasta_entries(file_path): + f = open(file_path, 'rb') + f_gen = _make_gen(f.raw.read) + return sum(buf.count(b'>') for buf in f_gen) - def first(self): - _, item = min(self._heap) - return item - def __len__(self): - return len(self._heap) +def count_bam_entries(file_path): + # From Biostars at https://www.biostars.org/p/1890/. + # Could be faster for sorted/index bam files using idxstats. + reduce(lambda x, y: x + y, + [eval('+'.join(l.rstrip('\n').split('\t')[2:])) + for l in pysam.idxstats(file_path)]) From fcb38b0009d1623c56df7514c3d1665e20b658a2 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Mon, 21 Dec 2015 16:42:45 +0100 Subject: [PATCH 016/100] Drop shippable for now. --- functional.py | 515 -------------------------------------------------- readme.md | 2 - shippable.yml | 55 ------ 3 files changed, 572 deletions(-) delete mode 100644 functional.py delete mode 100644 shippable.yml diff --git a/functional.py b/functional.py deleted file mode 100644 index e84e290..0000000 --- a/functional.py +++ /dev/null @@ -1,515 +0,0 @@ -import collections -import heapq -import itertools -import operator -from enum import Enum - -import pysam -import numpy as np -import pandas as pd -from toolz import curry, map, pipe, merge_with -from toolz.curried import filter - -import skbio -from skbio import DNA -from skbio.alignment import local_pairwise_align_ssw - - -# --- Model --- # - -Alignment = collections.namedtuple( - 'Alignment', - ['query_id', 'query_start', 'query_end', 'query_len', - 'target_id', 'target_start', 'target_end', 'target_len', - 'strand', 'identity', 'coverage', 'score']) - -ExtractResult = collections.namedtuple( - 'ExtractResult', ['genomic_sequence', 'barcode', 'status']) - - -def reverse_alignment(aln): - target_len = aln.target_len - - return Alignment( - query_id=aln.query_id, query_start=aln.query_start, - query_end=aln.query_end, query_len=aln.query_len, - target_id=aln.target_id, target_start=target_len - aln.target_end, - target_end=target_len - aln.target_start, target_len=target_len, - strand=aln.strand * -1, type=aln.type, identity=aln.identity, - coverage=aln.coverage, score=aln.score) - - -Insertion = collections.namedtuple( - 'Insertion', ['id', 'seqname', 'location', - 'strand', 'sample', 'metadata']) - - -# --- Alignment --- # - -@curry -def align_exact(target, query, query_strand=1): - # Note that this alignment returns the first occurrence it finds, - # later occurrences will not be found and are not checked for. - try: - index = str(target).index(str(query)) - except ValueError: - return None - else: - q_len = len(query) - - return Alignment( - query_id=query.metadata.get('id', None), query_start=0, - query_end=q_len, query_len=q_len, - target_id=target.metadata.get('id', None), target_start=index, - target_end=index + q_len, target_len=len(target), - strand=query_strand, identity=1.0, coverage=1.0, score=100) - - -@curry -def align_ssw(target, query, query_strand=1): - ssw_aln = local_pairwise_align_ssw(target.sequence, query.sequence) - - # Extract positions. - pos = ssw_aln.start_end_positions() - q_start, q_end = pos[1] - t_start, t_end = pos[0] - - # Offset ends by one, making them exclusive - # to match python conventions. - q_end += 1 - t_end += 1 - - # Calculate basic metrics. - coverage = (q_end - q_start) / float(len(query)) - identity = ssw_aln[0].fraction_same(ssw_aln[1]) - - aln = Alignment( - query_id=query.id, query_start=q_start, query_end=q_end, - query_len=len(query), target_id=target.id, target_start=t_start, - target_end=t_end, target_len=len(target), strand=query_strand, - identity=identity, coverage=coverage, - score=int(identity * coverage * 100)) - - return aln - - -@curry -def align_with_reverse(target, query, align_func, query_strand=1, **kwargs): - aln_fwd = align_func(target, query, query_strand=query_strand, **kwargs) - aln_rev = align_func(target, query.reverse_complement(), - query_strand=query_strand * -1, **kwargs) - - if aln_fwd is None: - return aln_rev - elif aln_rev is None: - return aln_fwd - else: - return aln_rev if aln_rev.score > aln_fwd.score else aln_fwd - - -@curry -def align_multiple(target, queries, align_func, **kwargs): - alns = (align_func(target, query, **kwargs) for query in queries) - alns = list(filter(bool, alns)) - - if len(alns) == 0: - return None - elif len(alns) == 1: - return alns[0] - else: - raise ValueError('Multiple alignments') - - -# --- Filtering --- # - -def filter_alignment(alignment, filters): - for filter_ in filters: - if not filter_(alignment): - return False - return True - - -# --- Extract pipeline --- # - -def extract(read): - raise NotImplementedError() - - -def print_stats(results): - # Iterate over results, counting statuses. - status_counts = collections.defaultdict(int) - - for result in results: - status_counts[result.status.name] += 1 - yield result - - # We're done, so print frequencies! - total = sum(status_counts.values()) - for status, count in status_counts.items(): - percentage = (count / total) * 100 - print('{}: {} ({}%)'.format(status, count, percentage)) - - -@curry -def write_sequences(results, file_path, format, mode='w', - compression='auto', compresslevel=9): - """ Test docstring """ - with skbio.io.util.open(file_path, mode=mode, compression=compression, - compresslevel=compresslevel) as file_: - for result in results: - skbio.io.write(result.genomic_sequence, into=file_, format=format) - yield result - - -@curry -def build_barcode_map(results, sample_map=None): - if sample_map is None: - return {result.genomic_sequence.metadata['id']: - result.barcode - for result in results} - else: - return {result.genomic_sequence.metadata['id']: - sample_map[result.barcode] - for result in results} - - -def consume(iterator, n=None): - "Advance the iterator n-steps ahead. If n is none, consume entirely." - # Use functions that consume iterators at C speed. - if n is None: - # Feed the entire iterator into a zero-length deque - collections.deque(iterator, maxlen=0) - else: - # Advance to the empty slice starting at position n - next(itertools.islice(iterator, n, n), None) - - -# --- Identify pipeline --- # - -class PrioritySet(object): - - def __init__(self): - self._heap = [] - self._set = set() - - def push(self, item, priority): - if item not in self._set: - heapq.heappush(self._heap, (priority, item)) - self._set.add(item) - - def pop(self): - priority, item = heapq.heappop(self._heap) - self._set.remove(item) - return item - - def first(self): - _, item = min(self._heap) - return item - - def __len__(self): - return len(self._heap) - - def __str__(self): - return 'PrioritySet(heap={}, set={})'\ - .format(str(self._heap), str(self._set)) - - def __repr__(self): - return str(self) - - -@curry -def groupby_reference(alignments, alignment_file=None): - for reference, group in itertools.groupby( - alignments, operator.attrgetter('reference_id')): - if alignment_file is not None: - reference = alignment_file.getrname(reference) - yield reference, group - - -def groupby_position(alignments): - """ Groups alignments by their positions, grouping forward strand - alignments with the same start position and reverse strand - alignments with the same end position. Assumes alignments - are all on a single reference sequence. - """ - # Setup our collections for tracking reads and positions. - # - # The priority set is used to track positions with alignment groups, - # ensuring that no position is listed twice (the set part) and - # always giving the lowest position first (the priority part). - # - # The alignment dict contains two lists for each position with at - # least one alignment, one for forward reads and one for reverse. - # Any alignments encountered as position x in orientation o are added - # to the corresponding entry dict[x][o] in the list, in which - # o is encoded as {0,1}, with 1 being for reverse strand alignments. - position_set = PrioritySet() - aln_dict = collections.defaultdict(lambda: ([], [])) - - curr_pos = 0 - for aln in alignments: - # Check our ordering. - if aln.reference_start < curr_pos: - raise ValueError('Alignments not ordered by position') - - curr_pos = aln.reference_start - - # Add current read to collections. - is_reverse = aln.is_reverse - ref_pos = aln.reference_end if is_reverse else curr_pos - aln_dict[ref_pos][bool(is_reverse)].append(aln) - position_set.push(ref_pos, ref_pos) - - # Return any alignment groups before our current position. - try: - while position_set.first() < curr_pos: - first_pos = position_set.pop() - fwd_grp, rev_grp = aln_dict.pop(first_pos) - if len(fwd_grp) > 0: - yield (fwd_grp[0].reference_start, 1), fwd_grp - if len(rev_grp) > 0: - yield (rev_grp[0].reference_end, -1), rev_grp - except ValueError: - pass - - # We're done, yield any remaining alignment groups. - for _ in range(len(position_set)): - fwd_grp, rev_grp = aln_dict.pop(position_set.pop()) - if len(fwd_grp) > 0: - yield (fwd_grp[0].reference_start, 1), fwd_grp - if len(rev_grp) > 0: - yield (rev_grp[0].reference_end, -1), rev_grp - - -@curry -def groupby_barcode(alignments, barcode_map): - # Group alignments by barcodes. - groups = collections.defaultdict(list) - for aln in alignments: - barcode = barcode_map[aln.query_name] - groups[barcode].append(aln) - - # Yield group together with barcode. - for barcode, group in groups.items(): - yield barcode, group - - -def chain_groupby(iterable, groupby_funcs): - grouped = groupby_funcs[0](iterable) - - if len(groupby_funcs) == 1: - for key, group in grouped: - if not isinstance(key, tuple): - key = (key,) - yield key, group - else: - for key, group in grouped: - for sub_key, sub_group in chain_groupby(group, groupby_funcs[1:]): - yield key + sub_key, sub_group - - -# --- ShearSplink --- # - -class ShearSplinkStatus(Enum): - contaminant = 1 - no_transposon = 2 - no_linker = 3 - no_barcode = 4 - multiple_barcodes = 5 - too_short = 6 - proper_read = 7 - - -@curry -def shearsplink_extract( - reads, transposon_sequence, barcode_sequences, linker_sequence, - contaminant_sequences=None, transposon_func=None, - barcode_func=None, linker_func=None, barcode_map=None): - - # Specify defaults for not provided aligners. - if transposon_func is None: - transposon_func = align_with_reverse(align_func=align_exact) - - if barcode_func is None: - barcode_func = align_multiple(align_func=align_exact) - - if linker_func is None: - linker_func = align_exact - - # Setup contaminant aligner if sequences are provided. - if contaminant_sequences is not None: - contaminant_func = align_multiple(queries=contaminant_sequences, - align_func=align_exact) - else: - contaminant_func = None - - # Prime aligners with their respective sequences. - transposon_func = transposon_func(query=transposon_sequence) - barcode_func = barcode_func(queries=barcode_sequences) - linker_func = linker_func(query=linker_sequence) - - # Extract and return results. - extract_func = curry(_shearsplink_extract, - transposon_func=transposon_func, - barcode_func=barcode_func, - linker_func=linker_func, - contaminant_func=contaminant_func) - - for result in map(extract_func, reads): - yield result - - -def _shearsplink_extract( - read, transposon_func, barcode_func, - linker_func, contaminant_func=None): - """ Extracts the genomic sequence and barcode from the passed - read. Reads containing contaminants are dropped. Reads are - expected to look as follows: - - [barcode][transposon][genomic-sequence][linker] - - Each of these sequences is recognized by their corresponding - alignment function. The barcode alignment identifies the - barcode (and thus the sample) of the read, whilst the transposon - and linker alignments are used to delineate the genomic sequence. - - The function returns an ExactResult tuple that contains the - genomic sequence, barcode and a status flag. If any errors - occur during the extraction, the genomic sequence and barcode - values are None and the status flag indicates the underlying reason. - """ - - # Drop read if it contains a contaminant. - if contaminant_func is not None and len(contaminant_func(read)) > 0: - return ExtractResult(None, None, ShearSplinkStatus.contaminant) - - # Identify location of the transposon. - transposon_aln = transposon_func(read) - if transposon_aln is None: - return ExtractResult(None, None, ShearSplinkStatus.no_transposon) - - # If transposon is on the reverse strand, flip the read and the - # alignment to bring everything into the same (fwd) orientation. - if transposon_aln.strand == -1: - read = read.reverse_complement() - transposon_aln = reverse_alignment(transposon_aln) - - # Identify barcode of the read. - try: - barcode_aln = barcode_func(read) - if barcode_aln is None: - return ExtractResult(None, None, ShearSplinkStatus.no_barcode) - except ValueError: - return ExtractResult(None, None, ShearSplinkStatus.multiple_barcodes) - - barcode = barcode_aln.query_id - - # Identify location of linker. - linker_aln = linker_func(read) - if linker_aln is None: - return ExtractResult(None, None, ShearSplinkStatus.no_linker) - - # Extract genomic sequence using previous alignments. - genomic = read[transposon_aln.target_end:linker_aln.target_start] - - return ExtractResult(genomic, barcode, ShearSplinkStatus.proper_read) - - -def shearsplink_identify(alignments): - pass - - -def insertion_from_group(info, group): - ref, pos, strand, bc = info - - # Get positions of the non-transposon ends of the alignment. - end_field = 'reference_end' if strand == 1 else 'reference_start' - end_positions = map(operator.attrgetter(end_field), group) - - # Calulate overall depth and unique end depth. - depth = len(group) - depth_unique = len(set(end_positions)) - - metadata = dict(depth=depth, depth_unique=depth_unique) - - return Insertion(id=None, seq_name=ref, location=pos, strand=strand, - sample=bc, metadata=metadata) - - -def group_insertions(insertions, distance): - # for insertion in insertions: - # check if we have an insertion from this sample in our collection - # if so, add to collection - - # - When did we last see SAMPLE_X? - # - Which sample have we not seen within distance? - pass - - -def merge_insertions(insertions): - # Summarize location as mean. - location = np.average([ins.location for ins in insertions]) - - # Merge metadata by summing depths. - metadata = merge_with(sum, *[ins.metadata for ins in insertions]) - - # Take first insertion as reference for other attributes. - ref = insertions[0] - - return Insertion(id=None, seqname=ref.seqname, location=location, - strand=ref.strand, sample=ref.sample, metadata=metadata) - - -# --- Main --- # - -# Extraction. - -seq1 = DNA('CACTGGCCACGCGAAGGTGC') -seq2 = DNA('GACCACTGGCCACGCGAAGG').reverse_complement() -seq3 = DNA('CGTTGGTCACTCTACCCACA') - -transposon = DNA('TTTG', metadata=dict(id='transposon')) -barcodes = [DNA('AAAT', metadata=dict(id='BC01')), - DNA('AAAA', metadata=dict(id='BC02'))] -linker = DNA('CCCG', metadata=dict(id='linker')) - -reads = [DNA(str(barcodes[0]) + str(transposon) + - str(seq1) + str(linker), metadata=dict(id='read_1')), - DNA(str(transposon) + str(seq1) + str(linker))] - -genomic_path = '/Users/Julian/Scratch/pyim/functional/genomic.fasta.gz' -barcode_path = '/Users/Julian/Scratch/pyim/functional/barcodes.txt' - -barcode_map = pipe( - reads, - shearsplink_extract(transposon_sequence=transposon, - barcode_sequences=barcodes, - linker_sequence=linker), - print_stats, - filter(lambda r: r.status == ShearSplinkStatus.proper_read), - filter(lambda r: len(r.genomic_sequence) >= 15), - write_sequences(file_path=genomic_path, format='fasta', - compression='gzip', compresslevel=9), - build_barcode_map) - -barcode_frame = pd.DataFrame.from_records( - iter(barcode_map.items()), columns=['read_id', 'barcode']) -barcode_frame.to_csv(barcode_path, sep='\t', index=False) - - -# Grouping. - -bam = pysam.AlignmentFile('/Volumes/Datastore/Scratch/' - 'lam-pcr-sjors/out/alignment.bam') -alns = itertools.islice(bam.fetch(), 0, 1000) - -it = chain_groupby( - itertools.islice(bam.fetch(multiple_iterators=True), 0, 1000), - [curry(groupby_reference, alignment_file=bam), groupby_position]) - -barcode_map = collections.defaultdict(lambda: 'BC01') -it2 = chain_groupby( - itertools.islice(bam.fetch(multiple_iterators=True), 0, 1000), - [groupby_reference(alignment_file=bam), - groupby_position, - groupby_barcode(barcode_map=barcode_map)]) diff --git a/readme.md b/readme.md index 4550e29..a953cc1 100644 --- a/readme.md +++ b/readme.md @@ -1,8 +1,6 @@ PyIM ======================= -[![Build Status](https://api.shippable.com/projects/550e9fef5ab6cc1352a74bf1/badge?branchName=master)](https://app.shippable.com/projects/550e9fef5ab6cc1352a74bf1/builds/latest) - PyIM is a software package for implementing pipelines that identify transposon integration sites from targeted DNA-sequencing of transposon insertions. The package implements a number of standard pipelines used in our group, but also diff --git a/shippable.yml b/shippable.yml deleted file mode 100644 index 46d5468..0000000 --- a/shippable.yml +++ /dev/null @@ -1,55 +0,0 @@ -language: python - -python: - - '2.7' - - '3.4' - -before_install: - # Get the Python version setup by shippable (for use in our env later). - - export CONDA_PYTHON_VERSION=$(python -c 'import sys; print("{0}.{1}".format(*sys.version_info))') - - # Install and setup miniconda. - - wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh - - bash Miniconda3-latest-Linux-x86_64.sh -b - - export PATH=/home/shippable/miniconda3/bin:$PATH - - # Create and activate environment. - - conda create -y -n env python=$CONDA_PYTHON_VERSION pip - - source activate env - - # Install any system dependencies. - - sudo apt-get update - - sudo apt-get install -y tabix - - sudo apt-get install -y --no-install-recommends r-base - - # Install Python package dependencies. - - conda install -y numpy scipy pandas natsort scikit-bio - - conda install -y -c https://conda.binstar.org/jrderuiter pysam - - pip install rpy2 - - # Install tkgeno dependency + required packages. - - conda install -y matplotlib statsmodels - - pip install git+ssh://git@bitbucket.org/jrderuiter/genomics-toolkit.git - - # Install packages required for tests. - - conda install -y pytest coverage - -install: - - python setup.py install - -before_script: - - mkdir -p shippable/testresults shippable/codecoverage - -script: - - py.test --junit-xml=shippable/testresults/results.xml - - coverage run --branch --source pyim -m py.test - - coverage xml -o shippable/codecoverage/coverage.xml - -notifications: - email: - recipients: - - jrderuiter@fastmail.fm - -branches: - only: - - master From 54ec72a190d71393f21250c189d7ced57b8a5ea0 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Mon, 21 Dec 2015 17:14:56 +0100 Subject: [PATCH 017/100] Add logger support to print_statistics. --- pyim/pipelines/_helpers/pipeline.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pyim/pipelines/_helpers/pipeline.py b/pyim/pipelines/_helpers/pipeline.py index e604dc7..0c82b81 100644 --- a/pyim/pipelines/_helpers/pipeline.py +++ b/pyim/pipelines/_helpers/pipeline.py @@ -5,7 +5,10 @@ import toolz -def print_stats(results): +@toolz.curry +def print_stats(results, logger=None, header=False): + print_ = print if logger is None else logger.info + # Iterate over results, counting statuses. status_counts = collections.defaultdict(int) @@ -14,12 +17,14 @@ def print_stats(results): yield result # We're done, so print frequencies! - print('\nExtract statistics:') + if header: + print_('Extraction stats:') total = sum(status_counts.values()) for status, count in status_counts.items(): percentage = (count / total) * 100 - print('{:>18}: {:>8} ({:05.2f}%)'.format(status, count, percentage)) + print_('{:>18}: {:>8} ({:05.2f}%)' + .format(status, count, percentage)) @toolz.curry From fea79891c794a84b8a3334ff3d044c706214dfde Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Mon, 21 Dec 2015 17:15:08 +0100 Subject: [PATCH 018/100] Add logging, mapping to samples. --- pyim/pipelines/shear_splink.py | 52 ++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 15 deletions(-) diff --git a/pyim/pipelines/shear_splink.py b/pyim/pipelines/shear_splink.py index 3295cfc..4482b52 100644 --- a/pyim/pipelines/shear_splink.py +++ b/pyim/pipelines/shear_splink.py @@ -1,5 +1,6 @@ import os import operator +import logging from enum import Enum from os import path @@ -23,6 +24,12 @@ from ._helpers.clustering import merge_within_distance +logging.basicConfig( + format='%(asctime)-15s %(message)s', + datefmt='[%Y-%m-%d %H:%M:%S]', + level=logging.INFO) + + # --- Pipeline register hook + main --- # def register(subparsers, name='shear_splink'): @@ -66,6 +73,8 @@ def main(args): # Read barcode --> sample map if given. if args.sample_map is not None: sample_map = pd.read_csv(args.sample_map, sep='\t') + sample_map = dict(zip(sample_map['barcode'], + sample_map['sample'])) else: sample_map = None @@ -91,34 +100,45 @@ def shear_splink(read_path, transposon, linker, barcodes, bowtie_index, output_dir, contaminants=None, sample_map=None, min_genomic_length=15): + logger = logging.getLogger() + # Determine paths for intermediates/outputs. genomic_path = path.join(output_dir, 'genomic.fna') barcode_path = path.join(output_dir, 'genomic.barcodes.txt') alignment_base = path.join(output_dir, 'alignment') # Log progress with progressbar. + logger.info('Extracting genomic sequences') + reads = skbio.read(read_path, format='fasta') reads = tqdm.tqdm(reads, total=count_fasta_entries(read_path), - leave=True, ncols=80, desc='Test') + leave=False, ncols=80) # Extract genomic sequences and barcodes _, barcode_frame = extract_genomic( - reads, transposon=transposon, barcodes=barcodes, - linker=linker, output_path=genomic_path, - contaminants=contaminants, min_length=min_genomic_length) + reads, transposon=transposon, barcodes=barcodes, linker=linker, + output_path=genomic_path, contaminants=contaminants, + min_length=min_genomic_length, logger=logger) + barcode_frame.to_csv(barcode_path, sep='\t', index=False) # Align to reference with Bowtie2. + logger.info('Aligning to reference genome') + aln_path = bowtie_align(genomic_path, bowtie_index, alignment_base, bam_output=True, options={'-f': True}, log=alignment_base + '.log') # Identify insertions from alignment. + logger.info('Identifying insertions') + barcode_map = dict(zip(barcode_frame['read_id'], barcode_frame['barcode'])) insertions = identify_insertions(aln_path, barcode_map=barcode_map) # Cluster and merge close insertions + logger.info('Merging close insertions') + agg_funcs = {'depth': 'sum', 'depth_unique': 'sum'} insertions = merge_within_distance( insertions, max_dist=2000, agg_funcs=agg_funcs) @@ -127,6 +147,10 @@ def shear_splink(read_path, transposon, linker, barcodes, insertions['id'] = ['INS_{}'.format(i) for i in range(1, len(insertions) + 1)] + # Map barcodes to samples. + if sample_map is not None: + insertions['sample'] = insertions['barcode'].map(sample_map) + return insertions @@ -142,9 +166,9 @@ class ShearSplinkStatus(Enum): proper_read = 7 -def extract_genomic(reads, transposon, barcodes, linker, output_path, - sample_map=None, contaminants=None, min_length=15, - io_kwargs=None): +def extract_genomic(reads, transposon, barcodes, linker, + output_path, contaminants=None, min_length=15, + io_kwargs=None, logger=None): io_kwargs = io_kwargs or {} # Extract and write genomic sequences. @@ -153,9 +177,8 @@ def extract_genomic(reads, transposon, barcodes, linker, output_path, _extract_reads(transposon=transposon, barcodes=barcodes, linker=linker, - contaminants=contaminants, - sample_map=sample_map), - print_stats, + contaminants=contaminants), + print_stats(logger=logger), curried_filter(lambda r: r.status == ShearSplinkStatus.proper_read), curried_filter(lambda r: len(r.genomic_sequence) >= min_length), write_genomic_sequences(file_path=output_path, @@ -171,8 +194,7 @@ def extract_genomic(reads, transposon, barcodes, linker, output_path, @toolz.curry def _extract_reads(reads, transposon, barcodes, linker, contaminants=None, - transposon_func=None, barcode_func=None, - linker_func=None, sample_map=None): + transposon_func=None, barcode_func=None, linker_func=None): # Specify defaults for not provided aligners. if transposon_func is None: @@ -209,9 +231,8 @@ def _extract_reads(reads, transposon, barcodes, linker, contaminants=None, yield result -def _extract_read( - read, transposon_func, barcode_func, - linker_func, contaminant_func=None): +def _extract_read(read, transposon_func, barcode_func, + linker_func, contaminant_func=None): """ Extracts the genomic sequence and barcode from the passed read. Reads containing contaminants are dropped. Reads are expected to look as follows: @@ -289,6 +310,7 @@ def identify_insertions(alignment_path, barcode_map): def _alignments_to_insertion(info, alignments, id_=None): + # Extract group info. ref, pos, strand, bc = info # Get positions of the non-transposon ends of the alignment. From 2afdc6a1f5941cc8b06c9dce50c8b47cf011fa46 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Tue, 22 Dec 2015 09:16:19 +0100 Subject: [PATCH 019/100] Bump version number in preparation for first release. --- pyim/__init__.py | 2 ++ setup.py | 9 ++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/pyim/__init__.py b/pyim/__init__.py index e69de29..234b32d 100644 --- a/pyim/__init__.py +++ b/pyim/__init__.py @@ -0,0 +1,2 @@ + +__version__ = '1.0.0-beta' diff --git a/setup.py b/setup.py index 374a51f..6961c8b 100644 --- a/setup.py +++ b/setup.py @@ -2,6 +2,9 @@ from setuptools import setup, find_packages +from pyim import __version__ + + install_requires = ['future', 'numpy', 'scipy', 'pandas', 'pysam', 'natsort', 'rpy2', 'scikit-bio', 'toolz'] @@ -10,10 +13,10 @@ setup( name='pyim', - version='0.4.3', - url='', + version=__version__, + url='https://bitbucket.org/jrderuiter/pyim', author='Julian de Ruiter', - author_email='j.r.deruiter@icloud.com', + author_email='julianderuiter@gmail.com', description='Predicts transposon insertion sites from DNA-seq data.', license='BSD', packages=find_packages(), From 0a4beb279d7f0bd33af75563836ec85d569340b9 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Tue, 22 Dec 2015 09:45:42 +0100 Subject: [PATCH 020/100] Flag short reads properly. --- pyim/pipelines/shear_splink.py | 42 +++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/pyim/pipelines/shear_splink.py b/pyim/pipelines/shear_splink.py index 4482b52..3e86dce 100644 --- a/pyim/pipelines/shear_splink.py +++ b/pyim/pipelines/shear_splink.py @@ -9,7 +9,8 @@ import skbio import toolz import tqdm -from toolz.curried import filter as curried_filter +from toolz.curried import (filter as curried_filter, + map as curried_map) from pyim.alignment.bowtie2 import align as bowtie_align from pyim.alignment.vector import (align_exact, align_multiple, @@ -74,7 +75,7 @@ def main(args): if args.sample_map is not None: sample_map = pd.read_csv(args.sample_map, sep='\t') sample_map = dict(zip(sample_map['barcode'], - sample_map['sample'])) + sample_map['sample'])) else: sample_map = None @@ -174,13 +175,13 @@ def extract_genomic(reads, transposon, barcodes, linker, # Extract and write genomic sequences. barcode_map = toolz.pipe( reads, - _extract_reads(transposon=transposon, - barcodes=barcodes, - linker=linker, - contaminants=contaminants), + _extract_from_reads(transposon=transposon, + barcodes=barcodes, + linker=linker, + contaminants=contaminants), + curried_map(_check_minimum_length(min_length=15)), print_stats(logger=logger), - curried_filter(lambda r: r.status == ShearSplinkStatus.proper_read), - curried_filter(lambda r: len(r.genomic_sequence) >= min_length), + curried_filter(_proper_filter), write_genomic_sequences(file_path=output_path, format='fasta', **io_kwargs), build_barcode_map) @@ -193,8 +194,9 @@ def extract_genomic(reads, transposon, barcodes, linker, @toolz.curry -def _extract_reads(reads, transposon, barcodes, linker, contaminants=None, - transposon_func=None, barcode_func=None, linker_func=None): +def _extract_from_reads( + reads, transposon, barcodes, linker, contaminants=None, + transposon_func=None, barcode_func=None, linker_func=None): # Specify defaults for not provided aligners. if transposon_func is None: @@ -221,7 +223,7 @@ def _extract_reads(reads, transposon, barcodes, linker, contaminants=None, # Extract and return results. extract_func = toolz.curry( - _extract_read, + _extract_from_read, transposon_func=transposon_func, barcode_func=barcode_func, linker_func=linker_func, @@ -231,8 +233,8 @@ def _extract_reads(reads, transposon, barcodes, linker, contaminants=None, yield result -def _extract_read(read, transposon_func, barcode_func, - linker_func, contaminant_func=None): +def _extract_from_read(read, transposon_func, barcode_func, + linker_func, contaminant_func=None): """ Extracts the genomic sequence and barcode from the passed read. Reads containing contaminants are dropped. Reads are expected to look as follows: @@ -286,6 +288,20 @@ def _extract_read(read, transposon_func, barcode_func, return ExtractResult(genomic, barcode, ShearSplinkStatus.proper_read) +@toolz.curry +def _check_minimum_length(result, min_length): + """Flags proper reads if shorter than min_length.""" + if (result.status == ShearSplinkStatus.proper_read and + len(result.genomic_sequence) < min_length): + result.status = ShearSplinkStatus.too_short + return result + + +def _proper_filter(result): + """Filters extraction results for proper reads.""" + return result.status == ShearSplinkStatus.proper_read + + # --- Insertion identification --- # def identify_insertions(alignment_path, barcode_map): From b69614205f3a511a07e535e5fa0716a9aa1da20e Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Tue, 22 Dec 2015 10:06:05 +0100 Subject: [PATCH 021/100] Drop leading zero in percentage formatting. --- pyim/pipelines/_helpers/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyim/pipelines/_helpers/pipeline.py b/pyim/pipelines/_helpers/pipeline.py index 0c82b81..8b0ce89 100644 --- a/pyim/pipelines/_helpers/pipeline.py +++ b/pyim/pipelines/_helpers/pipeline.py @@ -23,7 +23,7 @@ def print_stats(results, logger=None, header=False): total = sum(status_counts.values()) for status, count in status_counts.items(): percentage = (count / total) * 100 - print_('{:>18}: {:>8} ({:05.2f}%)' + print_('{:>18}: {:>8} ({:5.2f}%)' .format(status, count, percentage)) From 9740dc9bfdbed1dd9330dca49094c5aeda846abe Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Tue, 22 Dec 2015 10:06:27 +0100 Subject: [PATCH 022/100] Add header/footer logging. --- pyim/main/align.py | 15 ++++++++++++++- pyim/pipelines/shear_splink.py | 6 ------ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/pyim/main/align.py b/pyim/main/align.py index 149aad0..e743bc8 100644 --- a/pyim/main/align.py +++ b/pyim/main/align.py @@ -5,12 +5,20 @@ str, super, zip) import argparse +import logging -# from pyim.pipelines.lam_pcr import LamPcrPipeline +from pyim import __version__ from pyim.pipelines import shear_splink +logging.basicConfig( + format='%(asctime)-15s %(message)s', + datefmt='[%Y-%m-%d %H:%M:%S]', + level=logging.INFO) + def main(): + logger = logging.getLogger() + # Setup main parser. parser = argparse.ArgumentParser(prog='pyim-align') @@ -21,9 +29,14 @@ def main(): shear_splink.register(subparsers) # Parse args and dispatch. + header_str = ' PyIM ({}) '.format(__version__) + logger.info('{:-^40}'.format(header_str)) + args = parser.parse_args() args.main(args) + logger.info('{:-^40}'.format(' Done! ')) + if __name__ == '__main__': main() diff --git a/pyim/pipelines/shear_splink.py b/pyim/pipelines/shear_splink.py index 3e86dce..35ecca9 100644 --- a/pyim/pipelines/shear_splink.py +++ b/pyim/pipelines/shear_splink.py @@ -25,12 +25,6 @@ from ._helpers.clustering import merge_within_distance -logging.basicConfig( - format='%(asctime)-15s %(message)s', - datefmt='[%Y-%m-%d %H:%M:%S]', - level=logging.INFO) - - # --- Pipeline register hook + main --- # def register(subparsers, name='shear_splink'): From 2fc469140abd149dc91e1732b572fe9f1e726225 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Tue, 22 Dec 2015 15:26:39 +0100 Subject: [PATCH 023/100] Add shear_splink_sb pipeline with paper defaults. --- pyim/main/align.py | 11 ++-- pyim/pipelines/shear_splink_sb.py | 92 +++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+), 5 deletions(-) create mode 100644 pyim/pipelines/shear_splink_sb.py diff --git a/pyim/main/align.py b/pyim/main/align.py index e743bc8..0c66b26 100644 --- a/pyim/main/align.py +++ b/pyim/main/align.py @@ -8,7 +8,7 @@ import logging from pyim import __version__ -from pyim.pipelines import shear_splink +from pyim.pipelines import shear_splink, shear_splink_sb logging.basicConfig( format='%(asctime)-15s %(message)s', @@ -27,14 +27,15 @@ def main(): # Register pipelines. shear_splink.register(subparsers) + shear_splink_sb.register(subparsers) - # Parse args and dispatch. + # Parse args. + args = parser.parse_args() + + # Dispatch to pipeline. header_str = ' PyIM ({}) '.format(__version__) logger.info('{:-^40}'.format(header_str)) - - args = parser.parse_args() args.main(args) - logger.info('{:-^40}'.format(' Done! ')) diff --git a/pyim/pipelines/shear_splink_sb.py b/pyim/pipelines/shear_splink_sb.py new file mode 100644 index 0000000..c931c91 --- /dev/null +++ b/pyim/pipelines/shear_splink_sb.py @@ -0,0 +1,92 @@ +import os +from os import path + +import pandas as pd +import skbio +from .shear_splink import shear_splink + +from pyim.alignment import vector as vec + + +# --- Pipeline register hook + main --- # + +def register(subparsers, name='shear_splink_sb'): + parser = subparsers.add_parser(name, help=name + ' help') + + # Required arguments. + parser.add_argument('input') + parser.add_argument('output_dir') + parser.add_argument('--bowtie_index', required=True) + parser.add_argument('--transposon', required=True) + parser.add_argument('--barcodes', required=True) + parser.add_argument('--linker', required=True) + + # Optional arguments. + parser.add_argument('--contaminants', default=None) + parser.add_argument('--sample_map', default=None) + parser.add_argument('--min_genomic_length', type=int, default=15) + parser.add_argument('--min_depth', type=int, default=2) + parser.add_argument('--min_mapq', type=int, default=37) + + # Set main for dispatch. + parser.set_defaults(main=main) + + return parser + + +def main(args): + # Read transposon, linker and barcode sequences. + transposon = skbio.io.read(args.transposon, format='fasta', into=skbio.DNA) + linker = skbio.io.read(args.linker, format='fasta', into=skbio.DNA) + + barcodes = list(skbio.io.read(args.barcodes, format='fasta', + constructor=skbio.DNA)) + + if args.contaminants is not None: + contaminants = list(skbio.io.read(args.contaminants, format='fasta', + constructor=skbio.DNA)) + else: + contaminants = None + + # Read barcode --> sample map if given. + if args.sample_map is not None: + sample_map = pd.read_csv(args.sample_map, sep='\t') + sample_map = dict(zip(sample_map['barcode'], + sample_map['sample'])) + else: + sample_map = None + + # Create output_dir if it does not exist. + if not path.exists(args.output_dir): + os.mkdir(args.output_dir) + + # Setup custom aligners. + transposon_aligner = vec.align_chained( + align_funcs=[vec.compose(vec.align_exact, try_reverse=True), + vec.compose(vec.align_ssw, try_reverse=True, + filters=[vec.filter_score(min_score=90)])]) + + linker_ssw_filters = [ + vec.filter_score(min_score=90), + vec.filter_and(filters=[ + vec.filter_end_match(), + vec.filter_coverage(min_coverage=0.5, min_identity=0.9)])] + + linker_aligner = vec.align_chained( + align_funcs=[vec.compose(vec.align_exact, try_reverse=True), + vec.compose(vec.align_ssw, try_reverse=True, + filters=linker_ssw_filters)]) + + extract_kws = {'linker_func': linker_aligner, + 'transposon_func': transposon_aligner} + + # Run pipeline! + insertions = shear_splink( + args.input, transposon, linker, barcodes, + args.bowtie_index, args.output_dir, + contaminants=contaminants, sample_map=sample_map, + min_genomic_length=args.min_genomic_length, extract_kws=extract_kws) + + # Write insertion output. + insertions.to_csv(path.join(args.output_dir, 'insertions.txt'), + sep='\t', index=False) From 19ec49b0f7d93cbb48b49c3ca0f15c1c516b6c9c Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Tue, 22 Dec 2015 15:28:10 +0100 Subject: [PATCH 024/100] Add environment.yml. --- environment.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 environment.yml diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..6085bc5 --- /dev/null +++ b/environment.yml @@ -0,0 +1,12 @@ +name: pyim +dependencies: +- python3.5 +- future +- numpy +- scipy +- pandas +- scikit-bio +- toolz +- pip: + - pysam + - rpy2 From 0577a7146759c0ba19b7f3aba7680b43f3c1b511 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Tue, 22 Dec 2015 15:28:50 +0100 Subject: [PATCH 025/100] Drop natsort dependency. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6961c8b..a0a8c4b 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ install_requires = ['future', 'numpy', 'scipy', 'pandas', 'pysam', - 'natsort', 'rpy2', 'scikit-bio', 'toolz'] + 'rpy2', 'scikit-bio', 'toolz'] if not sys.version_info >= (3, ): install_requires += ['pathlib'] From 0e9c93b4ac77381d6572465e4963c7994e8a7ca1 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Tue, 22 Dec 2015 15:32:03 +0100 Subject: [PATCH 026/100] Replace ExtractResult and Alignment with classes. --- pyim/alignment/vector.py | 210 ++++++++++++++++++++++++--------- pyim/pipelines/_model.py | 11 +- pyim/pipelines/shear_splink.py | 62 +++++----- 3 files changed, 197 insertions(+), 86 deletions(-) diff --git a/pyim/alignment/vector.py b/pyim/alignment/vector.py index 517f928..57d3810 100644 --- a/pyim/alignment/vector.py +++ b/pyim/alignment/vector.py @@ -4,28 +4,51 @@ from toolz import curry -Alignment = collections.namedtuple( - 'Alignment', - ['query_id', 'query_start', 'query_end', 'query_len', - 'target_id', 'target_start', 'target_end', 'target_len', - 'strand', 'identity', 'coverage', 'score']) - - -def reverse_alignment(aln): - """Reverses strand of alignment object.""" - target_len = aln.target_len - - return Alignment( - query_id=aln.query_id, query_start=aln.query_start, - query_end=aln.query_end, query_len=aln.query_len, - target_id=aln.target_id, target_start=target_len - aln.target_end, - target_end=target_len - aln.target_start, target_len=target_len, - strand=aln.strand * -1, type=aln.type, identity=aln.identity, - coverage=aln.coverage, score=aln.score) +class Alignment(object): + + __slots__ = ('query_id', 'query_start', 'query_end', 'query_len', + 'target_id', 'target_start', 'target_end', 'target_len', + 'strand', 'identity', 'coverage', 'score', 'type') + + def __init__(self, query_id, query_start, query_end, query_len, + target_id, target_start, target_end, target_len, + strand, identity, coverage, type): + self.query_id = query_id + self.query_start = query_start + self.query_end = query_end + self.query_len = query_len + + self.target_id = target_id + self.target_start = target_start + self.target_end = target_end + self.target_len = target_len + + self.strand = strand + self.identity = identity + self.coverage = coverage + self.type = type + + self.score = int(identity * coverage * 100) + + def reverse(self): + return Alignment(query_id=self.query_id, + query_start=self.query_start, + query_end=self.query_end, + query_len=self.query_len, + target_id=self.target_id, + target_start=self.target_len - self.target_end, + target_end=self.target_len - self.target_start, + target_len=self.target_len, + strand=self.strand * -1, + identity=self.identity, + coverage=self.coverage, + type=self.type) @curry def align_exact(target, query, query_strand=1): + """Aligns query to target using exact matching.""" + # Note that this alignment returns the first occurrence it finds, # later occurrences will not be found and are not checked for. try: @@ -40,68 +63,145 @@ def align_exact(target, query, query_strand=1): query_end=q_len, query_len=q_len, target_id=target.metadata.get('id', None), target_start=index, target_end=index + q_len, target_len=len(target), - strand=query_strand, identity=1.0, coverage=1.0, score=100) + strand=query_strand, identity=1.0, coverage=1.0, type='exact') @curry def align_ssw(target, query, query_strand=1): - ssw_aln = local_pairwise_align_ssw(target.sequence, query.sequence) + """Aligns query to target using ssw aligner.""" + + # Perform actual alignment. + ssw_aln = local_pairwise_align_ssw(str(target), str(query)) - # Extract positions. - pos = ssw_aln.start_end_positions() - q_start, q_end = pos[1] - t_start, t_end = pos[0] + # Extract positions. + pos = ssw_aln.start_end_positions() + q_start, q_end = pos[1] + t_start, t_end = pos[0] - # Offset ends by one, making them exclusive - # to match python conventions. - q_end += 1 - t_end += 1 + # Offset ends by one, making them exclusive + # to match python conventions. + q_end += 1 + t_end += 1 - # Calculate basic metrics. - coverage = (q_end - q_start) / float(len(query)) - identity = ssw_aln[0].fraction_same(ssw_aln[1]) + # Calculate basic metrics. + coverage = (q_end - q_start) / float(len(query)) + identity = ssw_aln[0].match_frequency(ssw_aln[1], relative=True) - aln = Alignment( - query_id=query.id, query_start=q_start, query_end=q_end, - query_len=len(query), target_id=target.id, target_start=t_start, - target_end=t_end, target_len=len(target), strand=query_strand, - identity=identity, coverage=coverage, - score=int(identity * coverage * 100)) + aln = Alignment( + query_id=query.metadata.get('id', None), query_start=q_start, + query_end=q_end, query_len=len(query), + target_id=target.metadata.get('id', None), target_start=t_start, + target_end=t_end, target_len=len(target), strand=query_strand, + identity=identity, coverage=coverage, type='ssw') - return aln + return aln @curry def align_with_reverse(target, query, align_func, query_strand=1, **kwargs): + """Aligns query in both orientations to target sequence.""" + aln_fwd = align_func(target, query, query_strand=query_strand, **kwargs) aln_rev = align_func(target, query.reverse_complement(), query_strand=query_strand * -1, **kwargs) - - if aln_fwd is None: - return aln_rev - elif aln_rev is None: - return aln_fwd - else: - return aln_rev if aln_rev.score > aln_fwd.score else aln_fwd + return _pick_best(list(filter(bool, [aln_fwd, aln_rev]))) @curry -def align_multiple(target, queries, align_func, return_first=False, **kwargs): - alns = (align_func(target, query, **kwargs) for query in queries) - alns = list(filter(bool, alns)) +def align_multiple(target, queries, align_func, raise_error=False, **kwargs): + """Aligns multiple queries to target sequence.""" + + alignments = (align_func(target, query, **kwargs) + for query in queries) + alignments = list(filter(bool, alignments)) + + if len(alignments) > 1 and raise_error: + raise ValueError('Multiple alignments') - if len(alns) == 0: + return _pick_best(alignments) + + +def _pick_best(alignments): + """Picks best alignment from list (based on score).""" + + if len(alignments) == 0: return None - elif len(alns) == 1 or return_first: - return alns[0] + if len(alignments) == 1: + return alignments[0] else: - raise ValueError('Multiple alignments') + best = alignments[0] + for aln in alignments: + if aln.score > best.score: + best = aln + return best + + +@curry +def align_chained(target, query, align_funcs, **kwargs): + """Chains multiple vector alignment functions.""" + + for func in align_funcs: + aln = func(target, query, **kwargs) + if aln is not None: + return aln + return None + + +def compose(align_func, try_reverse=False, + filter='and', filters=None, **kwargs): + """Helper function to build an aligner.""" + + if try_reverse: + align_func = align_with_reverse(align_func=align_func) + + if filters is not None: + if filter == 'and': + align_func = filter_and(align_func=align_func, filters=filters) + elif filter == 'or': + align_func = filter_or(align_func=align_func, filters=filters) + else: + raise ValueError('Filter should be either "or" or "and" (not {})' + .format(filter)) + + return align_func(**kwargs) # --- Filtering --- # -def filter_alignment(alignment, filters): +@curry +def filter_and(target, query, align_func, filters, **kwargs): + """Performs AND of filters on resulting alignments.""" + + alignment = align_func(target, query, **kwargs) for filter_ in filters: if not filter_(alignment): - return False - return True \ No newline at end of file + return None + return alignment + + +@curry +def filter_or(target, query, align_func, filters, **kwargs): + """Performs OR of filters on resulting alignments.""" + + return not filter_and(target, query, align_func, filters, **kwargs) + + +@curry +def filter_score(alignment, min_score): + """Checks if alignment has minimum score.""" + + return alignment.score >= min_score + + +@curry +def filter_coverage(alignment, min_coverage, min_identity): + """Checks if alignment is at end of read.""" + + return ((alignment.coverage >= min_coverage) and + (alignment.identity >= min_identity)) + +@curry +def filter_end_match(alignment): + """Checks if alignment is at end of read.""" + + return alignment.target_end == alignment.target_len diff --git a/pyim/pipelines/_model.py b/pyim/pipelines/_model.py index d50f909..2728c60 100644 --- a/pyim/pipelines/_model.py +++ b/pyim/pipelines/_model.py @@ -1,5 +1,10 @@ -import collections +class ExtractResult(object): -ExtractResult = collections.namedtuple( - 'ExtractResult', ['genomic_sequence', 'barcode', 'status']) + __slots__ = ('genomic_sequence', 'barcode', 'status') + + def __init__(self, genomic_sequence, barcode, status): + super().__init__() + self.genomic_sequence = genomic_sequence + self.barcode = barcode + self.status = status diff --git a/pyim/pipelines/shear_splink.py b/pyim/pipelines/shear_splink.py index 35ecca9..179c6af 100644 --- a/pyim/pipelines/shear_splink.py +++ b/pyim/pipelines/shear_splink.py @@ -14,7 +14,7 @@ from pyim.alignment.bowtie2 import align as bowtie_align from pyim.alignment.vector import (align_exact, align_multiple, - align_with_reverse, reverse_alignment) + align_with_reverse) from pyim.util import count_fasta_entries from ._model import ExtractResult @@ -33,10 +33,10 @@ def register(subparsers, name='shear_splink'): # Required arguments. parser.add_argument('input') parser.add_argument('output_dir') - parser.add_argument('bowtie_index') - parser.add_argument('transposon') - parser.add_argument('barcodes') - parser.add_argument('linker') + parser.add_argument('--bowtie_index', required=True) + parser.add_argument('--transposon', required=True) + parser.add_argument('--barcodes', required=True) + parser.add_argument('--linker', required=True) # Optional arguments. parser.add_argument('--contaminants', default=None) @@ -82,7 +82,7 @@ def main(args): args.input, transposon, linker, barcodes, args.bowtie_index, args.output_dir, contaminants=contaminants, sample_map=sample_map, - min_genomic_length=args.min_genomic_length) + min_genomic_length=args.min_genomic_length, min_mapq=args.min_mapq) # Write insertion output. insertions.to_csv(path.join(args.output_dir, 'insertions.txt'), @@ -93,7 +93,8 @@ def main(args): def shear_splink(read_path, transposon, linker, barcodes, bowtie_index, output_dir, contaminants=None, - sample_map=None, min_genomic_length=15): + sample_map=None, min_genomic_length=15, + min_mapq=37, extract_kws=None): logger = logging.getLogger() @@ -105,15 +106,15 @@ def shear_splink(read_path, transposon, linker, barcodes, # Log progress with progressbar. logger.info('Extracting genomic sequences') - reads = skbio.read(read_path, format='fasta') + reads = skbio.read(read_path, format='fasta', constructor=skbio.DNA) reads = tqdm.tqdm(reads, total=count_fasta_entries(read_path), - leave=False, ncols=80) + leave=False, ncols=60) # Extract genomic sequences and barcodes _, barcode_frame = extract_genomic( reads, transposon=transposon, barcodes=barcodes, linker=linker, output_path=genomic_path, contaminants=contaminants, - min_length=min_genomic_length, logger=logger) + min_length=min_genomic_length, logger=logger, extract_kws=extract_kws) barcode_frame.to_csv(barcode_path, sep='\t', index=False) @@ -138,14 +139,15 @@ def shear_splink(read_path, transposon, linker, barcodes, insertions = merge_within_distance( insertions, max_dist=2000, agg_funcs=agg_funcs) - # Assign ids to insertions. - insertions['id'] = ['INS_{}'.format(i) - for i in range(1, len(insertions) + 1)] - # Map barcodes to samples. if sample_map is not None: insertions['sample'] = insertions['barcode'].map(sample_map) + # Sort and assign ids to insertions. + insertions.sort_values(by=['chrom', 'position'], inplace=True) + insertions['id'] = ['INS_{}'.format(i) + for i in range(1, len(insertions) + 1)] + return insertions @@ -163,8 +165,9 @@ class ShearSplinkStatus(Enum): def extract_genomic(reads, transposon, barcodes, linker, output_path, contaminants=None, min_length=15, - io_kwargs=None, logger=None): - io_kwargs = io_kwargs or {} + logger=None, extract_kws=None): + + extract_kws = extract_kws or {} # Extract and write genomic sequences. barcode_map = toolz.pipe( @@ -172,12 +175,12 @@ def extract_genomic(reads, transposon, barcodes, linker, _extract_from_reads(transposon=transposon, barcodes=barcodes, linker=linker, - contaminants=contaminants), - curried_map(_check_minimum_length(min_length=15)), + contaminants=contaminants, + **extract_kws), + curried_map(_check_minimum_length(min_length=min_length)), print_stats(logger=logger), curried_filter(_proper_filter), - write_genomic_sequences(file_path=output_path, - format='fasta', **io_kwargs), + write_genomic_sequences(file_path=output_path, format='fasta'), build_barcode_map) # Build frame mapping reads to barcodes. @@ -206,7 +209,7 @@ def _extract_from_reads( if contaminants is not None: contaminant_func = align_multiple(queries=contaminants, align_func=align_exact, - return_first=True) + raise_error=False) else: contaminant_func = None @@ -259,7 +262,12 @@ def _extract_from_read(read, transposon_func, barcode_func, # alignment to bring everything into the same (fwd) orientation. if transposon_aln.strand == -1: read = read.reverse_complement() - transposon_aln = reverse_alignment(transposon_aln) + transposon_aln = transposon_aln.reverse() + + # Identify location of linker. + linker_aln = linker_func(read) + if linker_aln is None: + return ExtractResult(None, None, ShearSplinkStatus.no_linker) # Identify barcode of the read. try: @@ -271,11 +279,6 @@ def _extract_from_read(read, transposon_func, barcode_func, barcode = barcode_aln.query_id - # Identify location of linker. - linker_aln = linker_func(read) - if linker_aln is None: - return ExtractResult(None, None, ShearSplinkStatus.no_linker) - # Extract genomic sequence using previous alignments. genomic = read[transposon_aln.target_end:linker_aln.target_start] @@ -298,11 +301,14 @@ def _proper_filter(result): # --- Insertion identification --- # -def identify_insertions(alignment_path, barcode_map): +def identify_insertions(alignment_path, barcode_map, min_mapq=37): # Get alignments from bowtie. bam = pysam.AlignmentFile(alignment_path) alignments = bam.fetch(multiple_iterators=True) + # Filter by mapq. + alignments = filter(lambda a: a.mapq >= min_mapq, alignments) + # Group alignments by barcode and position. aln_groups = chain_groupby( alignments, From e27e07ea8ce6426b3669ad8d78b0e60f6fe306b0 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Tue, 22 Dec 2015 15:32:18 +0100 Subject: [PATCH 027/100] Ensure statuses are ordered in print_stats. --- pyim/pipelines/_helpers/pipeline.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyim/pipelines/_helpers/pipeline.py b/pyim/pipelines/_helpers/pipeline.py index 8b0ce89..bca38ec 100644 --- a/pyim/pipelines/_helpers/pipeline.py +++ b/pyim/pipelines/_helpers/pipeline.py @@ -21,7 +21,9 @@ def print_stats(results, logger=None, header=False): print_('Extraction stats:') total = sum(status_counts.values()) - for status, count in status_counts.items(): + + for status in sorted(status_counts.keys()): + count = status_counts[status] percentage = (count / total) * 100 print_('{:>18}: {:>8} ({:5.2f}%)' .format(status, count, percentage)) From 8b2f3454f9ccdcc5d506742be3f9659406efa09c Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Tue, 22 Dec 2015 15:32:38 +0100 Subject: [PATCH 028/100] Change argparser name to reflect binary. --- pyim/main/split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyim/main/split.py b/pyim/main/split.py index c8ef310..9f2ce94 100644 --- a/pyim/main/split.py +++ b/pyim/main/split.py @@ -12,7 +12,7 @@ def setup_parser(): - parser = ArgumentParser(prog='pyim-merge') + parser = ArgumentParser(prog='pyim-split') parser.add_argument('alignment_bam', type=Path) parser.add_argument('read_barcode_map', type=Path) From e844e0ebaf077a1a311468dd717cf3a3ab2e20cf Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Tue, 22 Dec 2015 15:56:14 +0100 Subject: [PATCH 029/100] Create parent directories for output-dir if needed. --- pyim/pipelines/shear_splink.py | 2 +- pyim/pipelines/shear_splink_sb.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyim/pipelines/shear_splink.py b/pyim/pipelines/shear_splink.py index 179c6af..633b7a5 100644 --- a/pyim/pipelines/shear_splink.py +++ b/pyim/pipelines/shear_splink.py @@ -75,7 +75,7 @@ def main(args): # Create output_dir if it does not exist. if not path.exists(args.output_dir): - os.mkdir(args.output_dir) + os.makedirs(args.output_dir, exist_ok=True) # Run pipeline! insertions = shear_splink( diff --git a/pyim/pipelines/shear_splink_sb.py b/pyim/pipelines/shear_splink_sb.py index c931c91..de2adc1 100644 --- a/pyim/pipelines/shear_splink_sb.py +++ b/pyim/pipelines/shear_splink_sb.py @@ -58,7 +58,7 @@ def main(args): # Create output_dir if it does not exist. if not path.exists(args.output_dir): - os.mkdir(args.output_dir) + os.makedirs(args.output_dir, exist_ok=True) # Setup custom aligners. transposon_aligner = vec.align_chained( From 7c5ae85df3c27184c8c54f31e3913c0f0b05ce7b Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 23 Dec 2015 17:27:52 +0100 Subject: [PATCH 030/100] Degrade version number to reflect beta-status. --- pyim/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyim/__init__.py b/pyim/__init__.py index 234b32d..e9de292 100644 --- a/pyim/__init__.py +++ b/pyim/__init__.py @@ -1,2 +1,2 @@ -__version__ = '1.0.0-beta' +__version__ = '0.9.0' From b9bd9fd63cdcd3a9a3f10b5c9e0ad42bd9fce824 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 23 Dec 2015 17:27:52 +0100 Subject: [PATCH 031/100] Degrade version number to reflect beta-status. --- pyim/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyim/__init__.py b/pyim/__init__.py index 234b32d..875c4ac 100644 --- a/pyim/__init__.py +++ b/pyim/__init__.py @@ -1,2 +1,2 @@ -__version__ = '1.0.0-beta' +__version__ = '0.9' From 01c5e337852782355ac8abfe1cab2d559179612d Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 23 Dec 2015 17:46:28 +0100 Subject: [PATCH 032/100] Add automatic versioning. --- MANIFEST.IN | 1 + RELEASE-VERSION | 1 + pyim/__init__.py | 2 - setup.py | 4 +- version.py | 106 +++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 110 insertions(+), 4 deletions(-) create mode 100644 MANIFEST.IN create mode 100644 RELEASE-VERSION create mode 100644 version.py diff --git a/MANIFEST.IN b/MANIFEST.IN new file mode 100644 index 0000000..466cd00 --- /dev/null +++ b/MANIFEST.IN @@ -0,0 +1 @@ +include RELEASE-VERSION diff --git a/RELEASE-VERSION b/RELEASE-VERSION new file mode 100644 index 0000000..b63ba69 --- /dev/null +++ b/RELEASE-VERSION @@ -0,0 +1 @@ +0.9 diff --git a/pyim/__init__.py b/pyim/__init__.py index 875c4ac..e69de29 100644 --- a/pyim/__init__.py +++ b/pyim/__init__.py @@ -1,2 +0,0 @@ - -__version__ = '0.9' diff --git a/setup.py b/setup.py index a0a8c4b..731aae2 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup, find_packages -from pyim import __version__ +from version import get_git_version install_requires = ['future', 'numpy', 'scipy', 'pandas', 'pysam', @@ -13,7 +13,7 @@ setup( name='pyim', - version=__version__, + version=get_git_version(), url='https://bitbucket.org/jrderuiter/pyim', author='Julian de Ruiter', author_email='julianderuiter@gmail.com', diff --git a/version.py b/version.py new file mode 100644 index 0000000..1c404ad --- /dev/null +++ b/version.py @@ -0,0 +1,106 @@ +# -*- coding: utf-8 -*- +# Author: Douglas Creager +# This file is placed into the public domain. + +# Calculates the current version number. If possible, this is the +# output of “git describe”, modified to conform to the versioning +# scheme that setuptools uses. If “git describe” returns an error +# (most likely because we're in an unpacked copy of a release tarball, +# rather than in a git working copy), then we fall back on reading the +# contents of the RELEASE-VERSION file. +# +# To use this script, simply import it your setup.py file, and use the +# results of get_git_version() as your package version: +# +# from version import * +# +# setup( +# version=get_git_version(), +# . +# . +# . +# ) +# +# This will automatically update the RELEASE-VERSION file, if +# necessary. Note that the RELEASE-VERSION file should *not* be +# checked into git; please add it to your top-level .gitignore file. +# +# You'll probably want to distribute the RELEASE-VERSION file in your +# sdist tarballs; to do this, just create a MANIFEST.in file that +# contains the following line: +# +# include RELEASE-VERSION + +from __future__ import print_function + +__all__ = ("get_git_version") + +from subprocess import Popen, PIPE + + +def call_git_describe(abbrev=4): + try: + p = Popen(['git', 'describe', '--abbrev=%d' % abbrev], + stdout=PIPE, stderr=PIPE) + p.stderr.close() + line = p.stdout.readlines()[0] + return line.strip().decode() + + except: + return None + + +def read_release_version(): + try: + f = open("RELEASE-VERSION", "r") + + try: + version = f.readlines()[0] + return version.strip() + + finally: + f.close() + + except: + return None + + +def write_release_version(version): + f = open("RELEASE-VERSION", "w") + f.write("%s\n" % version) + f.close() + + +def get_git_version(abbrev=4): + # Read in the version that's currently in RELEASE-VERSION. + + release_version = read_release_version() + + # First try to get the current version using “git describe”. + + version = call_git_describe(abbrev) + + # If that doesn't work, fall back on the value that's in + # RELEASE-VERSION. + + if version is None: + version = release_version + + # If we still don't have anything, that's an error. + + if version is None: + raise ValueError("Cannot find the version number!") + + # If the current version is different from what's in the + # RELEASE-VERSION file, update the file to be current. + + if version != release_version: + write_release_version(version) + + # Finally, return the current version. + + return version + + +if __name__ == "__main__": + print(get_git_version()) \ No newline at end of file From 0cbcecf5aed95699b3e0342d4f84f767c83434ff Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 23 Dec 2015 17:53:51 +0100 Subject: [PATCH 033/100] Remove __version__ reference. --- RELEASE-VERSION | 1 - pyim/main/align.py | 7 +++++-- 2 files changed, 5 insertions(+), 3 deletions(-) delete mode 100644 RELEASE-VERSION diff --git a/RELEASE-VERSION b/RELEASE-VERSION deleted file mode 100644 index b63ba69..0000000 --- a/RELEASE-VERSION +++ /dev/null @@ -1 +0,0 @@ -0.9 diff --git a/pyim/main/align.py b/pyim/main/align.py index 0c66b26..ca9a4fa 100644 --- a/pyim/main/align.py +++ b/pyim/main/align.py @@ -6,8 +6,8 @@ import argparse import logging +import pkg_resources -from pyim import __version__ from pyim.pipelines import shear_splink, shear_splink_sb logging.basicConfig( @@ -33,9 +33,12 @@ def main(): args = parser.parse_args() # Dispatch to pipeline. - header_str = ' PyIM ({}) '.format(__version__) + version = pkg_resources.require('pyim')[0].version + header_str = ' PyIM ({}) '.format(version) logger.info('{:-^40}'.format(header_str)) + args.main(args) + logger.info('{:-^40}'.format(' Done! ')) From 426c7317fba7c9f1892283add0c8ec28b3504788 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 23 Dec 2015 17:54:23 +0100 Subject: [PATCH 034/100] Ignore RELEASE-VERSION. --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 335ed4e..f17f4f9 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ htmlcov .coverage _legacy docs/_build +RELEASE-VERSION From 92b76685d526385d978dddd6ae2b007f09989b91 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 23 Dec 2015 18:48:57 +0100 Subject: [PATCH 035/100] Supply reads as input to shear splink pipelines. --- pyim/main/align.py | 2 +- pyim/pipelines/shear_splink.py | 18 ++++++++++-------- pyim/pipelines/shear_splink_sb.py | 10 ++++++++-- 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/pyim/main/align.py b/pyim/main/align.py index ca9a4fa..42aedca 100644 --- a/pyim/main/align.py +++ b/pyim/main/align.py @@ -38,7 +38,7 @@ def main(): logger.info('{:-^40}'.format(header_str)) args.main(args) - + logger.info('{:-^40}'.format(' Done! ')) diff --git a/pyim/pipelines/shear_splink.py b/pyim/pipelines/shear_splink.py index 633b7a5..4aaa3bd 100644 --- a/pyim/pipelines/shear_splink.py +++ b/pyim/pipelines/shear_splink.py @@ -52,6 +52,10 @@ def register(subparsers, name='shear_splink'): def main(args): + # Prepare reads, counting total for progress bar. + reads = skbio.read(args.input, format='fasta', constructor=skbio.DNA) + total_reads = count_fasta_entries(args.input) + # Read transposon, linker and barcode sequences. transposon = skbio.io.read(args.transposon, format='fasta', into=skbio.DNA) linker = skbio.io.read(args.linker, format='fasta', into=skbio.DNA) @@ -79,10 +83,11 @@ def main(args): # Run pipeline! insertions = shear_splink( - args.input, transposon, linker, barcodes, + reads, transposon, linker, barcodes, args.bowtie_index, args.output_dir, contaminants=contaminants, sample_map=sample_map, - min_genomic_length=args.min_genomic_length, min_mapq=args.min_mapq) + min_genomic_length=args.min_genomic_length, + min_mapq=args.min_mapq, total_reads=total_reads) # Write insertion output. insertions.to_csv(path.join(args.output_dir, 'insertions.txt'), @@ -91,10 +96,10 @@ def main(args): # --- Overall pipeline --- # -def shear_splink(read_path, transposon, linker, barcodes, +def shear_splink(reads, transposon, linker, barcodes, bowtie_index, output_dir, contaminants=None, sample_map=None, min_genomic_length=15, - min_mapq=37, extract_kws=None): + min_mapq=37, extract_kws=None, total_reads=None): logger = logging.getLogger() @@ -105,10 +110,7 @@ def shear_splink(read_path, transposon, linker, barcodes, # Log progress with progressbar. logger.info('Extracting genomic sequences') - - reads = skbio.read(read_path, format='fasta', constructor=skbio.DNA) - reads = tqdm.tqdm(reads, total=count_fasta_entries(read_path), - leave=False, ncols=60) + reads = tqdm.tqdm(reads, total=total_reads, leave=False, ncols=60) # Extract genomic sequences and barcodes _, barcode_frame = extract_genomic( diff --git a/pyim/pipelines/shear_splink_sb.py b/pyim/pipelines/shear_splink_sb.py index de2adc1..2616356 100644 --- a/pyim/pipelines/shear_splink_sb.py +++ b/pyim/pipelines/shear_splink_sb.py @@ -6,6 +6,7 @@ from .shear_splink import shear_splink from pyim.alignment import vector as vec +from pyim.util import count_fasta_entries # --- Pipeline register hook + main --- # @@ -35,6 +36,10 @@ def register(subparsers, name='shear_splink_sb'): def main(args): + # Prepare reads, counting total for progress bar. + reads = skbio.read(args.input, format='fasta', constructor=skbio.DNA) + total_reads = count_fasta_entries(args.input) + # Read transposon, linker and barcode sequences. transposon = skbio.io.read(args.transposon, format='fasta', into=skbio.DNA) linker = skbio.io.read(args.linker, format='fasta', into=skbio.DNA) @@ -82,10 +87,11 @@ def main(args): # Run pipeline! insertions = shear_splink( - args.input, transposon, linker, barcodes, + reads, transposon, linker, barcodes, args.bowtie_index, args.output_dir, contaminants=contaminants, sample_map=sample_map, - min_genomic_length=args.min_genomic_length, extract_kws=extract_kws) + min_genomic_length=args.min_genomic_length, + extract_kws=extract_kws, total_reads=total_reads) # Write insertion output. insertions.to_csv(path.join(args.output_dir, 'insertions.txt'), From ca783afe444ff6df26a43902032df30c104a4fab Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 23 Dec 2015 19:05:54 +0100 Subject: [PATCH 036/100] Subset barcodes to sample map. --- pyim/pipelines/shear_splink.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pyim/pipelines/shear_splink.py b/pyim/pipelines/shear_splink.py index 4aaa3bd..b4f4beb 100644 --- a/pyim/pipelines/shear_splink.py +++ b/pyim/pipelines/shear_splink.py @@ -103,6 +103,14 @@ def shear_splink(reads, transposon, linker, barcodes, logger = logging.getLogger() + # Subset barcodes to sample map (if given). + if sample_map is not None: + barcodes = [bc for bc in barcodes + if bc.metadata['id'] in sample_map] + + if len(barcodes) != len(sample_map): + raise ValueError('Missing or duplicate barcodes') + # Determine paths for intermediates/outputs. genomic_path = path.join(output_dir, 'genomic.fna') barcode_path = path.join(output_dir, 'genomic.barcodes.txt') @@ -110,7 +118,8 @@ def shear_splink(reads, transposon, linker, barcodes, # Log progress with progressbar. logger.info('Extracting genomic sequences') - reads = tqdm.tqdm(reads, total=total_reads, leave=False, ncols=60) + reads = tqdm.tqdm(reads, total=total_reads, + unit='read', leave=False, ncols=60) # Extract genomic sequences and barcodes _, barcode_frame = extract_genomic( From 14e1347f9264a017844082b894c5117dc99e973a Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 23 Dec 2015 19:12:52 +0100 Subject: [PATCH 037/100] Add missing __init__.py. --- pyim/main/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 pyim/main/__init__.py diff --git a/pyim/main/__init__.py b/pyim/main/__init__.py new file mode 100644 index 0000000..e69de29 From c48bf91f313d440cbeb214f7e64e0b8a87f2f7ca Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 23 Dec 2015 19:49:59 +0100 Subject: [PATCH 038/100] Refactored logging into separate module. --- pyim/main/_logging.py | 18 ++++++++++++++++++ pyim/main/align.py | 16 +++------------- 2 files changed, 21 insertions(+), 13 deletions(-) create mode 100644 pyim/main/_logging.py diff --git a/pyim/main/_logging.py b/pyim/main/_logging.py new file mode 100644 index 0000000..b6f0b0e --- /dev/null +++ b/pyim/main/_logging.py @@ -0,0 +1,18 @@ +import logging +import pkg_resources + + +logging.basicConfig( + format='%(asctime)-15s %(message)s', + datefmt='[%Y-%m-%d %H:%M:%S]', + level=logging.INFO) + + +def print_header(logger): + version = pkg_resources.require('pyim')[0].version + header_str = ' PyIM ({}) '.format(version) + logger.info('{:-^40}'.format(header_str)) + + +def print_footer(logger): + logger.info('{:-^40}'.format(' Done! ')) diff --git a/pyim/main/align.py b/pyim/main/align.py index 42aedca..e828485 100644 --- a/pyim/main/align.py +++ b/pyim/main/align.py @@ -6,14 +6,9 @@ import argparse import logging -import pkg_resources from pyim.pipelines import shear_splink, shear_splink_sb - -logging.basicConfig( - format='%(asctime)-15s %(message)s', - datefmt='[%Y-%m-%d %H:%M:%S]', - level=logging.INFO) +from ._logging import print_header, print_footer def main(): @@ -21,7 +16,6 @@ def main(): # Setup main parser. parser = argparse.ArgumentParser(prog='pyim-align') - subparsers = parser.add_subparsers(dest='pipeline') subparsers.required = True @@ -33,13 +27,9 @@ def main(): args = parser.parse_args() # Dispatch to pipeline. - version = pkg_resources.require('pyim')[0].version - header_str = ' PyIM ({}) '.format(version) - logger.info('{:-^40}'.format(header_str)) - + print_header(logger) args.main(args) - - logger.info('{:-^40}'.format(' Done! ')) + print_footer(logger) if __name__ == '__main__': From 9f59a5e6450a721770cd4b0aca2432741479f579 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 23 Dec 2015 23:07:26 +0100 Subject: [PATCH 039/100] Move pipelines to submodule of alignment. --- pyim/alignment/_vector.py | 203 ------------------ pyim/{ => alignment}/pipelines/__init__.py | 0 .../pipelines/_helpers/__init__.py | 0 .../pipelines/_helpers/clustering.py | 0 .../pipelines/_helpers/grouping.py | 0 .../pipelines/_helpers/pipeline.py | 0 pyim/{ => alignment}/pipelines/_model.py | 0 pyim/{ => alignment}/pipelines/lam_pcr.py | 0 .../{ => alignment}/pipelines/shear_splink.py | 0 .../pipelines/shear_splink_sb.py | 0 pyim/main/align.py | 7 +- pyim/util/__init__.py | 0 pyim/{util.py => util/file.py} | 0 13 files changed, 2 insertions(+), 208 deletions(-) delete mode 100644 pyim/alignment/_vector.py rename pyim/{ => alignment}/pipelines/__init__.py (100%) rename pyim/{ => alignment}/pipelines/_helpers/__init__.py (100%) rename pyim/{ => alignment}/pipelines/_helpers/clustering.py (100%) rename pyim/{ => alignment}/pipelines/_helpers/grouping.py (100%) rename pyim/{ => alignment}/pipelines/_helpers/pipeline.py (100%) rename pyim/{ => alignment}/pipelines/_model.py (100%) rename pyim/{ => alignment}/pipelines/lam_pcr.py (100%) rename pyim/{ => alignment}/pipelines/shear_splink.py (100%) rename pyim/{ => alignment}/pipelines/shear_splink_sb.py (100%) create mode 100644 pyim/util/__init__.py rename pyim/{util.py => util/file.py} (100%) diff --git a/pyim/alignment/_vector.py b/pyim/alignment/_vector.py deleted file mode 100644 index 9ad6d40..0000000 --- a/pyim/alignment/_vector.py +++ /dev/null @@ -1,203 +0,0 @@ -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import (ascii, bytes, chr, dict, filter, hex, input, - int, map, next, oct, open, pow, range, round, - str, super, zip) - -from skbio.alignment import local_pairwise_align_ssw - - -class VectorAlignment(object): - - def __init__(self, query_id, query_start, query_end, query_len, - target_id, target_start, target_end, target_strand, - target_len, type, identity, coverage): - self.query_id = query_id - self.query_start = query_start - self.query_end = query_end - self.query_len = query_len - self.target_id = target_id - self.target_start = target_start - self.target_end = target_end - self.target_strand = target_strand - self.target_len = target_len - self.type = type - self.identity = identity - self.coverage = coverage - - @property - def score(self): - return self.identity * self.coverage - - def reverse(self, read): - read_len = len(read) - - return self.__class__( - query_id=self.query_id, - query_start=self.query_start, - query_end=self.query_end, - query_len=self.query_len, - target_id=self.target_id, - target_start=read_len - self.target_end, - target_end=read_len - self.target_start, - target_len=self.target_len, - target_strand=1 if self.target_strand == -1 else 1, - type=self.type, identity=self.identity, coverage=self.coverage - ) - - -class VectorAligner(object): - - def __init__(self, **kwargs): - pass - - def align(self, query, target): - raise NotImplementedError() - - def align_multiple(self, queries, target, how='unique'): - alignments = filter(bool, (self.align(q, target) for q in queries)) - alignments = list(alignments) - - num_alignments = len(alignments) - if num_alignments == 0: - return None - elif num_alignments == 1: - return alignments[0] - else: - if how == 'unique': - raise ValueError('Multiple matching queries for target {}' - .format(target.metadata['id'])) - elif how == 'any': - return alignments[0] - else: - raise ValueError('Unknown value for how ({})'.format(how)) - - -class ExactAligner(VectorAligner): - - def __init__(self, try_reverse=False): - super().__init__() - self._try_reverse = try_reverse - - def align(self, query, target): - alignment = self._align_exact(query, target, query_ori=1) - - # Try reverse complement if first alignment failed. - if alignment is None and self._try_reverse: - alignment = self._align_exact( - query.reverse_complement(), target, query_ori=-1) - - return alignment - - @staticmethod - def _align_exact(query, target, query_ori): - # Note that this alignment returns the first occurrence it finds, - # later occurrences will not be found and are not checked for. - try: - index = str(target).index(str(query)) - except ValueError: - return None - else: - q_len = len(query) - - return VectorAlignment( - query_id=query.metadata['id'], query_start=0, query_end=q_len, - query_len=q_len, target_id=target.metadata['id'], - target_start=index, target_end=index + q_len, - target_strand=query_ori, target_len=len(target), type='exact', - identity=1.0, coverage=1.0) - - -class SswAligner(VectorAligner): - - def __init__(self, try_reverse=False, filters=None): - super().__init__() - self._try_reverse = try_reverse - self._filters = filters - - def align(self, query, target): - fwd_alignment = self._align_ssw(query, target, query_ori=1) - - if self._try_reverse: - rev_alignment = self._align_ssw( - query.reverse_complement(), target, query_ori=-1) - - if fwd_alignment is None: - # Default to reverse if no forward. - alignment = rev_alignment - elif rev_alignment is None: - # Default to forward if no reverse. - alignment = fwd_alignment - else: - # Otherwise choose the best of the two. - if rev_alignment.score > fwd_alignment.score: - alignment = rev_alignment - else: - alignment = fwd_alignment - else: - alignment = fwd_alignment - - return alignment - - def _align_ssw(self, query, target, query_ori): - ssw_aln = local_pairwise_align_ssw(target, query) - - # Extract positions. - pos = ssw_aln.start_end_positions() - q_start, q_end = pos[1] - t_start, t_end = pos[0] - - # Offset ends by one, making them exclusive - # to match python conventions. - q_end += 1 - t_end += 1 - - # Calculate basic metrics. - coverage = (q_end - q_start) / float(len(query)) - identity = 1.0 - ssw_aln[0].distance(ssw_aln[1]) - - aln = VectorAlignment( - query_id=query.metadata['id'], query_start=q_start, - query_end=q_end, query_len=len(query), - target_id=target.metadata['id'], target_start=t_start, - target_end=t_end, target_strand=query_ori, target_len=len(target), - type='ssw', identity=identity, coverage=coverage) - - # Check if alignment passes any filter. - if self._filters is None: - return aln - else: - for filter_ in self._filters: - if filter_(aln): - return aln - return None - - -class ChainedAligner(VectorAligner): - - def __init__(self, aligners): - super().__init__() - self._aligners = aligners - - def align(self, query, target): - aln = None - - for aligner in self._aligners: - aln = aligner.align(query, target) - if aln is not None: - break - - return aln - - -def filter_identity(aln, min_identity): - return aln.identity >= min_identity - - -def filter_score(aln, min_score): - return aln.score >= min_score - - -def filter_end_match(aln, min_coverage=0.5, min_identity=1.0): - return aln.target_end == aln.target_len and \ - aln.coverage >= min_coverage and aln.identity >= min_identity diff --git a/pyim/pipelines/__init__.py b/pyim/alignment/pipelines/__init__.py similarity index 100% rename from pyim/pipelines/__init__.py rename to pyim/alignment/pipelines/__init__.py diff --git a/pyim/pipelines/_helpers/__init__.py b/pyim/alignment/pipelines/_helpers/__init__.py similarity index 100% rename from pyim/pipelines/_helpers/__init__.py rename to pyim/alignment/pipelines/_helpers/__init__.py diff --git a/pyim/pipelines/_helpers/clustering.py b/pyim/alignment/pipelines/_helpers/clustering.py similarity index 100% rename from pyim/pipelines/_helpers/clustering.py rename to pyim/alignment/pipelines/_helpers/clustering.py diff --git a/pyim/pipelines/_helpers/grouping.py b/pyim/alignment/pipelines/_helpers/grouping.py similarity index 100% rename from pyim/pipelines/_helpers/grouping.py rename to pyim/alignment/pipelines/_helpers/grouping.py diff --git a/pyim/pipelines/_helpers/pipeline.py b/pyim/alignment/pipelines/_helpers/pipeline.py similarity index 100% rename from pyim/pipelines/_helpers/pipeline.py rename to pyim/alignment/pipelines/_helpers/pipeline.py diff --git a/pyim/pipelines/_model.py b/pyim/alignment/pipelines/_model.py similarity index 100% rename from pyim/pipelines/_model.py rename to pyim/alignment/pipelines/_model.py diff --git a/pyim/pipelines/lam_pcr.py b/pyim/alignment/pipelines/lam_pcr.py similarity index 100% rename from pyim/pipelines/lam_pcr.py rename to pyim/alignment/pipelines/lam_pcr.py diff --git a/pyim/pipelines/shear_splink.py b/pyim/alignment/pipelines/shear_splink.py similarity index 100% rename from pyim/pipelines/shear_splink.py rename to pyim/alignment/pipelines/shear_splink.py diff --git a/pyim/pipelines/shear_splink_sb.py b/pyim/alignment/pipelines/shear_splink_sb.py similarity index 100% rename from pyim/pipelines/shear_splink_sb.py rename to pyim/alignment/pipelines/shear_splink_sb.py diff --git a/pyim/main/align.py b/pyim/main/align.py index e828485..567c9c2 100644 --- a/pyim/main/align.py +++ b/pyim/main/align.py @@ -1,13 +1,10 @@ from __future__ import (absolute_import, division, print_function, unicode_literals) -from builtins import (ascii, bytes, chr, dict, filter, hex, input, - int, map, next, oct, open, pow, range, round, - str, super, zip) import argparse import logging -from pyim.pipelines import shear_splink, shear_splink_sb +from pyim.alignment.pipelines import shear_splink, shear_splink_sb from ._logging import print_header, print_footer @@ -27,7 +24,7 @@ def main(): args = parser.parse_args() # Dispatch to pipeline. - print_header(logger) + print_header(logger, command='align') args.main(args) print_footer(logger) diff --git a/pyim/util/__init__.py b/pyim/util/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pyim/util.py b/pyim/util/file.py similarity index 100% rename from pyim/util.py rename to pyim/util/file.py From 308015c9d311a3282d78e214f4d895bc362d16d2 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 23 Dec 2015 23:08:18 +0100 Subject: [PATCH 040/100] Add level name and command to logging. --- pyim/main/_logging.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/pyim/main/_logging.py b/pyim/main/_logging.py index b6f0b0e..4ea49f8 100644 --- a/pyim/main/_logging.py +++ b/pyim/main/_logging.py @@ -3,16 +3,21 @@ logging.basicConfig( - format='%(asctime)-15s %(message)s', + format='%(asctime)-15s %(levelname)-10s %(message)s', datefmt='[%Y-%m-%d %H:%M:%S]', level=logging.INFO) -def print_header(logger): +def print_header(logger, command=None): version = pkg_resources.require('pyim')[0].version - header_str = ' PyIM ({}) '.format(version) - logger.info('{:-^40}'.format(header_str)) + + if command is None: + header_str = ' PyIM ({}) '.format(version) + else: + header_str = ' PyIM {} ({}) '.format(command, version) + + logger.info('{:-^60}'.format(header_str)) def print_footer(logger): - logger.info('{:-^40}'.format(' Done! ')) + logger.info('{:-^60}'.format(' Done! ')) From eb3665ee09595ed44e16d31586378e8d959c68f1 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 23 Dec 2015 23:08:33 +0100 Subject: [PATCH 041/100] Add logging messages to merge. --- pyim/main/merge.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/pyim/main/merge.py b/pyim/main/merge.py index 42d631c..dea49b7 100644 --- a/pyim/main/merge.py +++ b/pyim/main/merge.py @@ -5,11 +5,14 @@ str, super, zip) from future.utils import native_str +import logging from argparse import ArgumentParser from pathlib import Path import pandas as pd +from ._logging import print_header, print_footer + def setup_parser(): parser = ArgumentParser(prog='pyim-merge') @@ -19,7 +22,7 @@ def setup_parser(): parser.add_argument('--names', nargs='+', default=None) parser.add_argument('--samples', nargs='+', default=None) - parser.add_argument('--complement', default=False, action='store_true') + # parser.add_argument('--complement', default=False, action='store_true') return parser @@ -28,6 +31,10 @@ def main(): parser = setup_parser() args = parser.parse_args() + # Get logger and print header. + logger = logging.getLogger() + print_header(logger, command='merge') + # Generate default names if none given. if args.names is None: names = ['Set{}'.format(i) for i in range(1, len(args.insertions) + 1)] @@ -51,31 +58,36 @@ def main(): # Augment ids to avoid duplicates in merged frame. if name != '': - frame['insertion_id'] = ['{}.{}'.format(name, id_) - for id_ in frame['insertion_id']] - + frame['id'] = ['{}.{}'.format(name, id_) + for id_ in frame['id']] ins_frames.append(frame) # Merge frames. merged = pd.concat(ins_frames, ignore_index=True) + logger.info('Merging insertions for {} datasets, containing {} samples' + .format(len(args.insertions), merged['sample'].nunique())) + # Filter samples if needed. if args.samples is not None: + logger.info('Subsetting dataset to {} samples' + .format(len(args.samples))) + merged_samples = set(merged['sample']) for sample in args.samples: if sample not in merged_samples: - print('WARNING: unknown sample {}'.format(sample)) + logging.warning('- Missing insertions for sample {}' + .format(sample)) mask = merged['sample'].isin(set(args.samples)) - - if not args.complement: - merged = merged.ix[mask] - else: - merged = merged.ix[~mask] + merged = merged.ix[mask] # Write output. + logging.info('Writing merged output') merged.to_csv(str(args.output), sep=native_str('\t'), index=False) + print_footer(logger) + if __name__ == '__main__': main() From e4effc23b733c251e9a6d3fca6da6d11cd0f34ec Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 23 Dec 2015 23:08:57 +0100 Subject: [PATCH 042/100] Reinstate basic annotation functionality. --- pyim/annotation/__init__.py | 2 +- pyim/annotation/rbm.py | 21 +-- pyim/annotation/window.py | 13 +- pyim/main/annotate.py | 3 +- pyim/util/pandas.py | 13 ++ pyim/util/rpy2.py | 32 ++++ pyim/util/tabix.py | 295 ++++++++++++++++++++++++++++++++++++ 7 files changed, 361 insertions(+), 18 deletions(-) create mode 100644 pyim/util/pandas.py create mode 100644 pyim/util/rpy2.py create mode 100644 pyim/util/tabix.py diff --git a/pyim/annotation/__init__.py b/pyim/annotation/__init__.py index 5dc17da..189b88c 100644 --- a/pyim/annotation/__init__.py +++ b/pyim/annotation/__init__.py @@ -1,3 +1,3 @@ -from .kcrbm import KcRbmAnnotator +# from .kcrbm import KcRbmAnnotator from .rbm import RbmAnnotator from .window import WindowAnnotator diff --git a/pyim/annotation/rbm.py b/pyim/annotation/rbm.py index b3d4a38..be2b48b 100644 --- a/pyim/annotation/rbm.py +++ b/pyim/annotation/rbm.py @@ -11,8 +11,8 @@ from toolz import curry, pipe, merge_with, keymap from toolz.curried import filter, valfilter, valmap -from tkgeno.io import GtfFile -from tkgeno.util.pandas import reorder_columns +from pyim.util.tabix import GtfFile +from pyim.util.pandas import reorder_columns from .base import Annotator, get_closest from .window import Window, apply_window, fetch_features, annotate_features @@ -87,8 +87,8 @@ def _annotate_row(row, windows, gtf, feature_type='gene'): strand = row.strand if hasattr(row, 'strand') else None # Fetch features for orientation, or for the forward orientation. - apply_func = curry(apply_window, row.seqname, - row.location, strand or 1) + apply_func = curry(apply_window, row.chrom, + row.position, strand or 1) windows_fwd = valmap(apply_func, windows) features = fetch_features_windows( @@ -96,7 +96,7 @@ def _annotate_row(row, windows, gtf, feature_type='gene'): if strand is None: # Try again with reverse window orientation. - apply_func = curry(apply_window, row.seqname, row.location, -1) + apply_func = curry(apply_window, row.chrom, row.position, -1) windows_rev = valmap(apply_func, windows) features_rev = fetch_features_windows( @@ -112,14 +112,15 @@ def _annotate_row(row, windows, gtf, feature_type='gene'): if len(frames) == 2 else frames[0], features, features_rev) - if len(features) > 0: - annotated = {mech: annotate_features(row, features, mechanism=mech) - for mech, features in features.items()} + annotated = {mech: annotate_features(row, features, mechanism=mech) + for mech, features in features.items() + if len(features) > 0} + if len(annotated) > 0: frame = pd.concat(annotated.values(), ignore_index=True) return reorder_columns(frame, order=row.index) - - return None + else: + return None def fetch_features_windows(gtf, windows, feature_type): diff --git a/pyim/annotation/window.py b/pyim/annotation/window.py index 47cc1e1..1eb4ac7 100644 --- a/pyim/annotation/window.py +++ b/pyim/annotation/window.py @@ -10,13 +10,13 @@ import pandas as pd -from tkgeno.io import GtfFile -from tkgeno.util.pandas import reorder_columns +from pyim.util.tabix import GtfFile +from pyim.util.pandas import reorder_columns from .base import Annotator, get_closest -Window = namedtuple('Window', ['seqname', 'start', 'end', 'strand', +Window = namedtuple('Window', ['reference', 'start', 'end', 'strand', 'incl_left', 'incl_right']) @@ -96,14 +96,17 @@ def _annotate_row(row, window, gtf, feature_type='gene'): @lru_cache(maxsize=64) def fetch_features(gtf, window, feature_type): - return gtf.get_region(feature=feature_type, **window._asdict()) + dict_ = window._asdict() + strand = dict_.pop('strand') + return gtf.get_region(filters={'feature': feature_type, + 'strand': strand}, **dict_) def annotate_features(row, features, **kwargs): data = dict(row) data.update(dict( gene_id=features.gene_id, - distance=[feature_distance(s, e, row.location) + distance=[feature_distance(s, e, row.position) for s, e in zip(features.start, features.end)])) data.update(**kwargs) diff --git a/pyim/main/annotate.py b/pyim/main/annotate.py index 7f8549a..7e847f6 100644 --- a/pyim/main/annotate.py +++ b/pyim/main/annotate.py @@ -9,10 +9,9 @@ import pandas as pd -from pyim.annotation import KcRbmAnnotator, RbmAnnotator, WindowAnnotator +from pyim.annotation import RbmAnnotator, WindowAnnotator ANNOTATORS = { - 'kcrbm': KcRbmAnnotator, 'rbm': RbmAnnotator, 'window': WindowAnnotator } diff --git a/pyim/util/pandas.py b/pyim/util/pandas.py new file mode 100644 index 0000000..bdee1c6 --- /dev/null +++ b/pyim/util/pandas.py @@ -0,0 +1,13 @@ +from __future__ import (absolute_import, division, + print_function, unicode_literals) +from builtins import (ascii, bytes, chr, dict, filter, hex, input, + int, map, next, oct, open, pow, range, round, + str, super, zip) + + +def reorder_columns(frame, order, drop_extra=False): + if drop_extra: + return frame[order] + else: + extra_cols = [c for c in frame.columns if c not in set(order)] + return frame[list(order) + extra_cols] diff --git a/pyim/util/rpy2.py b/pyim/util/rpy2.py new file mode 100644 index 0000000..1906b36 --- /dev/null +++ b/pyim/util/rpy2.py @@ -0,0 +1,32 @@ +from rpy2 import robjects +from rpy2.robjects import pandas2ri +from rpy2.rinterface import RNULLType + + +pandas2ri.activate() + + +def pandas_to_dataframe(pd_frame, check_names=False): + r_frame = pandas2ri.py2ri_pandasdataframe(pd_frame) + r_frame.colnames = pd_frame.columns + + if not check_names: + r_frame.rownames = pd_frame.index + + return r_frame + + +def dataframe_to_pandas(r_frame): + pd_frame = pandas2ri.ri2py_dataframe(r_frame) + + # Extract column names if possible. + col_names = robjects.r.colnames(r_frame) + if not type(col_names) == RNULLType: + pd_frame.columns = col_names + + # Extract row names if possible. + index = robjects.r.rownames(r_frame) + if not type(index) == RNULLType: + pd_frame.index = index + + return pd_frame diff --git a/pyim/util/tabix.py b/pyim/util/tabix.py new file mode 100644 index 0000000..cf12230 --- /dev/null +++ b/pyim/util/tabix.py @@ -0,0 +1,295 @@ +from __future__ import (absolute_import, division, + print_function, unicode_literals) + +# noinspection PyUnresolvedReferences +from builtins import (ascii, bytes, chr, dict, filter, hex, input, + int, map, next, oct, open, pow, range, round, + str, super, zip) +from future.utils import native_str + +import contextlib +import itertools +import os +import subprocess + +import pysam +import numpy as np +import pandas as pd + + +def _parse_float(value): + try: + return float(value) + except ValueError: + return np.nan + + +def _reorder_columns(frame, order): + columns = list(order) + extra_columns = sorted([c for c in frame.columns + if c not in set(columns)]) + return frame[columns + extra_columns] + + +def _get_region(frame, reference, start=None, end=None, + filters=None, incl_left=True, incl_right=True, + ref_col='contig', start_col='start', end_col='end'): + # Filter on passed range. + mask = frame[ref_col] == reference + + if start is not None: + mask &= frame[start_col] <= end + + if end is not None: + mask &= frame[end_col] >= start + + # Filter for inclusiveness. + if not incl_left: + mask &= frame['start'] > start + + if not incl_right: + mask &= frame['end'] < end + + # Apply any additional filters. + if filters is not None: + for name, value in filters.items(): + mask &= frame[name] == value + + return frame.ix[mask] + + +def bgzip(file_path, out_path=None): + if out_path is None: + out_path = file_path + '.gz' + + with open(out_path, 'w') as out_file: + subprocess.check_call(['bgzip', '-c', str(file_path)], stdout=out_file) + + return out_path + + +def tabix(file_path, preset): + subprocess.check_call(['tabix', '-p', preset, file_path]) + + +class TabixIterator(object): + + def __init__(self, file_path, parser=None): + self._file_path = file_path + self._parser = parser + + def fetch(self, reference=None, start=None, end=None, + filters=None, incl_left=True, incl_right=True): + file_obj = pysam.TabixFile(native_str(self._file_path), + parser=self._parser) + + with contextlib.closing(file_obj) as tb_file: + if reference is not None: + reference = native_str(reference) + + records = self._fetch(tb_file, reference=reference, + start=start, end=end) + + # Filter records on additional filters. + if filters is not None: + for name, value in filters.items(): + records = (r for r in records + if hasattr(r, name) + and getattr(r, name) == value) + + # Filter inclusive/exclusive if needed. + if not incl_left: + records = filter(lambda r: r.start > start, records) + + if not incl_right: + records = filter(lambda r: r.end < end, records) + + # Yield records. + for record in records: + yield record + + + def _fetch(self, tb_file, reference=None, **kwargs): + # For some reason pysam does not fetch all records if reference + # is None under Python 2.7. To fix this, here we simply chain all + # the contig records into one iterable. + if reference is None: + contigs = tb_file.contigs + records = itertools.chain.from_iterable( + (tb_file.fetch(reference=ref, **kwargs) + for ref in contigs)) + else: + records = tb_file.fetch(reference=reference, **kwargs) + + for record in records: + yield record + + +class TabixFile(object): + + def __init__(self, file_path, parser): + self._file_path = file_path + self._iterator = TabixIterator(file_path, parser=parser) + + def fetch(self, reference=None, start=None, end=None, + filters=None, incl_left=True, incl_right=True): + records = self._iterator.fetch( + reference=reference, start=start, end=end, + filters=filters, incl_left=incl_left, incl_right=incl_right) + + for record in (self._to_series(r) for r in records): + yield record + + def get_region(self, reference=None, start=None, end=None, + filters=None, incl_left=True, incl_right=True): + records = self.fetch(reference, start, end, filters=filters, + incl_left=incl_left, incl_right=incl_right) + return self._frame_constructor().from_records(records) + + @classmethod + def _to_series(cls, record): + raise NotImplementedError() + + @classmethod + def _frame_constructor(cls): + raise NotImplementedError() + + +class TabixFrame(pd.DataFrame): + + @property + def _constructor(self): + raise NotImplementedError() + + def fetch(self, reference=None, start=None, end=None, + filters=None, incl_left=True, incl_right=True): + raise NotImplementedError() + + def get_region(self, reference=None, start=None, end=None, + filters=None, incl_left=True, incl_right=True, **kwargs): + return _get_region(self, reference, start, end, filters=filters, + incl_left=incl_left, incl_right=incl_right, **kwargs) + + +class GtfFile(TabixFile): + + TYPE_MAP = {3: int, 4: int, 5: _parse_float} + + FIELDS = ('contig', 'source', 'feature', 'start', + 'end', 'score', 'strand', 'frame', 'attribute') + + def __init__(self, file_path): + file_path = str(file_path) + if not file_path.endswith('.gz'): + if os.path.exists(file_path + '.gz'): + file_path += '.gz' + else: + file_path = self.compress(file_path) + + super().__init__(file_path, parser=pysam.asGTF()) + + @classmethod + def _to_series(cls, record): + rec_values = tuple((cls.TYPE_MAP.get(i, lambda x: x)(val) + for i, val in enumerate(record))) + attr_keys, attr_values = zip(*dict(record).items()) + return pd.Series(rec_values[:-1] + attr_values, + index=cls.FIELDS[:-1] + attr_keys) + + @classmethod + def _frame_constructor(cls): + return GtfFrame + + def get_gene(self, gene_id, feature_type='gene', + field_name='gene_id', **kwargs): + # Add feature filter to filters (if given). + filters = kwargs.pop('filter', {}) + filters['feature'] = feature_type + + # Search for gene record. + records = self._iterator.fetch(filters=filters, **kwargs) + for record in records: + if record[native_str(field_name)] == gene_id: + return self._to_series(record) + + raise ValueError('Gene {} does not exist'.format(gene_id)) + + @classmethod + def compress(cls, file_path, out_path=None, sort=True, create_index=True): + """Compresses and indexes a gtf file using bgzip and tabix.""" + + # Base output path on original file name. + out_path = out_path or file_path + '.gz' + + if sort: + # Sort file before compressing and indexing. + file_path = cls.sort(file_path, out_path=out_path + '.tmp') + + # Gzip and index file. + gzipped_path = bgzip(file_path, out_path=out_path) + + if create_index: + tabix(gzipped_path, preset='gff') + + # Clean up temp file. + if file_path.endswith('.tmp'): + os.unlink(file_path) + + return gzipped_path + + @classmethod + def sort(cls, file_path, out_path): + """Sorts a gtf file by position, as required for tabix.""" + with open(out_path, 'w') as out_file: + cmd = '(grep ^"#" {0}; grep -v ^"#" {0} ''| sort -k1,1 -k4,4n)' + subprocess.check_call(cmd.format(file_path), + stdout=out_file, shell=True) + return out_path + + def __repr__(self): + return ''.format(self._file_path) + + +class GtfFrame(TabixFrame): + + @property + def _constructor(self): + return GtfFrame + + @classmethod + def read_csv(cls, path, *args, **kwargs): + frame = pd.read_csv(path, *args, sep='\t', comment='#', **kwargs) + return cls._format_frame(frame) + + @classmethod + def from_records(cls, data, *args, **kwargs): + # Build frame. + frame = super().from_records(data, *args, **kwargs) + + # Handle empty case. + if len(frame) == 0: + frame = pd.DataFrame([], columns=GtfFile.FIELDS[:-1]) + + return cls._format_frame(frame) + + @classmethod + def _format_frame(cls, frame): + # Convert some columns to categorical. + frame['contig'] = frame['contig'].astype('category') + frame['feature'] = frame['feature'].astype('category') + frame['source'] = frame['source'].astype('category') + frame['frame'] = frame['frame'].astype('category') + + # Order columns to start with normal attributes. + frame = _reorder_columns(frame, GtfFile.FIELDS[:-1]) + + return frame + + def get_gene(self, gene_id): + result = self.ix[((self['feature'] == 'gene') & + (self['gene_id'] == gene_id))] + + if len(result) == 0: + raise ValueError('Gene {} does not exist'.format(gene_id)) + + return result From 81e1278ed69d00d7c2480cf5dfdd242ff14aec8b Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Thu, 24 Dec 2015 16:10:54 +0100 Subject: [PATCH 043/100] Initial rewrite of window annotator. --- pyim/annotation/__init__.py | 3 - pyim/annotation/_model.py | 43 ++++++++++ pyim/annotation/_util.py | 18 ++++ pyim/annotation/base.py | 32 ------- pyim/annotation/window.py | 164 ++++++++++++++++++------------------ pyim/main/annotate.py | 58 +++---------- 6 files changed, 158 insertions(+), 160 deletions(-) create mode 100644 pyim/annotation/_model.py create mode 100644 pyim/annotation/_util.py delete mode 100644 pyim/annotation/base.py diff --git a/pyim/annotation/__init__.py b/pyim/annotation/__init__.py index 189b88c..e69de29 100644 --- a/pyim/annotation/__init__.py +++ b/pyim/annotation/__init__.py @@ -1,3 +0,0 @@ -# from .kcrbm import KcRbmAnnotator -from .rbm import RbmAnnotator -from .window import WindowAnnotator diff --git a/pyim/annotation/_model.py b/pyim/annotation/_model.py new file mode 100644 index 0000000..7e11283 --- /dev/null +++ b/pyim/annotation/_model.py @@ -0,0 +1,43 @@ + +class Window(object): + + def __init__(self, start, end, reference=None, strand=None, + incl_left=True, incl_right=True, name=None): + self.reference = reference + self.start = start + self.end = end + self.strand = strand + + self.incl_left = incl_left + self.incl_right = incl_right + self.name = name + + if not incl_left or not incl_right: + raise NotImplementedError() + + def apply(self, reference, location, strand=None): + """Applies window to specific location and strand""" + if strand is not None and self.strand is not None: + strand = self.strand * strand + else: + strand = self.strand + + return Window(self.start + location, self.end + location, + reference, strand, self.incl_left, + self.incl_right, name=self.name) + + +# def apply_window(seqname, location, strand, window): +# # TODO: Check strand logic! +# start = location + (window.start * strand) +# end = location + (window.end * strand) +# +# if strand == -1: +# start, end = end, start +# incl_left, incl_right = window.incl_right, window.incl_left +# else: +# incl_left, incl_right = window.incl_left, window.incl_right +# +# new_strand = strand * window.strand if window.strand is not None else None +# +# return Window(seqname, start, end, new_strand, incl_left, incl_right) diff --git a/pyim/annotation/_util.py b/pyim/annotation/_util.py new file mode 100644 index 0000000..1c68706 --- /dev/null +++ b/pyim/annotation/_util.py @@ -0,0 +1,18 @@ + +def get_closest(frame, id_col='insertion_id', distance_col='distance'): + def _is_closest(x): + abs_dist = x[distance_col].abs() + return x.ix[abs_dist == abs_dist.min()] + + return (frame.groupby(id_col) + .apply(_is_closest) + .reset_index(drop=True)) + + +def feature_distance(start, end, location): + if start <= location <= end: + return 0 + elif location > end: + return location - end + else: + return location - start diff --git a/pyim/annotation/base.py b/pyim/annotation/base.py deleted file mode 100644 index 3d44273..0000000 --- a/pyim/annotation/base.py +++ /dev/null @@ -1,32 +0,0 @@ -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import (ascii, bytes, chr, dict, filter, hex, input, - int, map, next, oct, open, pow, range, round, - str, super, zip) - - -class Annotator(object): - - def __init__(self): - super().__init__() - - @classmethod - def configure_argparser(cls, subparsers, name='name'): - raise NotImplementedError() - - @classmethod - def from_args(cls, args): - return cls(**args) - - def annotate(self, frame): - raise NotImplementedError() - - -def get_closest(frame, id_col='insertion_id', distance_col='distance'): - def _is_closest(x): - abs_dist = x[distance_col].abs() - return x.ix[abs_dist == abs_dist.min()] - - return (frame.groupby(id_col) - .apply(_is_closest) - .reset_index(drop=True)) diff --git a/pyim/annotation/window.py b/pyim/annotation/window.py index 1eb4ac7..5c69e30 100644 --- a/pyim/annotation/window.py +++ b/pyim/annotation/window.py @@ -4,119 +4,123 @@ int, map, next, oct, open, pow, range, round, str, super, zip) -from collections import namedtuple -from functools import lru_cache -from pathlib import Path +import itertools +import logging import pandas as pd +from intervaltree import IntervalTree from pyim.util.tabix import GtfFile -from pyim.util.pandas import reorder_columns -from .base import Annotator, get_closest +from ._model import Window -Window = namedtuple('Window', ['reference', 'start', 'end', 'strand', - 'incl_left', 'incl_right']) +def register(subparsers, name='window'): + parser = subparsers.add_parser(name, help=name + ' help') + # Required arguments. + parser.add_argument('input') + parser.add_argument('output') + parser.add_argument('--gtf', required=True) -def apply_window(seqname, location, strand, window): - start = location + (window.start * strand) - end = location + (window.end * strand) + # Optional arguments. + # parser.add_argument('--feature_type', default='gene') + parser.add_argument('--window_size', default=20000, type=int) - if strand == -1: - start, end = end, start - incl_left, incl_right = window.incl_right, window.incl_left - else: - incl_left, incl_right = window.incl_left, window.incl_right + # Set main for dispatch. + parser.set_defaults(main=main) - new_strand = strand * window.strand if window.strand is not None else None + return parser - return Window(seqname, start, end, new_strand, incl_left, incl_right) +def main(args): + logger = logging.getLogger() -class WindowAnnotator(Annotator): + insertions = pd.read_csv(args.input, sep='\t', dtype={'chrom': str}) + logger.info('Read {} insertions'.format(len(insertions))) - def __init__(self, gtf_path, window_size, feature_type='gene', - id_column='insertion_id', closest=False): - super().__init__() + logger.info('Building interval trees from gtf') + gtf = GtfFile(args.gtf) + trees = build_interval_trees(gtf) - self._gtf = GtfFile(gtf_path) - self._window = Window( - seqname=None, start=-1 * window_size, end=window_size, - strand=None, incl_left=True, incl_right=True) + logger.info('Annotating insertions') + half_size = args.window_size // 2 + window = Window(start=-half_size, end=half_size) - self._feature_type = feature_type - self._closest = closest - self._id_column = id_column + annotation = annotate_for_window(insertions, trees, window) - @classmethod - def configure_argparser(cls, subparsers, name='window'): - parser = subparsers.add_parser(name, help=name + ' help') + logger.info('Merging') + merged = pd.merge(insertions, annotation, on='id', how='left') + merged.to_csv(args.output, sep='\t', index=False) - parser.add_argument('input', type=Path) - parser.add_argument('output', type=Path) - parser.add_argument('gtf') - parser.add_argument('--feature_type', default='gene') - parser.add_argument('--window_size', default=20000, type=int) +def annotate_for_window(insertions, trees, window): + """Annotates insertions for features in trees using given window.""" + return annotate_for_windows(insertions, trees, [window]) - parser.add_argument('--id_column', default='insertion_id') - parser.add_argument('--closest', default=False, action='store_true') - return parser +def annotate_for_windows(insertions, trees, windows): + """Annotates insertions for features in trees using given windows.""" - def annotate(self, frame, type_='gene'): - results = [self._annotate_row(row, self._window, - self._gtf, self._feature_type) - for _, row in frame.iterrows()] + return pd.concat((_annotate_for_windows(row, trees, windows) + for _, row in insertions.iterrows()), ignore_index=True) - results = pd.concat(filter(lambda x: x is not None, results), - ignore_index=True) - return results if self._closest is not None \ - else get_closest(frame, id_col=self._id_column) +def _annotate_for_windows(insertion, trees, windows): + """Annotates insertion for features in trees using given windows.""" - @staticmethod - def _annotate_row(row, window, gtf, feature_type='gene'): - # Apply window for row. - window = apply_window(row.seqname, row.location, - row.seqname, window) + return pd.concat((_annotate_for_window(insertion, trees, w) + for w in windows), ignore_index=True) - # Fetch features for row. - features = fetch_features(gtf, window, feature_type=feature_type) - # Annotate row with features, if any were found. - if len(features) > 0: - frame = annotate_features(row, features) - return reorder_columns(frame, order=row.index) +def _annotate_for_window(insertion, trees, window): + """Annotates insertion for features in trees using given window.""" - return None + # Apply window for insertion. + applied = window.apply( + insertion['chrom'], insertion['position'], insertion['strand']) + # Fetch features within window. + features = fetch_in_window(trees, applied) -@lru_cache(maxsize=64) -def fetch_features(gtf, window, feature_type): - dict_ = window._asdict() - strand = dict_.pop('strand') - return gtf.get_region(filters={'feature': feature_type, - 'strand': strand}, **dict_) + # Convert to frame. + frame = pd.DataFrame({ + 'id': insertion['id'], + 'gene_name': [f['gene_name'] for f in features]}) + # Include window name if known. + if window.name is not None: + frame['window'] = window.name -def annotate_features(row, features, **kwargs): - data = dict(row) - data.update(dict( - gene_id=features.gene_id, - distance=[feature_distance(s, e, row.position) - for s, e in zip(features.start, features.end)])) - data.update(**kwargs) + return frame - return reorder_columns(pd.DataFrame(data), order=row.index) +def fetch_in_window(trees, window): + """Fetches features within given window in the interval trees.""" -def feature_distance(start, end, location): - if start <= location <= end: - return 0 - elif location > end: - return location - end - else: - return location - start + if window.strand is not None: + raise NotImplementedError() + + try: + tree = trees[window.reference] + overlap = tree[window.start:window.end] + except KeyError: + overlap = [] + + return [interval[2] for interval in overlap] + + +def build_interval_trees(gtf): + """Builds an interval tree of genes for each chromosome in gtf.""" + + # Only select gene features for now. + genes = gtf.fetch(filters={'feature': 'gene'}) + + trees = {} + for contig, grp in itertools.groupby(genes, lambda r: r.contig): + # Build a tree for each individual chromosome. + intervals = ((g.start, g.end, dict(g)) for g in grp + if g.end > g.start) # Avoid null intervals. + trees[contig] = IntervalTree.from_tuples(intervals) + + return trees diff --git a/pyim/main/annotate.py b/pyim/main/annotate.py index 7e847f6..2075766 100644 --- a/pyim/main/annotate.py +++ b/pyim/main/annotate.py @@ -1,63 +1,31 @@ from __future__ import (absolute_import, division, print_function, unicode_literals) -from builtins import (ascii, bytes, chr, dict, filter, hex, input, - int, map, next, oct, open, pow, range, round, - str, super, zip) -from future.utils import native_str import argparse +import logging -import pandas as pd +from pyim.annotation import window +from ._logging import print_header, print_footer -from pyim.annotation import RbmAnnotator, WindowAnnotator - -ANNOTATORS = { - 'rbm': RbmAnnotator, - 'window': WindowAnnotator -} +def main(): + logger = logging.getLogger() -def setup_parser(): + # Setup main parser. parser = argparse.ArgumentParser(prog='pyim-annotate') - subparsers = parser.add_subparsers(dest='annotator') subparsers.required = True - for name, class_ in ANNOTATORS.items(): - class_.configure_argparser(subparsers, name=name) + # Register pipelines. + window.register(subparsers) - return parser - - -def main(): - parser = setup_parser() + # Parse args. args = parser.parse_args() - # Check if a sub-parser was chosen. - if args.annotator is None: - raise ValueError('No annotator was specified as sub-command (choose ' - 'from {})' .format(', '.join(ANNOTATORS.keys()))) - - # Parse options and extract main input/output parameters. - arg_dict = vars(args) - - annotator_name = arg_dict.pop('annotator') - input_path = arg_dict.pop('input') - output_path = arg_dict.pop('output') - - # Instantiate chosen annotator and use to annotate input! - try: - annotator_class = ANNOTATORS[annotator_name] - except KeyError: - raise ValueError('Pipeline \'{}\' does not exist' - .format(annotator_name)) - else: - annotator = annotator_class.from_args(arg_dict) - - in_frame = pd.read_csv(str(input_path), sep=native_str('\t')) - out_frame = annotator.annotate(in_frame) - - out_frame.to_csv(str(output_path), sep=native_str('\t'), index=False) + # Dispatch to pipeline. + print_header(logger, command='annotate') + args.main(args) + print_footer(logger) if __name__ == '__main__': From 5379018277c458789dc9de55b0691d350979684c Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Thu, 24 Dec 2015 20:38:45 +0100 Subject: [PATCH 044/100] Faster annotation lookup. --- pyim/annotation/window.py | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/pyim/annotation/window.py b/pyim/annotation/window.py index 5c69e30..ee240e7 100644 --- a/pyim/annotation/window.py +++ b/pyim/annotation/window.py @@ -39,7 +39,7 @@ def main(args): insertions = pd.read_csv(args.input, sep='\t', dtype={'chrom': str}) logger.info('Read {} insertions'.format(len(insertions))) - logger.info('Building interval trees from gtf') + logger.info('Building interval trees') gtf = GtfFile(args.gtf) trees = build_interval_trees(gtf) @@ -47,41 +47,36 @@ def main(args): half_size = args.window_size // 2 window = Window(start=-half_size, end=half_size) - annotation = annotate_for_window(insertions, trees, window) + annotation = annotate_for_windows(insertions, trees, [window]) - logger.info('Merging') + logger.info('Merging annotation') merged = pd.merge(insertions, annotation, on='id', how='left') merged.to_csv(args.output, sep='\t', index=False) -def annotate_for_window(insertions, trees, window): - """Annotates insertions for features in trees using given window.""" - return annotate_for_windows(insertions, trees, [window]) - - def annotate_for_windows(insertions, trees, windows): """Annotates insertions for features in trees using given windows.""" - return pd.concat((_annotate_for_windows(row, trees, windows) - for _, row in insertions.iterrows()), ignore_index=True) + if isinstance(insertions, pd.DataFrame): + insertions = (row for _, row in insertions.iterrows()) + queries = itertools.product(insertions, windows) -def _annotate_for_windows(insertion, trees, windows): - """Annotates insertion for features in trees using given windows.""" + annotation = pd.concat((_annotate_for_window(ins, trees, window) + for ins, window in queries), ignore_index=True) - return pd.concat((_annotate_for_window(insertion, trees, w) - for w in windows), ignore_index=True) + return annotation def _annotate_for_window(insertion, trees, window): """Annotates insertion for features in trees using given window.""" # Apply window for insertion. - applied = window.apply( + applied_window = window.apply( insertion['chrom'], insertion['position'], insertion['strand']) # Fetch features within window. - features = fetch_in_window(trees, applied) + features = fetch_in_window(trees, applied_window) # Convert to frame. frame = pd.DataFrame({ @@ -107,7 +102,17 @@ def fetch_in_window(trees, window): except KeyError: overlap = [] - return [interval[2] for interval in overlap] + features = [interval[2] for interval in overlap] + + if window.strand is not None: + features = [f for f in features + if _strand_numeric(f['strand']) == window.strand] + + return features + + +def _strand_numeric(strand): + return 1 if strand == '+' else -1 def build_interval_trees(gtf): From c841dd8bc79d1ee9b5f7e7b4de1157d669e64861 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Thu, 24 Dec 2015 20:39:15 +0100 Subject: [PATCH 045/100] Proper window implementation, accounting for strand. --- pyim/annotation/_model.py | 51 +++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/pyim/annotation/_model.py b/pyim/annotation/_model.py index 7e11283..41da04b 100644 --- a/pyim/annotation/_model.py +++ b/pyim/annotation/_model.py @@ -10,34 +10,37 @@ def __init__(self, start, end, reference=None, strand=None, self.incl_left = incl_left self.incl_right = incl_right + self.name = name if not incl_left or not incl_right: raise NotImplementedError() - def apply(self, reference, location, strand=None): + def apply(self, reference, location, strand): """Applies window to specific location and strand""" - if strand is not None and self.strand is not None: - strand = self.strand * strand + + # Determine start/end position. + if strand == 1: + start = location + self.start + end = location + self.end + + incl_left = self.incl_left + incl_right = self.incl_right + elif strand == -1: + start = location - self.end + end = location - self.start + + incl_right = self.incl_left + incl_left = self.incl_right else: - strand = self.strand - - return Window(self.start + location, self.end + location, - reference, strand, self.incl_left, - self.incl_right, name=self.name) - - -# def apply_window(seqname, location, strand, window): -# # TODO: Check strand logic! -# start = location + (window.start * strand) -# end = location + (window.end * strand) -# -# if strand == -1: -# start, end = end, start -# incl_left, incl_right = window.incl_right, window.incl_left -# else: -# incl_left, incl_right = window.incl_left, window.incl_right -# -# new_strand = strand * window.strand if window.strand is not None else None -# -# return Window(seqname, start, end, new_strand, incl_left, incl_right) + raise ValueError('Unknown value for strand ({})' + .format(strand)) + + # Determine new strand. + if self.strand is not None: + new_strand = self.strand * strand + else: + new_strand = None + + return Window(start, end, reference, new_strand, + incl_left, incl_right, name=self.name) From 2afc5f552f026f35a06967ab17369329712fcae7 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Thu, 24 Dec 2015 20:40:47 +0100 Subject: [PATCH 046/100] Initial RBM implementation. --- pyim/annotation/rbm.py | 151 +++++++++++++++-------------------------- pyim/main/annotate.py | 3 +- 2 files changed, 56 insertions(+), 98 deletions(-) diff --git a/pyim/annotation/rbm.py b/pyim/annotation/rbm.py index be2b48b..aed9a12 100644 --- a/pyim/annotation/rbm.py +++ b/pyim/annotation/rbm.py @@ -4,130 +4,87 @@ int, map, next, oct, open, pow, range, round, str, super, zip) # filter -from pathlib import Path +import logging import pandas as pd - -from toolz import curry, pipe, merge_with, keymap -from toolz.curried import filter, valfilter, valmap - from pyim.util.tabix import GtfFile -from pyim.util.pandas import reorder_columns -from .base import Annotator, get_closest -from .window import Window, apply_window, fetch_features, annotate_features +from ._model import Window +from .window import build_interval_trees, annotate_for_windows # Window format: (us, ua, ds, da) -WINDOW_SIZE_PRESETS = { +WINDOW_PRESETS = { 'SB': (20000, 10000, 25000, 5000), 'MULV': (20000, 120000, 40000, 5000), 'MMTV': (20000, 120000, 40000, 5000) } -class RbmAnnotator(Annotator): - - def __init__(self, gtf, window_sizes=None, preset=None, - feature_type='gene', closest=False, id_column='insertion_id'): - super().__init__() - - if window_sizes is None: - if preset is None: - raise ValueError('Either windows or preset must be given') - window_sizes = WINDOW_SIZE_PRESETS[preset] - - self._gtf = GtfFile(gtf) - self._feature_type = feature_type - - self._closest = closest - self._id_column = id_column - - self._windows = { - 'is': Window(None, 0, 1, 1, True, True), - 'ia': Window(None, 0, 1, -1, True, True), - 'us': Window(None, -window_sizes[0], 0, 1, True, False), - 'ua': Window(None, -window_sizes[1], 0, -1, True, False), - 'ds': Window(None, 1, window_sizes[2], 1, False, True), - 'da': Window(None, 1, window_sizes[3], -1, False, True) - } - - @classmethod - def configure_argparser(cls, subparsers, name='rbm'): - parser = subparsers.add_parser(name, help=name + ' help') - - parser.add_argument('input', type=Path) - parser.add_argument('output', type=Path) - parser.add_argument('gtf') - - parser.add_argument('--feature_type', default='gene', - choices={'gene', 'transcript'}) - parser.add_argument('--id_column', default='insertion_id') - parser.add_argument('--closest', default=False, action='store_true') - - group = parser.add_mutually_exclusive_group(required=True) - group.add_argument('--preset') - group.add_argument('--window_sizes', nargs=4, type=int) +def register(subparsers, name='rbm'): + parser = subparsers.add_parser(name, help=name + ' help') - return parser + # Required arguments. + parser.add_argument('input') + parser.add_argument('output') + parser.add_argument('gtf') - def annotate(self, frame, type_='gene'): - results = [self._annotate_row(row, self._windows, - self._gtf, self._feature_type) - for _, row in frame.iterrows()] + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument('--preset', choices=WINDOW_PRESETS.keys()) + group.add_argument('--window_sizes', nargs=4, type=int) - results = pd.concat(filter(lambda x: x is not None, results), - ignore_index=True) + # Optional arguments. + # parser.add_argument('--feature_type', default='gene', + # choices={'gene', 'transcript'}) + # parser.add_argument('--id_column', default='insertion_id') + # parser.add_argument('--closest', default=False, action='store_true') - return results if not self._closest \ - else get_closest(results, id_col=self._id_column) + # Set main for dispatch. + parser.set_defaults(main=main) - @staticmethod - def _annotate_row(row, windows, gtf, feature_type='gene'): - strand = row.strand if hasattr(row, 'strand') else None + return parser - # Fetch features for orientation, or for the forward orientation. - apply_func = curry(apply_window, row.chrom, - row.position, strand or 1) - windows_fwd = valmap(apply_func, windows) - features = fetch_features_windows( - gtf, windows_fwd, feature_type=feature_type) +def main(args): + logger = logging.getLogger() - if strand is None: - # Try again with reverse window orientation. - apply_func = curry(apply_window, row.chrom, row.position, -1) - windows_rev = valmap(apply_func, windows) + # Read insertions. + insertions = pd.read_csv(args.input, sep='\t', dtype={'chrom': str}) + logger.info('Read {} insertions'.format(len(insertions))) - features_rev = fetch_features_windows( - gtf, windows_rev, feature_type=feature_type) + # Build annotation trees. + logger.info('Building interval trees') + gtf = GtfFile(args.gtf) + trees = build_interval_trees(gtf) - # Reflect sense/antisense to match fwd windows. - features_rev = keymap( - curry(str_translate, table=str.maketrans('sa', 'as')), - features_rev) + # Define windows. + if args.preset is not None: + window_sizes = WINDOW_PRESETS[args.preset] + else: + window_sizes = args.window_sizes - features = merge_with( - lambda frames: pd.merge(frames[0], frames[1], how='inner') - if len(frames) == 2 else frames[0], - features, features_rev) + windows = build_windows(window_sizes) - annotated = {mech: annotate_features(row, features, mechanism=mech) - for mech, features in features.items() - if len(features) > 0} + # Annotate insertions. + logger.info('Annotating insertions') + annotation = annotate_for_windows(insertions, trees, windows) - if len(annotated) > 0: - frame = pd.concat(annotated.values(), ignore_index=True) - return reorder_columns(frame, order=row.index) - else: - return None + # Merge annotation with insertion frame. + logger.info('Merging annotation') + merged = pd.merge(insertions, annotation, on='id', how='left') + merged.to_csv(args.output, sep='\t', index=False) -def fetch_features_windows(gtf, windows, feature_type): - return pipe(windows, - valmap(lambda w: fetch_features(gtf, w, feature_type)), - valfilter(lambda x: x is not None)) +def build_windows(ranges): + us, ua, ds, da = ranges + windows = [ + Window(0, 1, strand=1, incl_left=True, incl_right=True, name='is'), + Window(0, 1, strand=-1, incl_left=True, incl_right=True, name='ia'), + Window(-us, 0, strand=1, incl_left=True, incl_right=False, name='us'), + Window(-ua, 0, strand=-1, incl_left=True, incl_right=False, name='ua'), + Window(1, ds, strand=1, incl_left=False, incl_right=True, name='ds'), + Window(1, da, strand=-1, incl_left=False, incl_right=True, name='da') + ] -def str_translate(s, table): - return str.translate(s, table) + return windows diff --git a/pyim/main/annotate.py b/pyim/main/annotate.py index 2075766..009d287 100644 --- a/pyim/main/annotate.py +++ b/pyim/main/annotate.py @@ -4,7 +4,7 @@ import argparse import logging -from pyim.annotation import window +from pyim.annotation import window, rbm from ._logging import print_header, print_footer @@ -18,6 +18,7 @@ def main(): # Register pipelines. window.register(subparsers) + rbm.register(subparsers) # Parse args. args = parser.parse_args() From 403b952f33b7b6f870a8fe4352ace2f0a8d2c960 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Thu, 24 Dec 2015 20:42:09 +0100 Subject: [PATCH 047/100] Check for lacking incl_left/incl_right implementation. --- pyim/annotation/window.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyim/annotation/window.py b/pyim/annotation/window.py index ee240e7..5a98032 100644 --- a/pyim/annotation/window.py +++ b/pyim/annotation/window.py @@ -93,7 +93,7 @@ def _annotate_for_window(insertion, trees, window): def fetch_in_window(trees, window): """Fetches features within given window in the interval trees.""" - if window.strand is not None: + if not window.incl_left or not window.incl_right: raise NotImplementedError() try: From 96f830fbf0eee4895a504693d7cae365497d0586 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Fri, 25 Dec 2015 09:52:09 +0100 Subject: [PATCH 048/100] Add inclusive filtering. --- pyim/annotation/window.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/pyim/annotation/window.py b/pyim/annotation/window.py index 5a98032..d6738e0 100644 --- a/pyim/annotation/window.py +++ b/pyim/annotation/window.py @@ -93,25 +93,34 @@ def _annotate_for_window(insertion, trees, window): def fetch_in_window(trees, window): """Fetches features within given window in the interval trees.""" - if not window.incl_left or not window.incl_right: - raise NotImplementedError() - + # Find overlapping features. try: tree = trees[window.reference] overlap = tree[window.start:window.end] except KeyError: overlap = [] - features = [interval[2] for interval in overlap] + # Extract features. + features = (interval[2] for interval in overlap) + + # Filter inclusive/exclusive if needed. + if not window.incl_left: + features = (f for f in features if f.start > window.start) + if not window.incl_right: + features = (f for f in features if f.end < window.end) + + # Filter for strand if needed. if window.strand is not None: - features = [f for f in features - if _strand_numeric(f['strand']) == window.strand] + features = (f for f in features + if _strand_numeric(f['strand']) == window.strand) - return features + return list(features) def _strand_numeric(strand): + """Convert strand to numeric representation.""" + return 1 if strand == '+' else -1 From 6c205535c6805eaeb7adac0303f0320b91f9ccf2 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Sat, 26 Dec 2015 12:59:09 +0100 Subject: [PATCH 049/100] Iterate per query (ins/window) and closest selection. --- pyim/annotation/_model.py | 3 -- pyim/annotation/_util.py | 27 ++++++++++++++---- pyim/annotation/rbm.py | 13 +++++++-- pyim/annotation/window.py | 60 ++++++++++++++++++++++++++++----------- pyim/main/annotate.py | 3 +- 5 files changed, 76 insertions(+), 30 deletions(-) diff --git a/pyim/annotation/_model.py b/pyim/annotation/_model.py index 41da04b..021c2d5 100644 --- a/pyim/annotation/_model.py +++ b/pyim/annotation/_model.py @@ -13,9 +13,6 @@ def __init__(self, start, end, reference=None, strand=None, self.name = name - if not incl_left or not incl_right: - raise NotImplementedError() - def apply(self, reference, location, strand): """Applies window to specific location and strand""" diff --git a/pyim/annotation/_util.py b/pyim/annotation/_util.py index 1c68706..14cefd6 100644 --- a/pyim/annotation/_util.py +++ b/pyim/annotation/_util.py @@ -1,7 +1,7 @@ -def get_closest(frame, id_col='insertion_id', distance_col='distance'): +def select_closest(frame, id_col='id', col='distance'): def _is_closest(x): - abs_dist = x[distance_col].abs() + abs_dist = x[col].abs() return x.ix[abs_dist == abs_dist.min()] return (frame.groupby(id_col) @@ -9,10 +9,25 @@ def _is_closest(x): .reset_index(drop=True)) -def feature_distance(start, end, location): +def feature_distance(feature, location, stranded=True): + start, end = feature['start'], feature['end'] + if start <= location <= end: - return 0 + dist = 0 elif location > end: - return location - end + dist = location - end else: - return location - start + dist = location - start + + if stranded: + dist *= numeric_strand(feature['strand']) + + return dist + + +def numeric_strand(strand): + """Convert strand to numeric representation.""" + + return 1 if strand == '+' else -1 + + diff --git a/pyim/annotation/rbm.py b/pyim/annotation/rbm.py index aed9a12..45b6d43 100644 --- a/pyim/annotation/rbm.py +++ b/pyim/annotation/rbm.py @@ -7,9 +7,11 @@ import logging import pandas as pd + from pyim.util.tabix import GtfFile from ._model import Window +from ._util import select_closest from .window import build_interval_trees, annotate_for_windows @@ -27,7 +29,7 @@ def register(subparsers, name='rbm'): # Required arguments. parser.add_argument('input') parser.add_argument('output') - parser.add_argument('gtf') + parser.add_argument('--gtf', required=True) group = parser.add_mutually_exclusive_group(required=True) group.add_argument('--preset', choices=WINDOW_PRESETS.keys()) @@ -37,7 +39,7 @@ def register(subparsers, name='rbm'): # parser.add_argument('--feature_type', default='gene', # choices={'gene', 'transcript'}) # parser.add_argument('--id_column', default='insertion_id') - # parser.add_argument('--closest', default=False, action='store_true') + parser.add_argument('--closest', default=False, action='store_true') # Set main for dispatch. parser.set_defaults(main=main) @@ -67,7 +69,12 @@ def main(args): # Annotate insertions. logger.info('Annotating insertions') - annotation = annotate_for_windows(insertions, trees, windows) + annotation = annotate_for_windows( + insertions, trees, windows, progress=True) + + if args.closest: + logger.info('Reducing to closest features') + annotation = select_closest(annotation, col='gene_distance') # Merge annotation with insertion frame. logger.info('Merging annotation') diff --git a/pyim/annotation/window.py b/pyim/annotation/window.py index d6738e0..8354421 100644 --- a/pyim/annotation/window.py +++ b/pyim/annotation/window.py @@ -9,10 +9,12 @@ import pandas as pd from intervaltree import IntervalTree +from tqdm import tqdm from pyim.util.tabix import GtfFile from ._model import Window +from ._util import feature_distance, numeric_strand, select_closest def register(subparsers, name='window'): @@ -24,6 +26,7 @@ def register(subparsers, name='window'): parser.add_argument('--gtf', required=True) # Optional arguments. + parser.add_argument('--closest', default=False, action='store_true') # parser.add_argument('--feature_type', default='gene') parser.add_argument('--window_size', default=20000, type=int) @@ -36,34 +39,52 @@ def register(subparsers, name='window'): def main(args): logger = logging.getLogger() + # Read annotation. insertions = pd.read_csv(args.input, sep='\t', dtype={'chrom': str}) logger.info('Read {} insertions'.format(len(insertions))) + # Build lookup trees. logger.info('Building interval trees') gtf = GtfFile(args.gtf) trees = build_interval_trees(gtf) + # Define windows. logger.info('Annotating insertions') half_size = args.window_size // 2 window = Window(start=-half_size, end=half_size) - annotation = annotate_for_windows(insertions, trees, [window]) + # Annotate insertions. + annotation = annotate_for_windows( + insertions, trees, [window], progress=True) + if args.closest: + # Sub-select for closest features. + logger.info('Reducing to closest features') + annotation = select_closest(annotation, col='gene_distance') + + # Merge annotation. logger.info('Merging annotation') merged = pd.merge(insertions, annotation, on='id', how='left') merged.to_csv(args.output, sep='\t', index=False) -def annotate_for_windows(insertions, trees, windows): +def annotate_for_windows(insertions, trees, windows, progress=False): """Annotates insertions for features in trees using given windows.""" - if isinstance(insertions, pd.DataFrame): - insertions = (row for _, row in insertions.iterrows()) + # Generate queries (insertion/window combinations). + ins_gen = (row for _, row in insertions.iterrows()) + queries = itertools.product(ins_gen, windows) + + if progress: + queries = tqdm(queries, unit='query', + total=len(insertions) * len(windows)) - queries = itertools.product(insertions, windows) + # Generate annotation for queries. + annotations = (_annotate_for_window(ins, trees, window) + for ins, window in queries) - annotation = pd.concat((_annotate_for_window(ins, trees, window) - for ins, window in queries), ignore_index=True) + # Merge annotations into single frame. + annotation = pd.concat(annotations, ignore_index=True) return annotation @@ -78,10 +99,21 @@ def _annotate_for_window(insertion, trees, window): # Fetch features within window. features = fetch_in_window(trees, applied_window) + # Extract feature values. + values = ((f['gene_name'], + feature_distance(f, insertion['position'])) + for f in features) + + try: + name, distance = zip(*values) + except ValueError: + name, distance = [], [] + # Convert to frame. frame = pd.DataFrame({ 'id': insertion['id'], - 'gene_name': [f['gene_name'] for f in features]}) + 'gene_name': name, + 'gene_distance': distance}) # Include window name if known. if window.name is not None: @@ -105,25 +137,19 @@ def fetch_in_window(trees, window): # Filter inclusive/exclusive if needed. if not window.incl_left: - features = (f for f in features if f.start > window.start) + features = (f for f in features if f['start'] > window.start) if not window.incl_right: - features = (f for f in features if f.end < window.end) + features = (f for f in features if f['end'] < window.end) # Filter for strand if needed. if window.strand is not None: features = (f for f in features - if _strand_numeric(f['strand']) == window.strand) + if numeric_strand(f['strand']) == window.strand) return list(features) -def _strand_numeric(strand): - """Convert strand to numeric representation.""" - - return 1 if strand == '+' else -1 - - def build_interval_trees(gtf): """Builds an interval tree of genes for each chromosome in gtf.""" diff --git a/pyim/main/annotate.py b/pyim/main/annotate.py index 009d287..80261ef 100644 --- a/pyim/main/annotate.py +++ b/pyim/main/annotate.py @@ -24,7 +24,8 @@ def main(): args = parser.parse_args() # Dispatch to pipeline. - print_header(logger, command='annotate') + cmd_str = '{} {}'.format('annotate', args.annotator) + print_header(logger, command=cmd_str) args.main(args) print_footer(logger) From 904eb3c9aef43825934c654848515494e305dce5 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Sun, 27 Dec 2015 15:30:29 +0100 Subject: [PATCH 050/100] Clean-up rbm/window. --- pyim/annotation/rbm.py | 3 --- pyim/annotation/window.py | 1 - 2 files changed, 4 deletions(-) diff --git a/pyim/annotation/rbm.py b/pyim/annotation/rbm.py index 45b6d43..fce57d5 100644 --- a/pyim/annotation/rbm.py +++ b/pyim/annotation/rbm.py @@ -36,9 +36,6 @@ def register(subparsers, name='rbm'): group.add_argument('--window_sizes', nargs=4, type=int) # Optional arguments. - # parser.add_argument('--feature_type', default='gene', - # choices={'gene', 'transcript'}) - # parser.add_argument('--id_column', default='insertion_id') parser.add_argument('--closest', default=False, action='store_true') # Set main for dispatch. diff --git a/pyim/annotation/window.py b/pyim/annotation/window.py index 8354421..9de3d22 100644 --- a/pyim/annotation/window.py +++ b/pyim/annotation/window.py @@ -27,7 +27,6 @@ def register(subparsers, name='window'): # Optional arguments. parser.add_argument('--closest', default=False, action='store_true') - # parser.add_argument('--feature_type', default='gene') parser.add_argument('--window_size', default=20000, type=int) # Set main for dispatch. From 0501bdf001284c02739bfa52e200267f34e23e83 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Sun, 27 Dec 2015 15:33:09 +0100 Subject: [PATCH 051/100] Initial rewrite of KCRBM annotator. --- pyim/annotation/kcrbm.py | 199 +++++++++++++++++++++------------------ pyim/main/annotate.py | 3 +- 2 files changed, 110 insertions(+), 92 deletions(-) diff --git a/pyim/annotation/kcrbm.py b/pyim/annotation/kcrbm.py index e6957fa..d62ff52 100644 --- a/pyim/annotation/kcrbm.py +++ b/pyim/annotation/kcrbm.py @@ -4,141 +4,158 @@ int, map, next, oct, open, pow, range, round, str, super, zip) +import logging from itertools import chain, repeat -from pathlib import Path import pandas as pd -from numpy import issubdtype from rpy2 import robjects +from rpy2.robjects.packages import importr -from tkgeno.util.rpy2 import importr, pandas_to_dataframe, dataframe_to_pandas +from pyim.util.rpy2 import dataframe_to_pandas -from .base import Annotator, get_closest +from ._util import select_closest -CHR_MAP = dict(zip( + + +CHROM_MAP = dict(zip( list(map(str, range(1, 19+1))) + ['X', 'Y'], range(1, 21+1) )) -class KcRbmAnnotator(Annotator): +def register(subparsers, name='kcrbm'): + parser = subparsers.add_parser(name, help=name + ' help') + + # Required arguments. + parser.add_argument('input') + parser.add_argument('output') + + # Optional arguments. + parser.add_argument('--reference', default='mm10', choices={'mm10'}) + parser.add_argument('--method', default='genes', + choices={'genes', 'transcripts'}) + parser.add_argument('--system', default='SB', + choices={'MMTV', 'MuLV', 'SB'}) + parser.add_argument('--closest', default=False, action='store_true') + + # Set main for dispatch. + parser.set_defaults(main=main) + + return parser + + +def main(args): + logger = logging.getLogger() - def __init__(self, reference, system, closest=False): - super().__init__() + # Read insertions. + logger.info('Annotation insertions') + insertions = pd.read_csv(args.input, sep='\t', dtype={'chrom': str}) + logger.info('Read {} insertions'.format(len(insertions))) - if system not in {'SB'}: - raise ValueError('Unknown system {}'.format(system)) + # Annotate with kcrbm. + annotation = annotate(insertions, args.reference, + args.system, args.method) - if reference not in {'mm10'}: - raise ValueError('Unsupported genome {}'.format(reference)) + if args.closest: + # Sub-select for closest features. + logger.info('Reducing to closest features') + annotation = select_closest(annotation, col='gene_distance') - self._reference = reference - self._system = system - self._closest = closest + # Merge annotation. + logger.info('Merging annotation') + merged = pd.merge(insertions, annotation, on='id', how='left') + merged.to_csv(args.output, sep='\t', index=False) - @classmethod - def configure_argparser(cls, subparsers, name='kcrbm'): - parser = subparsers.add_parser(name, help=name + ' help') - parser.add_argument('input', type=Path) - parser.add_argument('output', type=Path) - parser.add_argument('--reference', default='mm10') - parser.add_argument('--system', default='SB') - parser.add_argument('--closest', default=False, action='store_true') +def annotate(insertions, reference, system, method): + # Convert to kcrbm format. + ins_kcrbm = _convert_to_kcrbm(insertions) - return parser + # Run KCRBM. + kcrbm = importr('kcrbm') + genome = _load_genome(reference) - def annotate(self, frame, type_='gene'): - kcrbm_ins = self._convert_to_kcrbm_frame(frame) - kcrbm_result = self._run_kcrbm(kcrbm_ins, method='genes') + result = kcrbm.kcrbm(edata=genome, idata=ins_kcrbm, rules=system, + reference=reference, map_to=method) + result = dataframe_to_pandas(result) - gene_mapping = self._parse_gene_result(kcrbm_result) + # Convert to gene/transcript frame. + if method == 'gene': + result = _convert_gene_result(result) + elif method == 'transcript': + result = _convert_transcript_result(result) + else: + raise ValueError('Unknown method {}'.format(method)) - if self._closest: - gene_mapping = get_closest(gene_mapping) + return result - return pd.merge(frame, gene_mapping, on='insertion_id', how='left') - @staticmethod - def _convert_to_kcrbm_frame(frame): - # Extract and rename required columns. - kcrbm_frame = frame.ix[:, ['insertion_id', 'seqname', - 'location', 'strand']] - kcrbm_frame.columns = ['id', 'chr', 'base', 'ori'] +def _convert_to_kcrbm(insertion): + # Extract and rename required columns. + kcrbm_frame = insertion.ix[:, ['id', 'seqname', 'location', 'strand']] + kcrbm_frame.columns = ['id', 'chr', 'base', 'ori'] - # Remove any eccentric chromosomes from frame. - seq_mask = kcrbm_frame.chr.isin(CHR_MAP.keys()) - if any(~seq_mask): - dropped_chr = set(kcrbm_frame.ix[~seq_mask].chr) - print('Warning: dropped insertions not in regular ' - 'chromosomes ({})'.format(', '.join(dropped_chr))) + # Remove any eccentric chromosomes from frame. + seq_mask = kcrbm_frame.chr.isin(CHROM_MAP.keys()) + if any(~seq_mask): + dropped_chr = set(kcrbm_frame.ix[~seq_mask].chr) + print('Warning: dropped insertions not in regular ' + 'chromosomes ({})'.format(', '.join(dropped_chr))) - kcrbm_frame = kcrbm_frame.ix[seq_mask] + kcrbm_frame = kcrbm_frame.ix[seq_mask] - # Convert chr to numeric representation. - kcrbm_frame['chr'] = kcrbm_frame['chr'].map(CHR_MAP).astype(int) + # Convert chr to numeric representation. + kcrbm_frame['chr'] = kcrbm_frame['chr'].map(CHROM_MAP).astype(int) - # Convert orientation if required. - if not issubdtype(kcrbm_frame['ori'].dtype, int): - kcrbm_frame['ori'] = kcrbm_frame['ori'].map({'+': 1, '-': -1}) + # Copy insertion id to extra column. + kcrbm_frame['ins_id'] = kcrbm_frame['id'] - return kcrbm_frame + return kcrbm_frame - def _run_kcrbm(self, kcrbm_frame, method): - kcrbm_frame['ins_id'] = kcrbm_frame['id'] - kcrbm_df = pandas_to_dataframe(kcrbm_frame) - kcrbm = importr('kcrbm') - genome = self._load_genome(self._reference) - res = kcrbm.kcrbm(edata=genome, idata=kcrbm_df, rules=self._system, - reference=self._reference, map_to=method) +def _load_genome(genome): + utils = importr("utils") - return dataframe_to_pandas(res) + if genome == 'mm10': + utils.data('edata.mm10', package='kcrbm') + genome_obj = robjects.r['edata.mm10'] + else: + raise ValueError('Unknown genome version {}'.format(genome)) - @staticmethod - def _parse_gene_result(result): - result = result.ix[result['ensid'].astype(str) != 'NA'] + return genome_obj - gene_distance = result[['d2gss', 'd2gts']]\ - .abs().min(axis=1).astype(int) - gene_distance.ix[result.mechanism.str.startswith('u')] *= -1 - return pd.DataFrame({ - 'insertion_id': result['ins_id'], - 'gene_id': result['ensid'], - 'distance': gene_distance, - 'mechanism': result['mechanism']}, - columns=['insertion_id', 'gene_id', 'distance', 'mechanism']) +def _convert_gene_result(result): + result = result.ix[result['ensid'].astype(str) != 'NA'] - @staticmethod - def _load_genome(genome): - utils = importr("utils") + gene_distance = result[['d2gss', 'd2gts']]\ + .abs().min(axis=1).astype(int) + gene_distance.ix[result.mechanism.str.startswith('u')] *= -1 - if genome == 'mm10': - utils.data('edata.mm10', package='kcrbm') - genome_obj = robjects.r['edata.mm10'] - else: - raise ValueError('Unknown genome version {}'.format(genome)) + return pd.DataFrame({ + 'insertion_id': result['ins_id'], + 'gene_id': result['ensid'], + 'distance': gene_distance, + 'mechanism': result['mechanism']}, + columns=['insertion_id', 'gene_id', 'distance', 'mechanism']) - return genome_obj - @staticmethod - def _parse_transcript_result(result): - result = result.ix[result['ensid'].astype(str) != 'NA'] +def _convert_transcript_result(result): + result = result.ix[result['ensid'].astype(str) != 'NA'] - tr_list = result['transid'].str.split('|') - mech_list = result['mechanism'].str.split('|') + transcripts = result['transid'].str.split('|') + mechanisms = result['mechanism'].str.split('|') - counts = list(map(len, tr_list)) + counts = list(map(len, transcripts)) - ins_id = list(_repeat_list(result['ins_id'], counts)) - ens_id = list(_repeat_list(result['ensid'], counts)) + ins_id = list(_repeat_list(result['ins_id'], counts)) + ens_id = list(_repeat_list(result['ensid'], counts)) - return pd.DataFrame({'id': ins_id, 'gene': ens_id, - 'transcript': _flatten_list(tr_list), - 'mechanism': _flatten_list(mech_list)}, - columns=['id', 'gene', 'transcript', 'mechanism']) + return pd.DataFrame({'id': ins_id, 'gene': ens_id, + 'transcript': _flatten_list(transcripts), + 'mechanism': _flatten_list(mechanisms)}, + columns=['id', 'gene', 'transcript', 'mechanism']) def _repeat_list(l, n): diff --git a/pyim/main/annotate.py b/pyim/main/annotate.py index 80261ef..665a82f 100644 --- a/pyim/main/annotate.py +++ b/pyim/main/annotate.py @@ -4,7 +4,7 @@ import argparse import logging -from pyim.annotation import window, rbm +from pyim.annotation import window, rbm, kcrbm from ._logging import print_header, print_footer @@ -19,6 +19,7 @@ def main(): # Register pipelines. window.register(subparsers) rbm.register(subparsers) + kcrbm.register(subparsers) # Parse args. args = parser.parse_args() From bc66d341bd07ca7db8037a4d99f88ecf2538ea41 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Sun, 27 Dec 2015 15:46:40 +0100 Subject: [PATCH 052/100] Add cimpl/kcrbm as external dependencies. --- .gitmodules | 6 ++++++ external/cimpl | 1 + external/kcrbm | 1 + 3 files changed, 8 insertions(+) create mode 100644 .gitmodules create mode 160000 external/cimpl create mode 160000 external/kcrbm diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..c3a38c0 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,6 @@ +[submodule "external/kcrbm"] + path = external/kcrbm + url = git@bitbucket.org:jrderuiter/kcrbm.git +[submodule "external/cimpl"] + path = external/cimpl + url = git@bitbucket.org:jrderuiter/cimpl.git diff --git a/external/cimpl b/external/cimpl new file mode 160000 index 0000000..858f11b --- /dev/null +++ b/external/cimpl @@ -0,0 +1 @@ +Subproject commit 858f11b99a3c7153278bb16ab5dff668ea96cf63 diff --git a/external/kcrbm b/external/kcrbm new file mode 160000 index 0000000..47912b6 --- /dev/null +++ b/external/kcrbm @@ -0,0 +1 @@ +Subproject commit 47912b6954462f67ed1b0587014dea45db4d1688 From b9a22305ce383b5128f5b65febb8af0b68eb9848 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Sun, 27 Dec 2015 16:59:27 +0100 Subject: [PATCH 053/100] Fix moved import. --- pyim/alignment/pipelines/shear_splink.py | 2 +- pyim/alignment/pipelines/shear_splink_sb.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyim/alignment/pipelines/shear_splink.py b/pyim/alignment/pipelines/shear_splink.py index b4f4beb..b042060 100644 --- a/pyim/alignment/pipelines/shear_splink.py +++ b/pyim/alignment/pipelines/shear_splink.py @@ -15,7 +15,7 @@ from pyim.alignment.bowtie2 import align as bowtie_align from pyim.alignment.vector import (align_exact, align_multiple, align_with_reverse) -from pyim.util import count_fasta_entries +from pyim.util.file import count_fasta_entries from ._model import ExtractResult from ._helpers.pipeline import (print_stats, build_barcode_map, diff --git a/pyim/alignment/pipelines/shear_splink_sb.py b/pyim/alignment/pipelines/shear_splink_sb.py index 2616356..8d1dbf5 100644 --- a/pyim/alignment/pipelines/shear_splink_sb.py +++ b/pyim/alignment/pipelines/shear_splink_sb.py @@ -6,7 +6,7 @@ from .shear_splink import shear_splink from pyim.alignment import vector as vec -from pyim.util import count_fasta_entries +from pyim.util.file import count_fasta_entries # --- Pipeline register hook + main --- # From 916a35d6a59fdd0380cca57282d8e6185dfd47de Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Sun, 27 Dec 2015 22:38:05 +0100 Subject: [PATCH 054/100] Updated kcrbm. --- external/kcrbm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/kcrbm b/external/kcrbm index 47912b6..a4c7f21 160000 --- a/external/kcrbm +++ b/external/kcrbm @@ -1 +1 @@ -Subproject commit 47912b6954462f67ed1b0587014dea45db4d1688 +Subproject commit a4c7f218cd7bace313ebbffb98ad2d5eb3de83ea From afa942f5c5717aa7cfc291399a1c60da3b7a9a15 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Sun, 27 Dec 2015 22:54:09 +0100 Subject: [PATCH 055/100] Update kcrbm. --- external/kcrbm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/kcrbm b/external/kcrbm index a4c7f21..8e47247 160000 --- a/external/kcrbm +++ b/external/kcrbm @@ -1 +1 @@ -Subproject commit a4c7f218cd7bace313ebbffb98ad2d5eb3de83ea +Subproject commit 8e47247fca0c8627313ba7ba1dd47639d82d3386 From 22a6d433909f13aac73061a974120da9aa70edf5 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Sun, 27 Dec 2015 23:35:18 +0100 Subject: [PATCH 056/100] Remove tkgeno dependency. --- pyim/cis/cimpl.py | 3 ++- pyim/main/cis.py | 14 ++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/pyim/cis/cimpl.py b/pyim/cis/cimpl.py index a6e6421..e22d275 100644 --- a/pyim/cis/cimpl.py +++ b/pyim/cis/cimpl.py @@ -2,8 +2,9 @@ import readline from rpy2 import robjects +from rpy2.robjects.packages import importr -from tkgeno.util.rpy2 import importr, pandas_to_dataframe, dataframe_to_pandas +from pyim.util.rpy2 import pandas_to_dataframe, dataframe_to_pandas R_GENOMES = { diff --git a/pyim/main/cis.py b/pyim/main/cis.py index ff25db4..fc32c39 100644 --- a/pyim/main/cis.py +++ b/pyim/main/cis.py @@ -6,7 +6,7 @@ from future.utils import native_str from argparse import ArgumentParser -from pathlib import Path +from os import path import numpy as np import pandas as pd @@ -18,8 +18,8 @@ def setup_parser(): parser = ArgumentParser(prog='pyim-cis') - parser.add_argument('input', type=Path) - parser.add_argument('output', type=Path) + parser.add_argument('input') + parser.add_argument('output') group = parser.add_mutually_exclusive_group(required=True) group.add_argument('--pattern', default=None) @@ -47,7 +47,7 @@ def main(): args = parser.parse_args() # Read frame. - ins_frame = pd.read_csv(str(args.input), sep=native_str('\t')) + ins_frame = pd.read_csv(args.input, sep=native_str('\t')) # Run cimpl. cimpl_obj = cimpl(ins_frame, scales=args.scales, genome=args.genome, @@ -78,9 +78,11 @@ def main(): 'strand_mean', 'strand_homogeneity']] # Write out outputs. - cis.to_csv(str(args.output.with_suffix('.sites.txt')), + cis.to_csv(path.splitext(args.output)[0] + '.sites.txt', sep=native_str('\t'), index=False) - ins_annotated.to_csv(str(args.output), sep=native_str('\t'), index=False) + + ins_annotated.to_csv(args.output, sep=native_str('\t'), index=False) + def _strandedness(insertions, min_homogeneity): From 9b2a45f1467174c184ee77fb7bb069202cc32372 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Mon, 28 Dec 2015 09:29:57 +0100 Subject: [PATCH 057/100] Filter insertions on min_depth (shear_splink). --- pyim/alignment/pipelines/shear_splink.py | 10 ++++++++-- pyim/alignment/pipelines/shear_splink_sb.py | 3 ++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/pyim/alignment/pipelines/shear_splink.py b/pyim/alignment/pipelines/shear_splink.py index b042060..770a36b 100644 --- a/pyim/alignment/pipelines/shear_splink.py +++ b/pyim/alignment/pipelines/shear_splink.py @@ -87,7 +87,8 @@ def main(args): args.bowtie_index, args.output_dir, contaminants=contaminants, sample_map=sample_map, min_genomic_length=args.min_genomic_length, - min_mapq=args.min_mapq, total_reads=total_reads) + min_mapq=args.min_mapq, min_depth=args.min_depth, + total_reads=total_reads) # Write insertion output. insertions.to_csv(path.join(args.output_dir, 'insertions.txt'), @@ -99,7 +100,8 @@ def main(args): def shear_splink(reads, transposon, linker, barcodes, bowtie_index, output_dir, contaminants=None, sample_map=None, min_genomic_length=15, - min_mapq=37, extract_kws=None, total_reads=None): + min_mapq=37, min_depth=None, + extract_kws=None, total_reads=None): logger = logging.getLogger() @@ -154,6 +156,10 @@ def shear_splink(reads, transposon, linker, barcodes, if sample_map is not None: insertions['sample'] = insertions['barcode'].map(sample_map) + # Filter on (unique) depth. + if min_depth is not None: + insertions = insertions.ix[insertions['depth_unique'] >= min_depth] + # Sort and assign ids to insertions. insertions.sort_values(by=['chrom', 'position'], inplace=True) insertions['id'] = ['INS_{}'.format(i) diff --git a/pyim/alignment/pipelines/shear_splink_sb.py b/pyim/alignment/pipelines/shear_splink_sb.py index 8d1dbf5..2921be0 100644 --- a/pyim/alignment/pipelines/shear_splink_sb.py +++ b/pyim/alignment/pipelines/shear_splink_sb.py @@ -91,7 +91,8 @@ def main(args): args.bowtie_index, args.output_dir, contaminants=contaminants, sample_map=sample_map, min_genomic_length=args.min_genomic_length, - extract_kws=extract_kws, total_reads=total_reads) + min_depth=args.min_depth, extract_kws=extract_kws, + total_reads=total_reads) # Write insertion output. insertions.to_csv(path.join(args.output_dir, 'insertions.txt'), From afec10350d6a5ee2bab3ec12f046c23e605c44cc Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Mon, 28 Dec 2015 10:09:54 +0100 Subject: [PATCH 058/100] Re-worked cimpl api, dropped strandedness for now. --- pyim/cis/cimpl.py | 84 ++++++++++++++++++++++++++++++++++++----------- pyim/main/cis.py | 67 +++++++++---------------------------- 2 files changed, 81 insertions(+), 70 deletions(-) diff --git a/pyim/cis/cimpl.py b/pyim/cis/cimpl.py index e22d275..fc18375 100644 --- a/pyim/cis/cimpl.py +++ b/pyim/cis/cimpl.py @@ -12,12 +12,30 @@ } +def map_insertions(insertions, scales, genome, alpha=0.05, **kwargs): + """Maps given insertions to CISs using CIMPL.""" + + # Convert insertion to cimpl format. + cimpl_ins = convert_to_cimpl(insertions) + + # Run cimpl. + cimpl_result = cimpl(cimpl_ins, scales, genome, **kwargs) + + # Extract cis sites and mapping. + cis = extract_cis(cimpl_result, alpha=alpha) + mapping = extract_mapping(cimpl_result, cis) + + return cis, mapping + + def cimpl(insertions, scales, genome, system=None, pattern=None, lhc_method='none', iterations=1000, chromosomes=None, verbose=False, threads=1): + """Runs CIMPL on insertions (in CIMPL format).""" + # Fill in chromosomes from data if not specified. if chromosomes is None: - chromosomes = list(insertions['seqname'].unique()) + chromosomes = list(insertions['chr'].unique()) # Determine if system or specific pattern was specified. if pattern is not None: @@ -31,8 +49,7 @@ def cimpl(insertions, scales, genome, system=None, pattern=None, # Prepare chromosomes argument, adding 'chr' prefix and # converting to StrVector to pass to R. if not chromosomes[0].startswith('chr'): - chromosomes = ['chr' + c for c in chromosomes] - chromosomes = robjects.vectors.StrVector(chromosomes) + chromosomes = ['chr' + c for c in chromosomes] # Convert scales to IntVector if supplied as list. if type(scales) == list: @@ -41,37 +58,34 @@ def cimpl(insertions, scales, genome, system=None, pattern=None, # Load genome object from R. genome_obj = _load_genome(genome) - # Convert insertions to cimpl format. - cimpl_frame = _convert_to_cimpl_dataframe(insertions) - # Check if contig_depth is present (if doing hop exclusion). - if lhc_method == 'exclude' and 'contig_depth' not in cimpl_frame: + if lhc_method == 'exclude' and 'contig_depth' not in insertions: raise ValueError('Insertion depth is needed for lhc exclusion') # Run CIMPL! cimpl_r = importr('cimpl') cimpl_obj = cimpl_r.doCimplAnalysis( - pandas_to_dataframe(cimpl_frame), + pandas_to_dataframe(insertions), scales=scales, n_iterations=iterations, lhc_method=lhc_method, threads=threads, BSgenome=genome_obj, - chromosomes=chromosomes, verbose=verbose, **extra_args) + chromosomes=robjects.vectors.StrVector(chromosomes), + verbose=verbose, **extra_args) return cimpl_obj -def _convert_to_cimpl_dataframe(insertions): +def convert_to_cimpl(insertions): # Extract and rename required columns. - cimpl_frame = insertions.ix[:, ['insertion_id', 'seqname', - 'location', 'sample']] - cimpl_frame.columns = ['id', 'chr', 'location', 'sampleID'] + cimpl_ins = insertions.ix[:, ['id', 'chrom', 'position', 'sample']] + cimpl_ins.columns = ['id', 'chr', 'location', 'sampleID'] if 'depth_unique' in insertions: - cimpl_frame['contig_depth'] = insertions['depth_unique'] + cimpl_ins['contig_depth'] = insertions['depth_unique'] # Add 'chr' prefix to the chromosome names if needed. - cimpl_frame['chr'] = _prefix_chromosomes(cimpl_frame['chr']) + cimpl_ins['chr'] = _prefix_chromosomes(cimpl_ins['chr']) - return cimpl_frame + return cimpl_ins def _prefix_chromosomes(series, prefix='chr'): @@ -95,7 +109,7 @@ def _load_genome(genome): return genome_obj -def get_cis(cimpl_obj, alpha=0.05, mul_test=True): +def extract_cis(cimpl_obj, alpha=0.05, mul_test=True): cimpl_r = importr('cimpl') cis_obj = cimpl_r.getCISs(cimpl_obj, alpha=alpha, mul_test=mul_test) @@ -116,15 +130,25 @@ def get_cis(cimpl_obj, alpha=0.05, mul_test=True): 'scale', 'p_value', 'n_insertions', 'peak_location', 'peak_height', 'width']] + # Rename and reshuffle cis columns. + cis_frame = cis_frame.rename( + columns={'seqname': 'chrom', + 'peak_location': 'position', + 'peak_height': 'height'}) + + cis_frame = cis_frame[['cis_id', 'chrom', 'position', 'scale', + 'n_insertions', 'p_value', 'start', 'end', + 'height', 'width']] + return cis_frame -def get_cis_mapping(cimpl_obj, cis_frame): +def extract_mapping(cimpl_obj, cis_frame): # Add cis_id as index to cis frame before passing to R, # ensures CIMPL uses cis id's instead of row indices. cis_frame = cis_frame.copy() cis_frame.set_index('cis_id', drop=False, inplace=True) - cis_frame['chromosomes'] = _prefix_chromosomes(cis_frame['seqname']) + cis_frame['chromosomes'] = _prefix_chromosomes(cis_frame['chrom']) cis_frame_r = pandas_to_dataframe(cis_frame) # Retrieve cis matrix from cimpl. @@ -178,3 +202,25 @@ def _expand_row(row, col, delimiter): row_dict[col] = [row[col]] return pd.DataFrame(row_dict) + + +# def cis_strandedness(insertions, min_homogeneity): +# strand_mean = insertions.strand.mean() +# strand = int(np.sign(strand_mean)) +# +# if strand != 0: +# homogeneity = (insertions.strand == strand).sum() / len(insertions) +# else: +# homogeneity = 0.5 +# +# if homogeneity < min_homogeneity: +# strand = 0 +# +# return pd.Series(dict(strand=strand, +# strand_mean=strand_mean, +# strand_homogeneity=homogeneity)) +# +# # Determine strand of cis sites. +# strand_func = curry(_strandedness, min_homogeneity=args.strand_homogeneity) +# cis_strand = insertions.groupby('cis_id').apply(strand_func) +# cis = pd.merge(cis, cis_strand.reset_index(), on='cis_id') diff --git a/pyim/main/cis.py b/pyim/main/cis.py index fc32c39..c47f859 100644 --- a/pyim/main/cis.py +++ b/pyim/main/cis.py @@ -12,7 +12,7 @@ import pandas as pd from toolz import curry -from pyim.cis.cimpl import cimpl, get_cis, get_cis_mapping +from pyim.cis.cimpl import map_insertions def setup_parser(): @@ -29,7 +29,7 @@ def setup_parser(): parser.add_argument('--chromosomes', nargs='+', default=None) parser.add_argument('--scales', nargs='+', type=int, default=30000) - parser.add_argument('--strand_homogeneity', type=float, default=0.75) + # parser.add_argument('--strand_homogeneity', type=float, default=0.75) parser.add_argument('--alpha', type=float, default=0.05) parser.add_argument('--iterations', type=int, default=1000) @@ -46,61 +46,26 @@ def main(): parser = setup_parser() args = parser.parse_args() - # Read frame. - ins_frame = pd.read_csv(args.input, sep=native_str('\t')) + # Read insertions.. + insertions = pd.read_csv(args.input, sep=native_str('\t'), + dtype={'chrom': str}) - # Run cimpl. - cimpl_obj = cimpl(ins_frame, scales=args.scales, genome=args.genome, - system=args.system, pattern=args.pattern, - lhc_method=args.lhc_method, chromosomes=args.chromosomes, - iterations=args.iterations, threads=args.threads, - verbose=args.verbose) - - # Extract cis and cis mapping from object. - cis = get_cis(cimpl_obj, alpha=args.alpha, mul_test=True) - cis_mapping = get_cis_mapping(cimpl_obj, cis_frame=cis) + # Run cimpl on insertions. + cis, mapping = map_insertions( + insertions, scales=args.scales, genome=args.genome, alpha=args.alpha, + system=args.system, pattern=args.pattern, lhc_method=args.lhc_method, + chromosomes=args.chromosomes, iterations=args.iterations, + threads=args.threads, verbose=args.verbose) # Annotate insertions with cis mapping. - ins_annotated = pd.merge(ins_frame, cis_mapping, on='insertion_id') - - # Determine strand of cis sites. - strand_func = curry(_strandedness, min_homogeneity=args.strand_homogeneity) - cis_strand = ins_annotated.groupby('cis_id').apply(strand_func) - - # Merge strand information with cis sites. - cis = pd.merge(cis, cis_strand.reset_index(), on='cis_id') - - # Rename and reshuffle cis columns. - cis = cis.rename(columns={'peak_location': 'location', - 'peak_height': 'height'}) - cis = cis[['cis_id', 'seqname', 'location', 'strand', 'scale', - 'n_insertions', 'p_value', 'start', 'end', 'height', 'width', - 'strand_mean', 'strand_homogeneity']] + mapping_tmp = mapping.rename(columns={'insertion_id': 'id'}) + insertions = pd.merge(insertions, mapping_tmp, on='id') # Write out outputs. - cis.to_csv(path.splitext(args.output)[0] + '.sites.txt', - sep=native_str('\t'), index=False) - - ins_annotated.to_csv(args.output, sep=native_str('\t'), index=False) - - - -def _strandedness(insertions, min_homogeneity): - strand_mean = insertions.strand.mean() - strand = int(np.sign(strand_mean)) - - if strand != 0: - homogeneity = (insertions.strand == strand).sum() / len(insertions) - else: - homogeneity = 0.5 - - if homogeneity < min_homogeneity: - strand = 0 - - return pd.Series(dict(strand=strand, - strand_mean=strand_mean, - strand_homogeneity=homogeneity)) + cis_path = path.splitext(args.output)[0] + '.sites.txt' + cis.to_csv(cis_path, sep=native_str('\t'), index=False) + insertions.to_csv(args.output, sep=native_str('\t'), index=False) if __name__ == '__main__': From 68770b664589bad7f740f3ae5a031de806877314 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Mon, 28 Dec 2015 11:16:03 +0100 Subject: [PATCH 059/100] Add sample selection for cis. --- pyim/cis/cimpl.py | 2 +- pyim/main/cis.py | 36 ++++++++++++++++++++++++++++++------ pyim/main/merge.py | 11 ++--------- pyim/util/insertions.py | 13 +++++++++++++ 4 files changed, 46 insertions(+), 16 deletions(-) create mode 100644 pyim/util/insertions.py diff --git a/pyim/cis/cimpl.py b/pyim/cis/cimpl.py index fc18375..36d4782 100644 --- a/pyim/cis/cimpl.py +++ b/pyim/cis/cimpl.py @@ -90,7 +90,7 @@ def convert_to_cimpl(insertions): def _prefix_chromosomes(series, prefix='chr'): # Add 'chr' prefix to the chromosome names if needed. - if not series[0].startswith('chr'): + if len(series) > 0 and not series.iloc[0].startswith('chr'): series = series.map(lambda c: prefix + c) return series diff --git a/pyim/main/cis.py b/pyim/main/cis.py index c47f859..c067df1 100644 --- a/pyim/main/cis.py +++ b/pyim/main/cis.py @@ -8,11 +8,13 @@ from argparse import ArgumentParser from os import path -import numpy as np +import logging import pandas as pd -from toolz import curry from pyim.cis.cimpl import map_insertions +from pyim.util.insertions import subset_samples + +from ._logging import print_header, print_footer def setup_parser(): @@ -28,14 +30,16 @@ def setup_parser(): parser.add_argument('--genome', choices={'mm10'}, default='mm10') parser.add_argument('--chromosomes', nargs='+', default=None) parser.add_argument('--scales', nargs='+', type=int, default=30000) + parser.add_argument('--samples', nargs='+', default=None) - # parser.add_argument('--strand_homogeneity', type=float, default=0.75) - - parser.add_argument('--alpha', type=float, default=0.05) parser.add_argument('--iterations', type=int, default=1000) parser.add_argument('--lhc_method', choices={'none', 'exclude'}, default='exclude') + # parser.add_argument('--strand_homogeneity', type=float, default=0.75) + + parser.add_argument('--alpha', type=float, default=0.05) + parser.add_argument('--threads', type=int, default=1) parser.add_argument('--verbose', default=False, action='store_true') @@ -43,14 +47,28 @@ def setup_parser(): def main(): + logger = logging.getLogger() + + # Parse arguments. parser = setup_parser() args = parser.parse_args() - # Read insertions.. + # Print header. + print_header(logger, command='cis') + + # Read insertions. insertions = pd.read_csv(args.input, sep=native_str('\t'), dtype={'chrom': str}) + logger.info('Read {} insertions'.format(len(insertions))) + + # Subset to samples if needed. + if args.samples is not None: + logger.info('Subsetting to {} samples'.format(len(args.samples))) + insertions = subset_samples(insertions, args.samples, logger=logger) # Run cimpl on insertions. + logger.info('Running CIMPL in R') + cis, mapping = map_insertions( insertions, scales=args.scales, genome=args.genome, alpha=args.alpha, system=args.system, pattern=args.pattern, lhc_method=args.lhc_method, @@ -58,15 +76,21 @@ def main(): threads=args.threads, verbose=args.verbose) # Annotate insertions with cis mapping. + logger.info('Merging CIMPL annotation') + mapping_tmp = mapping.rename(columns={'insertion_id': 'id'}) insertions = pd.merge(insertions, mapping_tmp, on='id') # Write out outputs. + logger.info('Writing outputs') + cis_path = path.splitext(args.output)[0] + '.sites.txt' cis.to_csv(cis_path, sep=native_str('\t'), index=False) insertions.to_csv(args.output, sep=native_str('\t'), index=False) + print_footer(logger) + if __name__ == '__main__': main() diff --git a/pyim/main/merge.py b/pyim/main/merge.py index dea49b7..18b8bbf 100644 --- a/pyim/main/merge.py +++ b/pyim/main/merge.py @@ -11,6 +11,7 @@ import pandas as pd +from pyim.util.insertions import subset_samples from ._logging import print_header, print_footer @@ -72,15 +73,7 @@ def main(): if args.samples is not None: logger.info('Subsetting dataset to {} samples' .format(len(args.samples))) - - merged_samples = set(merged['sample']) - for sample in args.samples: - if sample not in merged_samples: - logging.warning('- Missing insertions for sample {}' - .format(sample)) - - mask = merged['sample'].isin(set(args.samples)) - merged = merged.ix[mask] + merged = subset_samples(merged, args.samples, logger=logger) # Write output. logging.info('Writing merged output') diff --git a/pyim/util/insertions.py b/pyim/util/insertions.py new file mode 100644 index 0000000..a9a9077 --- /dev/null +++ b/pyim/util/insertions.py @@ -0,0 +1,13 @@ + + +def subset_samples(insertions, samples, logger=None): + warn = print if logger is None else logger.warning + + # Check for missing samples. + ins_samples = set(insertions['sample']) + for sample in samples: + if sample not in ins_samples: + warn('- Missing insertions for sample {}'.format(sample)) + + # Actually subset insertions. + return insertions.ix[ insertions['sample'].isin(set(samples))] From 28807b030b3305265cf35d0a0d9a8dc7c7a842ce Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Mon, 28 Dec 2015 11:31:02 +0100 Subject: [PATCH 060/100] Add cis strand calculation. --- pyim/cis/_util.py | 32 ++++++++++++++++++++++++++++++++ pyim/cis/cimpl.py | 22 ---------------------- pyim/main/cis.py | 8 +++++++- 3 files changed, 39 insertions(+), 23 deletions(-) create mode 100644 pyim/cis/_util.py diff --git a/pyim/cis/_util.py b/pyim/cis/_util.py new file mode 100644 index 0000000..4574f97 --- /dev/null +++ b/pyim/cis/_util.py @@ -0,0 +1,32 @@ +import toolz + +import numpy as np +import pandas as pd + + +def annotate_cis_strand(cis, insertions, min_homogeneity): + # Determine strand of cis sites. + func = toolz.curry(_cis_strand, min_homogeneity=min_homogeneity) + cis_strand = insertions.groupby('cis_id').apply(func) + + # Merge with cis annotation + cis = pd.merge(cis, cis_strand.reset_index(), on='cis_id') + + return cis + + +def _cis_strand(insertions, min_homogeneity): + strand_mean = insertions.strand.mean() + strand = int(np.sign(strand_mean)) + + if strand != 0: + homogeneity = (insertions.strand == strand).sum() / len(insertions) + else: + homogeneity = 0.5 + + if homogeneity < min_homogeneity: + strand = 0 + + return pd.Series(dict(strand=strand, + strand_mean=strand_mean, + strand_homogeneity=homogeneity)) \ No newline at end of file diff --git a/pyim/cis/cimpl.py b/pyim/cis/cimpl.py index 36d4782..f76d006 100644 --- a/pyim/cis/cimpl.py +++ b/pyim/cis/cimpl.py @@ -202,25 +202,3 @@ def _expand_row(row, col, delimiter): row_dict[col] = [row[col]] return pd.DataFrame(row_dict) - - -# def cis_strandedness(insertions, min_homogeneity): -# strand_mean = insertions.strand.mean() -# strand = int(np.sign(strand_mean)) -# -# if strand != 0: -# homogeneity = (insertions.strand == strand).sum() / len(insertions) -# else: -# homogeneity = 0.5 -# -# if homogeneity < min_homogeneity: -# strand = 0 -# -# return pd.Series(dict(strand=strand, -# strand_mean=strand_mean, -# strand_homogeneity=homogeneity)) -# -# # Determine strand of cis sites. -# strand_func = curry(_strandedness, min_homogeneity=args.strand_homogeneity) -# cis_strand = insertions.groupby('cis_id').apply(strand_func) -# cis = pd.merge(cis, cis_strand.reset_index(), on='cis_id') diff --git a/pyim/main/cis.py b/pyim/main/cis.py index c067df1..51849d0 100644 --- a/pyim/main/cis.py +++ b/pyim/main/cis.py @@ -12,6 +12,7 @@ import pandas as pd from pyim.cis.cimpl import map_insertions +from pyim.cis._util import annotate_cis_strand from pyim.util.insertions import subset_samples from ._logging import print_header, print_footer @@ -36,7 +37,7 @@ def setup_parser(): parser.add_argument('--lhc_method', choices={'none', 'exclude'}, default='exclude') - # parser.add_argument('--strand_homogeneity', type=float, default=0.75) + parser.add_argument('--strand_homogeneity', type=float, default=None) parser.add_argument('--alpha', type=float, default=0.05) @@ -81,6 +82,11 @@ def main(): mapping_tmp = mapping.rename(columns={'insertion_id': 'id'}) insertions = pd.merge(insertions, mapping_tmp, on='id') + # Determine strand of cis sites. + if args.strand_homogeneity is not None: + logging.info('Determining CIS strands') + cis = annotate_cis_strand(cis, insertions, args.strand_homogeneity) + # Write out outputs. logger.info('Writing outputs') From 8217d752ccf790cab7981446e50fa1a29de6660e Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Tue, 29 Dec 2015 09:39:20 +0100 Subject: [PATCH 061/100] Updated pyim-gff for column name changes. --- pyim/main/gff.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/pyim/main/gff.py b/pyim/main/gff.py index de75e2c..3459fae 100644 --- a/pyim/main/gff.py +++ b/pyim/main/gff.py @@ -6,7 +6,6 @@ from future.utils import native_str from argparse import ArgumentParser -from pathlib import Path import pandas as pd @@ -14,8 +13,8 @@ def setup_parser(): parser = ArgumentParser(prog='pyim-gff') - parser.add_argument('insertions', type=Path) - parser.add_argument('output_gff', type=Path) + parser.add_argument('insertions') + parser.add_argument('output') return parser @@ -24,22 +23,22 @@ def _ins_to_gff(ins, size=1000): assert isinstance(ins.strand, int) attrs = [i for i in ins.index if i not in - {'insertion_id', 'seqname', 'location', 'strand'}] + {'id', 'seqname', 'location', 'strand'}] attr_dict = {attr: ins[attr] for attr in attrs} - attr_dict['id'] = ins.insertion_id - attr_dict['name'] = ins.insertion_id + attr_dict['id'] = ins['id'] + attr_dict['name'] = ins['id'] attr_keys = sorted(attr_dict.keys()) attr_str = ';'.join(('{} {}'.format(k, attr_dict[k]) for k in attr_keys)) return { - 'seqname': ins.seqname, + 'seqname': ins['chrom'], 'source': '.', 'feature': 'insertion', - 'start': int(ins.location - (size / 2)), - 'end': int(ins.location + (size / 2)), + 'start': int(ins['position'] - (size / 2)), + 'end': int(ins['position'] + (size / 2)), 'score': '.', 'strand': '+' if ins.strand == 1 else '-', 'frame': '.', @@ -52,18 +51,18 @@ def main(): args = parser.parse_args() # Read input. - ins_frame = pd.read_csv(str(args.insertions), sep=native_str('\t'), - dtype={'seqname': str, 'location': int}) + ins_frame = pd.read_csv(args.insertions, sep=native_str('\t'), + dtype={'chrom': str, 'position': int}) # Transform to gff frame. gff_frame = pd.DataFrame.from_records( (_ins_to_gff(r) for _, r in ins_frame.iterrows()), columns=['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']) - gff_frame = gff_frame.sort(['seqname', 'start', 'end']) + gff_frame = gff_frame.sort_values(by=['seqname', 'start', 'end']) # Write output. - gff_frame.to_csv(str(args.output_gff), sep=native_str('\t'), + gff_frame.to_csv(args.output, sep=native_str('\t'), index=False, header=False) From 66c87932a8c391608c9b6cd0140845dcedd759f7 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Tue, 29 Dec 2015 09:39:59 +0100 Subject: [PATCH 062/100] Initial code for poisson significance checking. --- pyim/cis/poisson.py | 96 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 pyim/cis/poisson.py diff --git a/pyim/cis/poisson.py b/pyim/cis/poisson.py new file mode 100644 index 0000000..e40ae6e --- /dev/null +++ b/pyim/cis/poisson.py @@ -0,0 +1,96 @@ +import itertools +import re + +import toolz +import pandas as pd +from intervaltree import IntervalTree +from scipy.stats import poisson +from statsmodels.stats.multitest import multipletests + + +def build_trees(insertions): + trees = {} + + for chrom, grp in insertions.groupby('chrom'): + intervals = zip(grp['position'], grp['position'] + 1, grp['id']) + trees[chrom] = IntervalTree.from_tuples(intervals) + + return trees + + +def count_pattern(record, pattern=None): + regex = re.compile(pattern) + return sum((1 for match in regex.finditer(record.seq))) + + +def count_matches(seq, regex): + return sum((1 for match in regex.finditer(seq))) + + +def generate_windows(insertions, window_size): + half_size = window_size // 2 + + # Generate list of windows for all insertions. + windows = (zip((chrom for _ in range(len(grp))), + grp['position'] - half_size, + grp['position'] + half_size) + for chrom, grp in insertions.groupby('chrom')) + windows = itertools.chain.from_iterable(windows) + + # Yield from windows. + for window in windows: + yield window + + +def calc_significance(insertions, reference, window_size, + pattern=None, chromosomes=None, total=None): + if chromosomes is None: + chromosomes = reference.keys() + + if pattern is not None: + regex = re.compile(pattern) + func = toolz.curry(count_matches, regex=regex) + else: + func = len + + if total is None: + total = sum((func(reference[c][0:len(reference[c])].seq) + for c in chromosomes)) + + # Subset insertions to chromosomes: + insertions = insertions.ix[ + insertions['chrom'].isin(chromosomes)] + + # Build lookup trees for insertions. + trees = build_trees(insertions) + + def _calc_for_window(window): + chrom, start, end = window + + # Calculate occurrence for region. + n_region = func(reference[chrom][int(start):int(end)].seq) + + # Calculate p-value. + x = len(trees[chrom][start:end]) + mu = len(insertions) * (n_region / total) + + p_val = poisson.sf(x, mu=mu, loc=1) + + return chrom, start, end, p_val + + # Generate windows. + windows = generate_windows(insertions, window_size=window_size) + + # Generate result. + res = pd.DataFrame.from_records( + (_calc_for_window(w) for w in windows), + columns=['chrom', 'start', 'end', 'p_val']) + res['p_val_corr'] = multipletests(res['p_val'], method='bonferroni')[1] + + return res + + +# result = calc_significance(insertions, ref, window_size=10000, +# pattern='(AT|TA)', chromosomes=chroms, +# total=genome_ta) +# result.query('p_val_corr < 0.05') From a961bc1721df3ad513c6dd020eabbff6c9d1cb8f Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 30 Dec 2015 16:55:52 +0100 Subject: [PATCH 063/100] Add gene id to annotation. --- pyim/annotation/window.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pyim/annotation/window.py b/pyim/annotation/window.py index 9de3d22..fcf3246 100644 --- a/pyim/annotation/window.py +++ b/pyim/annotation/window.py @@ -99,18 +99,20 @@ def _annotate_for_window(insertion, trees, window): features = fetch_in_window(trees, applied_window) # Extract feature values. - values = ((f['gene_name'], + values = ((f['gene_id'], + f['gene_name'], feature_distance(f, insertion['position'])) for f in features) try: - name, distance = zip(*values) + id_, name, distance = zip(*values) except ValueError: - name, distance = [], [] + id_, name, distance = [], [], [] # Convert to frame. frame = pd.DataFrame({ 'id': insertion['id'], + 'gene_id': id_, 'gene_name': name, 'gene_distance': distance}) @@ -159,7 +161,7 @@ def build_interval_trees(gtf): for contig, grp in itertools.groupby(genes, lambda r: r.contig): # Build a tree for each individual chromosome. intervals = ((g.start, g.end, dict(g)) for g in grp - if g.end > g.start) # Avoid null intervals. + if g.end > g.start) # Avoid null intervals. trees[contig] = IntervalTree.from_tuples(intervals) return trees From 0c594fe9804c2f166e8018315393b6a76acad79f Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 30 Dec 2015 16:56:02 +0100 Subject: [PATCH 064/100] Add support for unstranded insertions. --- pyim/annotation/rbm.py | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/pyim/annotation/rbm.py b/pyim/annotation/rbm.py index fce57d5..57adcd7 100644 --- a/pyim/annotation/rbm.py +++ b/pyim/annotation/rbm.py @@ -4,6 +4,7 @@ int, map, next, oct, open, pow, range, round, str, super, zip) # filter +import itertools import logging import pandas as pd @@ -51,6 +52,13 @@ def main(args): insertions = pd.read_csv(args.input, sep='\t', dtype={'chrom': str}) logger.info('Read {} insertions'.format(len(insertions))) + # Replace unstranded if needed. + if (~insertions['strand'].isin({-1, 1})).any(): + logger.warning('Replacing unstranded insertions') + converted = replace_unstranded(insertions) + else: + converted = insertions + # Build annotation trees. logger.info('Building interval trees') gtf = GtfFile(args.gtf) @@ -67,9 +75,10 @@ def main(args): # Annotate insertions. logger.info('Annotating insertions') annotation = annotate_for_windows( - insertions, trees, windows, progress=True) + converted, trees, windows, progress=True) if args.closest: + import pdb; pdb.set_trace() logger.info('Reducing to closest features') annotation = select_closest(annotation, col='gene_distance') @@ -92,3 +101,29 @@ def build_windows(ranges): ] return windows + + +def replace_unstranded(insertions): + """Replaces unstranded insertions with two stranded insertions.""" + + # Split stranded and unstranded. + mask = insertions['strand'].isin({-1, 1}) + stranded = insertions.ix[mask] + unstranded = insertions.ix[~mask] + + # Convert unstranded into two stranded. + converted = (_to_stranded(ins) for _, ins in unstranded.iterrows()) + converted = pd.DataFrame.from_records( + itertools.chain.from_iterable(converted)) + + return pd.concat((stranded, converted), ignore_index=True) + + +def _to_stranded(insertion): + fwd = insertion.copy() + fwd['strand'] = 1 + + rev = insertion.copy() + rev['strand'] = -1 + + return [fwd, rev] From 6013710f1560ecf321d57e5421d4f5e79cd78aa3 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 16 Mar 2016 15:42:38 +0100 Subject: [PATCH 065/100] Add distance to annotation. --- pyim/annotation/window.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pyim/annotation/window.py b/pyim/annotation/window.py index 9de3d22..4837c27 100644 --- a/pyim/annotation/window.py +++ b/pyim/annotation/window.py @@ -100,18 +100,20 @@ def _annotate_for_window(insertion, trees, window): # Extract feature values. values = ((f['gene_name'], + f['gene_id'], feature_distance(f, insertion['position'])) for f in features) try: - name, distance = zip(*values) + name, gene_id, distance = zip(*values) except ValueError: - name, distance = [], [] + name, gene_id, distance = [], [], [] # Convert to frame. frame = pd.DataFrame({ 'id': insertion['id'], 'gene_name': name, + 'gene_id': gene_id, 'gene_distance': distance}) # Include window name if known. @@ -159,7 +161,7 @@ def build_interval_trees(gtf): for contig, grp in itertools.groupby(genes, lambda r: r.contig): # Build a tree for each individual chromosome. intervals = ((g.start, g.end, dict(g)) for g in grp - if g.end > g.start) # Avoid null intervals. + if g.end > g.start) # Avoid null intervals. trees[contig] = IntervalTree.from_tuples(intervals) return trees From 8aec7f6ff0495b1231bc4d80bf92063c6207c6b9 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 16 Mar 2016 15:42:55 +0100 Subject: [PATCH 066/100] Updated call to samtools sort to include output path. --- pyim/alignment/bowtie2.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyim/alignment/bowtie2.py b/pyim/alignment/bowtie2.py index 2c156fa..671338a 100644 --- a/pyim/alignment/bowtie2.py +++ b/pyim/alignment/bowtie2.py @@ -71,8 +71,8 @@ def sam_to_bam(sam_path, bam_path=None, sort=False, # Pipe bam into samtools sort for sorting. p1 = subprocess.Popen(['samtools', 'view', '-b', sam_path], stdout=subprocess.PIPE) - p2 = subprocess.Popen(['samtools', 'sort', '-', - path.splitext(bam_path)[0]], stdin=p1.stdout) + p2 = subprocess.Popen(['samtools', 'sort', '-o', bam_path, '-'], + stdin=p1.stdout) p1.stdout.close() p2.communicate() @@ -88,4 +88,4 @@ def sam_to_bam(sam_path, bam_path=None, sort=False, # Delete original sam if requested. os.unlink(sam_path) - return bam_path \ No newline at end of file + return bam_path From c5d43192ed0dd8f2b0af110cbeadcd6b3bd6664e Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 16 Mar 2016 15:43:16 +0100 Subject: [PATCH 067/100] Added tqdm dependency. --- pyim/annotation/kcrbm.py | 2 -- setup.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/pyim/annotation/kcrbm.py b/pyim/annotation/kcrbm.py index d62ff52..2b666da 100644 --- a/pyim/annotation/kcrbm.py +++ b/pyim/annotation/kcrbm.py @@ -16,7 +16,6 @@ from ._util import select_closest - CHROM_MAP = dict(zip( list(map(str, range(1, 19+1))) + ['X', 'Y'], range(1, 21+1) @@ -67,7 +66,6 @@ def main(args): merged.to_csv(args.output, sep='\t', index=False) - def annotate(insertions, reference, system, method): # Convert to kcrbm format. ins_kcrbm = _convert_to_kcrbm(insertions) diff --git a/setup.py b/setup.py index 731aae2..673f838 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ install_requires = ['future', 'numpy', 'scipy', 'pandas', 'pysam', - 'rpy2', 'scikit-bio', 'toolz'] + 'rpy2', 'scikit-bio', 'toolz', 'tqdm'] if not sys.version_info >= (3, ): install_requires += ['pathlib'] From 6fd0a6ea40c898dbe2175d59f51db0ffb6765dc4 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 16 Mar 2016 16:50:38 +0100 Subject: [PATCH 068/100] Added rbm_cis annotator. --- pyim/annotation/rbm.py | 54 +++++++++++++++++------------ pyim/annotation/rbm_cis.py | 70 ++++++++++++++++++++++++++++++++++++++ pyim/main/annotate.py | 3 +- 3 files changed, 104 insertions(+), 23 deletions(-) create mode 100644 pyim/annotation/rbm_cis.py diff --git a/pyim/annotation/rbm.py b/pyim/annotation/rbm.py index 57adcd7..ee0bbd1 100644 --- a/pyim/annotation/rbm.py +++ b/pyim/annotation/rbm.py @@ -1,8 +1,8 @@ -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import (ascii, bytes, chr, dict, hex, input, - int, map, next, oct, open, pow, range, round, - str, super, zip) # filter +from __future__ import absolute_import, division, print_function + +#pylint: disable=wildcard-import,unused-wildcard-import,redefined-builtin +from builtins import * +#pylint: enable=wildcard-import,unused-wildcard-import,redefined-builtin import itertools import logging @@ -11,10 +11,11 @@ from pyim.util.tabix import GtfFile +#pylint: disable=import-error from ._model import Window from ._util import select_closest from .window import build_interval_trees, annotate_for_windows - +#pylint: enable=import-error # Window format: (us, ua, ds, da) WINDOW_PRESETS = { @@ -50,9 +51,24 @@ def main(args): # Read insertions. insertions = pd.read_csv(args.input, sep='\t', dtype={'chrom': str}) - logger.info('Read {} insertions'.format(len(insertions))) + logger.info('Read %d insertions', len(insertions)) + + # Define windows. + if args.window_sizes is not None: + window_sizes = args.window_sizes + else: + window_sizes = WINDOW_PRESETS[args.preset] + + # Annotate insertions. + annotated = rbm(insertions, args.gtf, window_sizes, logger, + closest=args.closest, verbose=True) + annotated.to_csv(args.output, sep='\t', index=False) + - # Replace unstranded if needed. +def rbm(insertions, gtf_path, window_sizes, logger, + closest=False, verbose=False): + + # Replace unstranded insertions with two stranded insertions. if (~insertions['strand'].isin({-1, 1})).any(): logger.warning('Replacing unstranded insertions') converted = replace_unstranded(insertions) @@ -61,35 +77,30 @@ def main(args): # Build annotation trees. logger.info('Building interval trees') - gtf = GtfFile(args.gtf) + gtf = GtfFile(gtf_path) trees = build_interval_trees(gtf) # Define windows. - if args.preset is not None: - window_sizes = WINDOW_PRESETS[args.preset] - else: - window_sizes = args.window_sizes - windows = build_windows(window_sizes) # Annotate insertions. logger.info('Annotating insertions') annotation = annotate_for_windows( - converted, trees, windows, progress=True) + converted, trees, windows, progress=verbose) - if args.closest: - import pdb; pdb.set_trace() + if closest: logger.info('Reducing to closest features') annotation = select_closest(annotation, col='gene_distance') # Merge annotation with insertion frame. logger.info('Merging annotation') merged = pd.merge(insertions, annotation, on='id', how='left') - merged.to_csv(args.output, sep='\t', index=False) + + return merged -def build_windows(ranges): - us, ua, ds, da = ranges +def build_windows(window_sizes): + us, ua, ds, da = window_sizes windows = [ Window(0, 1, strand=1, incl_left=True, incl_right=True, name='is'), @@ -97,8 +108,7 @@ def build_windows(ranges): Window(-us, 0, strand=1, incl_left=True, incl_right=False, name='us'), Window(-ua, 0, strand=-1, incl_left=True, incl_right=False, name='ua'), Window(1, ds, strand=1, incl_left=False, incl_right=True, name='ds'), - Window(1, da, strand=-1, incl_left=False, incl_right=True, name='da') - ] + Window(1, da, strand=-1, incl_left=False, incl_right=True, name='da')] return windows diff --git a/pyim/annotation/rbm_cis.py b/pyim/annotation/rbm_cis.py new file mode 100644 index 0000000..2f9c775 --- /dev/null +++ b/pyim/annotation/rbm_cis.py @@ -0,0 +1,70 @@ +from __future__ import absolute_import, division, print_function + +#pylint: disable=wildcard-import,unused-wildcard-import,redefined-builtin +from builtins import * +#pylint: enable=wildcard-import,unused-wildcard-import,redefined-builtin + +import logging +from os import path + +import pandas as pd + +#pylint: disable=import-error +from .rbm import rbm, WINDOW_PRESETS +#pylint: enable=import-error + + +def register(subparsers, name='rbm-cis'): + parser = subparsers.add_parser(name, help=name + ' help') + + # Required arguments. + parser.add_argument('input') + parser.add_argument('output') + parser.add_argument('--gtf', required=True) + parser.add_argument('--cis_sites', required=True) + + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument('--preset', choices=WINDOW_PRESETS.keys()) + group.add_argument('--window_sizes', nargs=4, type=int) + + # Optional arguments. + parser.add_argument('--closest', default=False, action='store_true') + + # Set main for dispatch. + parser.set_defaults(main=main) + + return parser + + +def main(args): + logger = logging.getLogger() + + # Read insertions. + insertions = pd.read_csv(args.input, sep='\t', dtype={'chrom': str}) + logger.info('Read %d insertions', len(insertions)) + + # Read cis sites. + cis_sites = pd.read_csv(args.cis_sites, sep='\t', dtype={'chrom': str}) + logger.info('Read %d cis sites', len(cis_sites)) + + # Define windows. + if args.window_sizes is not None: + window_sizes = args.window_sizes + else: + window_sizes = WINDOW_PRESETS[args.preset] + + # Annotate cis sites. + annotated_sites = rbm(cis_sites, args.gtf, window_sizes, logger, + closest=args.closest, verbose=True) + + # Extract and merge annotation with insertions. + annotation = annotated_sites[['id', 'gene_id', 'gene_name', + 'gene_distance', 'window']] + annotation = annotation.rename(columns={'id': 'cis_id'}) + + annotated_ins = pd.merge(insertions, annotation, on='cis_id', how='left') + + # Write outputs. + annotated_ins.to_csv(args.output, sep='\t', index=False) + annotated_sites.to_csv(path.splitext(args.output)[0] + '.sites.txt', + sep='\t', index=False) diff --git a/pyim/main/annotate.py b/pyim/main/annotate.py index 665a82f..af57507 100644 --- a/pyim/main/annotate.py +++ b/pyim/main/annotate.py @@ -4,7 +4,7 @@ import argparse import logging -from pyim.annotation import window, rbm, kcrbm +from pyim.annotation import window, rbm, kcrbm, rbm_cis from ._logging import print_header, print_footer @@ -19,6 +19,7 @@ def main(): # Register pipelines. window.register(subparsers) rbm.register(subparsers) + rbm_cis.register(subparsers) kcrbm.register(subparsers) # Parse args. From 4ae4dd07f659ef9d34997d856c49a6bf754e5a19 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 23 Mar 2016 17:30:54 +0100 Subject: [PATCH 069/100] Added interval tree dependency. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 673f838..cbe4760 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ install_requires = ['future', 'numpy', 'scipy', 'pandas', 'pysam', - 'rpy2', 'scikit-bio', 'toolz', 'tqdm'] + 'rpy2', 'scikit-bio', 'toolz', 'tqdm', 'intervaltree'] if not sys.version_info >= (3, ): install_requires += ['pathlib'] From 6bff94ee589fc6ad11080805f80e9f8691bd896a Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 23 Mar 2016 17:31:55 +0100 Subject: [PATCH 070/100] Restructured annotation, added blacklist support. --- pyim/annotation/_model.py | 43 ----- pyim/annotation/_util.py | 33 ---- pyim/annotation/annotator/__init__.py | 3 + pyim/annotation/{ => annotator}/kcrbm.py | 12 +- pyim/annotation/annotator/rbm.py | 163 ++++++++++++++++ pyim/annotation/annotator/rbm_cis.py | 159 +++++++++++++++ pyim/annotation/annotator/window.py | 236 +++++++++++++++++++++++ pyim/annotation/filtering.py | 45 +++++ pyim/annotation/metadata.py | 104 ++++++++++ pyim/annotation/rbm.py | 139 ------------- pyim/annotation/rbm_cis.py | 70 ------- pyim/annotation/util.py | 34 ++++ pyim/annotation/window.py | 168 ---------------- pyim/main/annotate.py | 14 +- 14 files changed, 760 insertions(+), 463 deletions(-) delete mode 100644 pyim/annotation/_model.py delete mode 100644 pyim/annotation/_util.py create mode 100644 pyim/annotation/annotator/__init__.py rename pyim/annotation/{ => annotator}/kcrbm.py (93%) create mode 100644 pyim/annotation/annotator/rbm.py create mode 100644 pyim/annotation/annotator/rbm_cis.py create mode 100644 pyim/annotation/annotator/window.py create mode 100644 pyim/annotation/filtering.py create mode 100644 pyim/annotation/metadata.py delete mode 100644 pyim/annotation/rbm.py delete mode 100644 pyim/annotation/rbm_cis.py create mode 100644 pyim/annotation/util.py delete mode 100644 pyim/annotation/window.py diff --git a/pyim/annotation/_model.py b/pyim/annotation/_model.py deleted file mode 100644 index 021c2d5..0000000 --- a/pyim/annotation/_model.py +++ /dev/null @@ -1,43 +0,0 @@ - -class Window(object): - - def __init__(self, start, end, reference=None, strand=None, - incl_left=True, incl_right=True, name=None): - self.reference = reference - self.start = start - self.end = end - self.strand = strand - - self.incl_left = incl_left - self.incl_right = incl_right - - self.name = name - - def apply(self, reference, location, strand): - """Applies window to specific location and strand""" - - # Determine start/end position. - if strand == 1: - start = location + self.start - end = location + self.end - - incl_left = self.incl_left - incl_right = self.incl_right - elif strand == -1: - start = location - self.end - end = location - self.start - - incl_right = self.incl_left - incl_left = self.incl_right - else: - raise ValueError('Unknown value for strand ({})' - .format(strand)) - - # Determine new strand. - if self.strand is not None: - new_strand = self.strand * strand - else: - new_strand = None - - return Window(start, end, reference, new_strand, - incl_left, incl_right, name=self.name) diff --git a/pyim/annotation/_util.py b/pyim/annotation/_util.py deleted file mode 100644 index 14cefd6..0000000 --- a/pyim/annotation/_util.py +++ /dev/null @@ -1,33 +0,0 @@ - -def select_closest(frame, id_col='id', col='distance'): - def _is_closest(x): - abs_dist = x[col].abs() - return x.ix[abs_dist == abs_dist.min()] - - return (frame.groupby(id_col) - .apply(_is_closest) - .reset_index(drop=True)) - - -def feature_distance(feature, location, stranded=True): - start, end = feature['start'], feature['end'] - - if start <= location <= end: - dist = 0 - elif location > end: - dist = location - end - else: - dist = location - start - - if stranded: - dist *= numeric_strand(feature['strand']) - - return dist - - -def numeric_strand(strand): - """Convert strand to numeric representation.""" - - return 1 if strand == '+' else -1 - - diff --git a/pyim/annotation/annotator/__init__.py b/pyim/annotation/annotator/__init__.py new file mode 100644 index 0000000..1cb1e9e --- /dev/null +++ b/pyim/annotation/annotator/__init__.py @@ -0,0 +1,3 @@ +from .rbm import annotate_rbm +from .rbm_cis import annotate_rbm_cis +from .window import annotate_windows, Window diff --git a/pyim/annotation/kcrbm.py b/pyim/annotation/annotator/kcrbm.py similarity index 93% rename from pyim/annotation/kcrbm.py rename to pyim/annotation/annotator/kcrbm.py index 2b666da..fda503f 100644 --- a/pyim/annotation/kcrbm.py +++ b/pyim/annotation/annotator/kcrbm.py @@ -1,8 +1,8 @@ -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import (ascii, bytes, chr, dict, filter, hex, input, - int, map, next, oct, open, pow, range, round, - str, super, zip) +from __future__ import absolute_import, division, print_function + +#pylint: disable=wildcard-import,unused-wildcard-import,redefined-builtin +from builtins import * +#pylint: enable=wildcard-import,unused-wildcard-import,redefined-builtin import logging from itertools import chain, repeat @@ -13,7 +13,7 @@ from pyim.util.rpy2 import dataframe_to_pandas -from ._util import select_closest +from ..filtering import select_closest CHROM_MAP = dict(zip( diff --git a/pyim/annotation/annotator/rbm.py b/pyim/annotation/annotator/rbm.py new file mode 100644 index 0000000..18b0505 --- /dev/null +++ b/pyim/annotation/annotator/rbm.py @@ -0,0 +1,163 @@ +from __future__ import absolute_import, division, print_function + +#pylint: disable=wildcard-import,unused-wildcard-import,redefined-builtin +from builtins import * +#pylint: enable=wildcard-import,unused-wildcard-import,redefined-builtin + +import itertools +import logging + +import pandas as pd + +#pylint: disable=import-error +from ..metadata import add_metadata +from ..filtering import filter_blacklist, select_closest +from .window import Window, annotate_windows +#pylint: enable=import-error + +# Window format: (us, ua, ds, da) +WINDOW_PRESETS = { + 'SB': (20000, 10000, 25000, 5000), + 'MULV': (20000, 120000, 40000, 5000), + 'MMTV': (20000, 120000, 40000, 5000) +} + + +def annotate_rbm(insertions, gtf, window_preset=None, window_sizes=None): + """Assigns insertions to genes using the rule-based-method (RBM) approach. + + Args: + insertions (pandas.DataFrame): Insertions to annotate in DataFrame + format. The frame is expected to contain at least the + following columns: id, position, strand. + gtf (str or GtfFile): Path to gtf file containing gene features. + Alternatively, a GtfFile object may also be given instead of a path. + window_preset (str): Preset to use for the RBM window sizes. + Alternatively custom window sizes can be given using the + *window_sizes* argument. Note that either *window_preset* or + *window_sizes* must be provided. + window_sizes (tuple[int]): Tuple of window sizes to use in the + RBM mapping. Should specify four window sizes, for the following + categories of insertions: upstream-sense, upstream-antisense, + downstream-sense, downstream-antisense. + + Returns: + pandas.DataFrame: Dataframe containing annotated insertions. Annotations + are added as columns 'gene_id' and 'gene_name', which respectively contain the id and name of the annotated gene. An extra column + 'window' indicates which of the RBM windows was used for + the annotation. + + """ + + # Lookup windows. + if window_preset is not None: + window_sizes = WINDOW_PRESETS[window_preset] + elif window_sizes is None: + raise ValueError('Either window_sizes or window_preset must be given') + + # Replace unstranded insertions with two stranded insertions. + if (~insertions['strand'].isin({-1, 1})).any(): + logging.warning('Replacing unstranded insertions') + converted = _replace_unstranded(insertions) + else: + converted = insertions + + # Define windows. + windows = _build_windows(window_sizes) + + # Annotate insertions. + annotated = annotate_windows(converted, gtf, windows) + + return annotated + + +def _build_windows(window_sizes): + us, ua, ds, da = window_sizes + + windows = [ + Window(0, 1, strand=1, incl_left=True, incl_right=True, name='is'), + Window(0, 1, strand=-1, incl_left=True, incl_right=True, name='ia'), + Window(-us, 0, strand=1, incl_left=True, incl_right=False, name='us'), + Window(-ua, 0, strand=-1, incl_left=True, incl_right=False, name='ua'), + Window(1, ds, strand=1, incl_left=False, incl_right=True, name='ds'), + Window(1, da, strand=-1, incl_left=False, incl_right=True, name='da')] + + return windows + + +def _replace_unstranded(insertions): + """Replaces unstranded insertions with two stranded insertions.""" + + # Split stranded and unstranded. + mask = insertions['strand'].isin({-1, 1}) + stranded = insertions.ix[mask] + unstranded = insertions.ix[~mask] + + # Convert unstranded into two stranded. + converted = (_to_stranded(ins) for _, ins in unstranded.iterrows()) + converted = pd.DataFrame.from_records( + itertools.chain.from_iterable(converted)) + + return pd.concat((stranded, converted), ignore_index=True) + + +def _to_stranded(insertion): + fwd = insertion.copy() + fwd['strand'] = 1 + + rev = insertion.copy() + rev['strand'] = -1 + + return (fwd, rev) + + +def register(subparsers, name='rbm'): + """Registers the RBM annotator as a subparser.""" + + parser = subparsers.add_parser(name, help=name + ' help') + + # Required arguments. + parser.add_argument('input') + parser.add_argument('output') + parser.add_argument('--gtf', required=True) + + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument('--preset', choices=WINDOW_PRESETS.keys()) + group.add_argument('--window_sizes', nargs=4, type=int) + + # Optional arguments. + parser.add_argument('--closest', default=False, action='store_true') + parser.add_argument('--blacklist', default=None, nargs='+') + + # Set main for dispatch. + parser.set_defaults(main=main) + + return parser + + +def main(args): + """Main function for the RBM annotator command-line tool.""" + + # Read insertions. + insertions = pd.read_csv(args.input, sep='\t', dtype={'chrom': str}) + logging.info('Read %d insertions', len(insertions)) + + # Annotate insertions. + logging.info('Annotating insertions') + annotated = annotate_rbm(insertions, args.gtf, window_preset=args.preset, + window_sizes=args.window_sizes) + + # Add metadata. + logging.info('Adding annotation metadata') + annotated = add_metadata(annotated, args.gtf) + + if args.blacklist is not None: + logging.info('Filtering blacklisted genes') + annotated = filter_blacklist(annotated, args.blacklist) + + if args.closest: + logging.info('Selecting closest genes') + annotated = select_closest(annotated) + + + annotated.to_csv(args.output, sep='\t', index=False) diff --git a/pyim/annotation/annotator/rbm_cis.py b/pyim/annotation/annotator/rbm_cis.py new file mode 100644 index 0000000..f2c50a9 --- /dev/null +++ b/pyim/annotation/annotator/rbm_cis.py @@ -0,0 +1,159 @@ +from __future__ import absolute_import, division, print_function + +#pylint: disable=wildcard-import,unused-wildcard-import,redefined-builtin +from builtins import * +#pylint: enable=wildcard-import,unused-wildcard-import,redefined-builtin + +import logging +from os import path + +import numpy as np +import pandas as pd + +#pylint: disable=import-error +from .rbm import annotate_rbm, WINDOW_PRESETS as RBM_WINDOW_PRESETS +from ..metadata import add_metadata +from ..filtering import filter_blacklist, select_closest +#pylint: enable=import-error + + +def annotate_rbm_cis(insertions, cis_sites, gtf, window_preset=None, + window_sizes=None, collapse=False): + """Assigns insertions to genes using the RBM approach via called CIS sites. + + Args: + insertions (pandas.DataFrame): Insertions to annotate in DataFrame + format. The frame is expected to contain at least the + following columns: id, position, strand. + cis_sites(pandas.DataFrame): Dataframe containing the CIS sites + for the given insertions. + gtf (str or GtfFile): Path to gtf file containing gene features. + Alternatively, a GtfFile object may also be given instead of a path. + window_preset (str): Preset to use for the RBM window sizes. + Alternatively custom window sizes can be given using the + *window_sizes* argument. Note that either *window_preset* or + *window_sizes* must be provided. + window_sizes (tuple[int]): Tuple of window sizes to use in the + RBM mapping. Should specify four window sizes, for the following + categories of insertions: upstream-sense, upstream-antisense, + downstream-sense, downstream-antisense. + + Returns: + tuple[pandas.DataFrame]: Returns two dataframes, the first + containing the annotated insertion sites, the second containing + the annotated CIS sites, which were used to annotate the insertions. + Annotations are added as columns 'gene_id' and 'gene_name', which + respectively contain the id and name of the annotated gene. An + extra column 'window' indicates which of the RBM windows was + used for the annotation. + + """ + + if 'strand' not in cis_sites: + # Add strand to cis sites if not present. + cis_sites = _determine_cis_strand(cis_sites, insertions) + + # Annotate cis sites. + cis_sites = cis_sites.rename(columns={'cis_id': 'id'}) + annotated_sites = annotate_rbm(cis_sites, gtf, + window_preset=window_preset, + window_sizes=window_sizes) + + # Extract and merge annotation with insertions. + annotation = annotated_sites[['id', 'gene_id', 'gene_name']] + annotation = annotation.rename(columns={'id': 'cis_id'}) + annotated_ins = pd.merge(insertions, annotation, on='cis_id', how='left') + + if collapse: + # Collapse multiple insertion entries resulting from CIS annotation. + annotated_ins.drop(['cis_id'], axis=1, inplace=True) + annotated_ins.drop_duplicates(inplace=True) + + return annotated_ins, annotated_sites + + +def _determine_cis_strand(cis, cis_insertions, min_homogeneity=0.5): + """Determines the strand for CIS sites with homogeneous insertions.""" + + # Extract and clip strands at zero. + ins_strands = cis_insertions[['cis_id', 'strand']].copy() + ins_strands['strand'] = ins_strands['strand'].map({1: 1, -1: 0}) + + # Calculate fwd/rev ratio for each cis. + ratio = ins_strands.groupby('cis_id')['strand'].mean() + + # Determine closest strand and homogeneity. + cis_strands = pd.DataFrame( + {'strand': ratio.round().astype(int).map({1: 1, 0: -1}), + 'strand_homogeneity': np.maximum((1 - ratio), ratio)}, + columns=['strand', 'strand_homogeneity']) + + # Don't assign strand if low homogeneity. + homogeneity_mask = cis_strands['strand_homogeneity'] < min_homogeneity + cis_strands.ix[homogeneity_mask, 'strand'] = None + + return pd.merge(cis, cis_strands.reset_index()) + + +def register(subparsers, name='rbm-cis'): + """Registers the RBM-CIS annotator as a subparser.""" + + parser = subparsers.add_parser(name, help=name + ' help') + + # Required arguments. + parser.add_argument('input') + parser.add_argument('output') + parser.add_argument('--gtf', required=True) + parser.add_argument('--cis_sites', required=True) + + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument('--preset', choices=RBM_WINDOW_PRESETS.keys()) + group.add_argument('--window_sizes', nargs=4, type=int) + + # Optional arguments. + parser.add_argument('--closest', default=False, action='store_true') + parser.add_argument('--collapse', default=False, action='store_true') + parser.add_argument('--blacklist', default=None, nargs='+') + + # Set main for dispatch. + parser.set_defaults(main=main) + + return parser + + +def main(args): + """Main function for the RBM-CIS annotator command-line tool.""" + + # Read insertions and cis sites. + insertions = pd.read_csv(args.input, sep='\t', dtype={'chrom': str}) + cis_sites = pd.read_csv(args.cis_sites, sep='\t', dtype={'chrom': str}) + + logging.info('Read %d insertions and %d cis sites', + len(insertions), len(cis_sites)) + + # Annotate insertions. + logging.info('Annotating insertions') + + annotated_ins, annotated_sites = annotate_rbm_cis( + insertions, cis_sites, args.gtf, window_preset=args.preset, + window_sizes=args.window_sizes, collapse=args.collapse) + + # Add metadata to annotated insertions. + logging.info('Adding annotation metadata') + annotated_ins = add_metadata(annotated_ins, args.gtf) + + if args.blacklist is not None: + logging.info('Filtering blacklisted genes') + annotated_ins = filter_blacklist(annotated_ins, args.blacklist) + annotated_sites = filter_blacklist(annotated_sites, args.blacklist) + + if args.closest: + logging.info('Selecting closest insertions') + annotated_ins = select_closest(annotated_ins) + annotated_sites = add_metadata(annotated_sites, args.gtf) + annotated_sites = select_closest(annotated_sites) + + # Write outputs. + annotated_ins.to_csv(args.output, sep='\t', index=False) + annotated_sites.to_csv(path.splitext(args.output)[0] + '.sites.txt', + sep='\t', index=False) diff --git a/pyim/annotation/annotator/window.py b/pyim/annotation/annotator/window.py new file mode 100644 index 0000000..3395078 --- /dev/null +++ b/pyim/annotation/annotator/window.py @@ -0,0 +1,236 @@ +from __future__ import absolute_import, division, print_function + +#pylint: disable=wildcard-import,unused-wildcard-import,redefined-builtin +from builtins import * +#pylint: enable=wildcard-import,unused-wildcard-import,redefined-builtin + +import itertools +import logging + +import pandas as pd +from intervaltree import IntervalTree +from tqdm import tqdm + +from pyim.util.tabix import GtfFile + +# pylint: disable=import-error +from ..filtering import filter_blacklist, select_closest +from ..util import build_interval_trees, numeric_strand +# pylint: enable=import-error + +def annotate_windows(insertions, gtf, windows): + """Assigns insertions to genes that fall within the given windows. + + Args: + insertions (pandas.DataFrame): Insertions to annotate in DataFrame + format. The frame is expected to contain at least the + following columns: id, position, strand. + gtf (str or GtfFile): Path to gtf file containing gene features. + Alternatively, a GtfFile object may also be given instead of a path. + windows (list[Window]): List of windows to inspect for genes. + + Returns: + pandas.DataFrame: Dataframe containing annotated insertions. Annotations + are added as columns 'gene_id' and 'gene_name', which respectively contain the id and name of the annotated gene. An extra column + 'window' indicates which of the RBM windows was used for + the annotation. + + """ + + if isinstance(gtf, str): + gtf = GtfFile(gtf) + + # Build lookup trees. + trees = build_interval_trees(gtf) + + # Generate queries (insertion/window combinations). + ins_gen = (row for _, row in insertions.iterrows()) + queries = itertools.product(ins_gen, windows) + + queries = tqdm(queries, unit='query', + total=len(insertions) * len(windows)) + + # Generate annotation for queries and merge into frame. + annotations = (_annotate_window(ins, window, trees) + for ins, window in queries) + annotation = pd.concat(annotations, ignore_index=True) + + # Merge annotation with insertions. + annotated = pd.merge(insertions, annotation, on='id', how='left') + + return annotated + + +class Window(object): + """Class representing a (relative) window to inspect for genes. + + The window may be an actual window corresponding to a real chromosome + location, in which case start and end represent the actual window + boundaries, and reference and strand represent the actual chromosome + and strand of the window. + + Alternatively, the window may also represent a relative window. In this + case start is typically negative and end is typically positive, whilst + reference is typically omitted and strand is optional. This relative window + can be applied to an actual position using the apply method, which + effectively calculates the given window around that position. + + Args: + start (int): Start of the window. + end (int): End of the window. + reference (str): Chromosome of the window (optional). + strand (int): Relative strand of window (optional). + incl_left (bool): Whether to include partially (left) + overlapping features. + incl_right (bool): Whether to include partially (right) + overlapping features. + + """ + + def __init__(self, start, end, reference=None, strand=None, + incl_left=True, incl_right=True, name=None): + self.reference = reference + self.start = start + self.end = end + self.strand = strand + + self.incl_left = incl_left + self.incl_right = incl_right + + self.name = name + + def apply(self, reference, location, strand): + """Applies a relative window to specific location and strand. + + For example, a relative window of Window(start=-1000, end=1000, + strand=-1) applied to position (2, 3000, -1) will become + Window(ref=2, start=2000, end=4000, strand=1). + + Args: + reference (str): Chromosome name of the reference position. + location (int): Reference genomic position. + strand (int): Reference genomic strand. + + """ + + # Determine start/end position. + if strand == 1: + start = location + self.start + end = location + self.end + + incl_left = self.incl_left + incl_right = self.incl_right + elif strand == -1: + start = location - self.end + end = location - self.start + + incl_right = self.incl_left + incl_left = self.incl_right + else: + raise ValueError('Unknown value for strand ({})' + .format(strand)) + + # Determine new strand. + if self.strand is not None: + new_strand = self.strand * strand + else: + new_strand = None + + return Window(start, end, reference, new_strand, + incl_left, incl_right, name=self.name) + + +def _annotate_window(insertion, window, feature_trees): + """Annotates insertion for features in trees using given window.""" + + # Apply window for insertion. + applied_window = window.apply( + insertion['chrom'], insertion['position'], insertion['strand']) + + # Fetch features within window. + features = _fetch_in_window(feature_trees, applied_window) + + # Extract feature values. + frame = pd.DataFrame.from_records( + ({'id': insertion['id'], + 'gene_id': feature['gene_id'], + 'gene_name': feature['gene_name']} + for feature in features)) + + # Include window name if known. + if window.name is not None: + frame['window'] = window.name + + return frame + + +def _fetch_in_window(trees, window): + """Fetches features within given window in the interval trees.""" + + # Find overlapping features. + try: + tree = trees[window.reference] + overlap = tree[window.start:window.end] + except KeyError: + overlap = [] + + # Extract features. + features = (interval[2] for interval in overlap) + + # Filter inclusive/exclusive if needed. + if not window.incl_left: + features = (f for f in features if f['start'] > window.start) + + if not window.incl_right: + features = (f for f in features if f['end'] < window.end) + + # Filter for strand if needed. + if window.strand is not None: + features = (f for f in features + if numeric_strand(f['strand']) == window.strand) + + return list(features) + + + +def register(subparsers, name='window'): + parser = subparsers.add_parser(name, help=name + ' help') + + # Required arguments. + parser.add_argument('input') + parser.add_argument('output') + parser.add_argument('--gtf', required=True) + + # Optional arguments. + parser.add_argument('--closest', default=False, action='store_true') + parser.add_argument('--window_size', default=20000, type=int) + + # Set main for dispatch. + parser.set_defaults(main=main) + + return parser + + +def main(args): + # Read annotation. + insertions = pd.read_csv(args.input, sep='\t', dtype={'chrom': str}) + logging.info('Read %d insertions', len(insertions)) + + # Define windows. + logging.info('Annotating insertions') + half_size = args.window_size // 2 + window = Window(start=-half_size, end=half_size) + + # Annotate insertions. + annotated = annotate_windows(insertions, args.gtf, [window]) + + if args.blacklist is not None: + logging.info('Filtering blacklisted genes') + annotated = filter_blacklist(annotated, args.blacklist) + + if args.closest: + logging.info('Selecting closest genes') + annotated = select_closest(annotated) + + # Merge annotation. + annotated.to_csv(args.output, sep='\t', index=False) diff --git a/pyim/annotation/filtering.py b/pyim/annotation/filtering.py new file mode 100644 index 0000000..7788ba7 --- /dev/null +++ b/pyim/annotation/filtering.py @@ -0,0 +1,45 @@ + +def select_closest(insertions, id_col='id', dist_col='distance'): + """Selects genes that are closest to the annotated insertions. + + Args: + insertions (pandas.DataFrame): Annotated insertions that are to + be filtered. The frame is expected to contain at least the + following columns: id, position, strand, *dist_col*. + id_col (str): Name of the column containing the id of the insertion. + dist_col (str): Name of the column containing the distance to + the gene or feature. Can be added using the add_metadata function. + + Returns: + pandas.DataFrame: Filtered annotated insertions, which have been + reduced to only include the genes closest to the insertions. + + """ + + def _is_closest(x): + abs_dist = x[col].abs() + return x.ix[abs_dist == abs_dist.min()] + + return (insertions.groupby(id_col) + .apply(_is_closest) + .reset_index(drop=True)) + + +def filter_blacklist(insertions, blacklist, gene_col='gene_name'): + """Filters annotations that assign insertions to blacklisted genes. + + Args: + insertions (pandas.DataFrame): Annotated insertions that are to + be filtered. The frame is expected to contain at least the + following columns: id, position, strand, *gene_id_col*. + blacklist (list[str]): List of blacklisted gene ids to filter. + gene_col (str): Name of the column containing the id of the genes. + + Returns: + pandas.DataFrame: Filtered annotated insertions, which have been + reduced remove blacklisted genes. + + """ + + mask = insertions[gene_col].isin(set(blacklist)) + return insertions.ix[~mask] diff --git a/pyim/annotation/metadata.py b/pyim/annotation/metadata.py new file mode 100644 index 0000000..63fac47 --- /dev/null +++ b/pyim/annotation/metadata.py @@ -0,0 +1,104 @@ +import pandas as pd +from pyim.util.tabix import GtfFile, GtfFrame + +from .util import numeric_strand + + +def add_metadata(insertions, gtf): + """Adds metadata to annotated insertions. + + Adds extra metadata to already annotated insertions. This metadata + currently includes the following information: distance to the gene + ('distance' column) and relative orientation ('orientation' column). + + Args: + insertions (pandas.DataFrame): Annotated insertions for which metadata + should be added. The frame is expected to contain at least the + following columns: id, position, strand, gene_id. + gtf (str or GtfFile): Path to gtf file containing gene features. + Alternatively, a GtfFile object may also be given instead of a path. + Used to annotate insertion gene assignments. + + Returns: + pandas.DataFrame: Annotated insertions with extra metadata. + + """ + + if isinstance(gtf, str): + gtf = GtfFile(gtf) + + # Look-up genes in GTF file. + genes = GtfFrame.from_records(gtf.fetch(filters={'feature': 'gene'})) + genes.set_index('gene_id', drop=False, inplace=True) + + # Generate metadata. + metadata = pd.DataFrame.from_records( + (_annotate_insertion(ins, genes.ix[ins['gene_id']]) + for _, ins in insertions.iterrows() + if ins['gene_id'] in genes.index)) + + # Re-order columns. + extra_cols = set(metadata.columns) - {'id', 'gene_id'} + metadata = metadata[['id', 'gene_id'] + sorted(extra_cols)] + + return pd.merge(insertions, metadata, on=['id', 'gene_id'], how='left') + + +def _annotate_insertion(insertion, feature): + """Annotates a given insertion/feature combination.""" + + return { + 'id': insertion['id'], + 'gene_id': feature['gene_id'], + 'distance': feature_distance(insertion, feature), + 'orientation': feature_orientation(insertion, feature) + } + + +def feature_distance(insertion, feature): + """Calculates the genomic distance between an insertion and a feature. + + Args: + insertion (pandas.Series): Insertion of interest. Assumed to have + 'position' and 'strand' values. + feature (pandas.Series): Feature of interest. Assumed to have + 'start', 'end' and 'strand' values. + + Returns: + int: Distance between insertion and feature. + + """ + + feat_start, feat_end = feature['start'], feature['end'] + ins_location = insertion['position'] + + if feat_start <= ins_location <= feat_end: + dist = 0 + elif ins_location > feat_end: + dist = ins_location - feat_end + else: + dist = ins_location - feat_start + + dist *= numeric_strand(feature['strand']) + + return dist + +def feature_orientation(insertion, feature): + """Determines the relative orientation of an insertion and a feature. + + Args: + insertion (pandas.Series): Insertion of interest. Assumed to have + a 'strand' value. + feature (pandas.Series): Feature of interest. Assumed to have + a 'strand' value. + + Returns: + str: Returns 'sense' if features have the same orientation (i.e. are + on the same strand), or 'antisense' if this is not the case. + + """ + + ins_strand = numeric_strand(insertion['strand']) + feat_strand = numeric_strand(feature['strand']) + + return 'sense' if ins_strand == feat_strand else 'antisense' diff --git a/pyim/annotation/rbm.py b/pyim/annotation/rbm.py deleted file mode 100644 index ee0bbd1..0000000 --- a/pyim/annotation/rbm.py +++ /dev/null @@ -1,139 +0,0 @@ -from __future__ import absolute_import, division, print_function - -#pylint: disable=wildcard-import,unused-wildcard-import,redefined-builtin -from builtins import * -#pylint: enable=wildcard-import,unused-wildcard-import,redefined-builtin - -import itertools -import logging - -import pandas as pd - -from pyim.util.tabix import GtfFile - -#pylint: disable=import-error -from ._model import Window -from ._util import select_closest -from .window import build_interval_trees, annotate_for_windows -#pylint: enable=import-error - -# Window format: (us, ua, ds, da) -WINDOW_PRESETS = { - 'SB': (20000, 10000, 25000, 5000), - 'MULV': (20000, 120000, 40000, 5000), - 'MMTV': (20000, 120000, 40000, 5000) -} - - -def register(subparsers, name='rbm'): - parser = subparsers.add_parser(name, help=name + ' help') - - # Required arguments. - parser.add_argument('input') - parser.add_argument('output') - parser.add_argument('--gtf', required=True) - - group = parser.add_mutually_exclusive_group(required=True) - group.add_argument('--preset', choices=WINDOW_PRESETS.keys()) - group.add_argument('--window_sizes', nargs=4, type=int) - - # Optional arguments. - parser.add_argument('--closest', default=False, action='store_true') - - # Set main for dispatch. - parser.set_defaults(main=main) - - return parser - - -def main(args): - logger = logging.getLogger() - - # Read insertions. - insertions = pd.read_csv(args.input, sep='\t', dtype={'chrom': str}) - logger.info('Read %d insertions', len(insertions)) - - # Define windows. - if args.window_sizes is not None: - window_sizes = args.window_sizes - else: - window_sizes = WINDOW_PRESETS[args.preset] - - # Annotate insertions. - annotated = rbm(insertions, args.gtf, window_sizes, logger, - closest=args.closest, verbose=True) - annotated.to_csv(args.output, sep='\t', index=False) - - -def rbm(insertions, gtf_path, window_sizes, logger, - closest=False, verbose=False): - - # Replace unstranded insertions with two stranded insertions. - if (~insertions['strand'].isin({-1, 1})).any(): - logger.warning('Replacing unstranded insertions') - converted = replace_unstranded(insertions) - else: - converted = insertions - - # Build annotation trees. - logger.info('Building interval trees') - gtf = GtfFile(gtf_path) - trees = build_interval_trees(gtf) - - # Define windows. - windows = build_windows(window_sizes) - - # Annotate insertions. - logger.info('Annotating insertions') - annotation = annotate_for_windows( - converted, trees, windows, progress=verbose) - - if closest: - logger.info('Reducing to closest features') - annotation = select_closest(annotation, col='gene_distance') - - # Merge annotation with insertion frame. - logger.info('Merging annotation') - merged = pd.merge(insertions, annotation, on='id', how='left') - - return merged - - -def build_windows(window_sizes): - us, ua, ds, da = window_sizes - - windows = [ - Window(0, 1, strand=1, incl_left=True, incl_right=True, name='is'), - Window(0, 1, strand=-1, incl_left=True, incl_right=True, name='ia'), - Window(-us, 0, strand=1, incl_left=True, incl_right=False, name='us'), - Window(-ua, 0, strand=-1, incl_left=True, incl_right=False, name='ua'), - Window(1, ds, strand=1, incl_left=False, incl_right=True, name='ds'), - Window(1, da, strand=-1, incl_left=False, incl_right=True, name='da')] - - return windows - - -def replace_unstranded(insertions): - """Replaces unstranded insertions with two stranded insertions.""" - - # Split stranded and unstranded. - mask = insertions['strand'].isin({-1, 1}) - stranded = insertions.ix[mask] - unstranded = insertions.ix[~mask] - - # Convert unstranded into two stranded. - converted = (_to_stranded(ins) for _, ins in unstranded.iterrows()) - converted = pd.DataFrame.from_records( - itertools.chain.from_iterable(converted)) - - return pd.concat((stranded, converted), ignore_index=True) - - -def _to_stranded(insertion): - fwd = insertion.copy() - fwd['strand'] = 1 - - rev = insertion.copy() - rev['strand'] = -1 - - return [fwd, rev] diff --git a/pyim/annotation/rbm_cis.py b/pyim/annotation/rbm_cis.py deleted file mode 100644 index 2f9c775..0000000 --- a/pyim/annotation/rbm_cis.py +++ /dev/null @@ -1,70 +0,0 @@ -from __future__ import absolute_import, division, print_function - -#pylint: disable=wildcard-import,unused-wildcard-import,redefined-builtin -from builtins import * -#pylint: enable=wildcard-import,unused-wildcard-import,redefined-builtin - -import logging -from os import path - -import pandas as pd - -#pylint: disable=import-error -from .rbm import rbm, WINDOW_PRESETS -#pylint: enable=import-error - - -def register(subparsers, name='rbm-cis'): - parser = subparsers.add_parser(name, help=name + ' help') - - # Required arguments. - parser.add_argument('input') - parser.add_argument('output') - parser.add_argument('--gtf', required=True) - parser.add_argument('--cis_sites', required=True) - - group = parser.add_mutually_exclusive_group(required=True) - group.add_argument('--preset', choices=WINDOW_PRESETS.keys()) - group.add_argument('--window_sizes', nargs=4, type=int) - - # Optional arguments. - parser.add_argument('--closest', default=False, action='store_true') - - # Set main for dispatch. - parser.set_defaults(main=main) - - return parser - - -def main(args): - logger = logging.getLogger() - - # Read insertions. - insertions = pd.read_csv(args.input, sep='\t', dtype={'chrom': str}) - logger.info('Read %d insertions', len(insertions)) - - # Read cis sites. - cis_sites = pd.read_csv(args.cis_sites, sep='\t', dtype={'chrom': str}) - logger.info('Read %d cis sites', len(cis_sites)) - - # Define windows. - if args.window_sizes is not None: - window_sizes = args.window_sizes - else: - window_sizes = WINDOW_PRESETS[args.preset] - - # Annotate cis sites. - annotated_sites = rbm(cis_sites, args.gtf, window_sizes, logger, - closest=args.closest, verbose=True) - - # Extract and merge annotation with insertions. - annotation = annotated_sites[['id', 'gene_id', 'gene_name', - 'gene_distance', 'window']] - annotation = annotation.rename(columns={'id': 'cis_id'}) - - annotated_ins = pd.merge(insertions, annotation, on='cis_id', how='left') - - # Write outputs. - annotated_ins.to_csv(args.output, sep='\t', index=False) - annotated_sites.to_csv(path.splitext(args.output)[0] + '.sites.txt', - sep='\t', index=False) diff --git a/pyim/annotation/util.py b/pyim/annotation/util.py new file mode 100644 index 0000000..bc83abd --- /dev/null +++ b/pyim/annotation/util.py @@ -0,0 +1,34 @@ +import itertools +from intervaltree import IntervalTree + + +def build_interval_trees(gtf): + """Builds an interval tree of genes for each chromosome in gtf.""" + + # Only select gene features for now. + genes = gtf.fetch(filters={'feature': 'gene'}) + + trees = {} + for contig, grp in itertools.groupby(genes, lambda r: r.contig): + # Build a tree for each individual chromosome. + intervals = ((g.start, g.end, dict(g)) for g in grp + if g.end > g.start) # Avoid null intervals. + trees[contig] = IntervalTree.from_tuples(intervals) + + return trees + + +def numeric_strand(strand): + """Converts strand to its numeric (integer) representation.""" + + if isinstance(strand, int): + return strand + elif isinstance(strand, float): + return int(strand) + else: + if strand == '+': + return 1 + elif strand == '-': + return -1 + else: + raise ValueError('Unknown value {} for strand'.format(strand)) diff --git a/pyim/annotation/window.py b/pyim/annotation/window.py deleted file mode 100644 index 3c6bc65..0000000 --- a/pyim/annotation/window.py +++ /dev/null @@ -1,168 +0,0 @@ -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import (ascii, bytes, chr, dict, filter, hex, input, - int, map, next, oct, open, pow, range, round, - str, super, zip) - -import itertools -import logging - -import pandas as pd -from intervaltree import IntervalTree -from tqdm import tqdm - -from pyim.util.tabix import GtfFile - -from ._model import Window -from ._util import feature_distance, numeric_strand, select_closest - - -def register(subparsers, name='window'): - parser = subparsers.add_parser(name, help=name + ' help') - - # Required arguments. - parser.add_argument('input') - parser.add_argument('output') - parser.add_argument('--gtf', required=True) - - # Optional arguments. - parser.add_argument('--closest', default=False, action='store_true') - parser.add_argument('--window_size', default=20000, type=int) - - # Set main for dispatch. - parser.set_defaults(main=main) - - return parser - - -def main(args): - logger = logging.getLogger() - - # Read annotation. - insertions = pd.read_csv(args.input, sep='\t', dtype={'chrom': str}) - logger.info('Read {} insertions'.format(len(insertions))) - - # Build lookup trees. - logger.info('Building interval trees') - gtf = GtfFile(args.gtf) - trees = build_interval_trees(gtf) - - # Define windows. - logger.info('Annotating insertions') - half_size = args.window_size // 2 - window = Window(start=-half_size, end=half_size) - - # Annotate insertions. - annotation = annotate_for_windows( - insertions, trees, [window], progress=True) - - if args.closest: - # Sub-select for closest features. - logger.info('Reducing to closest features') - annotation = select_closest(annotation, col='gene_distance') - - # Merge annotation. - logger.info('Merging annotation') - merged = pd.merge(insertions, annotation, on='id', how='left') - merged.to_csv(args.output, sep='\t', index=False) - - -def annotate_for_windows(insertions, trees, windows, progress=False): - """Annotates insertions for features in trees using given windows.""" - - # Generate queries (insertion/window combinations). - ins_gen = (row for _, row in insertions.iterrows()) - queries = itertools.product(ins_gen, windows) - - if progress: - queries = tqdm(queries, unit='query', - total=len(insertions) * len(windows)) - - # Generate annotation for queries. - annotations = (_annotate_for_window(ins, trees, window) - for ins, window in queries) - - # Merge annotations into single frame. - annotation = pd.concat(annotations, ignore_index=True) - - return annotation - - -def _annotate_for_window(insertion, trees, window): - """Annotates insertion for features in trees using given window.""" - - # Apply window for insertion. - applied_window = window.apply( - insertion['chrom'], insertion['position'], insertion['strand']) - - # Fetch features within window. - features = fetch_in_window(trees, applied_window) - - # Extract feature values. - values = ((f['gene_name'], - f['gene_id'], - feature_distance(f, insertion['position'])) - for f in features) - - try: - name, gene_id, distance = zip(*values) - except ValueError: - name, gene_id, distance = [], [], [] - - # Convert to frame. - frame = pd.DataFrame({ - 'id': insertion['id'], - 'gene_id': id_, - 'gene_name': name, - 'gene_id': gene_id, - 'gene_distance': distance}) - - # Include window name if known. - if window.name is not None: - frame['window'] = window.name - - return frame - - -def fetch_in_window(trees, window): - """Fetches features within given window in the interval trees.""" - - # Find overlapping features. - try: - tree = trees[window.reference] - overlap = tree[window.start:window.end] - except KeyError: - overlap = [] - - # Extract features. - features = (interval[2] for interval in overlap) - - # Filter inclusive/exclusive if needed. - if not window.incl_left: - features = (f for f in features if f['start'] > window.start) - - if not window.incl_right: - features = (f for f in features if f['end'] < window.end) - - # Filter for strand if needed. - if window.strand is not None: - features = (f for f in features - if numeric_strand(f['strand']) == window.strand) - - return list(features) - - -def build_interval_trees(gtf): - """Builds an interval tree of genes for each chromosome in gtf.""" - - # Only select gene features for now. - genes = gtf.fetch(filters={'feature': 'gene'}) - - trees = {} - for contig, grp in itertools.groupby(genes, lambda r: r.contig): - # Build a tree for each individual chromosome. - intervals = ((g.start, g.end, dict(g)) for g in grp - if g.end > g.start) # Avoid null intervals. - trees[contig] = IntervalTree.from_tuples(intervals) - - return trees diff --git a/pyim/main/annotate.py b/pyim/main/annotate.py index af57507..5db915d 100644 --- a/pyim/main/annotate.py +++ b/pyim/main/annotate.py @@ -1,11 +1,17 @@ -from __future__ import (absolute_import, division, - print_function, unicode_literals) +from __future__ import absolute_import, division, print_function + +#pylint: disable=wildcard-import,unused-wildcard-import,redefined-builtin +from builtins import * +#pylint: enable=wildcard-import,unused-wildcard-import,redefined-builtin import argparse import logging -from pyim.annotation import window, rbm, kcrbm, rbm_cis +from pyim.annotation.annotator import window, rbm, rbm_cis + +# pylint: disable=import-error from ._logging import print_header, print_footer +# pylint: enable=import-error def main(): @@ -20,7 +26,7 @@ def main(): window.register(subparsers) rbm.register(subparsers) rbm_cis.register(subparsers) - kcrbm.register(subparsers) + # kcrbm.register(subparsers) # Parse args. args = parser.parse_args() From eac37caa319dcdf955481a5943f7153062d3d7fa Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 23 Mar 2016 17:32:27 +0100 Subject: [PATCH 071/100] ShearSplink pipeline - Added extra logging + clonality annotation. --- pyim/alignment/pipelines/shear_splink.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pyim/alignment/pipelines/shear_splink.py b/pyim/alignment/pipelines/shear_splink.py index 770a36b..70e5e87 100644 --- a/pyim/alignment/pipelines/shear_splink.py +++ b/pyim/alignment/pipelines/shear_splink.py @@ -154,12 +154,18 @@ def shear_splink(reads, transposon, linker, barcodes, # Map barcodes to samples. if sample_map is not None: + logger.info('Mapping insertions to samples') insertions['sample'] = insertions['barcode'].map(sample_map) # Filter on (unique) depth. if min_depth is not None: + logger.info('Filtering insertions with depth < {}'.format(min_depth)) insertions = insertions.ix[insertions['depth_unique'] >= min_depth] + # Annotate with clonality. + logger.info('Annotating insertions with (relative) clonality') + insertions = annotate_with_clonality(insertions) + # Sort and assign ids to insertions. insertions.sort_values(by=['chrom', 'position'], inplace=True) insertions['id'] = ['INS_{}'.format(i) @@ -355,3 +361,18 @@ def _alignments_to_insertion(info, alignments, id_=None): depth_unique = len(set(end_positions)) return id_, ref, pos, strand, bc, depth, depth_unique + + +# --- Further annotation --- # + +def annotate_with_clonality(insertions): + def _clonality(grp): + clonality = grp['depth_unique'] / grp['depth_unique'].max() + return grp.assign(clonality=clonality) + + if 'sample' in insertions.columns: + per_sample = insertions.groupby('sample') + else: + per_sample = insertions.groupby('barcode') + + return pd.concat(_clonality(grp) for _, grp in per_sample) From 5ae82557fc4a9638a3bfc5cffe6e0bd5986c5d96 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 23 Mar 2016 17:32:43 +0100 Subject: [PATCH 072/100] Removed python 3.5 requirement. --- environment.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/environment.yml b/environment.yml index 6085bc5..2c052fa 100644 --- a/environment.yml +++ b/environment.yml @@ -1,6 +1,5 @@ name: pyim dependencies: -- python3.5 - future - numpy - scipy From 709d4690c43e4c7edac63d6c8ddd56d6f0434405 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 8 Jun 2016 13:05:54 +0200 Subject: [PATCH 073/100] Change environment name. --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 2c052fa..d892052 100644 --- a/environment.yml +++ b/environment.yml @@ -1,4 +1,4 @@ -name: pyim +name: pyim-dev dependencies: - future - numpy From 0f4af0f08f6f44a75b56c61560e9a0efc5acf254 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 8 Jun 2016 13:07:56 +0200 Subject: [PATCH 074/100] Fixed bug in rbm-cis annotation. --- pyim/alignment/bowtie2.py | 1 + pyim/annotation/annotator/rbm.py | 2 +- pyim/annotation/annotator/rbm_cis.py | 34 ++++++++++++---------------- pyim/annotation/annotator/window.py | 2 +- pyim/annotation/filtering.py | 2 +- pyim/annotation/metadata.py | 1 + pyim/main/cis.py | 8 ------- 7 files changed, 20 insertions(+), 30 deletions(-) diff --git a/pyim/alignment/bowtie2.py b/pyim/alignment/bowtie2.py index 671338a..cd7139d 100644 --- a/pyim/alignment/bowtie2.py +++ b/pyim/alignment/bowtie2.py @@ -11,6 +11,7 @@ def align(m1, index, output, m2=None, options=None, log=None, bam_output=False): + """Alignment with bowtie2.""" options = {} or options # Inject inputs into options. diff --git a/pyim/annotation/annotator/rbm.py b/pyim/annotation/annotator/rbm.py index 18b0505..0e789ad 100644 --- a/pyim/annotation/annotator/rbm.py +++ b/pyim/annotation/annotator/rbm.py @@ -140,7 +140,7 @@ def main(args): # Read insertions. insertions = pd.read_csv(args.input, sep='\t', dtype={'chrom': str}) - logging.info('Read %d insertions', len(insertions)) + logging.info('Read %d insertions', insertions['id'].nunique()) # Annotate insertions. logging.info('Annotating insertions') diff --git a/pyim/annotation/annotator/rbm_cis.py b/pyim/annotation/annotator/rbm_cis.py index f2c50a9..7b621c3 100644 --- a/pyim/annotation/annotator/rbm_cis.py +++ b/pyim/annotation/annotator/rbm_cis.py @@ -18,7 +18,8 @@ def annotate_rbm_cis(insertions, cis_sites, gtf, window_preset=None, - window_sizes=None, collapse=False): + window_sizes=None, blacklist=None, closest=False, + collapse=False): """Assigns insertions to genes using the RBM approach via called CIS sites. Args: @@ -59,11 +60,21 @@ def annotate_rbm_cis(insertions, cis_sites, gtf, window_preset=None, window_preset=window_preset, window_sizes=window_sizes) + if blacklist: + annotated_sites = filter_blacklist(annotated_sites, blacklist) + + if closest: + annotated_sites = add_metadata(annotated_sites, gtf) + annotated_sites = select_closest(annotated_sites) + # Extract and merge annotation with insertions. annotation = annotated_sites[['id', 'gene_id', 'gene_name']] annotation = annotation.rename(columns={'id': 'cis_id'}) annotated_ins = pd.merge(insertions, annotation, on='cis_id', how='left') + # Add metadata to insertions. + annotated_ins = add_metadata(annotated_ins, gtf) + if collapse: # Collapse multiple insertion entries resulting from CIS annotation. annotated_ins.drop(['cis_id'], axis=1, inplace=True) @@ -129,29 +140,14 @@ def main(args): cis_sites = pd.read_csv(args.cis_sites, sep='\t', dtype={'chrom': str}) logging.info('Read %d insertions and %d cis sites', - len(insertions), len(cis_sites)) + insertions['id'].nunique(), len(cis_sites)) # Annotate insertions. logging.info('Annotating insertions') - annotated_ins, annotated_sites = annotate_rbm_cis( insertions, cis_sites, args.gtf, window_preset=args.preset, - window_sizes=args.window_sizes, collapse=args.collapse) - - # Add metadata to annotated insertions. - logging.info('Adding annotation metadata') - annotated_ins = add_metadata(annotated_ins, args.gtf) - - if args.blacklist is not None: - logging.info('Filtering blacklisted genes') - annotated_ins = filter_blacklist(annotated_ins, args.blacklist) - annotated_sites = filter_blacklist(annotated_sites, args.blacklist) - - if args.closest: - logging.info('Selecting closest insertions') - annotated_ins = select_closest(annotated_ins) - annotated_sites = add_metadata(annotated_sites, args.gtf) - annotated_sites = select_closest(annotated_sites) + window_sizes=args.window_sizes, collapse=args.collapse, + blacklist=args.blacklist, closest=args.closest) # Write outputs. annotated_ins.to_csv(args.output, sep='\t', index=False) diff --git a/pyim/annotation/annotator/window.py b/pyim/annotation/annotator/window.py index 3395078..0122bfe 100644 --- a/pyim/annotation/annotator/window.py +++ b/pyim/annotation/annotator/window.py @@ -214,7 +214,7 @@ def register(subparsers, name='window'): def main(args): # Read annotation. insertions = pd.read_csv(args.input, sep='\t', dtype={'chrom': str}) - logging.info('Read %d insertions', len(insertions)) + logging.info('Read %d insertions', insertions['id'].unique()) # Define windows. logging.info('Annotating insertions') diff --git a/pyim/annotation/filtering.py b/pyim/annotation/filtering.py index 7788ba7..324b439 100644 --- a/pyim/annotation/filtering.py +++ b/pyim/annotation/filtering.py @@ -17,7 +17,7 @@ def select_closest(insertions, id_col='id', dist_col='distance'): """ def _is_closest(x): - abs_dist = x[col].abs() + abs_dist = x[dist_col].abs() return x.ix[abs_dist == abs_dist.min()] return (insertions.groupby(id_col) diff --git a/pyim/annotation/metadata.py b/pyim/annotation/metadata.py index 63fac47..04f804a 100644 --- a/pyim/annotation/metadata.py +++ b/pyim/annotation/metadata.py @@ -83,6 +83,7 @@ def feature_distance(insertion, feature): return dist + def feature_orientation(insertion, feature): """Determines the relative orientation of an insertion and a feature. diff --git a/pyim/main/cis.py b/pyim/main/cis.py index 51849d0..90eb4ce 100644 --- a/pyim/main/cis.py +++ b/pyim/main/cis.py @@ -12,7 +12,6 @@ import pandas as pd from pyim.cis.cimpl import map_insertions -from pyim.cis._util import annotate_cis_strand from pyim.util.insertions import subset_samples from ._logging import print_header, print_footer @@ -37,8 +36,6 @@ def setup_parser(): parser.add_argument('--lhc_method', choices={'none', 'exclude'}, default='exclude') - parser.add_argument('--strand_homogeneity', type=float, default=None) - parser.add_argument('--alpha', type=float, default=0.05) parser.add_argument('--threads', type=int, default=1) @@ -82,11 +79,6 @@ def main(): mapping_tmp = mapping.rename(columns={'insertion_id': 'id'}) insertions = pd.merge(insertions, mapping_tmp, on='id') - # Determine strand of cis sites. - if args.strand_homogeneity is not None: - logging.info('Determining CIS strands') - cis = annotate_cis_strand(cis, insertions, args.strand_homogeneity) - # Write out outputs. logger.info('Writing outputs') From d85c794786189a4c0cbb6c666fc0b01b8c547206 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 8 Jun 2016 13:25:15 +0200 Subject: [PATCH 075/100] Switch to versioneer for versioning. --- .gitattributes | 1 + MANIFEST.IN | 3 +- pyim/util/__init__.py | 0 setup.cfg | 7 + setup.py | 40 +- src/pyim/__init__.py | 4 + src/pyim/_version.py | 484 +++++ {pyim => src/pyim/alignment}/__init__.py | 0 {pyim => src/pyim}/alignment/bowtie2.py | 0 .../pyim/alignment/pipelines}/__init__.py | 0 .../alignment/pipelines/_helpers}/__init__.py | 0 .../pipelines/_helpers/clustering.py | 0 .../alignment/pipelines/_helpers/grouping.py | 0 .../alignment/pipelines/_helpers/pipeline.py | 0 .../pyim}/alignment/pipelines/_model.py | 0 .../pyim}/alignment/pipelines/lam_pcr.py | 0 .../pyim}/alignment/pipelines/shear_splink.py | 0 .../alignment/pipelines/shear_splink_sb.py | 0 {pyim => src/pyim}/alignment/vector.py | 0 .../pyim/annotation}/__init__.py | 0 .../pyim}/annotation/annotator/__init__.py | 0 .../pyim}/annotation/annotator/kcrbm.py | 0 .../pyim}/annotation/annotator/rbm.py | 0 .../pyim}/annotation/annotator/rbm_cis.py | 0 .../pyim}/annotation/annotator/window.py | 0 {pyim => src/pyim}/annotation/filtering.py | 0 {pyim => src/pyim}/annotation/metadata.py | 0 {pyim => src/pyim}/annotation/util.py | 0 {pyim/annotation => src/pyim/cis}/__init__.py | 0 {pyim => src/pyim}/cis/_util.py | 0 {pyim => src/pyim}/cis/cimpl.py | 0 {pyim => src/pyim}/cis/poisson.py | 0 {pyim/cis => src/pyim/main}/__init__.py | 0 {pyim => src/pyim}/main/_logging.py | 0 {pyim => src/pyim}/main/align.py | 0 {pyim => src/pyim}/main/annotate.py | 0 {pyim => src/pyim}/main/cis.py | 0 {pyim => src/pyim}/main/gff.py | 0 {pyim => src/pyim}/main/merge.py | 0 {pyim => src/pyim}/main/plot.py | 0 {pyim => src/pyim}/main/split.py | 0 {pyim/main => src/pyim/util}/__init__.py | 0 {pyim => src/pyim}/util/file.py | 0 {pyim => src/pyim}/util/insertions.py | 0 {pyim => src/pyim}/util/pandas.py | 0 {pyim => src/pyim}/util/rpy2.py | 0 {pyim => src/pyim}/util/tabix.py | 0 test/alignment/test_vector.py | 124 -- test/pipelines/test_base.py | 159 -- test/pipelines/test_lam_pcr.py | 85 - test/pipelines/test_shear_splink.py | 266 --- test/test_cluster.py | 100 - version.py | 106 - versioneer.py | 1774 +++++++++++++++++ 54 files changed, 2300 insertions(+), 853 deletions(-) create mode 100644 .gitattributes delete mode 100644 pyim/util/__init__.py create mode 100644 setup.cfg create mode 100644 src/pyim/__init__.py create mode 100644 src/pyim/_version.py rename {pyim => src/pyim/alignment}/__init__.py (100%) rename {pyim => src/pyim}/alignment/bowtie2.py (100%) rename {pyim/alignment => src/pyim/alignment/pipelines}/__init__.py (100%) rename {pyim/alignment/pipelines => src/pyim/alignment/pipelines/_helpers}/__init__.py (100%) rename {pyim => src/pyim}/alignment/pipelines/_helpers/clustering.py (100%) rename {pyim => src/pyim}/alignment/pipelines/_helpers/grouping.py (100%) rename {pyim => src/pyim}/alignment/pipelines/_helpers/pipeline.py (100%) rename {pyim => src/pyim}/alignment/pipelines/_model.py (100%) rename {pyim => src/pyim}/alignment/pipelines/lam_pcr.py (100%) rename {pyim => src/pyim}/alignment/pipelines/shear_splink.py (100%) rename {pyim => src/pyim}/alignment/pipelines/shear_splink_sb.py (100%) rename {pyim => src/pyim}/alignment/vector.py (100%) rename {pyim/alignment/pipelines/_helpers => src/pyim/annotation}/__init__.py (100%) rename {pyim => src/pyim}/annotation/annotator/__init__.py (100%) rename {pyim => src/pyim}/annotation/annotator/kcrbm.py (100%) rename {pyim => src/pyim}/annotation/annotator/rbm.py (100%) rename {pyim => src/pyim}/annotation/annotator/rbm_cis.py (100%) rename {pyim => src/pyim}/annotation/annotator/window.py (100%) rename {pyim => src/pyim}/annotation/filtering.py (100%) rename {pyim => src/pyim}/annotation/metadata.py (100%) rename {pyim => src/pyim}/annotation/util.py (100%) rename {pyim/annotation => src/pyim/cis}/__init__.py (100%) rename {pyim => src/pyim}/cis/_util.py (100%) rename {pyim => src/pyim}/cis/cimpl.py (100%) rename {pyim => src/pyim}/cis/poisson.py (100%) rename {pyim/cis => src/pyim/main}/__init__.py (100%) rename {pyim => src/pyim}/main/_logging.py (100%) rename {pyim => src/pyim}/main/align.py (100%) rename {pyim => src/pyim}/main/annotate.py (100%) rename {pyim => src/pyim}/main/cis.py (100%) rename {pyim => src/pyim}/main/gff.py (100%) rename {pyim => src/pyim}/main/merge.py (100%) rename {pyim => src/pyim}/main/plot.py (100%) rename {pyim => src/pyim}/main/split.py (100%) rename {pyim/main => src/pyim/util}/__init__.py (100%) rename {pyim => src/pyim}/util/file.py (100%) rename {pyim => src/pyim}/util/insertions.py (100%) rename {pyim => src/pyim}/util/pandas.py (100%) rename {pyim => src/pyim}/util/rpy2.py (100%) rename {pyim => src/pyim}/util/tabix.py (100%) delete mode 100644 test/alignment/test_vector.py delete mode 100644 test/pipelines/test_base.py delete mode 100644 test/pipelines/test_lam_pcr.py delete mode 100644 test/pipelines/test_shear_splink.py delete mode 100644 test/test_cluster.py delete mode 100644 version.py create mode 100644 versioneer.py diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..02fdffb --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +src/pyim/_version.py export-subst diff --git a/MANIFEST.IN b/MANIFEST.IN index 466cd00..076fc55 100644 --- a/MANIFEST.IN +++ b/MANIFEST.IN @@ -1 +1,2 @@ -include RELEASE-VERSION +include versioneer.py +include src/pyim/_version.py diff --git a/pyim/util/__init__.py b/pyim/util/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..a2e923b --- /dev/null +++ b/setup.cfg @@ -0,0 +1,7 @@ +[versioneer] +VCS = git +style = pep440 +versionfile_source = src/pyim/_version.py +versionfile_build = pyim/_version.py +tag_prefix = v +parentdir_prefix = pyim- diff --git a/setup.py b/setup.py index cbe4760..ad4cc45 100644 --- a/setup.py +++ b/setup.py @@ -1,25 +1,41 @@ import sys -from setuptools import setup, find_packages +import setuptools +import versioneer -from version import get_git_version +INSTALL_REQUIRES = ['future', 'numpy', 'scipy', 'pandas', 'pysam', + 'rpy2', 'scikit-bio', 'toolz', 'tqdm', 'intervaltree'] +EXTRAS_REQUIRE = { + 'dev': ['sphinx', 'pytest', 'pytest-mock', + 'pytest-datafiles', 'pytest-cov', + 'pytest-helpers-namespace'] +} -install_requires = ['future', 'numpy', 'scipy', 'pandas', 'pysam', - 'rpy2', 'scikit-bio', 'toolz', 'tqdm', 'intervaltree'] -if not sys.version_info >= (3, ): - install_requires += ['pathlib'] +# Check setuptools version, as recommended by: +# https://hynek.me/articles/conditional-python-dependencies/. +if int(setuptools.__version__.split('.', 1)[0]) < 18: + assert 'bdist_wheel' not in sys.argv + + # Add pathlib for Pythons before 3.4. + if sys.version_info[0:2] < (3, 4): + INSTALL_REQUIRES.append('pathlib2') +else: + EXTRAS_REQUIRE[":python_version<'3.4'"] = ['pathlib2'] + -setup( +setuptools.setup( name='pyim', - version=get_git_version(), + version=versioneer.get_version(), + cmdclass=versioneer.get_cmdclass(), url='https://bitbucket.org/jrderuiter/pyim', author='Julian de Ruiter', author_email='julianderuiter@gmail.com', description='Predicts transposon insertion sites from DNA-seq data.', license='BSD', - packages=find_packages(), + packages=setuptools.find_packages('src'), + package_dir={'': 'src'}, include_package_data=True, entry_points={'console_scripts': [ 'pyim-align = pyim.main.align:main', @@ -30,8 +46,8 @@ 'pyim-gff = pyim.main.gff:main', 'pyim-split = pyim.main.split:main' ]}, - extras_require={'test': 'pytest'}, + install_requires=INSTALL_REQUIRES, + extras_require=EXTRAS_REQUIRE, zip_safe=True, - classifiers=[], - install_requires=install_requires + classifiers=[] ) diff --git a/src/pyim/__init__.py b/src/pyim/__init__.py new file mode 100644 index 0000000..74f4e66 --- /dev/null +++ b/src/pyim/__init__.py @@ -0,0 +1,4 @@ + +from ._version import get_versions +__version__ = get_versions()['version'] +del get_versions diff --git a/src/pyim/_version.py b/src/pyim/_version.py new file mode 100644 index 0000000..25319d8 --- /dev/null +++ b/src/pyim/_version.py @@ -0,0 +1,484 @@ + +# This file helps to compute a version number in source trees obtained from +# git-archive tarball (such as those provided by githubs download-from-tag +# feature). Distribution tarballs (built by setup.py sdist) and build +# directories (produced by setup.py build) will contain a much shorter file +# that just contains the computed version number. + +# This file is released into the public domain. Generated by +# versioneer-0.16 (https://github.com/warner/python-versioneer) + +"""Git implementation of _version.py.""" + +import errno +import os +import re +import subprocess +import sys + + +def get_keywords(): + """Get the keywords needed to look up the version information.""" + # these strings will be replaced by git during git-archive. + # setup.py/versioneer.py will grep for the variable names, so they must + # each be defined on a line of their own. _version.py will just call + # get_keywords(). + git_refnames = "$Format:%d$" + git_full = "$Format:%H$" + keywords = {"refnames": git_refnames, "full": git_full} + return keywords + + +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" + + +def get_config(): + """Create, populate and return the VersioneerConfig() object.""" + # these strings are filled in when 'setup.py versioneer' creates + # _version.py + cfg = VersioneerConfig() + cfg.VCS = "git" + cfg.style = "pep440" + cfg.tag_prefix = "v" + cfg.parentdir_prefix = "pyim-" + cfg.versionfile_source = "src/pyim/_version.py" + cfg.verbose = False + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +LONG_VERSION_PY = {} +HANDLERS = {} + + +def register_vcs_handler(vcs, method): # decorator + """Decorator to mark a method as the handler for a particular VCS.""" + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + return decorate + + +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False): + """Call the given command(s).""" + assert isinstance(commands, list) + p = None + for c in commands: + try: + dispcmd = str([c] + args) + # remember shell=False, so use git.cmd on windows, not just git + p = subprocess.Popen([c] + args, cwd=cwd, stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr + else None)) + break + except EnvironmentError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %s" % dispcmd) + print(e) + return None + else: + if verbose: + print("unable to find command, tried %s" % (commands,)) + return None + stdout = p.communicate()[0].strip() + if sys.version_info[0] >= 3: + stdout = stdout.decode() + if p.returncode != 0: + if verbose: + print("unable to run %s (error)" % dispcmd) + return None + return stdout + + +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes + both the project name and a version string. + """ + dirname = os.path.basename(root) + if not dirname.startswith(parentdir_prefix): + if verbose: + print("guessing rootdir is '%s', but '%s' doesn't start with " + "prefix '%s'" % (root, dirname, parentdir_prefix)) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + return {"version": dirname[len(parentdir_prefix):], + "full-revisionid": None, + "dirty": False, "error": None} + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + f = open(versionfile_abs, "r") + for line in f.readlines(): + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + f.close() + except EnvironmentError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if not keywords: + raise NotThisMethod("no keywords at all, weird") + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = set([r.strip() for r in refnames.strip("()").split(",")]) + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = set([r for r in refs if re.search(r'\d', r)]) + if verbose: + print("discarding '%s', no digits" % ",".join(refs-tags)) + if verbose: + print("likely tags: %s" % ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix):] + if verbose: + print("picking %s" % r) + return {"version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": None + } + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return {"version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": "no suitable tags"} + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): + """Get version from 'git describe' in the root of the source tree. + + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + if not os.path.exists(os.path.join(root, ".git")): + if verbose: + print("no .git in %s" % root) + raise NotThisMethod("no .git directory") + + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out = run_command(GITS, ["describe", "--tags", "--dirty", + "--always", "--long", + "--match", "%s*" % tag_prefix], + cwd=root) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[:git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + if not mo: + # unparseable. Maybe git-describe is misbehaving? + pieces["error"] = ("unable to parse git-describe output: '%s'" + % describe_out) + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%s' doesn't start with prefix '%s'" + print(fmt % (full_tag, tag_prefix)) + pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" + % (full_tag, tag_prefix)) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix):] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + count_out = run_command(GITS, ["rev-list", "HEAD", "--count"], + cwd=root) + pieces["distance"] = int(count_out) # total number of commits + + return pieces + + +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%d.g%s" % (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_pre(pieces): + """TAG[.post.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post.devDISTANCE + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += ".post.dev%d" % pieces["distance"] + else: + # exception #1 + rendered = "0.post.dev%d" % pieces["distance"] + return rendered + + +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + return rendered + + +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + + The ".dev0" means dirty. + + Eexceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + + Like 'git describe --tags --dirty --always'. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return {"version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"]} + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%s'" % style) + + return {"version": rendered, "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], "error": None} + + +def get_versions(): + """Get version information or return default if unable to do so.""" + # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have + # __file__, we can work backwards from there to the root. Some + # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which + # case we can only use expanded keywords. + + cfg = get_config() + verbose = cfg.verbose + + try: + return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, + verbose) + except NotThisMethod: + pass + + try: + root = os.path.realpath(__file__) + # versionfile_source is the relative path from the top of the source + # tree (where the .git directory might live) to this file. Invert + # this to find the root from __file__. + for i in cfg.versionfile_source.split('/'): + root = os.path.dirname(root) + except NameError: + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree"} + + try: + pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) + return render(pieces, cfg.style) + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + except NotThisMethod: + pass + + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to compute version"} diff --git a/pyim/__init__.py b/src/pyim/alignment/__init__.py similarity index 100% rename from pyim/__init__.py rename to src/pyim/alignment/__init__.py diff --git a/pyim/alignment/bowtie2.py b/src/pyim/alignment/bowtie2.py similarity index 100% rename from pyim/alignment/bowtie2.py rename to src/pyim/alignment/bowtie2.py diff --git a/pyim/alignment/__init__.py b/src/pyim/alignment/pipelines/__init__.py similarity index 100% rename from pyim/alignment/__init__.py rename to src/pyim/alignment/pipelines/__init__.py diff --git a/pyim/alignment/pipelines/__init__.py b/src/pyim/alignment/pipelines/_helpers/__init__.py similarity index 100% rename from pyim/alignment/pipelines/__init__.py rename to src/pyim/alignment/pipelines/_helpers/__init__.py diff --git a/pyim/alignment/pipelines/_helpers/clustering.py b/src/pyim/alignment/pipelines/_helpers/clustering.py similarity index 100% rename from pyim/alignment/pipelines/_helpers/clustering.py rename to src/pyim/alignment/pipelines/_helpers/clustering.py diff --git a/pyim/alignment/pipelines/_helpers/grouping.py b/src/pyim/alignment/pipelines/_helpers/grouping.py similarity index 100% rename from pyim/alignment/pipelines/_helpers/grouping.py rename to src/pyim/alignment/pipelines/_helpers/grouping.py diff --git a/pyim/alignment/pipelines/_helpers/pipeline.py b/src/pyim/alignment/pipelines/_helpers/pipeline.py similarity index 100% rename from pyim/alignment/pipelines/_helpers/pipeline.py rename to src/pyim/alignment/pipelines/_helpers/pipeline.py diff --git a/pyim/alignment/pipelines/_model.py b/src/pyim/alignment/pipelines/_model.py similarity index 100% rename from pyim/alignment/pipelines/_model.py rename to src/pyim/alignment/pipelines/_model.py diff --git a/pyim/alignment/pipelines/lam_pcr.py b/src/pyim/alignment/pipelines/lam_pcr.py similarity index 100% rename from pyim/alignment/pipelines/lam_pcr.py rename to src/pyim/alignment/pipelines/lam_pcr.py diff --git a/pyim/alignment/pipelines/shear_splink.py b/src/pyim/alignment/pipelines/shear_splink.py similarity index 100% rename from pyim/alignment/pipelines/shear_splink.py rename to src/pyim/alignment/pipelines/shear_splink.py diff --git a/pyim/alignment/pipelines/shear_splink_sb.py b/src/pyim/alignment/pipelines/shear_splink_sb.py similarity index 100% rename from pyim/alignment/pipelines/shear_splink_sb.py rename to src/pyim/alignment/pipelines/shear_splink_sb.py diff --git a/pyim/alignment/vector.py b/src/pyim/alignment/vector.py similarity index 100% rename from pyim/alignment/vector.py rename to src/pyim/alignment/vector.py diff --git a/pyim/alignment/pipelines/_helpers/__init__.py b/src/pyim/annotation/__init__.py similarity index 100% rename from pyim/alignment/pipelines/_helpers/__init__.py rename to src/pyim/annotation/__init__.py diff --git a/pyim/annotation/annotator/__init__.py b/src/pyim/annotation/annotator/__init__.py similarity index 100% rename from pyim/annotation/annotator/__init__.py rename to src/pyim/annotation/annotator/__init__.py diff --git a/pyim/annotation/annotator/kcrbm.py b/src/pyim/annotation/annotator/kcrbm.py similarity index 100% rename from pyim/annotation/annotator/kcrbm.py rename to src/pyim/annotation/annotator/kcrbm.py diff --git a/pyim/annotation/annotator/rbm.py b/src/pyim/annotation/annotator/rbm.py similarity index 100% rename from pyim/annotation/annotator/rbm.py rename to src/pyim/annotation/annotator/rbm.py diff --git a/pyim/annotation/annotator/rbm_cis.py b/src/pyim/annotation/annotator/rbm_cis.py similarity index 100% rename from pyim/annotation/annotator/rbm_cis.py rename to src/pyim/annotation/annotator/rbm_cis.py diff --git a/pyim/annotation/annotator/window.py b/src/pyim/annotation/annotator/window.py similarity index 100% rename from pyim/annotation/annotator/window.py rename to src/pyim/annotation/annotator/window.py diff --git a/pyim/annotation/filtering.py b/src/pyim/annotation/filtering.py similarity index 100% rename from pyim/annotation/filtering.py rename to src/pyim/annotation/filtering.py diff --git a/pyim/annotation/metadata.py b/src/pyim/annotation/metadata.py similarity index 100% rename from pyim/annotation/metadata.py rename to src/pyim/annotation/metadata.py diff --git a/pyim/annotation/util.py b/src/pyim/annotation/util.py similarity index 100% rename from pyim/annotation/util.py rename to src/pyim/annotation/util.py diff --git a/pyim/annotation/__init__.py b/src/pyim/cis/__init__.py similarity index 100% rename from pyim/annotation/__init__.py rename to src/pyim/cis/__init__.py diff --git a/pyim/cis/_util.py b/src/pyim/cis/_util.py similarity index 100% rename from pyim/cis/_util.py rename to src/pyim/cis/_util.py diff --git a/pyim/cis/cimpl.py b/src/pyim/cis/cimpl.py similarity index 100% rename from pyim/cis/cimpl.py rename to src/pyim/cis/cimpl.py diff --git a/pyim/cis/poisson.py b/src/pyim/cis/poisson.py similarity index 100% rename from pyim/cis/poisson.py rename to src/pyim/cis/poisson.py diff --git a/pyim/cis/__init__.py b/src/pyim/main/__init__.py similarity index 100% rename from pyim/cis/__init__.py rename to src/pyim/main/__init__.py diff --git a/pyim/main/_logging.py b/src/pyim/main/_logging.py similarity index 100% rename from pyim/main/_logging.py rename to src/pyim/main/_logging.py diff --git a/pyim/main/align.py b/src/pyim/main/align.py similarity index 100% rename from pyim/main/align.py rename to src/pyim/main/align.py diff --git a/pyim/main/annotate.py b/src/pyim/main/annotate.py similarity index 100% rename from pyim/main/annotate.py rename to src/pyim/main/annotate.py diff --git a/pyim/main/cis.py b/src/pyim/main/cis.py similarity index 100% rename from pyim/main/cis.py rename to src/pyim/main/cis.py diff --git a/pyim/main/gff.py b/src/pyim/main/gff.py similarity index 100% rename from pyim/main/gff.py rename to src/pyim/main/gff.py diff --git a/pyim/main/merge.py b/src/pyim/main/merge.py similarity index 100% rename from pyim/main/merge.py rename to src/pyim/main/merge.py diff --git a/pyim/main/plot.py b/src/pyim/main/plot.py similarity index 100% rename from pyim/main/plot.py rename to src/pyim/main/plot.py diff --git a/pyim/main/split.py b/src/pyim/main/split.py similarity index 100% rename from pyim/main/split.py rename to src/pyim/main/split.py diff --git a/pyim/main/__init__.py b/src/pyim/util/__init__.py similarity index 100% rename from pyim/main/__init__.py rename to src/pyim/util/__init__.py diff --git a/pyim/util/file.py b/src/pyim/util/file.py similarity index 100% rename from pyim/util/file.py rename to src/pyim/util/file.py diff --git a/pyim/util/insertions.py b/src/pyim/util/insertions.py similarity index 100% rename from pyim/util/insertions.py rename to src/pyim/util/insertions.py diff --git a/pyim/util/pandas.py b/src/pyim/util/pandas.py similarity index 100% rename from pyim/util/pandas.py rename to src/pyim/util/pandas.py diff --git a/pyim/util/rpy2.py b/src/pyim/util/rpy2.py similarity index 100% rename from pyim/util/rpy2.py rename to src/pyim/util/rpy2.py diff --git a/pyim/util/tabix.py b/src/pyim/util/tabix.py similarity index 100% rename from pyim/util/tabix.py rename to src/pyim/util/tabix.py diff --git a/test/alignment/test_vector.py b/test/alignment/test_vector.py deleted file mode 100644 index 432bd2c..0000000 --- a/test/alignment/test_vector.py +++ /dev/null @@ -1,124 +0,0 @@ -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import (ascii, bytes, chr, dict, filter, hex, input, - int, map, next, oct, open, pow, range, round, - str, super, zip) - - -import pytest - -from tkgeno.io.model import DNASequence -from pyim.alignment.vector import ExactAligner, SswAligner - - -@pytest.fixture(scope='module') -def test_data(): - reads = [DNASequence('AATGTGTACCAACTGTTG', 'READ1'), # Query present. - DNASequence('AATGTGTACCACAGTTTG', 'READ2'), # Query in reverse. - DNASequence('AATGTGTACCATAGTTTG', 'READ3')] # Query missing. - - query = DNASequence('ACTG', 'QUERY') - - return reads, query - - -# noinspection PyShadowingNames -# noinspection PyMethodMayBeStatic -class TestExactAligner(object): - - def test_simple(self, test_data): - reads, query = test_data - - aligner = ExactAligner() - aln = aligner.align(query=query, target=reads[0]) - - assert aln.query_id == 'QUERY' - assert aln.query_start == 0 - assert aln.query_end == 4 - - assert aln.target_id == 'READ1' - assert aln.target_start == 11 - assert aln.target_end == 15 - assert aln.target_strand == 1 - - assert aln.type == 'exact' - assert aln.coverage == 1.0 - assert aln.identity == 1.0 - - def test_reverse(self, test_data): - reads, query = test_data - - aligner = ExactAligner(try_reverse=True) - aln = aligner.align(query=query, target=reads[1]) - - assert aln.target_id == 'READ2' - assert aln.target_strand == -1 - assert aln.target_start == 11 - assert aln.target_end == 15 - - assert aln.query_id == 'QUERY' - assert aln.query_start == 0 - assert aln.query_end == 4 - - assert aln.type == 'exact' - assert aln.coverage == 1.0 - assert aln.identity == 1.0 - - def test_no_reverse(self, test_data): - reads, query = test_data - - aligner = ExactAligner(try_reverse=False) - aln = aligner.align(query=query, target=reads[1]) - - assert aln is None - - def test_missing(self, test_data): - reads, query = test_data - - aligner = ExactAligner() - aln = aligner.align(query=query, target=reads[2]) - - assert aln is None - - -# noinspection PyShadowingNames -# noinspection PyMethodMayBeStatic -class TestSswAligner(object): - - def test_simple(self, test_data): - reads, query = test_data - - aligner = SswAligner() - aln = aligner.align(query=query, target=reads[0]) - - assert aln.query_id == 'QUERY' - assert aln.query_start == 0 - assert aln.query_end == 4 - - assert aln.target_id == 'READ1' - assert aln.target_start == 11 - assert aln.target_end == 15 - assert aln.target_strand == 1 - - assert aln.type == 'ssw' - assert aln.coverage == 1.0 - assert aln.identity == 1.0 - - def test_reverse(self, test_data): - reads, query = test_data - - aligner = SswAligner(try_reverse=True) - aln = aligner.align(query=query, target=reads[1]) - - assert aln.target_id == 'READ2' - assert aln.target_strand == -1 - assert aln.target_start == 11 - assert aln.target_end == 15 - - assert aln.query_id == 'QUERY' - assert aln.query_start == 0 - assert aln.query_end == 4 - - assert aln.type == 'ssw' - assert aln.coverage == 1.0 - assert aln.identity == 1.0 \ No newline at end of file diff --git a/test/pipelines/test_base.py b/test/pipelines/test_base.py deleted file mode 100644 index 30d0638..0000000 --- a/test/pipelines/test_base.py +++ /dev/null @@ -1,159 +0,0 @@ -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import (ascii, bytes, chr, dict, filter, hex, input, - int, map, next, oct, open, pow, range, round, - str, super, zip) - -from collections import namedtuple - -import pytest - -import numpy as np -from skbio import DNASequence - -from pyim.alignment.vector import ExactAligner -from pyim.pipelines._base import InsertionIdentifier - - -# -------- Base identifier tests ------- - -AlignedSegment = namedtuple( - 'AlignedSegment', - ['query_name', 'reference_id', 'reference_start', - 'reference_end', 'is_reverse']) - - -@pytest.fixture(scope='module') -def test_data(): - alignments = [ - AlignedSegment('READ1', 0, 20, 70, False), - AlignedSegment('READ2', 0, 20, 70, False), - AlignedSegment('READ3', 0, 20, 69, False), - AlignedSegment('READ4', 0, 20, 68, False), - AlignedSegment('READ5', 0, 21, 68, False) - ] - - bc_map = {'READ1': 'BC01', - 'READ2': 'BC01', - 'READ3': 'BC01', - 'READ4': 'BC02', - 'READ5': 'BC02'} - - return alignments, bc_map - - -@pytest.fixture(scope='module') -def test_data_stranded(): - return [ - AlignedSegment('READ1', 0, 20, 70, False), - AlignedSegment('READ2', 0, 20, 69, False), - AlignedSegment('READ3', 0, 20, 69, True), - AlignedSegment('READ4', 0, 21, 69, True), - AlignedSegment('READ5', 0, 21, 68, False), - ] - - -@pytest.fixture(scope='module') -def test_data_unordered(): - return [ - AlignedSegment('READ1', 0, 20, 70, False), - AlignedSegment('READ2', 0, 20, 69, False), - AlignedSegment('READ4', 0, 21, 69, True), - AlignedSegment('READ5', 0, 21, 68, False), - AlignedSegment('READ3', 0, 20, 69, True), - ] - -@pytest.fixture(scope='module') -def test_data_first_reverse(): - return [ - AlignedSegment('READ1', 0, 5, 21, True), - AlignedSegment('READ2', 0, 20, 69, False), - AlignedSegment('READ3', 0, 22, 69, False), - ] - - -# noinspection PyShadowingNames -# noinspection PyMethodMayBeStatic -class TestInsertionIdentifierGroupByPosition(object): - - def test_simple(self, test_data): - alignments, _ = test_data - - identifier = InsertionIdentifier() - groups = list(identifier._group_by_position_barcode(alignments)) - - # Should identify a total of two groups. - assert len(groups) == 2 - - # Check group 1 for position and membership. - (pos, strand, bc), alignments = groups[0] - assert pos == 20 - assert strand == 1 - assert np.isnan(bc) - assert len(alignments) == 4 - - def test_with_barcode(self, test_data): - alignments, bc_map = test_data - - identifier = InsertionIdentifier() - groups = list(identifier._group_by_position_barcode( - alignments, bc_map)) - - # Should identify a total of three groups. - assert len(groups) == 3 - - # Check first groups for position and membership. - for group in groups[0:1]: - (pos, strand, bc), alignments = group - assert pos == 20 - assert strand == 1 - - assert bc in {'BC01', 'BC02'} - - if bc == 'BC01': - assert len(alignments) == 3 - else: - assert len(alignments) == 1 - - def test_stranded(self, test_data_stranded): - alignments = test_data_stranded - - identifier = InsertionIdentifier() - groups = list(identifier._group_by_position_barcode(alignments)) - - # Should identify a total of three groups. - assert len(groups) == 3 - - # Check first group for position and membership. - (pos, strand, bc), alignments = groups[0] - assert pos == 20 - assert strand == 1 - assert len(alignments) == 2 - - # Check second group for position and membership. - (pos, strand, bc), alignments = groups[1] - assert pos == 21 - assert strand == 1 - assert len(alignments) == 1 - - # Check third (reverse) group for position and membership. - (pos, strand, bc), alignments = groups[2] - assert pos == 69 - assert strand == -1 - assert len(alignments) == 2 - - def test_unordered(self, test_data_unordered): - with pytest.raises(ValueError): - identifier = InsertionIdentifier() - list(identifier._group_by_position_barcode(test_data_unordered)) - - def test_first_reverse(self, test_data_first_reverse): - alignments = test_data_first_reverse - - identifier = InsertionIdentifier() - groups = list(identifier._group_by_position_barcode(alignments)) - - # Should have been returned in increasing order, even though - # the -1 cluster occurs earlier in the alignments. - positions = [grp[0][0] for grp in groups] - assert all(np.diff(positions) >= 0) diff --git a/test/pipelines/test_lam_pcr.py b/test/pipelines/test_lam_pcr.py deleted file mode 100644 index ad5c310..0000000 --- a/test/pipelines/test_lam_pcr.py +++ /dev/null @@ -1,85 +0,0 @@ -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import (ascii, bytes, chr, dict, filter, hex, input, - int, map, next, oct, open, pow, range, round, - str, super, zip) - -from collections import namedtuple - -import pytest - -import pysam -from pyim.pipelines.lam_pcr import LamPcrIdentifier - - -# --- Identifier --- # - -AlignedSegment = namedtuple( - 'AlignedSegment', - ['query_name', 'reference_id', 'reference_start', - 'reference_end', 'is_reverse', 'mapping_quality']) - - -class AlignmentFile(object): - - def __init__(self, *args, **kwargs): - pass - - def fetch(self, reference=None): - raise NotImplementedError() - - @property - def references(self): - raise NotImplementedError() - - -@pytest.fixture() -def patch_alignment_file(monkeypatch): - alignments = {'1': [ - AlignedSegment('READ0', '1', 1, 21, True, 40), # 1st group on -1. - AlignedSegment('READ1', '1', 20, 70, False, 40), # Start of 2nd group. - AlignedSegment('READ2', '1', 20, 70, False, 40), - AlignedSegment('READ3', '1', 20, 69, False, 40), - AlignedSegment('READ5', '1', 21, 68, False, 40), # Within merge dist. - AlignedSegment('READ6', '1', 32, 68, False, 30) # Outside merge dist. - ]} - - # Monkeypatch mock class for fixture. - monkeypatch.setattr(AlignmentFile, 'fetch', - lambda self, reference: alignments[reference]) - monkeypatch.setattr(AlignmentFile, 'references', list(alignments.keys())) - - # Monkeypatch class into pysam. - monkeypatch.setattr(pysam, 'AlignmentFile', AlignmentFile) - - -# noinspection PyShadowingNames -# noinspection PyMethodMayBeStatic -class TestLamPcrIdentifier(object): - - def test_identify(self, patch_alignment_file): - identifier = LamPcrIdentifier(merge_distance=10, min_mapq=0) - insertions = identifier.identify('dummy.bam') - - assert len(insertions) == 3 - - def test_identify_mapq(self, patch_alignment_file): - identifier = LamPcrIdentifier(merge_distance=10, min_mapq=37) - insertions = identifier.identify('dummy.bam') - - # Should find two insertions, due to lower mapq of READ6. - assert len(insertions) == 2 - - def test_identify_large_merge(self, patch_alignment_file): - identifier = LamPcrIdentifier(merge_distance=20, min_mapq=0) - insertions = identifier.identify('dummy.bam') - - # Should find two insertions, due to larger merge_dist. - assert len(insertions) == 2 - - def test_identify_no_merge(self, patch_alignment_file): - identifier = LamPcrIdentifier(merge_distance=0, min_mapq=0) - insertions = identifier.identify('dummy.bam') - - # Should find four insertions, due to no merge_dist. - assert len(insertions) == 4 \ No newline at end of file diff --git a/test/pipelines/test_shear_splink.py b/test/pipelines/test_shear_splink.py deleted file mode 100644 index 0b33896..0000000 --- a/test/pipelines/test_shear_splink.py +++ /dev/null @@ -1,266 +0,0 @@ -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import (ascii, bytes, chr, dict, filter, hex, input, - int, map, next, oct, open, pow, range, round, - str, super, zip) - -from collections import namedtuple - -import pytest - -import pysam -import pandas as pd -from skbio import DNASequence - -from pyim.pipelines.shear_splink import \ - ShearSplinkExtractor, ShearSplinkStatus, ShearSplinkIdentifier - - -# --- Extractor --- # - -GENOMIC_SEQ1 = DNASequence('CACTGGCCACGCGAAGGTGC') -GENOMIC_SEQ2 = DNASequence('GACCACTGGCCACGCGAAGG').reverse_complement() -GENOMIC_SEQ3 = DNASequence('CGTTGGTCACTCTACCCACA') - - -@pytest.fixture(scope='module') -def transposon_seq(): - return DNASequence('TTTG', id='transposon') - - -@pytest.fixture(scope='module') -def barcode_seqs(): - return [DNASequence('AAAT', 'BC01'), - DNASequence('AAAA', 'BC02')] - - -@pytest.fixture(scope='module') -def linker_seq(): - return DNASequence('CCCG', id='linker') - - -# noinspection PyShadowingNames -# noinspection PyMethodMayBeStatic -class TestShearSplinkExtractor(object): - - def test_forward(self, transposon_seq, barcode_seqs, linker_seq): - read = DNASequence(str(barcode_seqs[0]) + str(transposon_seq) + - str(GENOMIC_SEQ1) + str(linker_seq)) - - extractor = ShearSplinkExtractor( - transposon_sequence=transposon_seq, - barcode_sequences=barcode_seqs, - linker_sequence=linker_seq) - - result, status = extractor.extract_read(read) - assert status == ShearSplinkStatus.proper_read - assert result is not None - - genomic, barcode = result - assert genomic.sequence == GENOMIC_SEQ1.sequence - assert barcode == barcode_seqs[0].id - - def test_reverse(self, transposon_seq, - barcode_seqs, linker_seq): - read = DNASequence(str(barcode_seqs[0]) + str(transposon_seq) + - str(GENOMIC_SEQ1) + str(linker_seq)) - read = read.reverse_complement() - - extractor = ShearSplinkExtractor( - transposon_sequence=transposon_seq, - barcode_sequences=barcode_seqs, - linker_sequence=linker_seq) - - res, status = extractor.extract_read(read) - assert res is not None - assert status == ShearSplinkStatus.proper_read - - genomic, barcode = res - assert genomic.sequence == GENOMIC_SEQ1.sequence - assert barcode == barcode_seqs[0].id - - def test_missing_linker(self, transposon_seq, barcode_seqs, linker_seq): - read = DNASequence(str(barcode_seqs[0]) + str(transposon_seq) + - str(GENOMIC_SEQ1)) - - extractor = ShearSplinkExtractor( - transposon_sequence=transposon_seq, - barcode_sequences=barcode_seqs, - linker_sequence=linker_seq) - - res, status = extractor.extract_read(read) - assert res is None - assert status == ShearSplinkStatus.no_linker - - def test_missing_barcode(self, transposon_seq, - barcode_seqs, linker_seq): - read = DNASequence(str(transposon_seq) + str(GENOMIC_SEQ1) + - str(linker_seq)) - - extractor = ShearSplinkExtractor( - transposon_sequence=transposon_seq, - barcode_sequences=barcode_seqs, - linker_sequence=linker_seq) - - res, status = extractor.extract_read(read) - assert res is None - assert status == ShearSplinkStatus.no_barcode - - def test_missing_transposon(self, transposon_seq, - barcode_seqs, linker_seq): - read = DNASequence(str(barcode_seqs[0]) + str(GENOMIC_SEQ1) + - str(linker_seq)) - - extractor = ShearSplinkExtractor( - transposon_sequence=transposon_seq, - barcode_sequences=barcode_seqs, - linker_sequence=linker_seq) - - res, status = extractor.extract_read(read) - assert res is None - assert status == ShearSplinkStatus.no_transposon - - -# --- Identifier --- # - -AlignedSegment = namedtuple( - 'AlignedSegment', - ['query_name', 'reference_id', 'reference_start', - 'reference_end', 'is_reverse', 'mapping_quality']) - - -class AlignmentFile(object): - - def __init__(self, *args, **kwargs): - pass - - def fetch(self, reference=None): - raise NotImplementedError() - - @property - def references(self): - raise NotImplementedError() - - -@pytest.fixture() -def patch_alignment_file(monkeypatch): - alignments = {'1': [ - AlignedSegment('READ0', '1', 1, 21, True, 40), # 1st group on -1. - AlignedSegment('READ1', '1', 20, 70, False, 40), # Start of 2nd group. - AlignedSegment('READ2', '1', 20, 70, False, 40), - AlignedSegment('READ3', '1', 20, 69, False, 40), - AlignedSegment('READ5', '1', 21, 68, False, 40), # Within merge dist. - AlignedSegment('READ6', '1', 32, 68, False, 30) # Outside merge dist. - ]} - - # Monkeypatch mock class for fixture. - monkeypatch.setattr(AlignmentFile, 'fetch', - lambda self, reference: alignments[reference]) - monkeypatch.setattr(AlignmentFile, 'references', list(alignments.keys())) - - # Monkeypatch class into pysam. - monkeypatch.setattr(pysam, 'AlignmentFile', AlignmentFile) - - -@pytest.fixture() -def aln_barcode_map(): - return { - 'READ0': 'Sample1', - 'READ1': 'Sample1', - 'READ2': 'Sample1', - 'READ3': 'Sample2', - 'READ4': 'Sample2', - 'READ5': 'Sample2', - 'READ6': 'Sample2', - } - - -@pytest.fixture() -def ins_frame_merge(): - return pd.DataFrame({'insertion_id': ['INS_1', 'INS_2'], - 'seqname': ['1', '1'], - 'location': [100, 102], - 'strand': [1, 1], - 'sample': ['S1', 'S1'], - 'depth': [1, 200], - 'depth_unique': [1, 150]}) - - -# noinspection PyShadowingNames -# noinspection PyMethodMayBeStatic -class TestShearSplinkIdentifier(object): - - def test_identify(self, patch_alignment_file): - identifier = ShearSplinkIdentifier(merge_distance=10, min_mapq=0) - insertions = identifier.identify('dummy.bam') - - assert len(insertions) == 3 - - def test_identify_mapq(self, patch_alignment_file): - identifier = ShearSplinkIdentifier(merge_distance=10, min_mapq=37) - insertions = identifier.identify('dummy.bam') - - # Should find two insertions, due to lower mapq of READ6. - assert len(insertions) == 2 - - def test_identify_large_merge(self, patch_alignment_file): - identifier = ShearSplinkIdentifier(merge_distance=20, min_mapq=0) - insertions = identifier.identify('dummy.bam') - - # Should find two insertions, due to larger merge_dist. - assert len(insertions) == 2 - - def test_identify_no_merge(self, patch_alignment_file): - identifier = ShearSplinkIdentifier(merge_distance=0, min_mapq=0) - insertions = identifier.identify('dummy.bam') - - # Should find four insertions, due to no merge_dist. - assert len(insertions) == 4 - - def test_identify_barcodes(self, patch_alignment_file, aln_barcode_map): - identifier = ShearSplinkIdentifier( - merge_distance=10, min_mapq=0, min_depth=0) - insertions = identifier.identify( - 'dummy.bam', barcode_map=aln_barcode_map) - - # Should find four insertions, as reads (1,2) and 3 - # now belong to different samples. - assert len(insertions) == 4 - - # Test insertion membership. - assert insertions['sample'].iloc[0] == 'Sample1' - assert insertions['sample'].iloc[1] == 'Sample2' - assert insertions['sample'].iloc[2] == 'Sample1' - assert insertions['sample'].iloc[3] == 'Sample2' - - def test_merge(self, ins_frame_merge): - identifier = ShearSplinkIdentifier() - merged_ins = identifier._merge_insertions(ins_frame_merge) - - assert merged_ins.seqname == '1' - assert merged_ins.location == 102 # Location weighted towards INS_2. - assert merged_ins.strand == 1 - assert merged_ins.sample == 'S1' - assert merged_ins.depth == 201 - assert merged_ins.depth_unique == 151 - - def test_merge_diff_sample(self, ins_frame_merge): - ins_frame_merge.ix[1, 'sample'] = 'S2' - - identifier = ShearSplinkIdentifier() - with pytest.raises(AssertionError): - identifier._merge_insertions(ins_frame_merge) - - def test_merge_diff_strand(self, ins_frame_merge): - ins_frame_merge.ix[1, 'strand'] = -1 - - identifier = ShearSplinkIdentifier() - with pytest.raises(AssertionError): - identifier._merge_insertions(ins_frame_merge) - - def test_merge_diff_seqname(self, ins_frame_merge): - ins_frame_merge.ix[1, 'seqname'] = '2' - - identifier = ShearSplinkIdentifier() - with pytest.raises(AssertionError): - identifier._merge_insertions(ins_frame_merge) diff --git a/test/test_cluster.py b/test/test_cluster.py deleted file mode 100644 index c8ba4de..0000000 --- a/test/test_cluster.py +++ /dev/null @@ -1,100 +0,0 @@ -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import (ascii, bytes, chr, dict, filter, hex, input, - int, map, next, oct, open, pow, range, round, - str, super, zip) - -import pytest - -import pandas as pd - -from pyim.pipelines._base import genomic_distance -from pyim.cluster import cluster_frame, cluster_frame_merged - - -@pytest.fixture(scope='module') -def test_data(): - frame = pd.DataFrame( - [('1', 10, 1, 10), ('1', 5, 1, 12), - ('1', 40, 1, 20), ('2', 50, 1, 10)], - columns=['seqname', 'location', 'strand', 'value']) - - def merge_func(grp): - ref = grp.iloc[0] - return pd.Series({ - 'seqname': ref.seqname, - 'location': int(grp.location.mean()), - 'strand': ref.strand, - 'value': grp.value.sum() - }, index=grp.columns) - - dist_func = genomic_distance - - return frame, dist_func, merge_func - - -# noinspection PyShadowingNames -# noinspection PyMethodMayBeStatic -class TestClusterFrameMerged(object): - - def test_simple(self, test_data): - frame, dist_func, merge_func = test_data - - res = cluster_frame_merged(frame, dist_func, merge_func, t=10) - - assert isinstance(res, pd.DataFrame) - assert len(res) == 2 - - # Should have grouped the first two entries. - assert tuple(res.iloc[0]) == ('1', 7, 1, 22) - - # Due to omission of groupby, merge should naively have - # merged the last two entries into a single one, using - # the non-summarized entries (seqname, strand) - # from the first row. - assert tuple(res.iloc[1]) == ('1', 45, 1, 30) - - def test_grouping(self, test_data): - frame, dist_func, merge_func = test_data - - res = cluster_frame_merged(frame, dist_func, merge_func, - groupby=['seqname'], t=10) - - assert isinstance(res, pd.DataFrame) - assert len(res) == 3 - - # Should have grouped the first two entries. - assert tuple(res.iloc[0]) == ('1', 7, 1, 22) - - # Should not have grouped the last entries, - # due to the difference in seqname. - assert tuple(res.iloc[1]) == ('1', 40, 1, 20) - assert tuple(res.iloc[2]) == ('2', 50, 1, 10) - - def test_single(self, test_data): - frame, dist_func, merge_func = test_data - - res = cluster_frame_merged(frame.iloc[0:1], dist_func, merge_func, - groupby=['seqname'], t=10) - - assert isinstance(res, pd.DataFrame) - assert len(res) == 1 - assert tuple(res.iloc[0]) == ('1', 10, 1, 10) - - def test_empty(self, test_data): - frame, dist_func, merge_func = test_data - - frame = pd.DataFrame.from_records([], columns=frame.columns) - res = cluster_frame_merged(frame, dist_func, merge_func, - groupby=['seqname'], t=10) - - assert isinstance(res, pd.DataFrame) - assert len(res) == 0 - assert all(res.columns == frame.columns) - - def test_invalid_linkage(self, test_data): - frame, dist_func, merge_func = test_data - - with pytest.raises(ValueError): - cluster_frame_merged(frame, dist_func, merge_func, - linkage='whatever', t=10) diff --git a/version.py b/version.py deleted file mode 100644 index 1c404ad..0000000 --- a/version.py +++ /dev/null @@ -1,106 +0,0 @@ -# -*- coding: utf-8 -*- -# Author: Douglas Creager -# This file is placed into the public domain. - -# Calculates the current version number. If possible, this is the -# output of “git describe”, modified to conform to the versioning -# scheme that setuptools uses. If “git describe” returns an error -# (most likely because we're in an unpacked copy of a release tarball, -# rather than in a git working copy), then we fall back on reading the -# contents of the RELEASE-VERSION file. -# -# To use this script, simply import it your setup.py file, and use the -# results of get_git_version() as your package version: -# -# from version import * -# -# setup( -# version=get_git_version(), -# . -# . -# . -# ) -# -# This will automatically update the RELEASE-VERSION file, if -# necessary. Note that the RELEASE-VERSION file should *not* be -# checked into git; please add it to your top-level .gitignore file. -# -# You'll probably want to distribute the RELEASE-VERSION file in your -# sdist tarballs; to do this, just create a MANIFEST.in file that -# contains the following line: -# -# include RELEASE-VERSION - -from __future__ import print_function - -__all__ = ("get_git_version") - -from subprocess import Popen, PIPE - - -def call_git_describe(abbrev=4): - try: - p = Popen(['git', 'describe', '--abbrev=%d' % abbrev], - stdout=PIPE, stderr=PIPE) - p.stderr.close() - line = p.stdout.readlines()[0] - return line.strip().decode() - - except: - return None - - -def read_release_version(): - try: - f = open("RELEASE-VERSION", "r") - - try: - version = f.readlines()[0] - return version.strip() - - finally: - f.close() - - except: - return None - - -def write_release_version(version): - f = open("RELEASE-VERSION", "w") - f.write("%s\n" % version) - f.close() - - -def get_git_version(abbrev=4): - # Read in the version that's currently in RELEASE-VERSION. - - release_version = read_release_version() - - # First try to get the current version using “git describe”. - - version = call_git_describe(abbrev) - - # If that doesn't work, fall back on the value that's in - # RELEASE-VERSION. - - if version is None: - version = release_version - - # If we still don't have anything, that's an error. - - if version is None: - raise ValueError("Cannot find the version number!") - - # If the current version is different from what's in the - # RELEASE-VERSION file, update the file to be current. - - if version != release_version: - write_release_version(version) - - # Finally, return the current version. - - return version - - -if __name__ == "__main__": - print(get_git_version()) \ No newline at end of file diff --git a/versioneer.py b/versioneer.py new file mode 100644 index 0000000..7ed2a21 --- /dev/null +++ b/versioneer.py @@ -0,0 +1,1774 @@ + +# Version: 0.16 + +"""The Versioneer - like a rocketeer, but for versions. + +The Versioneer +============== + +* like a rocketeer, but for versions! +* https://github.com/warner/python-versioneer +* Brian Warner +* License: Public Domain +* Compatible With: python2.6, 2.7, 3.3, 3.4, 3.5, and pypy +* [![Latest Version] +(https://pypip.in/version/versioneer/badge.svg?style=flat) +](https://pypi.python.org/pypi/versioneer/) +* [![Build Status] +(https://travis-ci.org/warner/python-versioneer.png?branch=master) +](https://travis-ci.org/warner/python-versioneer) + +This is a tool for managing a recorded version number in distutils-based +python projects. The goal is to remove the tedious and error-prone "update +the embedded version string" step from your release process. Making a new +release should be as easy as recording a new tag in your version-control +system, and maybe making new tarballs. + + +## Quick Install + +* `pip install versioneer` to somewhere to your $PATH +* add a `[versioneer]` section to your setup.cfg (see below) +* run `versioneer install` in your source tree, commit the results + +## Version Identifiers + +Source trees come from a variety of places: + +* a version-control system checkout (mostly used by developers) +* a nightly tarball, produced by build automation +* a snapshot tarball, produced by a web-based VCS browser, like github's + "tarball from tag" feature +* a release tarball, produced by "setup.py sdist", distributed through PyPI + +Within each source tree, the version identifier (either a string or a number, +this tool is format-agnostic) can come from a variety of places: + +* ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows + about recent "tags" and an absolute revision-id +* the name of the directory into which the tarball was unpacked +* an expanded VCS keyword ($Id$, etc) +* a `_version.py` created by some earlier build step + +For released software, the version identifier is closely related to a VCS +tag. Some projects use tag names that include more than just the version +string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool +needs to strip the tag prefix to extract the version identifier. For +unreleased software (between tags), the version identifier should provide +enough information to help developers recreate the same tree, while also +giving them an idea of roughly how old the tree is (after version 1.2, before +version 1.3). Many VCS systems can report a description that captures this, +for example `git describe --tags --dirty --always` reports things like +"0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the +0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has +uncommitted changes. + +The version identifier is used for multiple purposes: + +* to allow the module to self-identify its version: `myproject.__version__` +* to choose a name and prefix for a 'setup.py sdist' tarball + +## Theory of Operation + +Versioneer works by adding a special `_version.py` file into your source +tree, where your `__init__.py` can import it. This `_version.py` knows how to +dynamically ask the VCS tool for version information at import time. + +`_version.py` also contains `$Revision$` markers, and the installation +process marks `_version.py` to have this marker rewritten with a tag name +during the `git archive` command. As a result, generated tarballs will +contain enough information to get the proper version. + +To allow `setup.py` to compute a version too, a `versioneer.py` is added to +the top level of your source tree, next to `setup.py` and the `setup.cfg` +that configures it. This overrides several distutils/setuptools commands to +compute the version when invoked, and changes `setup.py build` and `setup.py +sdist` to replace `_version.py` with a small static file that contains just +the generated version data. + +## Installation + +First, decide on values for the following configuration variables: + +* `VCS`: the version control system you use. Currently accepts "git". + +* `style`: the style of version string to be produced. See "Styles" below for + details. Defaults to "pep440", which looks like + `TAG[+DISTANCE.gSHORTHASH[.dirty]]`. + +* `versionfile_source`: + + A project-relative pathname into which the generated version strings should + be written. This is usually a `_version.py` next to your project's main + `__init__.py` file, so it can be imported at runtime. If your project uses + `src/myproject/__init__.py`, this should be `src/myproject/_version.py`. + This file should be checked in to your VCS as usual: the copy created below + by `setup.py setup_versioneer` will include code that parses expanded VCS + keywords in generated tarballs. The 'build' and 'sdist' commands will + replace it with a copy that has just the calculated version string. + + This must be set even if your project does not have any modules (and will + therefore never import `_version.py`), since "setup.py sdist" -based trees + still need somewhere to record the pre-calculated version strings. Anywhere + in the source tree should do. If there is a `__init__.py` next to your + `_version.py`, the `setup.py setup_versioneer` command (described below) + will append some `__version__`-setting assignments, if they aren't already + present. + +* `versionfile_build`: + + Like `versionfile_source`, but relative to the build directory instead of + the source directory. These will differ when your setup.py uses + 'package_dir='. If you have `package_dir={'myproject': 'src/myproject'}`, + then you will probably have `versionfile_build='myproject/_version.py'` and + `versionfile_source='src/myproject/_version.py'`. + + If this is set to None, then `setup.py build` will not attempt to rewrite + any `_version.py` in the built tree. If your project does not have any + libraries (e.g. if it only builds a script), then you should use + `versionfile_build = None`. To actually use the computed version string, + your `setup.py` will need to override `distutils.command.build_scripts` + with a subclass that explicitly inserts a copy of + `versioneer.get_version()` into your script file. See + `test/demoapp-script-only/setup.py` for an example. + +* `tag_prefix`: + + a string, like 'PROJECTNAME-', which appears at the start of all VCS tags. + If your tags look like 'myproject-1.2.0', then you should use + tag_prefix='myproject-'. If you use unprefixed tags like '1.2.0', this + should be an empty string, using either `tag_prefix=` or `tag_prefix=''`. + +* `parentdir_prefix`: + + a optional string, frequently the same as tag_prefix, which appears at the + start of all unpacked tarball filenames. If your tarball unpacks into + 'myproject-1.2.0', this should be 'myproject-'. To disable this feature, + just omit the field from your `setup.cfg`. + +This tool provides one script, named `versioneer`. That script has one mode, +"install", which writes a copy of `versioneer.py` into the current directory +and runs `versioneer.py setup` to finish the installation. + +To versioneer-enable your project: + +* 1: Modify your `setup.cfg`, adding a section named `[versioneer]` and + populating it with the configuration values you decided earlier (note that + the option names are not case-sensitive): + + ```` + [versioneer] + VCS = git + style = pep440 + versionfile_source = src/myproject/_version.py + versionfile_build = myproject/_version.py + tag_prefix = + parentdir_prefix = myproject- + ```` + +* 2: Run `versioneer install`. This will do the following: + + * copy `versioneer.py` into the top of your source tree + * create `_version.py` in the right place (`versionfile_source`) + * modify your `__init__.py` (if one exists next to `_version.py`) to define + `__version__` (by calling a function from `_version.py`) + * modify your `MANIFEST.in` to include both `versioneer.py` and the + generated `_version.py` in sdist tarballs + + `versioneer install` will complain about any problems it finds with your + `setup.py` or `setup.cfg`. Run it multiple times until you have fixed all + the problems. + +* 3: add a `import versioneer` to your setup.py, and add the following + arguments to the setup() call: + + version=versioneer.get_version(), + cmdclass=versioneer.get_cmdclass(), + +* 4: commit these changes to your VCS. To make sure you won't forget, + `versioneer install` will mark everything it touched for addition using + `git add`. Don't forget to add `setup.py` and `setup.cfg` too. + +## Post-Installation Usage + +Once established, all uses of your tree from a VCS checkout should get the +current version string. All generated tarballs should include an embedded +version string (so users who unpack them will not need a VCS tool installed). + +If you distribute your project through PyPI, then the release process should +boil down to two steps: + +* 1: git tag 1.0 +* 2: python setup.py register sdist upload + +If you distribute it through github (i.e. users use github to generate +tarballs with `git archive`), the process is: + +* 1: git tag 1.0 +* 2: git push; git push --tags + +Versioneer will report "0+untagged.NUMCOMMITS.gHASH" until your tree has at +least one tag in its history. + +## Version-String Flavors + +Code which uses Versioneer can learn about its version string at runtime by +importing `_version` from your main `__init__.py` file and running the +`get_versions()` function. From the "outside" (e.g. in `setup.py`), you can +import the top-level `versioneer.py` and run `get_versions()`. + +Both functions return a dictionary with different flavors of version +information: + +* `['version']`: A condensed version string, rendered using the selected + style. This is the most commonly used value for the project's version + string. The default "pep440" style yields strings like `0.11`, + `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section + below for alternative styles. + +* `['full-revisionid']`: detailed revision identifier. For Git, this is the + full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac". + +* `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that + this is only accurate if run in a VCS checkout, otherwise it is likely to + be False or None + +* `['error']`: if the version string could not be computed, this will be set + to a string describing the problem, otherwise it will be None. It may be + useful to throw an exception in setup.py if this is set, to avoid e.g. + creating tarballs with a version string of "unknown". + +Some variants are more useful than others. Including `full-revisionid` in a +bug report should allow developers to reconstruct the exact code being tested +(or indicate the presence of local changes that should be shared with the +developers). `version` is suitable for display in an "about" box or a CLI +`--version` output: it can be easily compared against release notes and lists +of bugs fixed in various releases. + +The installer adds the following text to your `__init__.py` to place a basic +version in `YOURPROJECT.__version__`: + + from ._version import get_versions + __version__ = get_versions()['version'] + del get_versions + +## Styles + +The setup.cfg `style=` configuration controls how the VCS information is +rendered into a version string. + +The default style, "pep440", produces a PEP440-compliant string, equal to the +un-prefixed tag name for actual releases, and containing an additional "local +version" section with more detail for in-between builds. For Git, this is +TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags +--dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the +tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and +that this commit is two revisions ("+2") beyond the "0.11" tag. For released +software (exactly equal to a known tag), the identifier will only contain the +stripped tag, e.g. "0.11". + +Other styles are available. See details.md in the Versioneer source tree for +descriptions. + +## Debugging + +Versioneer tries to avoid fatal errors: if something goes wrong, it will tend +to return a version of "0+unknown". To investigate the problem, run `setup.py +version`, which will run the version-lookup code in a verbose mode, and will +display the full contents of `get_versions()` (including the `error` string, +which may help identify what went wrong). + +## Updating Versioneer + +To upgrade your project to a new release of Versioneer, do the following: + +* install the new Versioneer (`pip install -U versioneer` or equivalent) +* edit `setup.cfg`, if necessary, to include any new configuration settings + indicated by the release notes +* re-run `versioneer install` in your source tree, to replace + `SRC/_version.py` +* commit any changed files + +### Upgrading to 0.16 + +Nothing special. + +### Upgrading to 0.15 + +Starting with this version, Versioneer is configured with a `[versioneer]` +section in your `setup.cfg` file. Earlier versions required the `setup.py` to +set attributes on the `versioneer` module immediately after import. The new +version will refuse to run (raising an exception during import) until you +have provided the necessary `setup.cfg` section. + +In addition, the Versioneer package provides an executable named +`versioneer`, and the installation process is driven by running `versioneer +install`. In 0.14 and earlier, the executable was named +`versioneer-installer` and was run without an argument. + +### Upgrading to 0.14 + +0.14 changes the format of the version string. 0.13 and earlier used +hyphen-separated strings like "0.11-2-g1076c97-dirty". 0.14 and beyond use a +plus-separated "local version" section strings, with dot-separated +components, like "0.11+2.g1076c97". PEP440-strict tools did not like the old +format, but should be ok with the new one. + +### Upgrading from 0.11 to 0.12 + +Nothing special. + +### Upgrading from 0.10 to 0.11 + +You must add a `versioneer.VCS = "git"` to your `setup.py` before re-running +`setup.py setup_versioneer`. This will enable the use of additional +version-control systems (SVN, etc) in the future. + +## Future Directions + +This tool is designed to make it easily extended to other version-control +systems: all VCS-specific components are in separate directories like +src/git/ . The top-level `versioneer.py` script is assembled from these +components by running make-versioneer.py . In the future, make-versioneer.py +will take a VCS name as an argument, and will construct a version of +`versioneer.py` that is specific to the given VCS. It might also take the +configuration arguments that are currently provided manually during +installation by editing setup.py . Alternatively, it might go the other +direction and include code from all supported VCS systems, reducing the +number of intermediate scripts. + + +## License + +To make Versioneer easier to embed, all its code is dedicated to the public +domain. The `_version.py` that it creates is also in the public domain. +Specifically, both are released under the Creative Commons "Public Domain +Dedication" license (CC0-1.0), as described in +https://creativecommons.org/publicdomain/zero/1.0/ . + +""" + +from __future__ import print_function +try: + import configparser +except ImportError: + import ConfigParser as configparser +import errno +import json +import os +import re +import subprocess +import sys + + +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" + + +def get_root(): + """Get the project root directory. + + We require that all commands are run from the project root, i.e. the + directory that contains setup.py, setup.cfg, and versioneer.py . + """ + root = os.path.realpath(os.path.abspath(os.getcwd())) + setup_py = os.path.join(root, "setup.py") + versioneer_py = os.path.join(root, "versioneer.py") + if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): + # allow 'python path/to/setup.py COMMAND' + root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0]))) + setup_py = os.path.join(root, "setup.py") + versioneer_py = os.path.join(root, "versioneer.py") + if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): + err = ("Versioneer was unable to run the project root directory. " + "Versioneer requires setup.py to be executed from " + "its immediate directory (like 'python setup.py COMMAND'), " + "or in a way that lets it use sys.argv[0] to find the root " + "(like 'python path/to/setup.py COMMAND').") + raise VersioneerBadRootError(err) + try: + # Certain runtime workflows (setup.py install/develop in a setuptools + # tree) execute all dependencies in a single python process, so + # "versioneer" may be imported multiple times, and python's shared + # module-import table will cache the first one. So we can't use + # os.path.dirname(__file__), as that will find whichever + # versioneer.py was first imported, even in later projects. + me = os.path.realpath(os.path.abspath(__file__)) + if os.path.splitext(me)[0] != os.path.splitext(versioneer_py)[0]: + print("Warning: build in %s is using versioneer.py from %s" + % (os.path.dirname(me), versioneer_py)) + except NameError: + pass + return root + + +def get_config_from_root(root): + """Read the project setup.cfg file to determine Versioneer config.""" + # This might raise EnvironmentError (if setup.cfg is missing), or + # configparser.NoSectionError (if it lacks a [versioneer] section), or + # configparser.NoOptionError (if it lacks "VCS="). See the docstring at + # the top of versioneer.py for instructions on writing your setup.cfg . + setup_cfg = os.path.join(root, "setup.cfg") + parser = configparser.SafeConfigParser() + with open(setup_cfg, "r") as f: + parser.readfp(f) + VCS = parser.get("versioneer", "VCS") # mandatory + + def get(parser, name): + if parser.has_option("versioneer", name): + return parser.get("versioneer", name) + return None + cfg = VersioneerConfig() + cfg.VCS = VCS + cfg.style = get(parser, "style") or "" + cfg.versionfile_source = get(parser, "versionfile_source") + cfg.versionfile_build = get(parser, "versionfile_build") + cfg.tag_prefix = get(parser, "tag_prefix") + if cfg.tag_prefix in ("''", '""'): + cfg.tag_prefix = "" + cfg.parentdir_prefix = get(parser, "parentdir_prefix") + cfg.verbose = get(parser, "verbose") + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + +# these dictionaries contain VCS-specific tools +LONG_VERSION_PY = {} +HANDLERS = {} + + +def register_vcs_handler(vcs, method): # decorator + """Decorator to mark a method as the handler for a particular VCS.""" + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + return decorate + + +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False): + """Call the given command(s).""" + assert isinstance(commands, list) + p = None + for c in commands: + try: + dispcmd = str([c] + args) + # remember shell=False, so use git.cmd on windows, not just git + p = subprocess.Popen([c] + args, cwd=cwd, stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr + else None)) + break + except EnvironmentError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %s" % dispcmd) + print(e) + return None + else: + if verbose: + print("unable to find command, tried %s" % (commands,)) + return None + stdout = p.communicate()[0].strip() + if sys.version_info[0] >= 3: + stdout = stdout.decode() + if p.returncode != 0: + if verbose: + print("unable to run %s (error)" % dispcmd) + return None + return stdout +LONG_VERSION_PY['git'] = ''' +# This file helps to compute a version number in source trees obtained from +# git-archive tarball (such as those provided by githubs download-from-tag +# feature). Distribution tarballs (built by setup.py sdist) and build +# directories (produced by setup.py build) will contain a much shorter file +# that just contains the computed version number. + +# This file is released into the public domain. Generated by +# versioneer-0.16 (https://github.com/warner/python-versioneer) + +"""Git implementation of _version.py.""" + +import errno +import os +import re +import subprocess +import sys + + +def get_keywords(): + """Get the keywords needed to look up the version information.""" + # these strings will be replaced by git during git-archive. + # setup.py/versioneer.py will grep for the variable names, so they must + # each be defined on a line of their own. _version.py will just call + # get_keywords(). + git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s" + git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s" + keywords = {"refnames": git_refnames, "full": git_full} + return keywords + + +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" + + +def get_config(): + """Create, populate and return the VersioneerConfig() object.""" + # these strings are filled in when 'setup.py versioneer' creates + # _version.py + cfg = VersioneerConfig() + cfg.VCS = "git" + cfg.style = "%(STYLE)s" + cfg.tag_prefix = "%(TAG_PREFIX)s" + cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s" + cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s" + cfg.verbose = False + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +LONG_VERSION_PY = {} +HANDLERS = {} + + +def register_vcs_handler(vcs, method): # decorator + """Decorator to mark a method as the handler for a particular VCS.""" + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + return decorate + + +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False): + """Call the given command(s).""" + assert isinstance(commands, list) + p = None + for c in commands: + try: + dispcmd = str([c] + args) + # remember shell=False, so use git.cmd on windows, not just git + p = subprocess.Popen([c] + args, cwd=cwd, stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr + else None)) + break + except EnvironmentError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %%s" %% dispcmd) + print(e) + return None + else: + if verbose: + print("unable to find command, tried %%s" %% (commands,)) + return None + stdout = p.communicate()[0].strip() + if sys.version_info[0] >= 3: + stdout = stdout.decode() + if p.returncode != 0: + if verbose: + print("unable to run %%s (error)" %% dispcmd) + return None + return stdout + + +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes + both the project name and a version string. + """ + dirname = os.path.basename(root) + if not dirname.startswith(parentdir_prefix): + if verbose: + print("guessing rootdir is '%%s', but '%%s' doesn't start with " + "prefix '%%s'" %% (root, dirname, parentdir_prefix)) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + return {"version": dirname[len(parentdir_prefix):], + "full-revisionid": None, + "dirty": False, "error": None} + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + f = open(versionfile_abs, "r") + for line in f.readlines(): + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + f.close() + except EnvironmentError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if not keywords: + raise NotThisMethod("no keywords at all, weird") + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = set([r.strip() for r in refnames.strip("()").split(",")]) + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %%d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = set([r for r in refs if re.search(r'\d', r)]) + if verbose: + print("discarding '%%s', no digits" %% ",".join(refs-tags)) + if verbose: + print("likely tags: %%s" %% ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix):] + if verbose: + print("picking %%s" %% r) + return {"version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": None + } + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return {"version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": "no suitable tags"} + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): + """Get version from 'git describe' in the root of the source tree. + + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + if not os.path.exists(os.path.join(root, ".git")): + if verbose: + print("no .git in %%s" %% root) + raise NotThisMethod("no .git directory") + + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out = run_command(GITS, ["describe", "--tags", "--dirty", + "--always", "--long", + "--match", "%%s*" %% tag_prefix], + cwd=root) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[:git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + if not mo: + # unparseable. Maybe git-describe is misbehaving? + pieces["error"] = ("unable to parse git-describe output: '%%s'" + %% describe_out) + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%%s' doesn't start with prefix '%%s'" + print(fmt %% (full_tag, tag_prefix)) + pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'" + %% (full_tag, tag_prefix)) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix):] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + count_out = run_command(GITS, ["rev-list", "HEAD", "--count"], + cwd=root) + pieces["distance"] = int(count_out) # total number of commits + + return pieces + + +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_pre(pieces): + """TAG[.post.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post.devDISTANCE + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += ".post.dev%%d" %% pieces["distance"] + else: + # exception #1 + rendered = "0.post.dev%%d" %% pieces["distance"] + return rendered + + +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%%s" %% pieces["short"] + else: + # exception #1 + rendered = "0.post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%%s" %% pieces["short"] + return rendered + + +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + + The ".dev0" means dirty. + + Eexceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + + Like 'git describe --tags --dirty --always'. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return {"version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"]} + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%%s'" %% style) + + return {"version": rendered, "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], "error": None} + + +def get_versions(): + """Get version information or return default if unable to do so.""" + # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have + # __file__, we can work backwards from there to the root. Some + # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which + # case we can only use expanded keywords. + + cfg = get_config() + verbose = cfg.verbose + + try: + return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, + verbose) + except NotThisMethod: + pass + + try: + root = os.path.realpath(__file__) + # versionfile_source is the relative path from the top of the source + # tree (where the .git directory might live) to this file. Invert + # this to find the root from __file__. + for i in cfg.versionfile_source.split('/'): + root = os.path.dirname(root) + except NameError: + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree"} + + try: + pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) + return render(pieces, cfg.style) + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + except NotThisMethod: + pass + + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to compute version"} +''' + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + f = open(versionfile_abs, "r") + for line in f.readlines(): + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + f.close() + except EnvironmentError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if not keywords: + raise NotThisMethod("no keywords at all, weird") + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = set([r.strip() for r in refnames.strip("()").split(",")]) + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = set([r for r in refs if re.search(r'\d', r)]) + if verbose: + print("discarding '%s', no digits" % ",".join(refs-tags)) + if verbose: + print("likely tags: %s" % ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix):] + if verbose: + print("picking %s" % r) + return {"version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": None + } + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return {"version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": "no suitable tags"} + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): + """Get version from 'git describe' in the root of the source tree. + + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + if not os.path.exists(os.path.join(root, ".git")): + if verbose: + print("no .git in %s" % root) + raise NotThisMethod("no .git directory") + + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out = run_command(GITS, ["describe", "--tags", "--dirty", + "--always", "--long", + "--match", "%s*" % tag_prefix], + cwd=root) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[:git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + if not mo: + # unparseable. Maybe git-describe is misbehaving? + pieces["error"] = ("unable to parse git-describe output: '%s'" + % describe_out) + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%s' doesn't start with prefix '%s'" + print(fmt % (full_tag, tag_prefix)) + pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" + % (full_tag, tag_prefix)) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix):] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + count_out = run_command(GITS, ["rev-list", "HEAD", "--count"], + cwd=root) + pieces["distance"] = int(count_out) # total number of commits + + return pieces + + +def do_vcs_install(manifest_in, versionfile_source, ipy): + """Git-specific installation logic for Versioneer. + + For Git, this means creating/changing .gitattributes to mark _version.py + for export-time keyword substitution. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + files = [manifest_in, versionfile_source] + if ipy: + files.append(ipy) + try: + me = __file__ + if me.endswith(".pyc") or me.endswith(".pyo"): + me = os.path.splitext(me)[0] + ".py" + versioneer_file = os.path.relpath(me) + except NameError: + versioneer_file = "versioneer.py" + files.append(versioneer_file) + present = False + try: + f = open(".gitattributes", "r") + for line in f.readlines(): + if line.strip().startswith(versionfile_source): + if "export-subst" in line.strip().split()[1:]: + present = True + f.close() + except EnvironmentError: + pass + if not present: + f = open(".gitattributes", "a+") + f.write("%s export-subst\n" % versionfile_source) + f.close() + files.append(".gitattributes") + run_command(GITS, ["add", "--"] + files) + + +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes + both the project name and a version string. + """ + dirname = os.path.basename(root) + if not dirname.startswith(parentdir_prefix): + if verbose: + print("guessing rootdir is '%s', but '%s' doesn't start with " + "prefix '%s'" % (root, dirname, parentdir_prefix)) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + return {"version": dirname[len(parentdir_prefix):], + "full-revisionid": None, + "dirty": False, "error": None} + +SHORT_VERSION_PY = """ +# This file was generated by 'versioneer.py' (0.16) from +# revision-control system data, or from the parent directory name of an +# unpacked source archive. Distribution tarballs contain a pre-generated copy +# of this file. + +import json +import sys + +version_json = ''' +%s +''' # END VERSION_JSON + + +def get_versions(): + return json.loads(version_json) +""" + + +def versions_from_file(filename): + """Try to determine the version from _version.py if present.""" + try: + with open(filename) as f: + contents = f.read() + except EnvironmentError: + raise NotThisMethod("unable to read _version.py") + mo = re.search(r"version_json = '''\n(.*)''' # END VERSION_JSON", + contents, re.M | re.S) + if not mo: + raise NotThisMethod("no version_json in _version.py") + return json.loads(mo.group(1)) + + +def write_to_version_file(filename, versions): + """Write the given version number to the given _version.py file.""" + os.unlink(filename) + contents = json.dumps(versions, sort_keys=True, + indent=1, separators=(",", ": ")) + with open(filename, "w") as f: + f.write(SHORT_VERSION_PY % contents) + + print("set %s to '%s'" % (filename, versions["version"])) + + +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%d.g%s" % (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_pre(pieces): + """TAG[.post.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post.devDISTANCE + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += ".post.dev%d" % pieces["distance"] + else: + # exception #1 + rendered = "0.post.dev%d" % pieces["distance"] + return rendered + + +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + return rendered + + +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + + The ".dev0" means dirty. + + Eexceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + + Like 'git describe --tags --dirty --always'. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return {"version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"]} + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%s'" % style) + + return {"version": rendered, "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], "error": None} + + +class VersioneerBadRootError(Exception): + """The project root directory is unknown or missing key files.""" + + +def get_versions(verbose=False): + """Get the project version from whatever source is available. + + Returns dict with two keys: 'version' and 'full'. + """ + if "versioneer" in sys.modules: + # see the discussion in cmdclass.py:get_cmdclass() + del sys.modules["versioneer"] + + root = get_root() + cfg = get_config_from_root(root) + + assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg" + handlers = HANDLERS.get(cfg.VCS) + assert handlers, "unrecognized VCS '%s'" % cfg.VCS + verbose = verbose or cfg.verbose + assert cfg.versionfile_source is not None, \ + "please set versioneer.versionfile_source" + assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" + + versionfile_abs = os.path.join(root, cfg.versionfile_source) + + # extract version from first of: _version.py, VCS command (e.g. 'git + # describe'), parentdir. This is meant to work for developers using a + # source checkout, for users of a tarball created by 'setup.py sdist', + # and for users of a tarball/zipball created by 'git archive' or github's + # download-from-tag feature or the equivalent in other VCSes. + + get_keywords_f = handlers.get("get_keywords") + from_keywords_f = handlers.get("keywords") + if get_keywords_f and from_keywords_f: + try: + keywords = get_keywords_f(versionfile_abs) + ver = from_keywords_f(keywords, cfg.tag_prefix, verbose) + if verbose: + print("got version from expanded keyword %s" % ver) + return ver + except NotThisMethod: + pass + + try: + ver = versions_from_file(versionfile_abs) + if verbose: + print("got version from file %s %s" % (versionfile_abs, ver)) + return ver + except NotThisMethod: + pass + + from_vcs_f = handlers.get("pieces_from_vcs") + if from_vcs_f: + try: + pieces = from_vcs_f(cfg.tag_prefix, root, verbose) + ver = render(pieces, cfg.style) + if verbose: + print("got version from VCS %s" % ver) + return ver + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + if verbose: + print("got version from parentdir %s" % ver) + return ver + except NotThisMethod: + pass + + if verbose: + print("unable to compute version") + + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, "error": "unable to compute version"} + + +def get_version(): + """Get the short version string for this project.""" + return get_versions()["version"] + + +def get_cmdclass(): + """Get the custom setuptools/distutils subclasses used by Versioneer.""" + if "versioneer" in sys.modules: + del sys.modules["versioneer"] + # this fixes the "python setup.py develop" case (also 'install' and + # 'easy_install .'), in which subdependencies of the main project are + # built (using setup.py bdist_egg) in the same python process. Assume + # a main project A and a dependency B, which use different versions + # of Versioneer. A's setup.py imports A's Versioneer, leaving it in + # sys.modules by the time B's setup.py is executed, causing B to run + # with the wrong versioneer. Setuptools wraps the sub-dep builds in a + # sandbox that restores sys.modules to it's pre-build state, so the + # parent is protected against the child's "import versioneer". By + # removing ourselves from sys.modules here, before the child build + # happens, we protect the child from the parent's versioneer too. + # Also see https://github.com/warner/python-versioneer/issues/52 + + cmds = {} + + # we add "version" to both distutils and setuptools + from distutils.core import Command + + class cmd_version(Command): + description = "report generated version string" + user_options = [] + boolean_options = [] + + def initialize_options(self): + pass + + def finalize_options(self): + pass + + def run(self): + vers = get_versions(verbose=True) + print("Version: %s" % vers["version"]) + print(" full-revisionid: %s" % vers.get("full-revisionid")) + print(" dirty: %s" % vers.get("dirty")) + if vers["error"]: + print(" error: %s" % vers["error"]) + cmds["version"] = cmd_version + + # we override "build_py" in both distutils and setuptools + # + # most invocation pathways end up running build_py: + # distutils/build -> build_py + # distutils/install -> distutils/build ->.. + # setuptools/bdist_wheel -> distutils/install ->.. + # setuptools/bdist_egg -> distutils/install_lib -> build_py + # setuptools/install -> bdist_egg ->.. + # setuptools/develop -> ? + + # we override different "build_py" commands for both environments + if "setuptools" in sys.modules: + from setuptools.command.build_py import build_py as _build_py + else: + from distutils.command.build_py import build_py as _build_py + + class cmd_build_py(_build_py): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + _build_py.run(self) + # now locate _version.py in the new build/ directory and replace + # it with an updated value + if cfg.versionfile_build: + target_versionfile = os.path.join(self.build_lib, + cfg.versionfile_build) + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + cmds["build_py"] = cmd_build_py + + if "cx_Freeze" in sys.modules: # cx_freeze enabled? + from cx_Freeze.dist import build_exe as _build_exe + + class cmd_build_exe(_build_exe): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + target_versionfile = cfg.versionfile_source + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + + _build_exe.run(self) + os.unlink(target_versionfile) + with open(cfg.versionfile_source, "w") as f: + LONG = LONG_VERSION_PY[cfg.VCS] + f.write(LONG % + {"DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + }) + cmds["build_exe"] = cmd_build_exe + del cmds["build_py"] + + # we override different "sdist" commands for both environments + if "setuptools" in sys.modules: + from setuptools.command.sdist import sdist as _sdist + else: + from distutils.command.sdist import sdist as _sdist + + class cmd_sdist(_sdist): + def run(self): + versions = get_versions() + self._versioneer_generated_versions = versions + # unless we update this, the command will keep using the old + # version + self.distribution.metadata.version = versions["version"] + return _sdist.run(self) + + def make_release_tree(self, base_dir, files): + root = get_root() + cfg = get_config_from_root(root) + _sdist.make_release_tree(self, base_dir, files) + # now locate _version.py in the new base_dir directory + # (remembering that it may be a hardlink) and replace it with an + # updated value + target_versionfile = os.path.join(base_dir, cfg.versionfile_source) + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, + self._versioneer_generated_versions) + cmds["sdist"] = cmd_sdist + + return cmds + + +CONFIG_ERROR = """ +setup.cfg is missing the necessary Versioneer configuration. You need +a section like: + + [versioneer] + VCS = git + style = pep440 + versionfile_source = src/myproject/_version.py + versionfile_build = myproject/_version.py + tag_prefix = + parentdir_prefix = myproject- + +You will also need to edit your setup.py to use the results: + + import versioneer + setup(version=versioneer.get_version(), + cmdclass=versioneer.get_cmdclass(), ...) + +Please read the docstring in ./versioneer.py for configuration instructions, +edit setup.cfg, and re-run the installer or 'python versioneer.py setup'. +""" + +SAMPLE_CONFIG = """ +# See the docstring in versioneer.py for instructions. Note that you must +# re-run 'versioneer.py setup' after changing this section, and commit the +# resulting files. + +[versioneer] +#VCS = git +#style = pep440 +#versionfile_source = +#versionfile_build = +#tag_prefix = +#parentdir_prefix = + +""" + +INIT_PY_SNIPPET = """ +from ._version import get_versions +__version__ = get_versions()['version'] +del get_versions +""" + + +def do_setup(): + """Main VCS-independent setup function for installing Versioneer.""" + root = get_root() + try: + cfg = get_config_from_root(root) + except (EnvironmentError, configparser.NoSectionError, + configparser.NoOptionError) as e: + if isinstance(e, (EnvironmentError, configparser.NoSectionError)): + print("Adding sample versioneer config to setup.cfg", + file=sys.stderr) + with open(os.path.join(root, "setup.cfg"), "a") as f: + f.write(SAMPLE_CONFIG) + print(CONFIG_ERROR, file=sys.stderr) + return 1 + + print(" creating %s" % cfg.versionfile_source) + with open(cfg.versionfile_source, "w") as f: + LONG = LONG_VERSION_PY[cfg.VCS] + f.write(LONG % {"DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + }) + + ipy = os.path.join(os.path.dirname(cfg.versionfile_source), + "__init__.py") + if os.path.exists(ipy): + try: + with open(ipy, "r") as f: + old = f.read() + except EnvironmentError: + old = "" + if INIT_PY_SNIPPET not in old: + print(" appending to %s" % ipy) + with open(ipy, "a") as f: + f.write(INIT_PY_SNIPPET) + else: + print(" %s unmodified" % ipy) + else: + print(" %s doesn't exist, ok" % ipy) + ipy = None + + # Make sure both the top-level "versioneer.py" and versionfile_source + # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so + # they'll be copied into source distributions. Pip won't be able to + # install the package without this. + manifest_in = os.path.join(root, "MANIFEST.in") + simple_includes = set() + try: + with open(manifest_in, "r") as f: + for line in f: + if line.startswith("include "): + for include in line.split()[1:]: + simple_includes.add(include) + except EnvironmentError: + pass + # That doesn't cover everything MANIFEST.in can do + # (http://docs.python.org/2/distutils/sourcedist.html#commands), so + # it might give some false negatives. Appending redundant 'include' + # lines is safe, though. + if "versioneer.py" not in simple_includes: + print(" appending 'versioneer.py' to MANIFEST.in") + with open(manifest_in, "a") as f: + f.write("include versioneer.py\n") + else: + print(" 'versioneer.py' already in MANIFEST.in") + if cfg.versionfile_source not in simple_includes: + print(" appending versionfile_source ('%s') to MANIFEST.in" % + cfg.versionfile_source) + with open(manifest_in, "a") as f: + f.write("include %s\n" % cfg.versionfile_source) + else: + print(" versionfile_source already in MANIFEST.in") + + # Make VCS-specific changes. For git, this means creating/changing + # .gitattributes to mark _version.py for export-time keyword + # substitution. + do_vcs_install(manifest_in, cfg.versionfile_source, ipy) + return 0 + + +def scan_setup_py(): + """Validate the contents of setup.py against Versioneer's expectations.""" + found = set() + setters = False + errors = 0 + with open("setup.py", "r") as f: + for line in f.readlines(): + if "import versioneer" in line: + found.add("import") + if "versioneer.get_cmdclass()" in line: + found.add("cmdclass") + if "versioneer.get_version()" in line: + found.add("get_version") + if "versioneer.VCS" in line: + setters = True + if "versioneer.versionfile_source" in line: + setters = True + if len(found) != 3: + print("") + print("Your setup.py appears to be missing some important items") + print("(but I might be wrong). Please make sure it has something") + print("roughly like the following:") + print("") + print(" import versioneer") + print(" setup( version=versioneer.get_version(),") + print(" cmdclass=versioneer.get_cmdclass(), ...)") + print("") + errors += 1 + if setters: + print("You should remove lines like 'versioneer.VCS = ' and") + print("'versioneer.versionfile_source = ' . This configuration") + print("now lives in setup.cfg, and should be removed from setup.py") + print("") + errors += 1 + return errors + +if __name__ == "__main__": + cmd = sys.argv[1] + if cmd == "setup": + errors = do_setup() + errors += scan_setup_py() + if errors: + sys.exit(1) From eebaf901da0d3fbedecd3607d9633287c5936b91 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Tue, 1 Nov 2016 12:32:41 +0100 Subject: [PATCH 076/100] Intermediate version before new release. commit b31d795a631d76b0bb1cac7e26ba631e81475ae7 Author: Julian de Ruiter Date: Tue Nov 1 12:31:58 2016 +0100 Refactoring of pipelines. commit 86ae3673c128e40301d38952841618bb9c78a60a Author: Julian de Ruiter Date: Thu Sep 1 13:06:25 2016 +0200 Add docs + grouping. commit 318dfded36c50aaca45056df6649837c6c681d51 Author: Julian de Ruiter Date: Mon Aug 8 16:20:07 2016 +0200 Updates to annotator. --- .gitignore | 1 + ATP1_Cassette.dna | Bin 0 -> 29957 bytes data/sb.barcodes.fa.fai | 201 +++++++++++++ docs/Makefile | 51 +++- docs/api.rst | 53 ++++ docs/conf.py | 73 +++-- docs/index.rst | 10 +- docs/introduction.rst | 14 + docs/make.bat | 29 +- setup.py | 1 + shear_splink_cutadapt.py | 38 +++ src/pyim/{alignment => align}/__init__.py | 0 src/pyim/{alignment => align}/bowtie2.py | 0 .../pipelines => align/common}/__init__.py | 0 src/pyim/align/common/cutadapt.py | 107 +++++++ .../_helpers => align/pipelines}/__init__.py | 0 src/pyim/align/pipelines/_helpers/__init__.py | 0 .../pipelines/_helpers/clustering.py | 0 .../pipelines/_helpers/grouping.py | 96 +++++++ .../pipelines/_helpers/pipeline.py | 0 .../{alignment => align}/pipelines/_model.py | 0 .../{alignment => align}/pipelines/lam_pcr.py | 0 .../pipelines/shear_splink.py | 0 .../pipelines/shear_splink_sb.py | 0 src/pyim/{alignment => align}/vector.py | 0 src/pyim/annotation/__init__.py | 2 + src/pyim/annotation/_registry.py | 10 + src/pyim/annotation/annotator/__init__.py | 6 +- src/pyim/annotation/annotator/window.py | 268 ++++++------------ src/pyim/annotation/metadata.py | 8 +- src/pyim/cis/cimpl.py | 60 ++-- src/pyim/main/annotate.py | 53 +++- src/pyim/main/merge.py | 86 +++--- src/pyim/main/merge_sets.py | 86 ++++++ src/pyim/main/plot.py | 96 ------- src/pyim/model.py | 92 ++++++ 36 files changed, 1038 insertions(+), 403 deletions(-) create mode 100644 ATP1_Cassette.dna create mode 100644 data/sb.barcodes.fa.fai create mode 100644 docs/api.rst create mode 100644 docs/introduction.rst create mode 100644 shear_splink_cutadapt.py rename src/pyim/{alignment => align}/__init__.py (100%) rename src/pyim/{alignment => align}/bowtie2.py (100%) rename src/pyim/{alignment/pipelines => align/common}/__init__.py (100%) create mode 100644 src/pyim/align/common/cutadapt.py rename src/pyim/{alignment/pipelines/_helpers => align/pipelines}/__init__.py (100%) create mode 100644 src/pyim/align/pipelines/_helpers/__init__.py rename src/pyim/{alignment => align}/pipelines/_helpers/clustering.py (100%) rename src/pyim/{alignment => align}/pipelines/_helpers/grouping.py (52%) rename src/pyim/{alignment => align}/pipelines/_helpers/pipeline.py (100%) rename src/pyim/{alignment => align}/pipelines/_model.py (100%) rename src/pyim/{alignment => align}/pipelines/lam_pcr.py (100%) rename src/pyim/{alignment => align}/pipelines/shear_splink.py (100%) rename src/pyim/{alignment => align}/pipelines/shear_splink_sb.py (100%) rename src/pyim/{alignment => align}/vector.py (100%) create mode 100644 src/pyim/annotation/_registry.py create mode 100644 src/pyim/main/merge_sets.py delete mode 100644 src/pyim/main/plot.py create mode 100644 src/pyim/model.py diff --git a/.gitignore b/.gitignore index f17f4f9..6d26c7e 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ htmlcov _legacy docs/_build RELEASE-VERSION +/.vscode diff --git a/ATP1_Cassette.dna b/ATP1_Cassette.dna new file mode 100644 index 0000000000000000000000000000000000000000..5fbe90d117a968f764c236ba36753f6e7d123516 GIT binary patch literal 29957 zcmdsf1$-9A_UP=+`tGdZ?(Xgq0tDCK8bW{|0YcD1u|lC}u_A@y6ewOO?ydz21qxJX zu>z$=fkJub%zht)-roPe_ucpY_xIi=`|Zq`xZJkMTi3 zL>+++Bm)}SfNYUEDHg?`HfSzqj$JO4BcdLh9>+ieFyeHG%)^V40*rt!bdVZ(aqXZ7 zK}8clCb|Rl=S@rnP7GZ%qbnu|`X@vh7{R+Bm&-%XaP`o!XfNl*iJ%2?xTT;o;vpv# zh<<|>cnk)D0}~K|+zG%3m;n<|23bNYAVnPR9}vYgFM%+?L-SBozy(Xta=_#uZWOl(#sLvj zK?BiT^Z-v4AO{tKah@NjKK!DJzzrmV!gvY%!X3iJ!T{i*RVXn~CDcGL0Fw=JCWy&> zk9Gv{LNPEF{zV$J1UYj1xKjWJeUHY0*=QPXAP#atZaA+{22Vn+7+`T2cgsJs9!UTH zH=oBD5|u|Dyb2BQ3fhM3NHfgg+*3c=d|W;;-Ud?bMu4x<>HFrXGlz&0iYHiHpN3FskzBo5keL8e?3 zas(v;7iFM4e8MrmyJIz~E$I zerN;-y;KXhy(SR3BV&*QGC>mWp!(dSLE3X~;!*^L@u`Js!w(!l!>?RH4!{re1st3+ zGUO*-xPp8HZN`9u7_gszzWe|+@CPu05k4IHKo`FT4Uk|mp_2g#e^FtO1y>XTzwiTU z@+AwTqf-L604ShH{>fd9lbUOYANYg6{6tq_uAnv`N*uV^g0XiBPeBf9}AU7!BjpL{e z7ta}D2Z=aOP9AXLU~C5HxN<{_iwPkFx`y-wbgl@;=0=3{K-O>~C3pq&<_E9{wZYk7 z9|Rdd0SB%YeqgA%Vu3Dz7Lv?ma$OM%8hnIHK~$tclY$@zeL&%`@COnFdf@;!GUx|w zAV&u9i`oU=3IV{5P#SXp#1U}u6DAung#)J<=1QP2pDV}^eS_;kC>4M+_!RR81#v#% zKI(%S2E0)RP!3mFZbb0Or%T`wj>q|9jF1V(3){sViLMJ=6=uvmkNJzL^G{SCnPVgZ ztFRL!JdOo}ArN4pK^⁢5fGh69eZPjs<$7aX1JzzyPQRsPM^IA{8Fc!D)$#aP@JO zM+$Hy;sz!^|D0%{BTqcMCD2(=J|xPXZb>>@hq zib`|qU;vax8QcPFVD_SDl!sE$?Wh~BfjsnFGp-I-1eHaWKpBJ@P=N?V1T7dxQozF1 zicb!-5H$fR5CI+Rf^J+c8jK#~V9*N(@dK5ICeQ&VY(oz<{0B>b6_=00(1s7R4w-X} zxawSI)GuTfM*=Yj0s~Psq()0P6K)zRg>dv%$aVmshfohd<~CvcAULQta0~q+YdFy9 z;9s;SxNss&YQRUkxuZ~fWPx~qfESD~1_yx{OK>A-iI>2m=zc)ubf_Q%oJ#=)AnVlu zqL2&}#)!cM5ut@D0~V$qM}|+p1WDW(oH6GI2r$3}b55u;XvaAqlb1gLLRr8AhLI^( z3K9ryKzv|=hVa!FO@o62Q72T8YaZADUdPy>ThK@t;w z1a9SOqDE*rp9$Q5fj?1itFgEt236hyfx_ zh{+Vv2@DIwpg0&rdQOB5R2x~riBUj1xCYSXT^Ft$+cvf`ffOI$re7IKuA4WMxydWD874Qd4rkXun++dd7ET+7 zD(8s^;Fch8Vb29300UVgBIpH)g%?yF{e@Et8~{LM1)msTL_~MPD1b3ffdIQ`DD(n4 zuog|_Q1B$OLnnaKc$82Sh6e|DR8SJQ0=pR!**DIq({3@F3LySkv;>Qh2YE$ zC&Ex4#6^E0Z{&$MXd~){w8#f(@rrtfeTzOtT{6O%3r@sA9gt60KWs~(e%R(Pt_*O* zN4}^(%0u~xhbPKLUBY^xPG}#VIEHfb!HF`^PNYGb!ulc%`5`XyKtCaEemK!)q{9>M?pJ z{SAGKeqR4d|62c5f29AWKh{MfwUNWfY2-3;8%2#`##_cz<9%bDvEJBV>^1for;KyP zH^zPAf$_7k-TcvfZvJ7~mSee=XQi>SSuZTY#uXuhfU?uj*4(&=k$ohG<`EkF;`nb$zw|iT;iLL@#U9 zFvb|~8uN|S#xdixao)IQ+%kSKYM7JF8RiyqkNK0C*2-k{x4N*NY^Lox#hm3%dU8PM zFTE)x$;0HLN=j|G7NIxShv^seZ;girHM^Ke=1#LQD`-!!C)=y+Db78oweOIRg!jcq z;%q7zADQdTTFy4-2gh;KxdYs9+-I&uz7?m?#Z+W@*cx_{{bYA_esFtwV|4iwbPS9(m?rRrMH?+yQFp2Z|V>Av_@W|gfZOMWb85y81u~~ z>_hggo!-ghG;|g?N8Rk6MMlyk(qj1&xt6kD(bdxCS?dd{7}IQ@v?i5l0_`b{lzPY` zWKGXujkGqg4t6hll>NTF#Qw}a3;NU}y+oC+mp_-&DDi59F_cYUzMaD_LC&dPtM}9& z)ko^@s;H@&rFmLrt(aCj4j4- z(B@vwAGNvB4t*|crO zhh{o!fHl{eWqVF>r>9fh%|G3|z?tgYce;BzNu@=Yt=Sa2lhe)X<*w%okUP z55Nwv)>p;lAZgbtUg3?=x>EaPC3o&`0!F z>7=~SdT7mI51oPTO!tQS++FRfYovzwoj8^L zEU#tVoQLjMUm;8A7cyme*(r9y^VlQhI&EFD zMnO(HVm2{C%tKGmTlBbeQ@Sr-bIbd+eV4o~UKXE=ndwvc17)jHTHUWTFdwpyoDyDo zKi)s!GtyS+suWZYYo)CL_AvXAUDdhl6n1;L7u+<1@24Nr(-|3zl16dMbQ(ELoM@-H)6!|@ z=I}C;4B`Z_Fg>KE*Jf+=^*B9V@2|hD@77P~xAY%%-^gT)FeVzyje|xdbAfrp9BPfT zEOwVg+GQL{-k|-Q6J9D(g4`fwg~HNQX{l65o+2-ib)~lQu5v&z)Zeu_Mj5j>%jX^O z1X%MLll($kaTMJuzomSwG*(xuSFH|iYA=^p$-C`oBr{1VTvUEga;s75IZL-4l1Eq~ z>=oV;Gtl1j9j&V|${J%i_Go*leZgjq504q|3J(NBOr*7?TJj6IymC@&ZO^w$xv}mZ z?}3-cKL>uPPTG(`WCK|*oEM%6<;DJD75WYBB8`&1mg3}*aurS2)9ML!e%JOGX+nER zJ>@J)31yt}hMLoAW_`>qvW|8k-y|!A)$#`UlH6S#rByS0^MrZA$_Wy(3tfcn!eK!X z^NI=LdQqckl|jlHg{Uo5$sTJzcAI!}yl0;8Z-&gTPbOItSqgiNeZU#)DWs)&*&N0i z+ee*5zY>{A&#U*Y!AE6-giE6viY6yt|T2-pQ%i%r}fm%=zr*$%qr$AGY6|-clAfWN?VMSRmN$@ zwS0P4y@x(fAEO`9T_cMz-8g0zwOoIbpN)K@eyiS8i)$sdzFI$RskT-V^%441eW|`i z->C1^PwAKR^2TiAk)fDr%_wt_In`WZo;6onYpjawH#WgfNA8G6=~H?{dLo@ys;fuT z>{=mXyOGnZZLTz*n(3{h)?=%T{hocv?&Y-e;=H{O8&MoUD@zllO7aAGpf%5`%WAkE z_&TXfGKsCEmU4R4cS<ST>(MV@2$$_6obOv)GyBGBRJt zB4w7_sr4+IC9sY504K9s$IawT^3#wi!fbJixJTSaH7S?0M^fdS@_zX%d4wwI+w{Rk zKI^7c-#%?`a#FZ0y)ihM#jSq}FMrEtF^@0_*KXg8G z8hD9bJ3kYdEwrFr=$mvm-6<{C)@!5mYx)x7L*uY6eT`=05YD zsaZ9xHr6_8hZW7*v#(heJGcF&(*$%lCY}-R$hnn9N=GF@Ij;PycxpB^N$sdj*XC-Q zv`@4%rRDGwg}ZSMC{zBNK0kzly8Pb!Kih$&PZqcOE#uyFEOgv?4vEfzk-6 zyF5T1F0Yd>$_Z*(?Q?CXQIx6nJo~(z%}Yr>rW>V0()X&XHPlQawK>3CX>GHrvC4KW zn>YoXq0YNbMmN%RNouX2R?UpGR#@d&8}f!|Y89gsuv16Uy zj^Zc49jmH%oF0>YU{~G!a2Jgi)6x&bRZQVnz?v;V8|Av$irbv@z zO_{1(P#!3`)SBvXcvcxm77OcyJ;LvTE+&bi#m4k&T1)y$ijx13Gb_cF%32rW0xRh< zZ?|{fD+qG332{Qa@S$)>ki|S=b9JP4-1^8q;>`Ab2Y((@3u`^JKH61H&|B)m^(Fe} zx@=A`=b9B*19pwY`oqDd@5TP~2%R9kBNsFhjFhaj-Px<}FMt)Yvhcp}p150lBIc*% zrE>B<`Aa#$`pznBZ?I3<{hf`@dB=Ary7j%baJN4yHmAdAE~THEQ!A=xHFKNY%Eq~YsWI%x$Iw^8lFcs(GAjfav9w))|y+b2v*6SZojZ=I&V04oeXX@H>>BspHsQX zJM=s~CoQnbvQh3AZjAS)x5|_K_xzQ9dNP1~O-c(ZgtfwMK@xL|{lrmX8X6@{mTJnA zg0ACxntaS-97FvuJ5h&E_gqB#r=Wc z!Nue-IZ5u2ABiqx5}FFVg;T;W!gTRRu^hciBc)`z8$o~kks3)0DW)Vdw zAv6$13y+1WQWaTN-caT!C0S#4th?7;=WX{cdRM(ga3?B5x{$eKIr)r~7g`E8h0!~W{nn4x?@mRxh5Lv5wYL?X_gqqd%n&}5 zR>-E3S1V_>HxtY+omuX;?oXcKmw?$ih;~+{D}$|!Y&6S2J|IiT5%L}RiDVX{g+9WM z!mmOJafbMlSY4_oHI$l2k#b$Rf!tVDjB;*mZ-sZpyA01!r^sDm2>pc9!WH35@kjbl zick(KGgvx1w=>R>+%E1}x1d+f8wpP+B8e2r)9-0fX@+!HDk4vp>lqELJ@yXAA)7>* zPNIdC`pO2yR*S1$)KhA0t)%&}Io^(QQjjd-YOxJ%Pfye9^ptc>8m>rce%020u{%0H zId8ht-E-bu$nUCT9myrE72l$rr4OW7d4W7uX=;^n`Z{^t7zV% z``r7|)BLpXHCt2Cmy9593U3KV&rxpY~|CO4BG%D>5p$_nMF zlHW|}#5=m%(tY99^fQuJ(vJ)yyGTKyg3wCnB#ae~2{px%QVIDNSyWslgOX8gs&-U& zsefpjt>x~I?qqMK_l=i}bRb>HD6)dACY#ANa)CI)Eunx|TdYgxQ6{C9@=F~gOHL){ zliSNXmF4Ohb%VND-K);DD!CinwBC3xl|R*A>TmN?l6mA;(oF0uE)t{YyR?neT0Si= zwhmZ7S?kyi_RN{h@GxM?U+z4-x_p^5dX24P^ zvD0ju{grbP?vy*|Hfgh*LaC|jRaEt+=IMP6(Mn~Vu`XLJ{L#Kd9*Apcobo<=_mC03 zlp9P3D5I3s${$KYb+9_bI^?~8zj`Z@8Oo1JCiWIPXlL;o`MW`bEW$#evRF-=DgG?x zr48u?t*3R}9p>jE3&~P)l-wkO&`jtn+z@^e9tkv$h}SG+pl(TU_|Qb!mnd@Q))V3AUjo}`!P3F&j` zfm}(sq~uj=t6SVpJ(-LpTZQc6FtGx7x0N_fOhJ3lBgz4-FI#9|w<|e`?m4%-*B$1_ zRPu!|T)ZNFCH_Qzlzx}DDH`+GOm>cqch)%3?h<#DcMx)U9QmG@LTaJCkR)suQi~15 zEVMYSrYuxWD`nIwYD06goyU>gu5KZ(q*n?4+B{9p2-k%J;veD|^^EqJ^&?AfXLG-I zGeBG_(00lZt&BCtiGcZcj~0`@mx{`j!5!meAU(vP(i`$drG(m5J*^JWvm0fMJw~Fr z*-Wy=u@BfXwu%k0zp`)Ij+5Fc>MV7bn*>kB`-OSpWV(ocN57SdC|i^Q_B-}UyNi<+ zWGb{5T`Vnr!9!ind74Z}c=;u~?SFK5OT9Vw|6y)~@M&@AdS3 z(w!_5Qqd0dZTbiGrRS0-7t~YR73?<7aCeG3!=>JNcy{(bij*ZMTB2bMv{KASc#| zp4q~jXi>Y7J;wgc?F)DKq9lX%ww70Kpm))`>jU(0`iJ@+{iObr{#xaq%p~; zY<^%KHLKfO9LL`XS)Wn7Ef%3qXeFto^jNAWw~!C3&5a}0Z&n@Fk9}^p^@jLsVaB~9 z*i@A=OF1P)&LroMhpX{MU2B*%-eT;1_6I9wFSl#EeL??;VjMlD+)=Ws6SX6UG#{C&)!O==m30(iiQkCxl(zN+=QR99tC9rLNZ2l36@Q^WOVzC9)_d#e z-P~Sp?+*MG+Jp=z*GOR@N@ycA7E4Q|+$j4+W*-31nfDj{$5o(E+lugPfnQ~S+ulz_BlpV?wrHR@>{YYJ} zeyr|Mw=>OM=6>j|chh*2ycymUe+k%8i$s&*LOXGrd{q8Mxn|w7w%Q||s@_6xpZBeI z7`{`smK^Ife@j#@PV*N_(Z5A4iblo4d~}|u5?kFBUe;DSMsQ} z)E<@s-y%&Rsf2dIH6fL_U#zU2*3Mfl>%%^2}YV)w2qlTNB(*K^1gm z1BHFU3t_JKExjRqszj)YKHD0?T)T&DJE@%RPI`EvD@WqVQeljkk$yq1NC)K~<;QYA zBO>PAL>YHrGkMsA6nX%4A{lwEEvPtdv<1E9w~77e ze40^-R$pjs>{;G4AOBgGgLEcIWHd8JZPC&`fW$R*_%`B#}Jnbe}{I(3t}TaD6+o1fUKyTYC5P4V`6H@v*C zF26D=nuIyG~)Kl65`;J@1-wEFjbt9fIK=?yQE8d{CltwBb zeIqeBwVYr6P~ETA(Rb+0%=fGoc6Ymgo6c+Pr2&0wlD4EH8A3*r56MSlBiTZ>lLzDv zl2+ItR1gPL8DjN6T-?E7UFOC+b#ppISr9 zq!%?iTXEJbtGE4*m)hSBE9FIUQMfG>7VC&5X$0*mb&(e<<uc|I#zDrvPd*pwip|AZv?*;Xb(0dLHgZ?Fmz-4@ zr8c*wu;F0Q6q=|Q?h+An=1ZB!_+-mR=uOy$1G^owhmb>SPc7_33f_5 zpMBBk?yiRaM|?q2h=;_|bhh-KTu&WmR<)K|m)KhSptI1e<_+-{dnZBGCGtDDD`e0+ zSe>j%teM@(j zR$HX~tX0wv>f?=~5UsC-qGA_uiAd;DWxV>XI#)fbbu?O;SIjM}FgYpCqA8Wn)CcO%>SI;X zbZxLUOuMdqqy4H?(HrR<^i}$1{Y(89J(W@27-oz%rWo^#RmNK5sBzLbYg{$HHtrdd z%<1Mf^P$PC)K*sO59=uFXSam^>#P=1XrdWsHey?y2ku*NPpC^O(Ghfu9Hs12H1#|E z2feECzWJ$j#oEZ`JK5n|j-7P7lu65~71f$(&9s(U8?C)IK-;0+(tN$O{*FFdpQo?a zztAfgiN-)Jt&tzZ6LNX2m)664&-}s43BGSd*U>f7PWg&_LjB5`55B)7 zenhv?Poz!qJ-Lf=%KF?|?G*RA`6I!q_q6o)jcDtF~OK&EHf&ahs|Qv z8$OQYw8D6NtQDjz8?l;h?lbHCHr z8{~QZ$MF6347yq>qufxxQ%h)xT9URzTcaJ*-qNS&YxPa~Vg0mzS--DWFypbtn6$zo9ATp-vL>bNH?0ptYr>2pO}a2NPiDx-(I>~YO9vg-qIFmRDV}rrtj0o zu-{odyM;H=hyU~oO_dHxFJ-3klaf-srB2h9X&-9qw2j&pZKS@~_}oysg zXI?PZTA5iTc91=0PwkmbWAObiVt1*J6fgId@2VHI!p07xj=9SG-OOMWVcnTxf9XDj zxztmrB1VdR#AV_!dQ>W)rDN~12W+-|*^YD)oc3N9Zy#iLR-v1)NYKP?Vk7!HeJXvX zn;X&9X)-WBuFX7svzRL!B~GVhwbtS43`yOjMVtkYD?CDxQCNj2n&@>S)Y zQkB(XC+t6L%_;8I^;UXy{TTm4c%ofIj*+wE77>N!!neZD!X5g)bX%GSuVS<{-?zp( zpE{47hHg=Ko?Avn3BL)Y#7<%rDO##5HUhzwj3NpMW10 zkVN6KP+S~HAJPU=W9bK}zT8NTRR&q3*lwG;WxZPPd|H@PAvuN2;zsyl(4wd4NhzNi zr#7)#TI1}J&RLK7>tQ`jK_-#9!Z2a8FhtC*@jZ-g9TM^U5(eL(L^pJ_Yox17_?W_Kv$ z)7xa5P+x2*zA5^&80}6^DQ~GNJIg9Mecg(%7A#O(8Y9irtUo(q&vsrosYp-STuGzG zn0?JcRzGXFRfE0F7O-DfetWPz!d_&5Xn$!Jah5p!T#x)Nrlh%OYh|x?Q0rl|WSv+l zyR!4S)5D$Ye&n8adw>QFNo#SuSdc!^vgr%-T83**ux>NaW_E;K0lpopC{>V;DsQS| zwLJQCeTDu^w~c1Tucl(vu?}00tlBJ*J!eJjRPe&pVet#`Yw>6Llk`j;Z$?pm z7CX-_vvu}6&RXY?6X~4-557xAh-GMfYoOE2J>zEf-thW^E>}rSA*+-{-mW}W+N-hJ zHa5b!;VgBRc$57@Fup*zDx?(WNVDZiYJIi4<*?ptlbyw_>n?If!~fO47v_iuwN5_ggG;wSVC2ygRnzrDDD*_l_F{heS@{j+F~p24)>D#t5+Gm z!@ftJku*YoA&odi{6s84bt$)`$+_gOBr%wp{>_8zC3Ti7e*r67}qPlb!(D*6fC zES-~AYpL{h`ai#rLFH@>q4Z_O@BYs$~^m%~?lwgZ;#ao!5@GFF5nv58M>wOW}n$PMxaGQOjrx zwNJGlw4bz!`T@OzG0ynfyk+*WzGLlS)}0V5(&2Qw_KQ|oe^0-z4=`35M~trw!PL#= zW?AdGb%d#8j?j{RFPGC|jqYYKPa?ClcZ{XR=Vmt2mF}Utq|!Z<=QV|W%68cM?F!C&P9N7GIcYN`wHj?C zTEAMim}L*LhubERNe$sQrJ~wNi?_D2;m$YCX*UVh#>rw~HBy_beXkYQYw3l}rsjV0 znyFiDt@YMUYY@w7?{+?iXTb~NN;*rLDW8>RtE;phwf4qkW2|}KjIx@s4s1C4hUKxF z+Fv+Lf#-4YuH0DZq4?@d?S1WIZJRbppQV4VcQX1L#C&FDWrx{gJ0;1Z#3;R$3Cam2 zyE;IfpuVpzR?lgz&3*PU`FOoiWC| zY7S(duw70r@~L=MJSUzPm(vwAUg@smsngXN>P&T(I!|4wE>dS`^R%tn zUA=+P&}d{dHkuf%jW$MGBgS}Pj5gmgzcAtD*!=AUcpD7g zjE;&b7!?s685xD20BsQ!6&=AZh{DkkhoeI;)P`0-8^A++BDsKwh{&kuXuybyjslJx z22EhWxdA+?AmB8Bvq>w&;doJih$n)gfi$`x_QFUA0cOZLgo9*&!?~lVNN5M@f|4OE zU-iR!ARFWoki8U)ia{q38fb~)fMZB!a8N<;P%s2|$S(?d;J>hIcoTtP$ABln!AOS} z=mBO3hMkwZQH!vgS2iWHh>Hsc2H3rd5AbObunrp((Fz1M0Sr_%DmoC^s!4zZ_Mp;m z0760F6Lca`0J=5`sggSd6c0J&-A@JE&wF9D%b=>hb4n+Uv)qO z7znvLoENx&wc=(p4>BXLE?GNR27(C^;8a7LP6u_ff{TG%s!DS%q z3CMJ?E9BK6>oF@i8ytv^z)uJ)*ar&S16ORo6%q{4!A-apgK(4t>l9*e&m^CW+zot)=RK;%?<6nhD;mb{5@=Xp zW(eu^yGfWV8ccy4`KOXVRmdUk@+P6XNZ|kEX$vDL>~CZgW-Vk0&S{v$c)_X2KQVa$ z7M`9M0YJbN0o8*OCh?zV9D-h-abd`-85br4OF<&0JcJPZ48Pp};YL9W0mg;LB4_|2 zT7`27f`n9%2uKB*VKgBlcuoKu+My{11i*&}9r6IP0SrOrkSM?wObXU49xC93t1Zrn zP!IqgmrEXc7~u1q-|_f7gF9a!2Itw!4k!WbkS~J{VxehawttmHF!-mL@MrXw)L_o5 zdnZ>5SC~La3?@g!>GCoYu^E^KW8u!rwJrb!o(PSDPr(~pA#@FrydK*yU#lDmjxH)(<3uXjF6BK_%#q$90!Cn7+B7m!2t{D(fyajRVQB-&Yvz3noOBaqH z2(-jB4#WmTp;SR1g6{mn=K*9F^f(n}7*0;OU`BwMI90@0C?O$xTc3(5*6ALj!zkn=%QI19qO2AYUgoHKX~8R7+v!4dcg zGlZd+3|>51$bggpt^IC}%VK_1M1USx+z|4XkJ{^P7U_R;w z6#Us7=LAoTCIy>dfWR&=gPVnJ!#w9n-6S#!9=VYWm<2aGkcWT8pjeOv7o3{`0ZIty zT0}**Ky`6W7L05a8PxrfW3emxS-FF89ED?W7dF`Xi<|^zzhU{2rv^+&}aM(g@}P5 z?hv>+1U>F0#6*h#AEkrv$bzlV_9nm@W)6(uZ34zX7eK*OifVy9fPxKpt0@@dKLGIu zxrF5b0ia{{aXpa>*P8nifFZX=H37%s>;b`i5`}d^I*^0O1+2ll)uSS7p$h;LrxJ{z zC*T6s;;jQSqy?@BuT2IJ3#Q;01RN~Fz@gSa2lH<)1d*Db68QRx``?^(-ir_WW44uH;3Me29WT8A18x`FOpY@2&98NG@37Ys63_u^l&ZUzQ!EEghq$7zyP!W*Fofg0467R5K;*8CrEbA3q8hH z=U2%Ey1|UX%mimeMmEK007E3k%;QT2;P5Ggjsl!O7fb>$9ce)=&=nRN9OW7M5(iud zRA41=0GyYh1k7MH;JFVrfbC!oNQB!bVDcpueFL`fV59Y178;FlN4YrP;2w$)SlTct zaeajO&%MRhW?XUs2mJs>b2xGUZulA9l#jN+HrNh)BAQ}IK`X$=bmlf9E!v2Y2KFJ@ zKn~OxK2!xQg2-XWz)tQ=v>THMR~eYYFoQ9(Av(BPV0=MSR0Z%bbqhu`ZHjpm6#@U{ zhdtx39q>ieUmmz6^}jm)%qNTs-)`~apYaXj{nZtD{xiPG@&7xVunx&-lDAQ}|4zTZ z8viR;a=zG3j+6gS{p4fG>Cm3!V}I@^$NfLq#_h)c>_C0~J3s$3+W*`7|CL_JVaa`& zyqz2ebx%Hq{lAkph!;MPKFlM0#j)h=zv?3lkFY*r{O}dWu#H>7!$f%EV-YsjWFD@{#WwCHsBa;V-NFz=1V@w zqd=NemhTHGM_J{d~`5><_EV<1HL%v~uK$9O~T-?I{ z=OavyIJm7JWg`qvgoUrjH>@jOv5!8%D;|i8@bDP+@xVT|@kIMkchm!Yh+}vlkMJ3m zhj64pc=(ETC6@=?m*a?&TyNwZ?xUVKjtAZk(6?A}AnZrP#Wt2=z&4&?Srws;@xlXT zBhRo-2n)+W+YpX z;2p?NIi?TVx?D#JpA0xr&`^YZ9Kl*1Sc38-kS2ofbihNKD74bUtk*#U6-I4<$su|f zX14*b!1gRKFDn6cYPjN?6=`Ai`7n2D0JjtzmJMcWC7j`HD-hBSR;zY-;H*tZ#}lFRM7kZ3Np|VNBHTS7g%{eQt;srqVur3S>fvDz2>#VGr$HeWhWx2SW z$?NF7X`d2r%8^j+WVeabN8L&~UJD70#8Og8d7CxV>CcNHY>-b_+YQaWsUI)~La~iU zPzq$e78I4Z%J7xCFZ8_(w-4gR!8tUgltRV=5Bs3R!&4_Eoy-D00j&TVMRy37+zRw} zS%d8^g=sINjbE4^7thJ<;NSSc%N@_G!wieGRM%sPsGNeb1h?ebT6HKyP{zR`C@1-7 zqKf5O#z29VTgGyyjlW&KZUs^%vUUD-JJHRqE)cr&7jSMk0^g)6L(!1MutoI|a5yb4 z2EOHi$4z}KltVcOMLW9iagjAlbq!EO;@8t@d%tc?DX-&1Y7CE!i4ux-Mhk z*7)MEEq=8%(f&aT`x3X2S|MT`{3hu5a3iesfQ za%FfK+S6jqHvVDxuC>5RNdw`(f~nWfSkN2z+I`-1`W3}Lg|+t6gSW%x?RN_>Q8ndhA(*quj=*i zu~^g%tsXB7g>w~)XW+&_+<@1RZkB8EZHKtou)H@y>|qpQ_=+EL$n5p-yNUS9C%)?0 z5?(IsPVtq(3gnoHr7MCLjqo+2N$`qMR=5MaUW(&Y*^QM@ZUYN#go|uoi482QfkibA z3eTYc%?NERd&pMXSnejhKh_Uk_+2G$l8>AC!tY4Ghp5`}_-gn; zVT88Yel9*_>EUI6jh3_moT+pzLygAzmbDIMzX& zTf!@~=+nT%=-JYKdm3yht_52X+Q3$X=GJI`yz%e7g#7QnPWO6T|0heV{JF%B!x9$1LJ^Bh zP{tyoSHlZRS)ir3g>DjTwYvtdWfz8xbai1X-7sMsyn3BkY%C55y{J7JUezuw#Y!h( zi(MxfU&$_@6oS{X$M6@kacf>a*qj#!+w(fZ2EE{QZpq2+l!UE#xLvO@Y}k8!L*7}~ zl6MDQ_s$0|eE+kp)_-r4=D%M`;qSlp`R^A^`#-bA4L#Nh3YSz*R=Q*`gTr_T&mLd~(sv z1{s5!-`o0sf4dZXu_0uu7}2?Nd{TVxgqWU9)VaxPJXA7p++2FQ65x_a2z=SGa!(f62Fg@6PdE;-Qi)Xi&Lk`~VOh z-z%3C*YGPA`7d1nUk_=yD^?lQt7p~$ zaf$sv&q~>g6)u{+O67_*<6@HfC&u;5nh-ZAsd-G#_|CPf!(h?uFNd-w#>K`ZBq6w5 z_N)mpz2YimZ&WR7sXSS0M>olyHED1k0Q8FQ7u%srz$<&!n4Ud*4{RFOwHMGe1c^Ok z`t+-my=eBVvAq-e#l`kd0w+Z$#w7Fu-+(d=`uFM-ml&VW6$EF`+9xr-cOryy2(T>% zYJ~V?1^li6Dp{&nu|g%w7B7L^diGANls!k$qGgK~1@3_={gPr5I>#h-&R(Q)#Udf| z{>qrLVPl$B%Ua?;WlYhMB?^@+S*+wMW4gfcuNzY)Y)p-W;!PvMuK1seN{LdX$`mS5 zu59V#hCopM<){FIAjO(i{|{JE0>V+ERB@iEL2h;FQo8iNZbhlE74ZMprjXISd-YC= zOZ=ZVs933DWeb%kUIKDGJXuP^Q4S{ZzcyJQK|^6_Ub1M1KL3FXD^|RC$wI|SmQ0pm z<;s;P@vp}Q7LNb0Ddox(EmW*nIauq$LHXaaqJzj521kk2k$Dy$`mS76jsr&DgS%RFf1H_DR6`QPu)|B6)Ii=)}OE;f9K{H zzD@S4Y~X*8-uvrE1^!r*_@^fYd{C%0xKm<$=lCJ~X5J_!F{W3)ti9qBB6}w!#V7Rd z-M?RbsOR6UUK~DkfQR_gLH+oCy%5T8`G8zMt|uUOjgIRD&u#pX1a9?3`8_-&x3Yo% zwfd-H16WEy(TdUV;8D2&qQ>;BScLbCijnZ32aj8oix(|cs&LuD#o$3N3^FR#i|LmX zJT|?K@f!Rkk7|P}H>g{?P(<^3FL?mSfaj6m5i_!93_M9TYYRu!}1UR6J|Mv?H?EOD< F{4c4JAwmEE literal 0 HcmV?d00001 diff --git a/data/sb.barcodes.fa.fai b/data/sb.barcodes.fa.fai new file mode 100644 index 0000000..ca405f7 --- /dev/null +++ b/data/sb.barcodes.fa.fai @@ -0,0 +1,201 @@ +SB001 20 7 20 21 +SB002 20 35 20 21 +SB003 20 63 20 21 +SB004 20 91 20 21 +SB005 20 119 20 21 +SB006 20 147 20 21 +SB007 20 175 20 21 +SB008 20 203 20 21 +SB009 20 231 20 21 +SB010 20 259 20 21 +SB011 20 287 20 21 +SB012 20 315 20 21 +SB013 20 343 20 21 +SB014 20 371 20 21 +SB015 20 399 20 21 +SB016 20 427 20 21 +SB017 20 455 20 21 +SB018 20 483 20 21 +SB019 20 511 20 21 +SB020 20 539 20 21 +SB021 20 567 20 21 +SB022 20 595 20 21 +SB023 20 623 20 21 +SB024 20 651 20 21 +SB025 20 679 20 21 +SB026 20 707 20 21 +SB027 20 735 20 21 +SB028 20 763 20 21 +SB029 20 791 20 21 +SB030 20 819 20 21 +SB031 20 847 20 21 +SB032 20 875 20 21 +SB033 20 903 20 21 +SB034 20 931 20 21 +SB035 20 959 20 21 +SB036 20 987 20 21 +SB037 20 1015 20 21 +SB038 20 1043 20 21 +SB039 20 1071 20 21 +SB040 20 1099 20 21 +SB041 20 1127 20 21 +SB042 20 1155 20 21 +SB043 20 1183 20 21 +SB044 20 1211 20 21 +SB045 20 1239 20 21 +SB046 20 1267 20 21 +SB047 20 1295 20 21 +SB048 20 1323 20 21 +SB049 20 1351 20 21 +SB050 20 1379 20 21 +SB051 20 1407 20 21 +SB052 20 1435 20 21 +SB053 20 1463 20 21 +SB054 20 1491 20 21 +SB055 20 1519 20 21 +SB056 20 1547 20 21 +SB057 20 1575 20 21 +SB058 20 1603 20 21 +SB059 20 1631 20 21 +SB060 20 1659 20 21 +SB061 20 1687 20 21 +SB062 20 1715 20 21 +SB063 20 1743 20 21 +SB064 20 1771 20 21 +SB065 20 1799 20 21 +SB066 20 1827 20 21 +SB067 20 1855 20 21 +SB068 20 1883 20 21 +SB069 20 1911 20 21 +SB070 20 1939 20 21 +SB071 20 1967 20 21 +SB072 20 1995 20 21 +SB073 20 2023 20 21 +SB074 20 2051 20 21 +SB075 20 2079 20 21 +SB076 20 2107 20 21 +SB077 20 2135 20 21 +SB078 20 2163 20 21 +SB079 20 2191 20 21 +SB080 20 2219 20 21 +SB081 20 2247 20 21 +SB082 20 2275 20 21 +SB083 20 2303 20 21 +SB084 20 2331 20 21 +SB085 20 2359 20 21 +SB086 20 2387 20 21 +SB087 20 2415 20 21 +SB088 20 2443 20 21 +SB089 20 2471 20 21 +SB090 20 2499 20 21 +SB091 20 2527 20 21 +SB092 20 2555 20 21 +SB093 20 2583 20 21 +SB094 20 2611 20 21 +SB095 20 2639 20 21 +SB096 20 2667 20 21 +SB097 20 2695 20 21 +SB098 20 2723 20 21 +SB099 20 2751 20 21 +SB100 20 2779 20 21 +SB101 20 2807 20 21 +SB102 20 2835 20 21 +SB103 20 2863 20 21 +SB104 20 2891 20 21 +SB105 20 2919 20 21 +SB106 20 2947 20 21 +SB107 20 2975 20 21 +SB108 20 3003 20 21 +SB109 20 3031 20 21 +SB110 20 3059 20 21 +SB111 20 3087 20 21 +SB112 20 3115 20 21 +SB113 20 3143 20 21 +SB114 20 3171 20 21 +SB115 20 3199 20 21 +SB116 20 3227 20 21 +SB117 20 3255 20 21 +SB118 20 3283 20 21 +SB119 20 3311 20 21 +SB120 20 3339 20 21 +SB121 20 3367 20 21 +SB122 20 3395 20 21 +SB123 20 3423 20 21 +SB124 20 3451 20 21 +SB125 20 3479 20 21 +SB126 20 3507 20 21 +SB127 20 3535 20 21 +SB128 20 3563 20 21 +SB129 20 3591 20 21 +SB130 20 3619 20 21 +SB131 20 3647 20 21 +SB132 20 3675 20 21 +SB133 20 3703 20 21 +SB134 20 3731 20 21 +SB135 20 3759 20 21 +SB136 20 3787 20 21 +SB137 20 3815 20 21 +SB138 20 3843 20 21 +SB139 20 3871 20 21 +SB140 20 3899 20 21 +SB141 20 3927 20 21 +SB142 20 3955 20 21 +SB143 20 3983 20 21 +SB144 20 4011 20 21 +SB145 20 4039 20 21 +SB146 20 4067 20 21 +SB147 20 4095 20 21 +SB148 20 4123 20 21 +SB149 20 4151 20 21 +SB150 20 4179 20 21 +SB151 20 4207 20 21 +SB152 20 4235 20 21 +SB153 20 4263 20 21 +SB154 20 4291 20 21 +SB155 20 4319 20 21 +SB156 20 4347 20 21 +SB157 20 4375 20 21 +SB158 20 4403 20 21 +SB159 20 4431 20 21 +SB160 20 4459 20 21 +SB161 20 4487 20 21 +SB162 20 4515 20 21 +SB163 20 4543 20 21 +SB164 20 4571 20 21 +SB165 20 4599 20 21 +SB166 20 4627 20 21 +SB167 20 4655 20 21 +SB168 20 4683 20 21 +SB169 20 4711 20 21 +SB170 20 4739 20 21 +SB171 20 4767 20 21 +SB172 20 4795 20 21 +SB173 20 4823 20 21 +SB174 20 4851 20 21 +SB175 20 4879 20 21 +SB176 20 4907 20 21 +SB177 20 4935 20 21 +SB178 20 4963 20 21 +SB179 20 4991 20 21 +SB180 20 5019 20 21 +SB181 20 5047 20 21 +SB182 20 5075 20 21 +SB183 20 5103 20 21 +SB184 20 5131 20 21 +SB185 20 5159 20 21 +SB186 20 5187 20 21 +SB187 20 5215 20 21 +SB188 20 5243 20 21 +SB189 20 5271 20 21 +SB190 20 5299 20 21 +SB191 20 5327 20 21 +SB192 20 5355 20 21 +SB193 20 5383 20 21 +SB194 20 5411 20 21 +SB195 20 5439 20 21 +SB196 20 5467 20 21 +SB197 20 5495 20 21 +SB198 20 5523 20 21 +SB199 20 5551 20 21 +SB200 20 5579 20 21 +SB201 20 5607 20 21 diff --git a/docs/Makefile b/docs/Makefile index 6bf6b79..ccfa1a1 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -19,8 +19,7 @@ ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . # the i18n builder cannot share the environment and doctrees with the others I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . -.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext - +.PHONY: help help: @echo "Please use \`make ' where is one of" @echo " html to make standalone HTML files" @@ -30,6 +29,7 @@ help: @echo " json to make JSON files" @echo " htmlhelp to make HTML files and a HTML help project" @echo " qthelp to make HTML files and a qthelp project" + @echo " applehelp to make an Apple Help Book" @echo " devhelp to make HTML files and a Devhelp project" @echo " epub to make an epub" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @@ -45,64 +45,85 @@ help: @echo " pseudoxml to make pseudoxml-XML files for display purposes" @echo " linkcheck to check all external links for integrity" @echo " doctest to run all doctests embedded in the documentation (if enabled)" + @echo " coverage to run coverage check of the documentation (if enabled)" +.PHONY: clean clean: rm -rf $(BUILDDIR)/* +.PHONY: html html: $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." +.PHONY: dirhtml dirhtml: $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." +.PHONY: singlehtml singlehtml: $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml @echo @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." +.PHONY: pickle pickle: $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle @echo @echo "Build finished; now you can process the pickle files." +.PHONY: json json: $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json @echo @echo "Build finished; now you can process the JSON files." +.PHONY: htmlhelp htmlhelp: $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp @echo @echo "Build finished; now you can run HTML Help Workshop with the" \ ".hhp project file in $(BUILDDIR)/htmlhelp." +.PHONY: qthelp qthelp: $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp @echo @echo "Build finished; now you can run "qcollectiongenerator" with the" \ ".qhcp project file in $(BUILDDIR)/qthelp, like this:" - @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/PyIM.qhcp" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pyim.qhcp" @echo "To view the help file:" - @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/PyIM.qhc" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pyim.qhc" + +.PHONY: applehelp +applehelp: + $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp + @echo + @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." + @echo "N.B. You won't be able to view it unless you put it in" \ + "~/Library/Documentation/Help or install it in your application" \ + "bundle." +.PHONY: devhelp devhelp: $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp @echo @echo "Build finished." @echo "To view the help file:" - @echo "# mkdir -p $$HOME/.local/share/devhelp/PyIM" - @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/PyIM" + @echo "# mkdir -p $$HOME/.local/share/devhelp/pyim" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pyim" @echo "# devhelp" +.PHONY: epub epub: $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub @echo @echo "Build finished. The epub file is in $(BUILDDIR)/epub." +.PHONY: latex latex: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo @@ -110,28 +131,33 @@ latex: @echo "Run \`make' in that directory to run these through (pdf)latex" \ "(use \`make latexpdf' here to do that automatically)." +.PHONY: latexpdf latexpdf: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through pdflatex..." $(MAKE) -C $(BUILDDIR)/latex all-pdf @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." +.PHONY: latexpdfja latexpdfja: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through platex and dvipdfmx..." $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." +.PHONY: text text: $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text @echo @echo "Build finished. The text files are in $(BUILDDIR)/text." +.PHONY: man man: $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man @echo @echo "Build finished. The manual pages are in $(BUILDDIR)/man." +.PHONY: texinfo texinfo: $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo @@ -139,38 +165,51 @@ texinfo: @echo "Run \`make' in that directory to run these through makeinfo" \ "(use \`make info' here to do that automatically)." +.PHONY: info info: $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo "Running Texinfo files through makeinfo..." make -C $(BUILDDIR)/texinfo info @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." +.PHONY: gettext gettext: $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale @echo @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." +.PHONY: changes changes: $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes @echo @echo "The overview file is in $(BUILDDIR)/changes." +.PHONY: linkcheck linkcheck: $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck @echo @echo "Link check complete; look for any errors in the above output " \ "or in $(BUILDDIR)/linkcheck/output.txt." +.PHONY: doctest doctest: $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest @echo "Testing of doctests in the sources finished, look at the " \ "results in $(BUILDDIR)/doctest/output.txt." +.PHONY: coverage +coverage: + $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage + @echo "Testing of coverage in the sources finished, look at the " \ + "results in $(BUILDDIR)/coverage/python.txt." + +.PHONY: xml xml: $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml @echo @echo "Build finished. The XML files are in $(BUILDDIR)/xml." +.PHONY: pseudoxml pseudoxml: $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml @echo diff --git a/docs/api.rst b/docs/api.rst new file mode 100644 index 0000000..794c525 --- /dev/null +++ b/docs/api.rst @@ -0,0 +1,53 @@ +API +============ + +Alignment +--------------- + + +Vector +~~~~~~~~~~~~ + +.. autoclass:: pyim.alignment.vector.Alignment + :members: +.. autofunction:: pyim.alignment.vector.align_exact(target, query, query_strand=1) +.. autofunction:: pyim.alignment.vector.align_ssw(target, query, query_strand=1) +.. autofunction:: pyim.alignment.vector.align_with_reverse(target, query, align_func, query_strand=1, **kwargs) +.. autofunction:: pyim.alignment.vector.align_multiple(target, queries, align_func, raise_error=False, **kwargs) +.. autofunction:: pyim.alignment.vector.align_chained(align_chained(target, query, align_funcs, **kwargs) +.. autofunction:: pyim.alignment.vector.compose +.. autofunction:: pyim.alignment.vector.filter_and(target, query, align_func, filters, **kwargs) +.. autofunction:: pyim.alignment.vector.filter_or(target, query, align_func, filters, **kwargs) +.. autofunction:: pyim.alignment.vector.filter_score(alignment, min_score) +.. autofunction:: pyim.alignment.vector.filter_coverage(alignment, min_coverage, min_identity) +.. autofunction:: pyim.alignment.vector.filter_end_match(alignment) + +Genome +~~~~~~~~~~~~ + +.. autofunction:: pyim.alignment.bowtie2.align + +Annotation +--------------- + +Annotators +~~~~~~~~~~~~ + +.. autofunction:: pyim.annotation.annotator.annotate_windows +.. autoclass:: pyim.annotation.annotator.Window + :members: +.. autofunction:: pyim.annotation.annotator.annotate_rbm +.. autofunction:: pyim.annotation.annotator.annotate_rbm_cis + +Metadata +~~~~~~~~~~~~ + +.. autofunction:: pyim.annotation.metadata.add_metadata +.. autofunction:: pyim.annotation.metadata.feature_distance +.. autofunction:: pyim.annotation.metadata.feature_orientation + +Filtering +~~~~~~~~~~~~ + +.. autofunction:: pyim.annotation.filtering.filter_blacklist +.. autofunction:: pyim.annotation.filtering.select_closest diff --git a/docs/conf.py b/docs/conf.py index a425ef8..9e2f37a 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- # -# PyIM documentation build configuration file, created by -# sphinx-quickstart on Sat May 2 20:42:48 2015. +# pyim documentation build configuration file, created by +# sphinx-quickstart on Mon Mar 21 15:43:09 2016. # # This file is execfile()d with the current directory set to its # containing dir. @@ -16,6 +16,9 @@ import sys import os +sys.path.insert(0, os.path.abspath('..')) +from version import get_git_version + import sphinx_rtd_theme # If extensions (or modules to document with autodoc) are in another directory, @@ -34,15 +37,17 @@ extensions = [ 'sphinx.ext.autodoc', 'sphinx.ext.doctest', - 'sphinx.ext.todo', - 'sphinx.ext.mathjax', + 'sphinx.ext.coverage', 'sphinx.ext.viewcode', + 'sphinx.ext.napoleon' ] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] -# The suffix of source filenames. +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# source_suffix = ['.rst', '.md'] source_suffix = '.rst' # The encoding of source files. @@ -52,21 +57,25 @@ master_doc = 'index' # General information about the project. -project = 'PyIM' -copyright = '2015, Julian de Ruiter' +project = 'pyim' +copyright = '2016, Julian de Ruiter' +author = 'Julian de Ruiter' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = '0.4' +version = get_git_version().split('-')[0] # The full version, including alpha/beta/rc tags. -release = '0.4.2' +release = get_git_version() # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. -#language = None +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: @@ -102,12 +111,18 @@ # If true, keep warnings as "system message" paragraphs in the built documents. #keep_warnings = False +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'sphinx_rtd_theme' +# html_theme = 'alabaster' +html_theme = "sphinx_rtd_theme" + +html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the @@ -115,7 +130,7 @@ #html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. -html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] +#html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". @@ -184,9 +199,22 @@ # This is the file name suffix for HTML files (e.g. ".xhtml"). #html_file_suffix = None -# Output file base name for HTML help builder. -htmlhelp_basename = 'PyIMdoc' +# Language to be used for generating the HTML full-text search index. +# Sphinx supports the following languages: +# 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja' +# 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr' +#html_search_language = 'en' + +# A dictionary with options for the search language support, empty by default. +# Now only 'ja' uses this config value +#html_search_options = {'type': 'default'} +# The name of a javascript file (relative to the configuration directory) that +# implements a search results scorer. If empty, the default will be used. +#html_search_scorer = 'scorer.js' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'pyimdoc' # -- Options for LaTeX output --------------------------------------------- @@ -199,14 +227,17 @@ # Additional stuff for the LaTeX preamble. #'preamble': '', + +# Latex figure (float) alignment +#'figure_align': 'htbp', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - ('index', 'PyIM.tex', 'PyIM Documentation', - 'Julian de Ruiter', 'manual'), + (master_doc, 'pyim.tex', 'pyim Documentation', + 'Julian de Ruiter', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of @@ -235,8 +266,8 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ('index', 'pyim', 'PyIM Documentation', - ['Julian de Ruiter'], 1) + (master_doc, 'pyim', 'pyim Documentation', + [author], 1) ] # If true, show URL addresses after external links. @@ -249,9 +280,9 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'PyIM', 'PyIM Documentation', - 'Julian de Ruiter', 'PyIM', 'One line description of project.', - 'Miscellaneous'), + (master_doc, 'pyim', 'pyim Documentation', + author, 'pyim', 'One line description of project.', + 'Miscellaneous'), ] # Documents to append as an appendix to all manuals. diff --git a/docs/index.rst b/docs/index.rst index 620f233..59c72c3 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,9 +1,9 @@ -.. PyIM documentation master file, created by - sphinx-quickstart on Sat May 2 20:42:48 2015. +.. pyim documentation master file, created by + sphinx-quickstart on Mon Mar 21 15:43:09 2016. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -Welcome to PyIM's documentation! +Welcome to pyim's documentation! ================================ Contents: @@ -11,6 +11,9 @@ Contents: .. toctree:: :maxdepth: 2 + introduction + api + Indices and tables @@ -19,4 +22,3 @@ Indices and tables * :ref:`genindex` * :ref:`modindex` * :ref:`search` - diff --git a/docs/introduction.rst b/docs/introduction.rst new file mode 100644 index 0000000..dbc68da --- /dev/null +++ b/docs/introduction.rst @@ -0,0 +1,14 @@ +Introduction +============ + +Alignment +---------------------- + +Merging sets +---------------------- + +CIS selection +---------------------- + +Annotating insertions +---------------------- diff --git a/docs/make.bat b/docs/make.bat index 787e48d..edcd341 100644 --- a/docs/make.bat +++ b/docs/make.bat @@ -37,6 +37,7 @@ if "%1" == "help" ( echo. pseudoxml to make pseudoxml-XML files for display purposes echo. linkcheck to check all external links for integrity echo. doctest to run all doctests embedded in the documentation if enabled + echo. coverage to run coverage check of the documentation if enabled goto end ) @@ -47,6 +48,14 @@ if "%1" == "clean" ( ) +REM Check if sphinx-build is available and fallback to Python version if any +%SPHINXBUILD% 1>NUL 2>NUL +if errorlevel 9009 goto sphinx_python +goto sphinx_ok + +:sphinx_python + +set SPHINXBUILD=python -m sphinx.__init__ %SPHINXBUILD% 2> nul if errorlevel 9009 ( echo. @@ -60,6 +69,9 @@ if errorlevel 9009 ( exit /b 1 ) +:sphinx_ok + + if "%1" == "html" ( %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html if errorlevel 1 exit /b 1 @@ -115,9 +127,9 @@ if "%1" == "qthelp" ( echo. echo.Build finished; now you can run "qcollectiongenerator" with the ^ .qhcp project file in %BUILDDIR%/qthelp, like this: - echo.^> qcollectiongenerator %BUILDDIR%\qthelp\PyIM.qhcp + echo.^> qcollectiongenerator %BUILDDIR%\qthelp\pyim.qhcp echo.To view the help file: - echo.^> assistant -collectionFile %BUILDDIR%\qthelp\PyIM.ghc + echo.^> assistant -collectionFile %BUILDDIR%\qthelp\pyim.ghc goto end ) @@ -149,7 +161,7 @@ if "%1" == "latexpdf" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex cd %BUILDDIR%/latex make all-pdf - cd %BUILDDIR%/.. + cd %~dp0 echo. echo.Build finished; the PDF files are in %BUILDDIR%/latex. goto end @@ -159,7 +171,7 @@ if "%1" == "latexpdfja" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex cd %BUILDDIR%/latex make all-pdf-ja - cd %BUILDDIR%/.. + cd %~dp0 echo. echo.Build finished; the PDF files are in %BUILDDIR%/latex. goto end @@ -223,6 +235,15 @@ results in %BUILDDIR%/doctest/output.txt. goto end ) +if "%1" == "coverage" ( + %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage + if errorlevel 1 exit /b 1 + echo. + echo.Testing of coverage in the sources finished, look at the ^ +results in %BUILDDIR%/coverage/python.txt. + goto end +) + if "%1" == "xml" ( %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml if errorlevel 1 exit /b 1 diff --git a/setup.py b/setup.py index ad4cc45..9775910 100644 --- a/setup.py +++ b/setup.py @@ -40,6 +40,7 @@ entry_points={'console_scripts': [ 'pyim-align = pyim.main.align:main', 'pyim-merge = pyim.main.merge:main', + 'pyim-merge-sets = pyim.main.merge_sets:main', 'pyim-annotate = pyim.main.annotate:main', 'pyim-cis = pyim.main.cis:main', 'pyim-plot = pyim.main.plot:main', diff --git a/shear_splink_cutadapt.py b/shear_splink_cutadapt.py new file mode 100644 index 0000000..b3442ac --- /dev/null +++ b/shear_splink_cutadapt.py @@ -0,0 +1,38 @@ + + +from ..common.cutadapt import cutadapt + +def shear_splink(reads, barcodes): + + # De-multiplex + sample_files = _demultiplex(reads, barcodes) + + # Filter for contaminants + + + # Select for SB and T7 + + + +def _demultiplex(reads, output, barcodes): + options = { + '-g': ('file:' + str(barcodes), ) + '--discard-untrimmed': () + } + + cutadapt(reads_path, output_path, options=options) + + +def _extract_genomic(reads, transposon, linker, contaminants): + + # Filter for contaminants. + options = { + '-g': ('file:' + str(contaminants), ) + '--discard-trimmed': () + } + + cutadapt(reads_path, tmp_path, options) + + # Select for and remove transposon sequence. + + # Select for and remove linker sequence. diff --git a/src/pyim/alignment/__init__.py b/src/pyim/align/__init__.py similarity index 100% rename from src/pyim/alignment/__init__.py rename to src/pyim/align/__init__.py diff --git a/src/pyim/alignment/bowtie2.py b/src/pyim/align/bowtie2.py similarity index 100% rename from src/pyim/alignment/bowtie2.py rename to src/pyim/align/bowtie2.py diff --git a/src/pyim/alignment/pipelines/__init__.py b/src/pyim/align/common/__init__.py similarity index 100% rename from src/pyim/alignment/pipelines/__init__.py rename to src/pyim/align/common/__init__.py diff --git a/src/pyim/align/common/cutadapt.py b/src/pyim/align/common/cutadapt.py new file mode 100644 index 0000000..d063a2e --- /dev/null +++ b/src/pyim/align/common/cutadapt.py @@ -0,0 +1,107 @@ +import subprocess + + +def cutadapt(input_path, output_path, options): + cmdline_args = _build_arguments(input_path, output_path, options) + print(cmdline_args) + #check_call(cmdline_args) + + +def cutadapt_piped(input_path, output_path, options_list): + raise NotImplementedError() + + +def _build_arguments(input_path, output_path, options): + """Builds argument list for cutadapt.""" + + cmdline_opts = flatten_options(options) + return (['cutadapt'] + cmdline_opts + + ['-o', str(output_path), str(input_path)]) # yapf: disable + + +def _run(arguments, stdout=None, stderr=None, *args, **kwargs): + stdout_ = _open_output(stdout) + stderr_ = _open_output(stderr) + + try: + process = subprocess.Popen( + arguments, stdout=stdout, stderr=stderr, *args, **kwargs) + finally: + _close_output(stdout_) + _close_output(stderr_) + + return process.returncode + + +def _run_piped(arguments_list, stdout=None, stderrs=None): + if len(arguments_list) < 2: + raise ValueError('At least two sets of arguments should be given') + + if stderrs is None: + stderrs = [None] * len(arguments_list) + + # Handle processes 1-(n-1). + processes = [] + file_handles = [] + + try: + prev_out = None + for arg_list, stderr in list(zip(arguments_list, stderrs))[:-1]: + # Setup processes. + stderr_fh = _open_output(stderr) + process = subprocess.Popen( + arg_list, + stdin=prev_out, + stdout=subprocess.PIPE, + stderr=stderr_fh) + + prev_out = process.stdout + + processes.append(process) + file_handles.append(stderr_fh) + + # Handle final process. + stdout_fh = _open_output(stdout) + stderr_fh = _open_output(stderrs[-1]) + process = subprocess.Popen( + arguments_list[-1], + stdout=stdout_fh, + stderr=stderr_fh, + stdin=prev_out) + + processes.append(process) + file_handles += [stderr_fh, stdout_fh] + + # Allow pi to receive a SIGPIPE. + for p in processes[:-1]: + p.stdout.close() + + process.wait() + + finally: + # Close all file handles. + for fh in file_handles: + _close_output(fh) + + return process.returncode + + +def _open_output(file_path, mode='w'): + if file_path is None: + return None + else: + return file_path.open(mode) + + +def _close_output(file_path): + if file_path is not None: + file_path.close() + + +def flatten_options(option_dict): + """Flattens a dict of options into an argument list.""" + + options = [] + for opt_name, opt_values in option_dict.items(): + options += [opt_name] + list(opt_values) + return options diff --git a/src/pyim/alignment/pipelines/_helpers/__init__.py b/src/pyim/align/pipelines/__init__.py similarity index 100% rename from src/pyim/alignment/pipelines/_helpers/__init__.py rename to src/pyim/align/pipelines/__init__.py diff --git a/src/pyim/align/pipelines/_helpers/__init__.py b/src/pyim/align/pipelines/_helpers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/pyim/alignment/pipelines/_helpers/clustering.py b/src/pyim/align/pipelines/_helpers/clustering.py similarity index 100% rename from src/pyim/alignment/pipelines/_helpers/clustering.py rename to src/pyim/align/pipelines/_helpers/clustering.py diff --git a/src/pyim/alignment/pipelines/_helpers/grouping.py b/src/pyim/align/pipelines/_helpers/grouping.py similarity index 52% rename from src/pyim/alignment/pipelines/_helpers/grouping.py rename to src/pyim/align/pipelines/_helpers/grouping.py index abf91e3..294bf6c 100644 --- a/src/pyim/alignment/pipelines/_helpers/grouping.py +++ b/src/pyim/align/pipelines/_helpers/grouping.py @@ -5,6 +5,8 @@ import heapq import toolz +from collections import namedtuple + class PrioritySet(object): @@ -101,6 +103,100 @@ def groupby_position(alignments): yield (rev_grp[0].reference_end, -1), rev_grp +GenomicPosition = namedtuple('GenomicPosition', + ['chromosome', 'position', 'strand']) + + +def groupby_position_mate(alignments): + """ Groups alignments by their positions, grouping forward strand + alignments with the same start position and reverse strand + alignments with the same end position. Assumes alignments + are all on a single reference sequence. + """ + # Setup our collections for tracking reads and positions. + # + # The priority set is used to track positions with alignment groups, + # ensuring that no position is listed twice (the set part) and + # always giving the lowest position first (the priority part). + # + # The alignment dict contains two lists for each position with at + # least one alignment, one for forward reads and one for reverse. + # Any alignments encountered as position x in orientation o are added + # to the corresponding entry dict[x][o] in the list, in which + # o is encoded as {0,1}, with 1 being for reverse strand alignments. + position_set = PrioritySet() + aln_dict = collections.defaultdict(lambda: ([], [])) + + # Only use proper pairs. + alignments = (aln for aln in alignments if aln.is_proper_pair) + + # Limit ourselves to alignments from one chromosome (the first + # encountered), as sort is only valid with the same chromosome. + aln, alignments = toolz.peek(alignments) + ref_name = aln.reference_name + + alignments = itertools.takewhile( + lambda aln: aln.reference_name == ref_name, alignments) + + # We match position on the first pair. The second is stored until + # needed and then returned together with the corresponding first pair. + second_pairs = {} + + curr_pos = 0 + for aln in alignments: + if aln.is_read2: + second_pairs[aln.query_name] = aln + else: + # Check our ordering. + if aln.reference_start < curr_pos: + raise ValueError('Alignments not ordered by position') + + curr_pos = aln.reference_start + + # Add current read to collections. + is_reverse = aln.is_reverse + ref_pos = aln.reference_end if is_reverse else curr_pos + aln_dict[ref_pos][bool(is_reverse)].append(aln) + position_set.push(ref_pos, ref_pos) + + # Return any alignment groups before our current position. + try: + while position_set.first() < curr_pos: + first_pos = position_set.pop() + fwd_grp, rev_grp = aln_dict.pop(first_pos) + + if len(fwd_grp) > 0: + fwd_mates = [second_pairs.pop(aln.query_name) + for aln in fwd_grp] + fwd_pos = fwd_grp[0].reference_start + yield (GenomicPosition(ref_name, fwd_pos, 1), + fwd_grp, fwd_mates) + + if len(rev_grp) > 0: + rev_mates = [second_pairs.pop(aln.query_name) + for aln in rev_grp] + rev_pos = rev_grp[0].reference_start + yield (GenomicPosition(ref_name, rev_pos, 1), + rev_grp, rev_mates) + + except ValueError: + pass + + # We're done, yield any remaining alignment groups. + for _ in range(len(position_set)): + fwd_grp, rev_grp = aln_dict.pop(position_set.pop()) + + if len(fwd_grp) > 0: + fwd_mates = [second_pairs.pop(aln.query_name) for aln in fwd_grp] + fwd_pos = fwd_grp[0].reference_start + yield (GenomicPosition(ref_name, fwd_pos, 1), fwd_grp, fwd_mates) + + if len(rev_grp) > 0: + rev_mates = [second_pairs.pop(aln.query_name) for aln in rev_grp] + rev_pos = rev_grp[0].reference_start + yield (GenomicPosition(ref_name, rev_pos, 1), rev_grp, rev_mates) + + @toolz.curry def groupby_reference_position(alignments, alignment_file=None): chained = chain_groupby( diff --git a/src/pyim/alignment/pipelines/_helpers/pipeline.py b/src/pyim/align/pipelines/_helpers/pipeline.py similarity index 100% rename from src/pyim/alignment/pipelines/_helpers/pipeline.py rename to src/pyim/align/pipelines/_helpers/pipeline.py diff --git a/src/pyim/alignment/pipelines/_model.py b/src/pyim/align/pipelines/_model.py similarity index 100% rename from src/pyim/alignment/pipelines/_model.py rename to src/pyim/align/pipelines/_model.py diff --git a/src/pyim/alignment/pipelines/lam_pcr.py b/src/pyim/align/pipelines/lam_pcr.py similarity index 100% rename from src/pyim/alignment/pipelines/lam_pcr.py rename to src/pyim/align/pipelines/lam_pcr.py diff --git a/src/pyim/alignment/pipelines/shear_splink.py b/src/pyim/align/pipelines/shear_splink.py similarity index 100% rename from src/pyim/alignment/pipelines/shear_splink.py rename to src/pyim/align/pipelines/shear_splink.py diff --git a/src/pyim/alignment/pipelines/shear_splink_sb.py b/src/pyim/align/pipelines/shear_splink_sb.py similarity index 100% rename from src/pyim/alignment/pipelines/shear_splink_sb.py rename to src/pyim/align/pipelines/shear_splink_sb.py diff --git a/src/pyim/alignment/vector.py b/src/pyim/align/vector.py similarity index 100% rename from src/pyim/alignment/vector.py rename to src/pyim/align/vector.py diff --git a/src/pyim/annotation/__init__.py b/src/pyim/annotation/__init__.py index e69de29..d72f89e 100644 --- a/src/pyim/annotation/__init__.py +++ b/src/pyim/annotation/__init__.py @@ -0,0 +1,2 @@ +from ._registry import register_annotator, get_annotators +from .annotator.window import WindowAnnotator \ No newline at end of file diff --git a/src/pyim/annotation/_registry.py b/src/pyim/annotation/_registry.py new file mode 100644 index 0000000..c1948e7 --- /dev/null +++ b/src/pyim/annotation/_registry.py @@ -0,0 +1,10 @@ + +_registry = {} + + +def register_annotator(name, aligner): + _registry[name] = aligner + + +def get_annotators(): + return dict(_registry) diff --git a/src/pyim/annotation/annotator/__init__.py b/src/pyim/annotation/annotator/__init__.py index 1cb1e9e..ef588c3 100644 --- a/src/pyim/annotation/annotator/__init__.py +++ b/src/pyim/annotation/annotator/__init__.py @@ -1,3 +1,3 @@ -from .rbm import annotate_rbm -from .rbm_cis import annotate_rbm_cis -from .window import annotate_windows, Window +#from .rbm import annotate_rbm +#from .rbm_cis import annotate_rbm_cis +#from .window import annotate_windows, Window diff --git a/src/pyim/annotation/annotator/window.py b/src/pyim/annotation/annotator/window.py index 0122bfe..176454e 100644 --- a/src/pyim/annotation/annotator/window.py +++ b/src/pyim/annotation/annotator/window.py @@ -1,131 +1,103 @@ +# pylint: disable=W0622,W0614,W0401 from __future__ import absolute_import, division, print_function - -#pylint: disable=wildcard-import,unused-wildcard-import,redefined-builtin from builtins import * -#pylint: enable=wildcard-import,unused-wildcard-import,redefined-builtin +# pylint: enable=W0622,W0614,W0401 +import collections import itertools -import logging -import pandas as pd -from intervaltree import IntervalTree -from tqdm import tqdm +import toolz +from pyim.annotation import register_annotator from pyim.util.tabix import GtfFile -# pylint: disable=import-error -from ..filtering import filter_blacklist, select_closest +# from ..filtering import filter_blacklist, select_closest from ..util import build_interval_trees, numeric_strand -# pylint: enable=import-error - -def annotate_windows(insertions, gtf, windows): - """Assigns insertions to genes that fall within the given windows. - - Args: - insertions (pandas.DataFrame): Insertions to annotate in DataFrame - format. The frame is expected to contain at least the - following columns: id, position, strand. - gtf (str or GtfFile): Path to gtf file containing gene features. - Alternatively, a GtfFile object may also be given instead of a path. - windows (list[Window]): List of windows to inspect for genes. - Returns: - pandas.DataFrame: Dataframe containing annotated insertions. Annotations - are added as columns 'gene_id' and 'gene_name', which respectively contain the id and name of the annotated gene. An extra column - 'window' indicates which of the RBM windows was used for - the annotation. - """ +class WindowAnnotator(object): - if isinstance(gtf, str): - gtf = GtfFile(gtf) + def __init__(self, reference_gtf, windows): + if not isinstance(reference_gtf, GtfFile): + reference_gtf = GtfFile(reference_gtf) - # Build lookup trees. - trees = build_interval_trees(gtf) + self._windows = windows + self._gtf = reference_gtf - # Generate queries (insertion/window combinations). - ins_gen = (row for _, row in insertions.iterrows()) - queries = itertools.product(ins_gen, windows) + self._trees = None - queries = tqdm(queries, unit='query', - total=len(insertions) * len(windows)) + @classmethod + def from_args(cls, args): + window_size = args.window_size // 2 + windows = [Window(-window_size, window_size, strand=None, + name=None, strict_left=False, strict_right=False)] + return cls(reference_gtf=args.reference_gtf, windows=windows) - # Generate annotation for queries and merge into frame. - annotations = (_annotate_window(ins, window, trees) - for ins, window in queries) - annotation = pd.concat(annotations, ignore_index=True) + @classmethod + def setup_args(cls, parser): + # Required arguments. + parser.add_argument('--reference_gtf', required=True) - # Merge annotation with insertions. - annotated = pd.merge(insertions, annotation, on='id', how='left') + # Optional arguments. + # parser.add_argument('--closest', default=False, action='store_true') + parser.add_argument('--window_size', default=20000, type=int) - return annotated + def annotate(self, insertions): + if self._trees is None: + self._trees = build_interval_trees(self._gtf) + queries = itertools.product(insertions, self._windows) + annotated = itertools.chain.from_iterable( + (self._annotate(ins, window, self._trees) + for ins, window in queries)) -class Window(object): - """Class representing a (relative) window to inspect for genes. + return annotated - The window may be an actual window corresponding to a real chromosome - location, in which case start and end represent the actual window - boundaries, and reference and strand represent the actual chromosome - and strand of the window. + def _annotate(self, ins, window, interval_trees): + # Identify overlapping features. + applied_window = window.apply(ins.chromosome, ins.position, ins.strand) + features = list(applied_window.get_overlap(interval_trees)) - Alternatively, the window may also represent a relative window. In this - case start is typically negative and end is typically positive, whilst - reference is typically omitted and strand is optional. This relative window - can be applied to an actual position using the apply method, which - effectively calculates the given window around that position. + if len(features) > 0: + for feature in features: + feat_metadata = {'gene_id': feature['gene_id'], + 'gene_name': feature['gene_name']} - Args: - start (int): Start of the window. - end (int): End of the window. - reference (str): Chromosome of the window (optional). - strand (int): Relative strand of window (optional). - incl_left (bool): Whether to include partially (left) - overlapping features. - incl_right (bool): Whether to include partially (right) - overlapping features. + if window.name is not None: + feat_metadata['window'] = window.name - """ + new_metadata = toolz.merge(ins.metadata, feat_metadata) - def __init__(self, start, end, reference=None, strand=None, - incl_left=True, incl_right=True, name=None): - self.reference = reference - self.start = start - self.end = end - self.strand = strand + yield ins._replace(metadata=new_metadata) + else: + yield ins - self.incl_left = incl_left - self.incl_right = incl_right - self.name = name +register_annotator('window', WindowAnnotator) - def apply(self, reference, location, strand): - """Applies a relative window to specific location and strand. - For example, a relative window of Window(start=-1000, end=1000, - strand=-1) applied to position (2, 3000, -1) will become - Window(ref=2, start=2000, end=4000, strand=1). +_Window =collections.namedtuple( + 'Window', ['start', 'end', 'strand', 'name', + 'strict_left', 'strict_right']) - Args: - reference (str): Chromosome name of the reference position. - location (int): Reference genomic position. - strand (int): Reference genomic strand. - """ +class Window(_Window): + __slots__ = () + def apply(self, chromosome, position, strand): # Determine start/end position. if strand == 1: - start = location + self.start - end = location + self.end + start = position + self.start + end = position + self.end - incl_left = self.incl_left - incl_right = self.incl_right + strict_left = self.strict_left + strict_right = self.strict_right elif strand == -1: - start = location - self.end - end = location - self.start + start = position - self.end + end = position - self.start - incl_right = self.incl_left - incl_left = self.incl_right + strict_right = self.strict_left + strict_left = self.strict_right else: raise ValueError('Unknown value for strand ({})' .format(strand)) @@ -136,101 +108,39 @@ def apply(self, reference, location, strand): else: new_strand = None - return Window(start, end, reference, new_strand, - incl_left, incl_right, name=self.name) - - -def _annotate_window(insertion, window, feature_trees): - """Annotates insertion for features in trees using given window.""" - - # Apply window for insertion. - applied_window = window.apply( - insertion['chrom'], insertion['position'], insertion['strand']) - - # Fetch features within window. - features = _fetch_in_window(feature_trees, applied_window) - - # Extract feature values. - frame = pd.DataFrame.from_records( - ({'id': insertion['id'], - 'gene_id': feature['gene_id'], - 'gene_name': feature['gene_name']} - for feature in features)) - - # Include window name if known. - if window.name is not None: - frame['window'] = window.name - - return frame - + return AppliedWindow(chromosome, start, end, new_strand, + self.name, strict_left, strict_right) -def _fetch_in_window(trees, window): - """Fetches features within given window in the interval trees.""" - # Find overlapping features. - try: - tree = trees[window.reference] - overlap = tree[window.start:window.end] - except KeyError: - overlap = [] +_AppliedWindow = collections.namedtuple( + 'AppliedWindow', ['chromosome', 'start', 'end', 'strand', + 'name', 'strict_left', 'strict_right']) - # Extract features. - features = (interval[2] for interval in overlap) - # Filter inclusive/exclusive if needed. - if not window.incl_left: - features = (f for f in features if f['start'] > window.start) +class AppliedWindow(_AppliedWindow): + __slots__ = () - if not window.incl_right: - features = (f for f in features if f['end'] < window.end) + def get_overlap(self, interval_trees): + # Find overlapping features. + try: + tree = interval_trees[self.chromosome] + overlap = tree[self.start:self.end] + except KeyError: + overlap = [] - # Filter for strand if needed. - if window.strand is not None: - features = (f for f in features - if numeric_strand(f['strand']) == window.strand) + # Extract features. + features = (interval[2] for interval in overlap) - return list(features) + # Filter inclusive/exclusive if needed. + if self.strict_left: + features = (f for f in features if f['start'] > self.start) + if self.strict_right: + features = (f for f in features if f['end'] < self.end) + # Filter for strand if needed. + if self.strand is not None: + features = (f for f in features + if numeric_strand(f['strand']) == self.strand) -def register(subparsers, name='window'): - parser = subparsers.add_parser(name, help=name + ' help') - - # Required arguments. - parser.add_argument('input') - parser.add_argument('output') - parser.add_argument('--gtf', required=True) - - # Optional arguments. - parser.add_argument('--closest', default=False, action='store_true') - parser.add_argument('--window_size', default=20000, type=int) - - # Set main for dispatch. - parser.set_defaults(main=main) - - return parser - - -def main(args): - # Read annotation. - insertions = pd.read_csv(args.input, sep='\t', dtype={'chrom': str}) - logging.info('Read %d insertions', insertions['id'].unique()) - - # Define windows. - logging.info('Annotating insertions') - half_size = args.window_size // 2 - window = Window(start=-half_size, end=half_size) - - # Annotate insertions. - annotated = annotate_windows(insertions, args.gtf, [window]) - - if args.blacklist is not None: - logging.info('Filtering blacklisted genes') - annotated = filter_blacklist(annotated, args.blacklist) - - if args.closest: - logging.info('Selecting closest genes') - annotated = select_closest(annotated) - - # Merge annotation. - annotated.to_csv(args.output, sep='\t', index=False) + return features diff --git a/src/pyim/annotation/metadata.py b/src/pyim/annotation/metadata.py index 04f804a..477f144 100644 --- a/src/pyim/annotation/metadata.py +++ b/src/pyim/annotation/metadata.py @@ -44,14 +44,14 @@ def add_metadata(insertions, gtf): return pd.merge(insertions, metadata, on=['id', 'gene_id'], how='left') -def _annotate_insertion(insertion, feature): - """Annotates a given insertion/feature combination.""" +def _annotate_insertion(insertion, gene): + """Annotates a given insertion/gene combination.""" return { 'id': insertion['id'], 'gene_id': feature['gene_id'], - 'distance': feature_distance(insertion, feature), - 'orientation': feature_orientation(insertion, feature) + 'gene_distance': feature_distance(insertion, gene), + 'gene_orientation': feature_orientation(insertion, gene) } diff --git a/src/pyim/cis/cimpl.py b/src/pyim/cis/cimpl.py index f76d006..3cba07a 100644 --- a/src/pyim/cis/cimpl.py +++ b/src/pyim/cis/cimpl.py @@ -6,10 +6,7 @@ from pyim.util.rpy2 import pandas_to_dataframe, dataframe_to_pandas - -R_GENOMES = { - 'mm10': 'BSgenome.Mmusculus.UCSC.mm10' -} +R_GENOMES = {'mm10': 'BSgenome.Mmusculus.UCSC.mm10'} def map_insertions(insertions, scales, genome, alpha=0.05, **kwargs): @@ -28,9 +25,16 @@ def map_insertions(insertions, scales, genome, alpha=0.05, **kwargs): return cis, mapping -def cimpl(insertions, scales, genome, system=None, pattern=None, - lhc_method='none', iterations=1000, chromosomes=None, - verbose=False, threads=1): +def cimpl(insertions, + scales, + genome, + system=None, + pattern=None, + lhc_method='none', + iterations=1000, + chromosomes=None, + verbose=False, + threads=1): """Runs CIMPL on insertions (in CIMPL format).""" # Fill in chromosomes from data if not specified. @@ -49,7 +53,7 @@ def cimpl(insertions, scales, genome, system=None, pattern=None, # Prepare chromosomes argument, adding 'chr' prefix and # converting to StrVector to pass to R. if not chromosomes[0].startswith('chr'): - chromosomes = ['chr' + c for c in chromosomes] + chromosomes = ['chr' + c for c in chromosomes] # Convert scales to IntVector if supplied as list. if type(scales) == list: @@ -66,17 +70,21 @@ def cimpl(insertions, scales, genome, system=None, pattern=None, cimpl_r = importr('cimpl') cimpl_obj = cimpl_r.doCimplAnalysis( pandas_to_dataframe(insertions), - scales=scales, n_iterations=iterations, - lhc_method=lhc_method, threads=threads, BSgenome=genome_obj, + scales=scales, + n_iterations=iterations, + lhc_method=lhc_method, + threads=threads, + BSgenome=genome_obj, chromosomes=robjects.vectors.StrVector(chromosomes), - verbose=verbose, **extra_args) + verbose=verbose, + **extra_args) return cimpl_obj def convert_to_cimpl(insertions): # Extract and rename required columns. - cimpl_ins = insertions.ix[:, ['id', 'chrom', 'position', 'sample']] + cimpl_ins = insertions.ix[:, ['id', 'chromosome', 'position', 'sample']] cimpl_ins.columns = ['id', 'chr', 'location', 'sampleID'] if 'depth_unique' in insertions: @@ -115,8 +123,9 @@ def extract_cis(cimpl_obj, alpha=0.05, mul_test=True): # Convert cis to pandas and rename index. cis_frame = dataframe_to_pandas(cis_obj).reset_index() - cis_frame.rename(columns={'index': 'cis_id', - 'chromosome': 'seqname'}, inplace=True) + cis_frame.rename( + columns={'index': 'cis_id', + 'chromosome': 'seqname'}, inplace=True) # Convert columns to int types. for col in ['peak_location', 'start', 'end', 'width', 'n_insertions']: @@ -126,19 +135,18 @@ def extract_cis(cimpl_obj, alpha=0.05, mul_test=True): cis_frame['seqname'] = cis_frame['seqname'].str.replace('chr', '') # Reorder columns. - cis_frame = cis_frame[['cis_id', 'seqname', 'start', 'end', - 'scale', 'p_value', 'n_insertions', - 'peak_location', 'peak_height', 'width']] + cis_frame = cis_frame[['cis_id', 'seqname', 'start', 'end', 'scale', + 'p_value', 'n_insertions', 'peak_location', + 'peak_height', 'width']] # Rename and reshuffle cis columns. - cis_frame = cis_frame.rename( - columns={'seqname': 'chrom', - 'peak_location': 'position', - 'peak_height': 'height'}) + cis_frame = cis_frame.rename(columns={'seqname': 'chrom', + 'peak_location': 'position', + 'peak_height': 'height'}) cis_frame = cis_frame[['cis_id', 'chrom', 'position', 'scale', - 'n_insertions', 'p_value', 'start', 'end', - 'height', 'width']] + 'n_insertions', 'p_value', 'start', 'end', 'height', + 'width']] return cis_frame @@ -187,8 +195,10 @@ def merge_cis(cis_frame): def _expand_column(frame, col, delimiter): - exp = pd.concat((_expand_row(row, col=col, delimiter=delimiter) - for _, row in frame.iterrows()), ignore_index=True) + exp = pd.concat( + (_expand_row( + row, col=col, delimiter=delimiter) for _, row in frame.iterrows()), + ignore_index=True) return exp[frame.columns] diff --git a/src/pyim/main/annotate.py b/src/pyim/main/annotate.py index 5db915d..47c055a 100644 --- a/src/pyim/main/annotate.py +++ b/src/pyim/main/annotate.py @@ -5,37 +5,60 @@ #pylint: enable=wildcard-import,unused-wildcard-import,redefined-builtin import argparse -import logging -from pyim.annotation.annotator import window, rbm, rbm_cis +import pandas as pd + +from pyim.annotation import get_annotators +from pyim.model import Insertion # pylint: disable=import-error -from ._logging import print_header, print_footer +# from ._logging import print_header, print_footer # pylint: enable=import-error def main(): - logger = logging.getLogger() + args = parse_args() + + ins_frame = pd.read_csv(args.input, sep='\t') + insertions = Insertion.from_frame(ins_frame) + + annotator = args.class_.from_args(args) + annotated = list(annotator.annotate(insertions)) + + annotated_frame = Insertion.to_frame(annotated) + annotated_frame.to_csv(args.output, sep='\t', index=False) + + # Dispatch to pipeline. + #cmd_str = '{} {}'.format('annotate', args.annotator) + #print_header(logger, command=cmd_str) + #args.main(args) + #print_footer(logger) + +def parse_args(): # Setup main parser. parser = argparse.ArgumentParser(prog='pyim-annotate') subparsers = parser.add_subparsers(dest='annotator') subparsers.required = True # Register pipelines. - window.register(subparsers) - rbm.register(subparsers) - rbm_cis.register(subparsers) - # kcrbm.register(subparsers) - - # Parse args. + for name, class_ in get_annotators().items(): + annot_parser = subparsers.add_parser(name) + + _add_default_arguments(annot_parser) + class_.setup_args(annot_parser) + + annot_parser.set_defaults(class_=class_) + + # Actually parse args. args = parser.parse_args() - # Dispatch to pipeline. - cmd_str = '{} {}'.format('annotate', args.annotator) - print_header(logger, command=cmd_str) - args.main(args) - print_footer(logger) + return args + + +def _add_default_arguments(parser): + parser.add_argument('input') + parser.add_argument('output') if __name__ == '__main__': diff --git a/src/pyim/main/merge.py b/src/pyim/main/merge.py index 18b8bbf..2ae2027 100644 --- a/src/pyim/main/merge.py +++ b/src/pyim/main/merge.py @@ -1,29 +1,25 @@ -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import (ascii, bytes, chr, dict, filter, hex, input, - int, map, next, oct, open, pow, range, round, - str, super, zip) -from future.utils import native_str +# pylint: disable=W0622,W0614,W0401 +from __future__ import absolute_import, division, print_function +from builtins import * +# pylint: enable=W0622,W0614,W0401 import logging from argparse import ArgumentParser from pathlib import Path +from collections import Counter import pandas as pd -from pyim.util.insertions import subset_samples +from pyim.model import Insertion from ._logging import print_header, print_footer def setup_parser(): parser = ArgumentParser(prog='pyim-merge') - parser.add_argument('insertions', nargs='+', type=Path) - parser.add_argument('output', type=Path) - - parser.add_argument('--names', nargs='+', default=None) - parser.add_argument('--samples', nargs='+', default=None) - # parser.add_argument('--complement', default=False, action='store_true') + parser.add_argument('--insertions', nargs='+', type=Path, required=True) + parser.add_argument('--output', type=Path, required=True) + parser.add_argument('--sample_names', nargs='+', default=None) return parser @@ -36,50 +32,48 @@ def main(): logger = logging.getLogger() print_header(logger, command='merge') - # Generate default names if none given. - if args.names is None: - names = ['Set{}'.format(i) for i in range(1, len(args.insertions) + 1)] - else: - names = args.names + # Read and merge frames. + merge_files(args.insertions, args.output, sample_names=args.sample_names) + + print_footer(logger) + + +def merge_files(file_paths, output_path, sample_names=None): + if sample_names is None: + sample_names = [fp.stem for fp in file_paths] - # Read frames. - ins_frames, samples = [], set() - for (ins_path, name) in zip(args.insertions, names): - frame = pd.read_csv(str(ins_path), sep=native_str('\t')) + ins_frames = (pd.read_csv(fp, sep='\t') for fp in file_paths) - # Check for overlapping samples. - frame_samples = set(filter(bool, frame['sample'])) - overlap = samples.intersection(frame_samples) + merged = merge_frames(ins_frames, sample_names) + merged.to_csv(str(output_path), sep='\t', index=False) - if len(overlap) > 0: - raise ValueError('Overlapping samples between frames ({})' - .format(', '.join(overlap))) - samples = samples.union(frame_samples) +def merge_frames(insertion_frames, sample_names): + # Check sample names for duplicates. + duplicate_samples = [s for s, count in Counter(sample_names).items() + if count > 1] - # Augment ids to avoid duplicates in merged frame. - if name != '': - frame['id'] = ['{}.{}'.format(name, id_) - for id_ in frame['id']] - ins_frames.append(frame) + if len(duplicate_samples) > 1: + raise ValueError('Duplicate sample names given ({})' + .format(', '.join(duplicate_samples))) # Merge frames. - merged = pd.concat(ins_frames, ignore_index=True) + frames = [] + for (frame, sample_name) in zip(insertion_frames, sample_names): + # Check if frame is valid. + Insertion.check_frame(frame) - logger.info('Merging insertions for {} datasets, containing {} samples' - .format(len(args.insertions), merged['sample'].nunique())) + # Augment frame with sample name. + frame = frame.copy() + frame['sample'] = sample_name + frame['id'] = (sample_name + '.') + frame['id'] - # Filter samples if needed. - if args.samples is not None: - logger.info('Subsetting dataset to {} samples' - .format(len(args.samples))) - merged = subset_samples(merged, args.samples, logger=logger) + frames.append(frame) - # Write output. - logging.info('Writing merged output') - merged.to_csv(str(args.output), sep=native_str('\t'), index=False) + merged = pd.concat(frames, axis=0) + merged = Insertion.format_frame(merged) - print_footer(logger) + return merged if __name__ == '__main__': diff --git a/src/pyim/main/merge_sets.py b/src/pyim/main/merge_sets.py new file mode 100644 index 0000000..43b9a1e --- /dev/null +++ b/src/pyim/main/merge_sets.py @@ -0,0 +1,86 @@ +from __future__ import (absolute_import, division, + print_function, unicode_literals) +from builtins import (ascii, bytes, chr, dict, filter, hex, input, + int, map, next, oct, open, pow, range, round, + str, super, zip) +from future.utils import native_str + +import logging +from argparse import ArgumentParser +from pathlib import Path + +import pandas as pd + +from pyim.util.insertions import subset_samples +from ._logging import print_header, print_footer + + +def setup_parser(): + parser = ArgumentParser(prog='pyim-merge-sets') + + parser.add_argument('insertions', nargs='+', type=Path) + parser.add_argument('output', type=Path) + + parser.add_argument('--names', nargs='+', default=None) + parser.add_argument('--samples', nargs='+', default=None) + # parser.add_argument('--complement', default=False, action='store_true') + + return parser + + +def main(): + parser = setup_parser() + args = parser.parse_args() + + # Get logger and print header. + logger = logging.getLogger() + print_header(logger, command='merge') + + # Generate default names if none given. + if args.names is None: + names = ['Set{}'.format(i) for i in range(1, len(args.insertions) + 1)] + else: + names = args.names + + # Read frames. + ins_frames, samples = [], set() + for (ins_path, name) in zip(args.insertions, names): + frame = pd.read_csv(str(ins_path), sep=native_str('\t')) + + # Check for overlapping samples. + frame_samples = set(filter(bool, frame['sample'])) + overlap = samples.intersection(frame_samples) + + if len(overlap) > 0: + raise ValueError('Overlapping samples between frames ({})' + .format(', '.join(overlap))) + + samples = samples.union(frame_samples) + + # Augment ids to avoid duplicates in merged frame. + if name != '': + frame['id'] = ['{}.{}'.format(name, id_) + for id_ in frame['id']] + ins_frames.append(frame) + + # Merge frames. + merged = pd.concat(ins_frames, ignore_index=True) + + logger.info('Merging insertions for {} datasets, containing {} samples' + .format(len(args.insertions), merged['sample'].nunique())) + + # Filter samples if needed. + if args.samples is not None: + logger.info('Subsetting dataset to {} samples' + .format(len(args.samples))) + merged = subset_samples(merged, args.samples, logger=logger) + + # Write output. + logging.info('Writing merged output') + merged.to_csv(str(args.output), sep=native_str('\t'), index=False) + + print_footer(logger) + + +if __name__ == '__main__': + main() diff --git a/src/pyim/main/plot.py b/src/pyim/main/plot.py deleted file mode 100644 index 4a9a49f..0000000 --- a/src/pyim/main/plot.py +++ /dev/null @@ -1,96 +0,0 @@ -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import (ascii, bytes, chr, dict, filter, hex, input, - int, map, next, oct, open, pow, range, round, - str, super, zip) -from future.utils import native_str - -import re -from argparse import ArgumentParser -from pathlib import Path - -import pandas as pd - -from tkgeno.io import GtfFile -from tkgeno.viz.tracks import FeatureTrack, GeneRegionTrack, plot_tracks - -try: - import seaborn as sns - sns.set_style('whitegrid') -except ImportError: - sns = None - - -def setup_parser(): - parser = ArgumentParser(prog='pyim-merge') - - parser.add_argument('input', type=Path) - parser.add_argument('output', type=Path) - parser.add_argument('reference_gtf', type=Path) - - parser.add_argument('--region', required=True) - parser.add_argument('--figure_width', default=None) - parser.add_argument('--transcript_ids', nargs='+', default=None) - - parser.add_argument('--insertion_width', type=int, default=2000) - parser.add_argument('--gene_line_height', type=float, default=0.5) - parser.add_argument('--insertion_line_height', type=float, default=0.5) - - parser.add_argument('--flip_x', default=False, action='store_true') - parser.add_argument('--flip_y', default=False, action='store_true') - - parser.add_argument('--dpi', default=72, type=int) - - return parser - - -def main(): - parser = setup_parser() - args = parser.parse_args() - - # Setup insertion track. - if sns is not None: - palette = sns.color_palette() - color_map = {1: palette[0], -1: palette[2]} - else: - color_map = {1: 'red', -1: 'blue'} - - # Parse region. - seqname, start, end = re.split('[:-]', args.region) - start, end = int(start), int(end) - - # Setup insertion track. - ins_frame = pd.read_csv(str(args.input), sep=native_str('\t'), - dtype={'seqname': str}) - ins_track = FeatureTrack.from_location( - ins_frame, width=args.insertion_width, - line_height=args.insertion_line_height, - color='strand', color_map=color_map, flip_y=args.flip_y) - - # Setup gene region track. - gtf = GtfFile(args.reference_gtf) - - if args.transcript_ids is not None: - gtf = gtf.get_region(seqname, start, end, expand=True) - gtf = gtf.ix[gtf.transcript_id.isin(args.transcript_ids)] - - gene_track = GeneRegionTrack(gtf, line_height=args.gene_line_height) - - # Setup plot. - tracks = [gene_track, ins_track] \ - if args.flip_y else [ins_track, gene_track] - - fig, axes = plot_tracks(tracks, - seqname=seqname, start=start, end=end, - figsize=(int(args.figure_width), None), - tick_top=args.flip_y, reverse_x=args.flip_x) - - if args.output.suffix == '.png': - save_kwargs = {'dpi': args.dpi} - else: - save_kwargs = {} - - fig.savefig(str(args.output), bbox_inches='tight', **save_kwargs) - -if __name__ == '__main__': - main() diff --git a/src/pyim/model.py b/src/pyim/model.py new file mode 100644 index 0000000..e30036d --- /dev/null +++ b/src/pyim/model.py @@ -0,0 +1,92 @@ +# pylint: disable=W0622,W0614,W0401 +from __future__ import absolute_import, division, print_function +from builtins import * +# pylint: enable=W0622,W0614,W0401 + +import collections + +import pandas as pd +import toolz + + +class MetadataFrameMixin(object): + """Mixin class adding namedtuple/frame conversion support.""" + + @classmethod + def _non_metadata_fields(cls): + fields = list(cls._fields) + del fields[fields.index('metadata')] + return fields + + @classmethod + def to_frame(cls, insertions): + """Converts list of objects to a dataframe representation.""" + + rows = (cls._to_dict(ins) for ins in insertions) + + df = pd.DataFrame.from_records(rows) + df = cls.format_frame(df) + + return df + + @classmethod + def format_frame(cls, df): + """Ensures frame is properly formatted (column order etc.)""" + cls.check_frame(df) + return cls._reorder_columns(df, order=cls._non_metadata_fields()) + + @classmethod + def check_frame(cls, df): + basic_fields = cls._non_metadata_fields() + missing_columns = set(basic_fields) - set(df.columns) + + if len(missing_columns) > 0: + raise ValueError('Missing required columns {}', + ', '.join(missing_columns)) + + @classmethod + def _to_dict(cls, obj): + obj_data = obj._asdict() + metadata = obj_data.pop('metadata') + return toolz.merge(metadata, obj_data) + + @classmethod + def _reorder_columns(cls, df, order): + extra_cols = set(df.columns) - set(order) + col_order = list(order) + sorted(extra_cols) + return df[col_order] + + @classmethod + def from_frame(cls, df): + """Converts dataframe into a list of objects.""" + + cls.check_frame(df) + + basic_fields = cls._non_metadata_fields() + metadata_fields = list(set(df.columns) - set(basic_fields)) + + for row in df.itertuples(): + row_dict = row._asdict() + + metadata = {k: row_dict.pop(k) for k in metadata_fields} + row_dict.pop('Index', None) + + yield cls(**row_dict, metadata=metadata) + + +_Insertion = collections.namedtuple( + 'Insertion', ['id', 'chromosome', 'position', + 'strand', 'metadata']) + + +class Insertion(MetadataFrameMixin, _Insertion): + """Model class representing an insertion.""" + + __slots__ = () + + @classmethod + def format_frame(cls, df): + df = super().format_frame(df) + df['position'] = df['position'].astype(int) + df['strand'] = df['strand'].astype(int) + return df From 18e2678a6d13231bdfafe0258e90dcfe51971e22 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Tue, 1 Nov 2016 13:59:41 +0100 Subject: [PATCH 077/100] Initial version of PyIM2. commit 174abdf2baaf4c5f82533a08d6562724bdf26d45 Author: Julian de Ruiter Date: Tue Nov 1 12:49:10 2016 +0100 Removed backup file. commit 9470850db39f57b86cbc8ab4d4af695f23db3247 Author: Julian de Ruiter Date: Tue Nov 1 12:45:50 2016 +0100 Bump version to 0.2.0.dev0 commit 8deb4faa953f6852eec542e278c3c8f239e0108f Author: Julian de Ruiter Date: Tue Nov 1 12:40:59 2016 +0100 Point cimpl submodule to new Github. commit f22645b71f86e72cdbf99500477858d0b6f3f4d4 Author: Julian de Ruiter Date: Tue Nov 1 12:35:00 2016 +0100 Initial version of PyIM 2 with single-end pipelines. --- .gitmodules | 5 +- ATP1_Cassette.dna | Bin 29957 -> 0 bytes AUTHORS.rst | 13 + CONTRIBUTING.rst | 114 ++ HISTORY.rst | 8 + LICENSE | 11 + MANIFEST.IN | 15 +- Makefile | 88 + README.rst | 40 + conda/build.sh | 1 + conda/meta.yaml | 77 + data/sb.barcodes.fa.fai | 201 -- data/sb.linker.fa | 2 +- data/sb.transposon.fa | 2 +- docs/Makefile | 43 +- docs/api.rst | 53 - docs/authors.rst | 1 + docs/conf.py | 193 +- docs/contributing.rst | 1 + docs/history.rst | 1 + docs/index.rst | 15 +- docs/installation.rst | 51 + docs/introduction.rst | 14 - docs/make.bat | 505 +++-- docs/readme.rst | 1 + docs/usage.rst | 7 + environment.yml | 11 - envs/dev.yml | 29 + external/cimpl | 2 +- external/kcrbm | 1 - readme.md | 7 - setup.cfg | 25 +- setup.py | 84 +- shear_splink_cutadapt.py | 38 - src/pyim/__init__.py | 7 +- src/pyim/_version.py | 484 ----- src/pyim/align/bowtie2.py | 92 - src/pyim/align/common/cutadapt.py | 107 - src/pyim/align/common/genomic.py | 147 ++ src/pyim/align/common/insertions.py | 226 +++ src/pyim/align/pipelines/__init__.py | 2 + .../align/pipelines/_helpers/clustering.py | 64 - src/pyim/align/pipelines/_helpers/grouping.py | 235 --- src/pyim/align/pipelines/_helpers/pipeline.py | 62 - src/pyim/align/pipelines/_model.py | 10 - src/pyim/align/pipelines/base.py | 34 + src/pyim/align/pipelines/lam_pcr.py | 244 --- src/pyim/align/pipelines/paired.py | 26 + src/pyim/align/pipelines/shear_splink.py | 378 ---- src/pyim/align/pipelines/shear_splink_sb.py | 99 - src/pyim/align/pipelines/single.py | 283 +++ src/pyim/align/vector.py | 207 -- src/pyim/annotate/__init__.py | 1 + src/pyim/annotate/annotators/__init__.py | 3 + src/pyim/annotate/annotators/base.py | 146 ++ src/pyim/annotate/annotators/rbm.py | 97 + src/pyim/annotate/annotators/window.py | 200 ++ src/pyim/annotate/filter_.py | 57 + src/pyim/{annotation => annotate}/metadata.py | 64 +- src/pyim/annotate/util.py | 45 + src/pyim/annotation/__init__.py | 2 - src/pyim/annotation/_registry.py | 10 - src/pyim/annotation/annotator/__init__.py | 3 - src/pyim/annotation/annotator/kcrbm.py | 164 -- src/pyim/annotation/annotator/rbm.py | 163 -- src/pyim/annotation/annotator/rbm_cis.py | 155 -- src/pyim/annotation/annotator/window.py | 146 -- src/pyim/annotation/filtering.py | 45 - src/pyim/annotation/util.py | 34 - src/pyim/cis/__init__.py | 1 + src/pyim/cis/_util.py | 32 - src/pyim/cis/callers/__init__.py | 2 + src/pyim/cis/callers/base.py | 42 + src/pyim/cis/callers/cimpl.py | 267 +++ src/pyim/cis/cimpl.py | 214 -- src/pyim/cis/poisson.py | 96 - src/pyim/cis/util.py | 54 + .../_helpers => external}/__init__.py | 0 src/pyim/external/bowtie2.py | 57 + src/pyim/external/cutadapt.py | 176 ++ src/pyim/external/util.py | 106 + src/pyim/main/_logging.py | 23 - src/pyim/main/align.py | 33 - src/pyim/main/annotate.py | 65 - src/pyim/main/cis.py | 94 - src/pyim/main/gff.py | 70 - src/pyim/main/merge_sets.py | 86 - src/pyim/main/pyim_align.py | 50 + src/pyim/main/pyim_annotate.py | 39 + src/pyim/main/pyim_bed.py | 64 + src/pyim/main/pyim_cis.py | 64 + src/pyim/main/pyim_demultiplex.py | 46 + src/pyim/main/{merge.py => pyim_merge.py} | 31 +- src/pyim/main/pyim_split.py | 48 + src/pyim/main/split.py | 61 - src/pyim/model.py | 123 +- src/pyim/util/__init__.py | 6 + src/pyim/util/file.py | 29 - src/pyim/util/insertions.py | 13 - src/pyim/util/pandas.py | 13 - src/pyim/util/path.py | 14 + src/pyim/util/rpy2.py | 1 - src/pyim/util/tabix.py | 211 +- tests/pyim/util/test_shell.py | 2 + versioneer.py | 1774 ----------------- 105 files changed, 3434 insertions(+), 6274 deletions(-) delete mode 100644 ATP1_Cassette.dna create mode 100644 AUTHORS.rst create mode 100644 CONTRIBUTING.rst create mode 100644 HISTORY.rst create mode 100644 LICENSE create mode 100644 Makefile create mode 100644 README.rst create mode 100644 conda/build.sh create mode 100644 conda/meta.yaml delete mode 100644 data/sb.barcodes.fa.fai delete mode 100644 docs/api.rst create mode 100644 docs/authors.rst mode change 100644 => 100755 docs/conf.py create mode 100644 docs/contributing.rst create mode 100644 docs/history.rst create mode 100644 docs/installation.rst delete mode 100644 docs/introduction.rst create mode 100644 docs/readme.rst create mode 100644 docs/usage.rst delete mode 100644 environment.yml create mode 100644 envs/dev.yml delete mode 160000 external/kcrbm delete mode 100644 readme.md delete mode 100644 shear_splink_cutadapt.py delete mode 100644 src/pyim/_version.py delete mode 100644 src/pyim/align/bowtie2.py delete mode 100644 src/pyim/align/common/cutadapt.py create mode 100644 src/pyim/align/common/genomic.py create mode 100644 src/pyim/align/common/insertions.py delete mode 100644 src/pyim/align/pipelines/_helpers/clustering.py delete mode 100644 src/pyim/align/pipelines/_helpers/grouping.py delete mode 100644 src/pyim/align/pipelines/_helpers/pipeline.py delete mode 100644 src/pyim/align/pipelines/_model.py create mode 100644 src/pyim/align/pipelines/base.py delete mode 100644 src/pyim/align/pipelines/lam_pcr.py create mode 100644 src/pyim/align/pipelines/paired.py delete mode 100644 src/pyim/align/pipelines/shear_splink.py delete mode 100644 src/pyim/align/pipelines/shear_splink_sb.py create mode 100644 src/pyim/align/pipelines/single.py delete mode 100644 src/pyim/align/vector.py create mode 100644 src/pyim/annotate/__init__.py create mode 100644 src/pyim/annotate/annotators/__init__.py create mode 100644 src/pyim/annotate/annotators/base.py create mode 100644 src/pyim/annotate/annotators/rbm.py create mode 100644 src/pyim/annotate/annotators/window.py create mode 100644 src/pyim/annotate/filter_.py rename src/pyim/{annotation => annotate}/metadata.py (58%) create mode 100644 src/pyim/annotate/util.py delete mode 100644 src/pyim/annotation/__init__.py delete mode 100644 src/pyim/annotation/_registry.py delete mode 100644 src/pyim/annotation/annotator/__init__.py delete mode 100644 src/pyim/annotation/annotator/kcrbm.py delete mode 100644 src/pyim/annotation/annotator/rbm.py delete mode 100644 src/pyim/annotation/annotator/rbm_cis.py delete mode 100644 src/pyim/annotation/annotator/window.py delete mode 100644 src/pyim/annotation/filtering.py delete mode 100644 src/pyim/annotation/util.py delete mode 100644 src/pyim/cis/_util.py create mode 100644 src/pyim/cis/callers/__init__.py create mode 100644 src/pyim/cis/callers/base.py create mode 100644 src/pyim/cis/callers/cimpl.py delete mode 100644 src/pyim/cis/cimpl.py delete mode 100644 src/pyim/cis/poisson.py create mode 100644 src/pyim/cis/util.py rename src/pyim/{align/pipelines/_helpers => external}/__init__.py (100%) create mode 100644 src/pyim/external/bowtie2.py create mode 100644 src/pyim/external/cutadapt.py create mode 100644 src/pyim/external/util.py delete mode 100644 src/pyim/main/_logging.py delete mode 100644 src/pyim/main/align.py delete mode 100644 src/pyim/main/annotate.py delete mode 100644 src/pyim/main/cis.py delete mode 100644 src/pyim/main/gff.py delete mode 100644 src/pyim/main/merge_sets.py create mode 100644 src/pyim/main/pyim_align.py create mode 100644 src/pyim/main/pyim_annotate.py create mode 100644 src/pyim/main/pyim_bed.py create mode 100644 src/pyim/main/pyim_cis.py create mode 100644 src/pyim/main/pyim_demultiplex.py rename src/pyim/main/{merge.py => pyim_merge.py} (65%) create mode 100644 src/pyim/main/pyim_split.py delete mode 100644 src/pyim/main/split.py delete mode 100644 src/pyim/util/file.py delete mode 100644 src/pyim/util/insertions.py delete mode 100644 src/pyim/util/pandas.py create mode 100644 src/pyim/util/path.py create mode 100644 tests/pyim/util/test_shell.py delete mode 100644 versioneer.py diff --git a/.gitmodules b/.gitmodules index c3a38c0..48c12c4 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,3 @@ -[submodule "external/kcrbm"] - path = external/kcrbm - url = git@bitbucket.org:jrderuiter/kcrbm.git [submodule "external/cimpl"] path = external/cimpl - url = git@bitbucket.org:jrderuiter/cimpl.git + url = https://github.com/jrderuiter/cimpl.git diff --git a/ATP1_Cassette.dna b/ATP1_Cassette.dna deleted file mode 100644 index 5fbe90d117a968f764c236ba36753f6e7d123516..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 29957 zcmdsf1$-9A_UP=+`tGdZ?(Xgq0tDCK8bW{|0YcD1u|lC}u_A@y6ewOO?ydz21qxJX zu>z$=fkJub%zht)-roPe_ucpY_xIi=`|Zq`xZJkMTi3 zL>+++Bm)}SfNYUEDHg?`HfSzqj$JO4BcdLh9>+ieFyeHG%)^V40*rt!bdVZ(aqXZ7 zK}8clCb|Rl=S@rnP7GZ%qbnu|`X@vh7{R+Bm&-%XaP`o!XfNl*iJ%2?xTT;o;vpv# zh<<|>cnk)D0}~K|+zG%3m;n<|23bNYAVnPR9}vYgFM%+?L-SBozy(Xta=_#uZWOl(#sLvj zK?BiT^Z-v4AO{tKah@NjKK!DJzzrmV!gvY%!X3iJ!T{i*RVXn~CDcGL0Fw=JCWy&> zk9Gv{LNPEF{zV$J1UYj1xKjWJeUHY0*=QPXAP#atZaA+{22Vn+7+`T2cgsJs9!UTH zH=oBD5|u|Dyb2BQ3fhM3NHfgg+*3c=d|W;;-Ud?bMu4x<>HFrXGlz&0iYHiHpN3FskzBo5keL8e?3 zas(v;7iFM4e8MrmyJIz~E$I zerN;-y;KXhy(SR3BV&*QGC>mWp!(dSLE3X~;!*^L@u`Js!w(!l!>?RH4!{re1st3+ zGUO*-xPp8HZN`9u7_gszzWe|+@CPu05k4IHKo`FT4Uk|mp_2g#e^FtO1y>XTzwiTU z@+AwTqf-L604ShH{>fd9lbUOYANYg6{6tq_uAnv`N*uV^g0XiBPeBf9}AU7!BjpL{e z7ta}D2Z=aOP9AXLU~C5HxN<{_iwPkFx`y-wbgl@;=0=3{K-O>~C3pq&<_E9{wZYk7 z9|Rdd0SB%YeqgA%Vu3Dz7Lv?ma$OM%8hnIHK~$tclY$@zeL&%`@COnFdf@;!GUx|w zAV&u9i`oU=3IV{5P#SXp#1U}u6DAung#)J<=1QP2pDV}^eS_;kC>4M+_!RR81#v#% zKI(%S2E0)RP!3mFZbb0Or%T`wj>q|9jF1V(3){sViLMJ=6=uvmkNJzL^G{SCnPVgZ ztFRL!JdOo}ArN4pK^⁢5fGh69eZPjs<$7aX1JzzyPQRsPM^IA{8Fc!D)$#aP@JO zM+$Hy;sz!^|D0%{BTqcMCD2(=J|xPXZb>>@hq zib`|qU;vax8QcPFVD_SDl!sE$?Wh~BfjsnFGp-I-1eHaWKpBJ@P=N?V1T7dxQozF1 zicb!-5H$fR5CI+Rf^J+c8jK#~V9*N(@dK5ICeQ&VY(oz<{0B>b6_=00(1s7R4w-X} zxawSI)GuTfM*=Yj0s~Psq()0P6K)zRg>dv%$aVmshfohd<~CvcAULQta0~q+YdFy9 z;9s;SxNss&YQRUkxuZ~fWPx~qfESD~1_yx{OK>A-iI>2m=zc)ubf_Q%oJ#=)AnVlu zqL2&}#)!cM5ut@D0~V$qM}|+p1WDW(oH6GI2r$3}b55u;XvaAqlb1gLLRr8AhLI^( z3K9ryKzv|=hVa!FO@o62Q72T8YaZADUdPy>ThK@t;w z1a9SOqDE*rp9$Q5fj?1itFgEt236hyfx_ zh{+Vv2@DIwpg0&rdQOB5R2x~riBUj1xCYSXT^Ft$+cvf`ffOI$re7IKuA4WMxydWD874Qd4rkXun++dd7ET+7 zD(8s^;Fch8Vb29300UVgBIpH)g%?yF{e@Et8~{LM1)msTL_~MPD1b3ffdIQ`DD(n4 zuog|_Q1B$OLnnaKc$82Sh6e|DR8SJQ0=pR!**DIq({3@F3LySkv;>Qh2YE$ zC&Ex4#6^E0Z{&$MXd~){w8#f(@rrtfeTzOtT{6O%3r@sA9gt60KWs~(e%R(Pt_*O* zN4}^(%0u~xhbPKLUBY^xPG}#VIEHfb!HF`^PNYGb!ulc%`5`XyKtCaEemK!)q{9>M?pJ z{SAGKeqR4d|62c5f29AWKh{MfwUNWfY2-3;8%2#`##_cz<9%bDvEJBV>^1for;KyP zH^zPAf$_7k-TcvfZvJ7~mSee=XQi>SSuZTY#uXuhfU?uj*4(&=k$ohG<`EkF;`nb$zw|iT;iLL@#U9 zFvb|~8uN|S#xdixao)IQ+%kSKYM7JF8RiyqkNK0C*2-k{x4N*NY^Lox#hm3%dU8PM zFTE)x$;0HLN=j|G7NIxShv^seZ;girHM^Ke=1#LQD`-!!C)=y+Db78oweOIRg!jcq z;%q7zADQdTTFy4-2gh;KxdYs9+-I&uz7?m?#Z+W@*cx_{{bYA_esFtwV|4iwbPS9(m?rRrMH?+yQFp2Z|V>Av_@W|gfZOMWb85y81u~~ z>_hggo!-ghG;|g?N8Rk6MMlyk(qj1&xt6kD(bdxCS?dd{7}IQ@v?i5l0_`b{lzPY` zWKGXujkGqg4t6hll>NTF#Qw}a3;NU}y+oC+mp_-&DDi59F_cYUzMaD_LC&dPtM}9& z)ko^@s;H@&rFmLrt(aCj4j4- z(B@vwAGNvB4t*|crO zhh{o!fHl{eWqVF>r>9fh%|G3|z?tgYce;BzNu@=Yt=Sa2lhe)X<*w%okUP z55Nwv)>p;lAZgbtUg3?=x>EaPC3o&`0!F z>7=~SdT7mI51oPTO!tQS++FRfYovzwoj8^L zEU#tVoQLjMUm;8A7cyme*(r9y^VlQhI&EFD zMnO(HVm2{C%tKGmTlBbeQ@Sr-bIbd+eV4o~UKXE=ndwvc17)jHTHUWTFdwpyoDyDo zKi)s!GtyS+suWZYYo)CL_AvXAUDdhl6n1;L7u+<1@24Nr(-|3zl16dMbQ(ELoM@-H)6!|@ z=I}C;4B`Z_Fg>KE*Jf+=^*B9V@2|hD@77P~xAY%%-^gT)FeVzyje|xdbAfrp9BPfT zEOwVg+GQL{-k|-Q6J9D(g4`fwg~HNQX{l65o+2-ib)~lQu5v&z)Zeu_Mj5j>%jX^O z1X%MLll($kaTMJuzomSwG*(xuSFH|iYA=^p$-C`oBr{1VTvUEga;s75IZL-4l1Eq~ z>=oV;Gtl1j9j&V|${J%i_Go*leZgjq504q|3J(NBOr*7?TJj6IymC@&ZO^w$xv}mZ z?}3-cKL>uPPTG(`WCK|*oEM%6<;DJD75WYBB8`&1mg3}*aurS2)9ML!e%JOGX+nER zJ>@J)31yt}hMLoAW_`>qvW|8k-y|!A)$#`UlH6S#rByS0^MrZA$_Wy(3tfcn!eK!X z^NI=LdQqckl|jlHg{Uo5$sTJzcAI!}yl0;8Z-&gTPbOItSqgiNeZU#)DWs)&*&N0i z+ee*5zY>{A&#U*Y!AE6-giE6viY6yt|T2-pQ%i%r}fm%=zr*$%qr$AGY6|-clAfWN?VMSRmN$@ zwS0P4y@x(fAEO`9T_cMz-8g0zwOoIbpN)K@eyiS8i)$sdzFI$RskT-V^%441eW|`i z->C1^PwAKR^2TiAk)fDr%_wt_In`WZo;6onYpjawH#WgfNA8G6=~H?{dLo@ys;fuT z>{=mXyOGnZZLTz*n(3{h)?=%T{hocv?&Y-e;=H{O8&MoUD@zllO7aAGpf%5`%WAkE z_&TXfGKsCEmU4R4cS<ST>(MV@2$$_6obOv)GyBGBRJt zB4w7_sr4+IC9sY504K9s$IawT^3#wi!fbJixJTSaH7S?0M^fdS@_zX%d4wwI+w{Rk zKI^7c-#%?`a#FZ0y)ihM#jSq}FMrEtF^@0_*KXg8G z8hD9bJ3kYdEwrFr=$mvm-6<{C)@!5mYx)x7L*uY6eT`=05YD zsaZ9xHr6_8hZW7*v#(heJGcF&(*$%lCY}-R$hnn9N=GF@Ij;PycxpB^N$sdj*XC-Q zv`@4%rRDGwg}ZSMC{zBNK0kzly8Pb!Kih$&PZqcOE#uyFEOgv?4vEfzk-6 zyF5T1F0Yd>$_Z*(?Q?CXQIx6nJo~(z%}Yr>rW>V0()X&XHPlQawK>3CX>GHrvC4KW zn>YoXq0YNbMmN%RNouX2R?UpGR#@d&8}f!|Y89gsuv16Uy zj^Zc49jmH%oF0>YU{~G!a2Jgi)6x&bRZQVnz?v;V8|Av$irbv@z zO_{1(P#!3`)SBvXcvcxm77OcyJ;LvTE+&bi#m4k&T1)y$ijx13Gb_cF%32rW0xRh< zZ?|{fD+qG332{Qa@S$)>ki|S=b9JP4-1^8q;>`Ab2Y((@3u`^JKH61H&|B)m^(Fe} zx@=A`=b9B*19pwY`oqDd@5TP~2%R9kBNsFhjFhaj-Px<}FMt)Yvhcp}p150lBIc*% zrE>B<`Aa#$`pznBZ?I3<{hf`@dB=Ary7j%baJN4yHmAdAE~THEQ!A=xHFKNY%Eq~YsWI%x$Iw^8lFcs(GAjfav9w))|y+b2v*6SZojZ=I&V04oeXX@H>>BspHsQX zJM=s~CoQnbvQh3AZjAS)x5|_K_xzQ9dNP1~O-c(ZgtfwMK@xL|{lrmX8X6@{mTJnA zg0ACxntaS-97FvuJ5h&E_gqB#r=Wc z!Nue-IZ5u2ABiqx5}FFVg;T;W!gTRRu^hciBc)`z8$o~kks3)0DW)Vdw zAv6$13y+1WQWaTN-caT!C0S#4th?7;=WX{cdRM(ga3?B5x{$eKIr)r~7g`E8h0!~W{nn4x?@mRxh5Lv5wYL?X_gqqd%n&}5 zR>-E3S1V_>HxtY+omuX;?oXcKmw?$ih;~+{D}$|!Y&6S2J|IiT5%L}RiDVX{g+9WM z!mmOJafbMlSY4_oHI$l2k#b$Rf!tVDjB;*mZ-sZpyA01!r^sDm2>pc9!WH35@kjbl zick(KGgvx1w=>R>+%E1}x1d+f8wpP+B8e2r)9-0fX@+!HDk4vp>lqELJ@yXAA)7>* zPNIdC`pO2yR*S1$)KhA0t)%&}Io^(QQjjd-YOxJ%Pfye9^ptc>8m>rce%020u{%0H zId8ht-E-bu$nUCT9myrE72l$rr4OW7d4W7uX=;^n`Z{^t7zV% z``r7|)BLpXHCt2Cmy9593U3KV&rxpY~|CO4BG%D>5p$_nMF zlHW|}#5=m%(tY99^fQuJ(vJ)yyGTKyg3wCnB#ae~2{px%QVIDNSyWslgOX8gs&-U& zsefpjt>x~I?qqMK_l=i}bRb>HD6)dACY#ANa)CI)Eunx|TdYgxQ6{C9@=F~gOHL){ zliSNXmF4Ohb%VND-K);DD!CinwBC3xl|R*A>TmN?l6mA;(oF0uE)t{YyR?neT0Si= zwhmZ7S?kyi_RN{h@GxM?U+z4-x_p^5dX24P^ zvD0ju{grbP?vy*|Hfgh*LaC|jRaEt+=IMP6(Mn~Vu`XLJ{L#Kd9*Apcobo<=_mC03 zlp9P3D5I3s${$KYb+9_bI^?~8zj`Z@8Oo1JCiWIPXlL;o`MW`bEW$#evRF-=DgG?x zr48u?t*3R}9p>jE3&~P)l-wkO&`jtn+z@^e9tkv$h}SG+pl(TU_|Qb!mnd@Q))V3AUjo}`!P3F&j` zfm}(sq~uj=t6SVpJ(-LpTZQc6FtGx7x0N_fOhJ3lBgz4-FI#9|w<|e`?m4%-*B$1_ zRPu!|T)ZNFCH_Qzlzx}DDH`+GOm>cqch)%3?h<#DcMx)U9QmG@LTaJCkR)suQi~15 zEVMYSrYuxWD`nIwYD06goyU>gu5KZ(q*n?4+B{9p2-k%J;veD|^^EqJ^&?AfXLG-I zGeBG_(00lZt&BCtiGcZcj~0`@mx{`j!5!meAU(vP(i`$drG(m5J*^JWvm0fMJw~Fr z*-Wy=u@BfXwu%k0zp`)Ij+5Fc>MV7bn*>kB`-OSpWV(ocN57SdC|i^Q_B-}UyNi<+ zWGb{5T`Vnr!9!ind74Z}c=;u~?SFK5OT9Vw|6y)~@M&@AdS3 z(w!_5Qqd0dZTbiGrRS0-7t~YR73?<7aCeG3!=>JNcy{(bij*ZMTB2bMv{KASc#| zp4q~jXi>Y7J;wgc?F)DKq9lX%ww70Kpm))`>jU(0`iJ@+{iObr{#xaq%p~; zY<^%KHLKfO9LL`XS)Wn7Ef%3qXeFto^jNAWw~!C3&5a}0Z&n@Fk9}^p^@jLsVaB~9 z*i@A=OF1P)&LroMhpX{MU2B*%-eT;1_6I9wFSl#EeL??;VjMlD+)=Ws6SX6UG#{C&)!O==m30(iiQkCxl(zN+=QR99tC9rLNZ2l36@Q^WOVzC9)_d#e z-P~Sp?+*MG+Jp=z*GOR@N@ycA7E4Q|+$j4+W*-31nfDj{$5o(E+lugPfnQ~S+ulz_BlpV?wrHR@>{YYJ} zeyr|Mw=>OM=6>j|chh*2ycymUe+k%8i$s&*LOXGrd{q8Mxn|w7w%Q||s@_6xpZBeI z7`{`smK^Ife@j#@PV*N_(Z5A4iblo4d~}|u5?kFBUe;DSMsQ} z)E<@s-y%&Rsf2dIH6fL_U#zU2*3Mfl>%%^2}YV)w2qlTNB(*K^1gm z1BHFU3t_JKExjRqszj)YKHD0?T)T&DJE@%RPI`EvD@WqVQeljkk$yq1NC)K~<;QYA zBO>PAL>YHrGkMsA6nX%4A{lwEEvPtdv<1E9w~77e ze40^-R$pjs>{;G4AOBgGgLEcIWHd8JZPC&`fW$R*_%`B#}Jnbe}{I(3t}TaD6+o1fUKyTYC5P4V`6H@v*C zF26D=nuIyG~)Kl65`;J@1-wEFjbt9fIK=?yQE8d{CltwBb zeIqeBwVYr6P~ETA(Rb+0%=fGoc6Ymgo6c+Pr2&0wlD4EH8A3*r56MSlBiTZ>lLzDv zl2+ItR1gPL8DjN6T-?E7UFOC+b#ppISr9 zq!%?iTXEJbtGE4*m)hSBE9FIUQMfG>7VC&5X$0*mb&(e<<uc|I#zDrvPd*pwip|AZv?*;Xb(0dLHgZ?Fmz-4@ zr8c*wu;F0Q6q=|Q?h+An=1ZB!_+-mR=uOy$1G^owhmb>SPc7_33f_5 zpMBBk?yiRaM|?q2h=;_|bhh-KTu&WmR<)K|m)KhSptI1e<_+-{dnZBGCGtDDD`e0+ zSe>j%teM@(j zR$HX~tX0wv>f?=~5UsC-qGA_uiAd;DWxV>XI#)fbbu?O;SIjM}FgYpCqA8Wn)CcO%>SI;X zbZxLUOuMdqqy4H?(HrR<^i}$1{Y(89J(W@27-oz%rWo^#RmNK5sBzLbYg{$HHtrdd z%<1Mf^P$PC)K*sO59=uFXSam^>#P=1XrdWsHey?y2ku*NPpC^O(Ghfu9Hs12H1#|E z2feECzWJ$j#oEZ`JK5n|j-7P7lu65~71f$(&9s(U8?C)IK-;0+(tN$O{*FFdpQo?a zztAfgiN-)Jt&tzZ6LNX2m)664&-}s43BGSd*U>f7PWg&_LjB5`55B)7 zenhv?Poz!qJ-Lf=%KF?|?G*RA`6I!q_q6o)jcDtF~OK&EHf&ahs|Qv z8$OQYw8D6NtQDjz8?l;h?lbHCHr z8{~QZ$MF6347yq>qufxxQ%h)xT9URzTcaJ*-qNS&YxPa~Vg0mzS--DWFypbtn6$zo9ATp-vL>bNH?0ptYr>2pO}a2NPiDx-(I>~YO9vg-qIFmRDV}rrtj0o zu-{odyM;H=hyU~oO_dHxFJ-3klaf-srB2h9X&-9qw2j&pZKS@~_}oysg zXI?PZTA5iTc91=0PwkmbWAObiVt1*J6fgId@2VHI!p07xj=9SG-OOMWVcnTxf9XDj zxztmrB1VdR#AV_!dQ>W)rDN~12W+-|*^YD)oc3N9Zy#iLR-v1)NYKP?Vk7!HeJXvX zn;X&9X)-WBuFX7svzRL!B~GVhwbtS43`yOjMVtkYD?CDxQCNj2n&@>S)Y zQkB(XC+t6L%_;8I^;UXy{TTm4c%ofIj*+wE77>N!!neZD!X5g)bX%GSuVS<{-?zp( zpE{47hHg=Ko?Avn3BL)Y#7<%rDO##5HUhzwj3NpMW10 zkVN6KP+S~HAJPU=W9bK}zT8NTRR&q3*lwG;WxZPPd|H@PAvuN2;zsyl(4wd4NhzNi zr#7)#TI1}J&RLK7>tQ`jK_-#9!Z2a8FhtC*@jZ-g9TM^U5(eL(L^pJ_Yox17_?W_Kv$ z)7xa5P+x2*zA5^&80}6^DQ~GNJIg9Mecg(%7A#O(8Y9irtUo(q&vsrosYp-STuGzG zn0?JcRzGXFRfE0F7O-DfetWPz!d_&5Xn$!Jah5p!T#x)Nrlh%OYh|x?Q0rl|WSv+l zyR!4S)5D$Ye&n8adw>QFNo#SuSdc!^vgr%-T83**ux>NaW_E;K0lpopC{>V;DsQS| zwLJQCeTDu^w~c1Tucl(vu?}00tlBJ*J!eJjRPe&pVet#`Yw>6Llk`j;Z$?pm z7CX-_vvu}6&RXY?6X~4-557xAh-GMfYoOE2J>zEf-thW^E>}rSA*+-{-mW}W+N-hJ zHa5b!;VgBRc$57@Fup*zDx?(WNVDZiYJIi4<*?ptlbyw_>n?If!~fO47v_iuwN5_ggG;wSVC2ygRnzrDDD*_l_F{heS@{j+F~p24)>D#t5+Gm z!@ftJku*YoA&odi{6s84bt$)`$+_gOBr%wp{>_8zC3Ti7e*r67}qPlb!(D*6fC zES-~AYpL{h`ai#rLFH@>q4Z_O@BYs$~^m%~?lwgZ;#ao!5@GFF5nv58M>wOW}n$PMxaGQOjrx zwNJGlw4bz!`T@OzG0ynfyk+*WzGLlS)}0V5(&2Qw_KQ|oe^0-z4=`35M~trw!PL#= zW?AdGb%d#8j?j{RFPGC|jqYYKPa?ClcZ{XR=Vmt2mF}Utq|!Z<=QV|W%68cM?F!C&P9N7GIcYN`wHj?C zTEAMim}L*LhubERNe$sQrJ~wNi?_D2;m$YCX*UVh#>rw~HBy_beXkYQYw3l}rsjV0 znyFiDt@YMUYY@w7?{+?iXTb~NN;*rLDW8>RtE;phwf4qkW2|}KjIx@s4s1C4hUKxF z+Fv+Lf#-4YuH0DZq4?@d?S1WIZJRbppQV4VcQX1L#C&FDWrx{gJ0;1Z#3;R$3Cam2 zyE;IfpuVpzR?lgz&3*PU`FOoiWC| zY7S(duw70r@~L=MJSUzPm(vwAUg@smsngXN>P&T(I!|4wE>dS`^R%tn zUA=+P&}d{dHkuf%jW$MGBgS}Pj5gmgzcAtD*!=AUcpD7g zjE;&b7!?s685xD20BsQ!6&=AZh{DkkhoeI;)P`0-8^A++BDsKwh{&kuXuybyjslJx z22EhWxdA+?AmB8Bvq>w&;doJih$n)gfi$`x_QFUA0cOZLgo9*&!?~lVNN5M@f|4OE zU-iR!ARFWoki8U)ia{q38fb~)fMZB!a8N<;P%s2|$S(?d;J>hIcoTtP$ABln!AOS} z=mBO3hMkwZQH!vgS2iWHh>Hsc2H3rd5AbObunrp((Fz1M0Sr_%DmoC^s!4zZ_Mp;m z0760F6Lca`0J=5`sggSd6c0J&-A@JE&wF9D%b=>hb4n+Uv)qO z7znvLoENx&wc=(p4>BXLE?GNR27(C^;8a7LP6u_ff{TG%s!DS%q z3CMJ?E9BK6>oF@i8ytv^z)uJ)*ar&S16ORo6%q{4!A-apgK(4t>l9*e&m^CW+zot)=RK;%?<6nhD;mb{5@=Xp zW(eu^yGfWV8ccy4`KOXVRmdUk@+P6XNZ|kEX$vDL>~CZgW-Vk0&S{v$c)_X2KQVa$ z7M`9M0YJbN0o8*OCh?zV9D-h-abd`-85br4OF<&0JcJPZ48Pp};YL9W0mg;LB4_|2 zT7`27f`n9%2uKB*VKgBlcuoKu+My{11i*&}9r6IP0SrOrkSM?wObXU49xC93t1Zrn zP!IqgmrEXc7~u1q-|_f7gF9a!2Itw!4k!WbkS~J{VxehawttmHF!-mL@MrXw)L_o5 zdnZ>5SC~La3?@g!>GCoYu^E^KW8u!rwJrb!o(PSDPr(~pA#@FrydK*yU#lDmjxH)(<3uXjF6BK_%#q$90!Cn7+B7m!2t{D(fyajRVQB-&Yvz3noOBaqH z2(-jB4#WmTp;SR1g6{mn=K*9F^f(n}7*0;OU`BwMI90@0C?O$xTc3(5*6ALj!zkn=%QI19qO2AYUgoHKX~8R7+v!4dcg zGlZd+3|>51$bggpt^IC}%VK_1M1USx+z|4XkJ{^P7U_R;w z6#Us7=LAoTCIy>dfWR&=gPVnJ!#w9n-6S#!9=VYWm<2aGkcWT8pjeOv7o3{`0ZIty zT0}**Ky`6W7L05a8PxrfW3emxS-FF89ED?W7dF`Xi<|^zzhU{2rv^+&}aM(g@}P5 z?hv>+1U>F0#6*h#AEkrv$bzlV_9nm@W)6(uZ34zX7eK*OifVy9fPxKpt0@@dKLGIu zxrF5b0ia{{aXpa>*P8nifFZX=H37%s>;b`i5`}d^I*^0O1+2ll)uSS7p$h;LrxJ{z zC*T6s;;jQSqy?@BuT2IJ3#Q;01RN~Fz@gSa2lH<)1d*Db68QRx``?^(-ir_WW44uH;3Me29WT8A18x`FOpY@2&98NG@37Ys63_u^l&ZUzQ!EEghq$7zyP!W*Fofg0467R5K;*8CrEbA3q8hH z=U2%Ey1|UX%mimeMmEK007E3k%;QT2;P5Ggjsl!O7fb>$9ce)=&=nRN9OW7M5(iud zRA41=0GyYh1k7MH;JFVrfbC!oNQB!bVDcpueFL`fV59Y178;FlN4YrP;2w$)SlTct zaeajO&%MRhW?XUs2mJs>b2xGUZulA9l#jN+HrNh)BAQ}IK`X$=bmlf9E!v2Y2KFJ@ zKn~OxK2!xQg2-XWz)tQ=v>THMR~eYYFoQ9(Av(BPV0=MSR0Z%bbqhu`ZHjpm6#@U{ zhdtx39q>ieUmmz6^}jm)%qNTs-)`~apYaXj{nZtD{xiPG@&7xVunx&-lDAQ}|4zTZ z8viR;a=zG3j+6gS{p4fG>Cm3!V}I@^$NfLq#_h)c>_C0~J3s$3+W*`7|CL_JVaa`& zyqz2ebx%Hq{lAkph!;MPKFlM0#j)h=zv?3lkFY*r{O}dWu#H>7!$f%EV-YsjWFD@{#WwCHsBa;V-NFz=1V@w zqd=NemhTHGM_J{d~`5><_EV<1HL%v~uK$9O~T-?I{ z=OavyIJm7JWg`qvgoUrjH>@jOv5!8%D;|i8@bDP+@xVT|@kIMkchm!Yh+}vlkMJ3m zhj64pc=(ETC6@=?m*a?&TyNwZ?xUVKjtAZk(6?A}AnZrP#Wt2=z&4&?Srws;@xlXT zBhRo-2n)+W+YpX z;2p?NIi?TVx?D#JpA0xr&`^YZ9Kl*1Sc38-kS2ofbihNKD74bUtk*#U6-I4<$su|f zX14*b!1gRKFDn6cYPjN?6=`Ai`7n2D0JjtzmJMcWC7j`HD-hBSR;zY-;H*tZ#}lFRM7kZ3Np|VNBHTS7g%{eQt;srqVur3S>fvDz2>#VGr$HeWhWx2SW z$?NF7X`d2r%8^j+WVeabN8L&~UJD70#8Og8d7CxV>CcNHY>-b_+YQaWsUI)~La~iU zPzq$e78I4Z%J7xCFZ8_(w-4gR!8tUgltRV=5Bs3R!&4_Eoy-D00j&TVMRy37+zRw} zS%d8^g=sINjbE4^7thJ<;NSSc%N@_G!wieGRM%sPsGNeb1h?ebT6HKyP{zR`C@1-7 zqKf5O#z29VTgGyyjlW&KZUs^%vUUD-JJHRqE)cr&7jSMk0^g)6L(!1MutoI|a5yb4 z2EOHi$4z}KltVcOMLW9iagjAlbq!EO;@8t@d%tc?DX-&1Y7CE!i4ux-Mhk z*7)MEEq=8%(f&aT`x3X2S|MT`{3hu5a3iesfQ za%FfK+S6jqHvVDxuC>5RNdw`(f~nWfSkN2z+I`-1`W3}Lg|+t6gSW%x?RN_>Q8ndhA(*quj=*i zu~^g%tsXB7g>w~)XW+&_+<@1RZkB8EZHKtou)H@y>|qpQ_=+EL$n5p-yNUS9C%)?0 z5?(IsPVtq(3gnoHr7MCLjqo+2N$`qMR=5MaUW(&Y*^QM@ZUYN#go|uoi482QfkibA z3eTYc%?NERd&pMXSnejhKh_Uk_+2G$l8>AC!tY4Ghp5`}_-gn; zVT88Yel9*_>EUI6jh3_moT+pzLygAzmbDIMzX& zTf!@~=+nT%=-JYKdm3yht_52X+Q3$X=GJI`yz%e7g#7QnPWO6T|0heV{JF%B!x9$1LJ^Bh zP{tyoSHlZRS)ir3g>DjTwYvtdWfz8xbai1X-7sMsyn3BkY%C55y{J7JUezuw#Y!h( zi(MxfU&$_@6oS{X$M6@kacf>a*qj#!+w(fZ2EE{QZpq2+l!UE#xLvO@Y}k8!L*7}~ zl6MDQ_s$0|eE+kp)_-r4=D%M`;qSlp`R^A^`#-bA4L#Nh3YSz*R=Q*`gTr_T&mLd~(sv z1{s5!-`o0sf4dZXu_0uu7}2?Nd{TVxgqWU9)VaxPJXA7p++2FQ65x_a2z=SGa!(f62Fg@6PdE;-Qi)Xi&Lk`~VOh z-z%3C*YGPA`7d1nUk_=yD^?lQt7p~$ zaf$sv&q~>g6)u{+O67_*<6@HfC&u;5nh-ZAsd-G#_|CPf!(h?uFNd-w#>K`ZBq6w5 z_N)mpz2YimZ&WR7sXSS0M>olyHED1k0Q8FQ7u%srz$<&!n4Ud*4{RFOwHMGe1c^Ok z`t+-my=eBVvAq-e#l`kd0w+Z$#w7Fu-+(d=`uFM-ml&VW6$EF`+9xr-cOryy2(T>% zYJ~V?1^li6Dp{&nu|g%w7B7L^diGANls!k$qGgK~1@3_={gPr5I>#h-&R(Q)#Udf| z{>qrLVPl$B%Ua?;WlYhMB?^@+S*+wMW4gfcuNzY)Y)p-W;!PvMuK1seN{LdX$`mS5 zu59V#hCopM<){FIAjO(i{|{JE0>V+ERB@iEL2h;FQo8iNZbhlE74ZMprjXISd-YC= zOZ=ZVs933DWeb%kUIKDGJXuP^Q4S{ZzcyJQK|^6_Ub1M1KL3FXD^|RC$wI|SmQ0pm z<;s;P@vp}Q7LNb0Ddox(EmW*nIauq$LHXaaqJzj521kk2k$Dy$`mS76jsr&DgS%RFf1H_DR6`QPu)|B6)Ii=)}OE;f9K{H zzD@S4Y~X*8-uvrE1^!r*_@^fYd{C%0xKm<$=lCJ~X5J_!F{W3)ti9qBB6}w!#V7Rd z-M?RbsOR6UUK~DkfQR_gLH+oCy%5T8`G8zMt|uUOjgIRD&u#pX1a9?3`8_-&x3Yo% zwfd-H16WEy(TdUV;8D2&qQ>;BScLbCijnZ32aj8oix(|cs&LuD#o$3N3^FR#i|LmX zJT|?K@f!Rkk7|P}H>g{?P(<^3FL?mSfaj6m5i_!93_M9TYYRu!}1UR6J|Mv?H?EOD< F{4c4JAwmEE diff --git a/AUTHORS.rst b/AUTHORS.rst new file mode 100644 index 0000000..5ee29cd --- /dev/null +++ b/AUTHORS.rst @@ -0,0 +1,13 @@ +======= +Credits +======= + +Development Lead +---------------- + +* Julian de Ruiter + +Contributors +------------ + +None yet. Why not be the first? diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst new file mode 100644 index 0000000..a1e9422 --- /dev/null +++ b/CONTRIBUTING.rst @@ -0,0 +1,114 @@ +.. highlight:: shell + +============ +Contributing +============ + +Contributions are welcome, and they are greatly appreciated! Every +little bit helps, and credit will always be given. + +You can contribute in many ways: + +Types of Contributions +---------------------- + +Report Bugs +~~~~~~~~~~~ + +Report bugs at https://github.com/jrderuiter/pyim/issues. + +If you are reporting a bug, please include: + +* Your operating system name and version. +* Any details about your local setup that might be helpful in troubleshooting. +* Detailed steps to reproduce the bug. + +Fix Bugs +~~~~~~~~ + +Look through the GitHub issues for bugs. Anything tagged with "bug" +and "help wanted" is open to whoever wants to implement it. + +Implement Features +~~~~~~~~~~~~~~~~~~ + +Look through the GitHub issues for features. Anything tagged with "enhancement" +and "help wanted" is open to whoever wants to implement it. + +Write Documentation +~~~~~~~~~~~~~~~~~~~ + +PyIM could always use more documentation, whether as part of the +official PyIM docs, in docstrings, or even on the web in blog posts, +articles, and such. + +Submit Feedback +~~~~~~~~~~~~~~~ + +The best way to send feedback is to file an issue at https://github.com/jrderuiter/pyim/issues. + +If you are proposing a feature: + +* Explain in detail how it would work. +* Keep the scope as narrow as possible, to make it easier to implement. +* Remember that this is a volunteer-driven project, and that contributions + are welcome :) + +Get Started! +------------ + +Ready to contribute? Here's how to set up `pyim` for local development. + +1. Fork the `pyim` repo on GitHub. +2. Clone your fork locally:: + + $ git clone git@github.com:your_name_here/pyim.git + +3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development:: + + $ mkvirtualenv pyim + $ cd pyim/ + $ python setup.py develop + +4. Create a branch for local development:: + + $ git checkout -b name-of-your-bugfix-or-feature + + Now you can make your changes locally. + +5. When you're done making changes, check that your changes pass flake8 and the tests, including testing other Python versions with tox:: + + $ flake8 pyim tests + $ python setup.py test or py.test + $ tox + + To get flake8 and tox, just pip install them into your virtualenv. + +6. Commit your changes and push your branch to GitHub:: + + $ git add . + $ git commit -m "Your detailed description of your changes." + $ git push origin name-of-your-bugfix-or-feature + +7. Submit a pull request through the GitHub website. + +Pull Request Guidelines +----------------------- + +Before you submit a pull request, check that it meets these guidelines: + +1. The pull request should include tests. +2. If the pull request adds functionality, the docs should be updated. Put + your new functionality into a function with a docstring, and add the + feature to the list in README.rst. +3. The pull request should work for Python 2.6, 2.7, 3.3, 3.4 and 3.5, and for PyPy. Check + https://travis-ci.org/jrderuiter/pyim/pull_requests + and make sure that the tests pass for all supported Python versions. + +Tips +---- + +To run a subset of tests:: + +$ py.test tests.test_pyim + diff --git a/HISTORY.rst b/HISTORY.rst new file mode 100644 index 0000000..08d07c2 --- /dev/null +++ b/HISTORY.rst @@ -0,0 +1,8 @@ +======= +History +======= + +0.1.0 (2016-09-01) +------------------ + +* First release on PyPI. diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..1587bd6 --- /dev/null +++ b/LICENSE @@ -0,0 +1,11 @@ + +MIT License + +Copyright (c) 2016, Julian de Ruiter + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + diff --git a/MANIFEST.IN b/MANIFEST.IN index 076fc55..63895e7 100644 --- a/MANIFEST.IN +++ b/MANIFEST.IN @@ -1,2 +1,13 @@ -include versioneer.py -include src/pyim/_version.py + +include AUTHORS.rst + +include CONTRIBUTING.rst +include HISTORY.rst +include LICENSE +include README.rst + +recursive-include tests * +recursive-exclude * __pycache__ +recursive-exclude * *.py[co] + +recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..4756a26 --- /dev/null +++ b/Makefile @@ -0,0 +1,88 @@ +.PHONY: clean clean-test clean-pyc clean-build docs help +.DEFAULT_GOAL := help +define BROWSER_PYSCRIPT +import os, webbrowser, sys +try: + from urllib import pathname2url +except: + from urllib.request import pathname2url + +webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1]))) +endef +export BROWSER_PYSCRIPT + +define PRINT_HELP_PYSCRIPT +import re, sys + +for line in sys.stdin: + match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line) + if match: + target, help = match.groups() + print("%-20s %s" % (target, help)) +endef +export PRINT_HELP_PYSCRIPT +BROWSER := python -c "$$BROWSER_PYSCRIPT" + +help: + @python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST) + +clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts + + +clean-build: ## remove build artifacts + rm -fr build/ + rm -fr dist/ + rm -fr .eggs/ + find . -name '*.egg-info' -exec rm -fr {} + + find . -name '*.egg' -exec rm -f {} + + +clean-pyc: ## remove Python file artifacts + find . -name '*.pyc' -exec rm -f {} + + find . -name '*.pyo' -exec rm -f {} + + find . -name '*~' -exec rm -f {} + + find . -name '__pycache__' -exec rm -fr {} + + +clean-test: ## remove test and coverage artifacts + rm -fr .tox/ + rm -f .coverage + rm -fr htmlcov/ + +lint: ## check style with flake8 + flake8 pyim tests + +test: ## run tests quickly with the default Python + py.test + + +test-all: ## run tests on every Python version with tox + tox + +coverage: ## check code coverage quickly with the default Python + coverage run --source pyim py.test + + coverage report -m + coverage html + $(BROWSER) htmlcov/index.html + +docs: ## generate Sphinx HTML documentation, including API docs + rm -f docs/pyim.rst + rm -f docs/modules.rst + sphinx-apidoc -o docs/ pyim + $(MAKE) -C docs clean + $(MAKE) -C docs html + $(BROWSER) docs/_build/html/index.html + +servedocs: docs ## compile the docs watching for changes + watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D . + +release: clean ## package and upload a release + python setup.py sdist upload + python setup.py bdist_wheel upload + +dist: clean ## builds source and wheel package + python setup.py sdist + python setup.py bdist_wheel + ls -l dist + +install: clean ## install the package to the active Python's site-packages + python setup.py install diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..02b9a1b --- /dev/null +++ b/README.rst @@ -0,0 +1,40 @@ +=============================== +PyIM +=============================== + + +.. image:: https://img.shields.io/pypi/v/pyim.svg + :target: https://pypi.python.org/pypi/pyim + +.. image:: https://img.shields.io/travis/jrderuiter/pyim.svg + :target: https://travis-ci.org/jrderuiter/pyim + +.. image:: https://readthedocs.org/projects/pyim/badge/?version=latest + :target: https://pyim.readthedocs.io/en/latest/?badge=latest + :alt: Documentation Status + +.. image:: https://pyup.io/repos/github/jrderuiter/pyim/shield.svg + :target: https://pyup.io/repos/github/jrderuiter/pyim/ + :alt: Updates + + +PTools for analyzing insertional mutagenesis data. + + +* Free software: MIT license +* Documentation: https://pyim.readthedocs.io. + + +Features +-------- + +* TODO + +Credits +--------- + +This package was created with Cookiecutter_ and the `audreyr/cookiecutter-pypackage`_ project template. + +.. _Cookiecutter: https://github.com/audreyr/cookiecutter +.. _`audreyr/cookiecutter-pypackage`: https://github.com/audreyr/cookiecutter-pypackage + diff --git a/conda/build.sh b/conda/build.sh new file mode 100644 index 0000000..a40f109 --- /dev/null +++ b/conda/build.sh @@ -0,0 +1 @@ +$PYTHON setup.py install # Python command to install the script. diff --git a/conda/meta.yaml b/conda/meta.yaml new file mode 100644 index 0000000..2b1d201 --- /dev/null +++ b/conda/meta.yaml @@ -0,0 +1,77 @@ +{% set version = "0.2.0.dev0" %} + +package: + name: pyim + version: {{ version }} + +source: + path: ../ + +build: + number: 0 + skip: True # [py27] + rpaths: + - lib/R/lib/ + - lib/ + +requirements: + build: + - python + - setuptools + + # Basic dependencies + - pandas ==0.18.1 + - pyfaidx ==0.4.7.1 + - pysam ==0.9.1 + - toolz ==0.8.0 + - tqdm + - intervaltree + - frozendict + - cutadapt + + # R dependencies + - r + - r-cimpl + - rpy2 >=2.7.4 + + run: + - python + + # Basic dependencies + - pandas ==0.18.1 + - pyfaidx ==0.4.7.1 + - pysam ==0.9.1 + - toolz ==0.8.0 + - tqdm + - intervaltree + - frozendict + - cutadapt + + # R dependencies + - r + - r-cimpl + - rpy2 >=2.7.4 + + # External dependencies + - bowtie2 + +test: + imports: + - pyim + commands: + - pyim-align --help > /dev/null + - pyim-annotate --help > /dev/null + - pyim-bed --help > /dev/null + - pyim-cis --help > /dev/null + - pyim-demultiplex --help > /dev/null + - pyim-merge --help > /dev/null + - pyim-split --help > /dev/null + +about: + home: http://www.github.com/jrderuiter/pyim + license: MIT License + summary: 'PyIM is a package for that implements a number of pipelines for + identifying transposon integration sites from targeted DNA-sequencing + of transposon insertions. The package implements a number of standard + pipelines used in our group, but also provides the basic build blocks + for custom pipelines.' diff --git a/data/sb.barcodes.fa.fai b/data/sb.barcodes.fa.fai deleted file mode 100644 index ca405f7..0000000 --- a/data/sb.barcodes.fa.fai +++ /dev/null @@ -1,201 +0,0 @@ -SB001 20 7 20 21 -SB002 20 35 20 21 -SB003 20 63 20 21 -SB004 20 91 20 21 -SB005 20 119 20 21 -SB006 20 147 20 21 -SB007 20 175 20 21 -SB008 20 203 20 21 -SB009 20 231 20 21 -SB010 20 259 20 21 -SB011 20 287 20 21 -SB012 20 315 20 21 -SB013 20 343 20 21 -SB014 20 371 20 21 -SB015 20 399 20 21 -SB016 20 427 20 21 -SB017 20 455 20 21 -SB018 20 483 20 21 -SB019 20 511 20 21 -SB020 20 539 20 21 -SB021 20 567 20 21 -SB022 20 595 20 21 -SB023 20 623 20 21 -SB024 20 651 20 21 -SB025 20 679 20 21 -SB026 20 707 20 21 -SB027 20 735 20 21 -SB028 20 763 20 21 -SB029 20 791 20 21 -SB030 20 819 20 21 -SB031 20 847 20 21 -SB032 20 875 20 21 -SB033 20 903 20 21 -SB034 20 931 20 21 -SB035 20 959 20 21 -SB036 20 987 20 21 -SB037 20 1015 20 21 -SB038 20 1043 20 21 -SB039 20 1071 20 21 -SB040 20 1099 20 21 -SB041 20 1127 20 21 -SB042 20 1155 20 21 -SB043 20 1183 20 21 -SB044 20 1211 20 21 -SB045 20 1239 20 21 -SB046 20 1267 20 21 -SB047 20 1295 20 21 -SB048 20 1323 20 21 -SB049 20 1351 20 21 -SB050 20 1379 20 21 -SB051 20 1407 20 21 -SB052 20 1435 20 21 -SB053 20 1463 20 21 -SB054 20 1491 20 21 -SB055 20 1519 20 21 -SB056 20 1547 20 21 -SB057 20 1575 20 21 -SB058 20 1603 20 21 -SB059 20 1631 20 21 -SB060 20 1659 20 21 -SB061 20 1687 20 21 -SB062 20 1715 20 21 -SB063 20 1743 20 21 -SB064 20 1771 20 21 -SB065 20 1799 20 21 -SB066 20 1827 20 21 -SB067 20 1855 20 21 -SB068 20 1883 20 21 -SB069 20 1911 20 21 -SB070 20 1939 20 21 -SB071 20 1967 20 21 -SB072 20 1995 20 21 -SB073 20 2023 20 21 -SB074 20 2051 20 21 -SB075 20 2079 20 21 -SB076 20 2107 20 21 -SB077 20 2135 20 21 -SB078 20 2163 20 21 -SB079 20 2191 20 21 -SB080 20 2219 20 21 -SB081 20 2247 20 21 -SB082 20 2275 20 21 -SB083 20 2303 20 21 -SB084 20 2331 20 21 -SB085 20 2359 20 21 -SB086 20 2387 20 21 -SB087 20 2415 20 21 -SB088 20 2443 20 21 -SB089 20 2471 20 21 -SB090 20 2499 20 21 -SB091 20 2527 20 21 -SB092 20 2555 20 21 -SB093 20 2583 20 21 -SB094 20 2611 20 21 -SB095 20 2639 20 21 -SB096 20 2667 20 21 -SB097 20 2695 20 21 -SB098 20 2723 20 21 -SB099 20 2751 20 21 -SB100 20 2779 20 21 -SB101 20 2807 20 21 -SB102 20 2835 20 21 -SB103 20 2863 20 21 -SB104 20 2891 20 21 -SB105 20 2919 20 21 -SB106 20 2947 20 21 -SB107 20 2975 20 21 -SB108 20 3003 20 21 -SB109 20 3031 20 21 -SB110 20 3059 20 21 -SB111 20 3087 20 21 -SB112 20 3115 20 21 -SB113 20 3143 20 21 -SB114 20 3171 20 21 -SB115 20 3199 20 21 -SB116 20 3227 20 21 -SB117 20 3255 20 21 -SB118 20 3283 20 21 -SB119 20 3311 20 21 -SB120 20 3339 20 21 -SB121 20 3367 20 21 -SB122 20 3395 20 21 -SB123 20 3423 20 21 -SB124 20 3451 20 21 -SB125 20 3479 20 21 -SB126 20 3507 20 21 -SB127 20 3535 20 21 -SB128 20 3563 20 21 -SB129 20 3591 20 21 -SB130 20 3619 20 21 -SB131 20 3647 20 21 -SB132 20 3675 20 21 -SB133 20 3703 20 21 -SB134 20 3731 20 21 -SB135 20 3759 20 21 -SB136 20 3787 20 21 -SB137 20 3815 20 21 -SB138 20 3843 20 21 -SB139 20 3871 20 21 -SB140 20 3899 20 21 -SB141 20 3927 20 21 -SB142 20 3955 20 21 -SB143 20 3983 20 21 -SB144 20 4011 20 21 -SB145 20 4039 20 21 -SB146 20 4067 20 21 -SB147 20 4095 20 21 -SB148 20 4123 20 21 -SB149 20 4151 20 21 -SB150 20 4179 20 21 -SB151 20 4207 20 21 -SB152 20 4235 20 21 -SB153 20 4263 20 21 -SB154 20 4291 20 21 -SB155 20 4319 20 21 -SB156 20 4347 20 21 -SB157 20 4375 20 21 -SB158 20 4403 20 21 -SB159 20 4431 20 21 -SB160 20 4459 20 21 -SB161 20 4487 20 21 -SB162 20 4515 20 21 -SB163 20 4543 20 21 -SB164 20 4571 20 21 -SB165 20 4599 20 21 -SB166 20 4627 20 21 -SB167 20 4655 20 21 -SB168 20 4683 20 21 -SB169 20 4711 20 21 -SB170 20 4739 20 21 -SB171 20 4767 20 21 -SB172 20 4795 20 21 -SB173 20 4823 20 21 -SB174 20 4851 20 21 -SB175 20 4879 20 21 -SB176 20 4907 20 21 -SB177 20 4935 20 21 -SB178 20 4963 20 21 -SB179 20 4991 20 21 -SB180 20 5019 20 21 -SB181 20 5047 20 21 -SB182 20 5075 20 21 -SB183 20 5103 20 21 -SB184 20 5131 20 21 -SB185 20 5159 20 21 -SB186 20 5187 20 21 -SB187 20 5215 20 21 -SB188 20 5243 20 21 -SB189 20 5271 20 21 -SB190 20 5299 20 21 -SB191 20 5327 20 21 -SB192 20 5355 20 21 -SB193 20 5383 20 21 -SB194 20 5411 20 21 -SB195 20 5439 20 21 -SB196 20 5467 20 21 -SB197 20 5495 20 21 -SB198 20 5523 20 21 -SB199 20 5551 20 21 -SB200 20 5579 20 21 -SB201 20 5607 20 21 diff --git a/data/sb.linker.fa b/data/sb.linker.fa index 4a438ed..2a87ab8 100644 --- a/data/sb.linker.fa +++ b/data/sb.linker.fa @@ -1,2 +1,2 @@ >T7 -CCTATAGTGAGTCGTATTA \ No newline at end of file +CCTATAGTGAGTCGTATTA diff --git a/data/sb.transposon.fa b/data/sb.transposon.fa index c54dc5c..e612da2 100644 --- a/data/sb.transposon.fa +++ b/data/sb.transposon.fa @@ -1,2 +1,2 @@ >SB -GTGTATGTAAACTTCCGACTTCAACTG \ No newline at end of file +GTGTATGTAAACTTCCGACTTCAACTG diff --git a/docs/Makefile b/docs/Makefile index ccfa1a1..e46022c 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -19,7 +19,8 @@ ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . # the i18n builder cannot share the environment and doctrees with the others I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . -.PHONY: help +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext + help: @echo "Please use \`make ' where is one of" @echo " html to make standalone HTML files" @@ -29,7 +30,6 @@ help: @echo " json to make JSON files" @echo " htmlhelp to make HTML files and a HTML help project" @echo " qthelp to make HTML files and a qthelp project" - @echo " applehelp to make an Apple Help Book" @echo " devhelp to make HTML files and a Devhelp project" @echo " epub to make an epub" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @@ -45,50 +45,41 @@ help: @echo " pseudoxml to make pseudoxml-XML files for display purposes" @echo " linkcheck to check all external links for integrity" @echo " doctest to run all doctests embedded in the documentation (if enabled)" - @echo " coverage to run coverage check of the documentation (if enabled)" -.PHONY: clean clean: rm -rf $(BUILDDIR)/* -.PHONY: html html: $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." -.PHONY: dirhtml dirhtml: $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." -.PHONY: singlehtml singlehtml: $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml @echo @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." -.PHONY: pickle pickle: $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle @echo @echo "Build finished; now you can process the pickle files." -.PHONY: json json: $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json @echo @echo "Build finished; now you can process the JSON files." -.PHONY: htmlhelp htmlhelp: $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp @echo @echo "Build finished; now you can run HTML Help Workshop with the" \ ".hhp project file in $(BUILDDIR)/htmlhelp." -.PHONY: qthelp qthelp: $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp @echo @@ -98,16 +89,6 @@ qthelp: @echo "To view the help file:" @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pyim.qhc" -.PHONY: applehelp -applehelp: - $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp - @echo - @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." - @echo "N.B. You won't be able to view it unless you put it in" \ - "~/Library/Documentation/Help or install it in your application" \ - "bundle." - -.PHONY: devhelp devhelp: $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp @echo @@ -117,13 +98,11 @@ devhelp: @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pyim" @echo "# devhelp" -.PHONY: epub epub: $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub @echo @echo "Build finished. The epub file is in $(BUILDDIR)/epub." -.PHONY: latex latex: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo @@ -131,33 +110,28 @@ latex: @echo "Run \`make' in that directory to run these through (pdf)latex" \ "(use \`make latexpdf' here to do that automatically)." -.PHONY: latexpdf latexpdf: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through pdflatex..." $(MAKE) -C $(BUILDDIR)/latex all-pdf @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." -.PHONY: latexpdfja latexpdfja: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through platex and dvipdfmx..." $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." -.PHONY: text text: $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text @echo @echo "Build finished. The text files are in $(BUILDDIR)/text." -.PHONY: man man: $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man @echo @echo "Build finished. The manual pages are in $(BUILDDIR)/man." -.PHONY: texinfo texinfo: $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo @@ -165,51 +139,38 @@ texinfo: @echo "Run \`make' in that directory to run these through makeinfo" \ "(use \`make info' here to do that automatically)." -.PHONY: info info: $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo "Running Texinfo files through makeinfo..." make -C $(BUILDDIR)/texinfo info @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." -.PHONY: gettext gettext: $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale @echo @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." -.PHONY: changes changes: $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes @echo @echo "The overview file is in $(BUILDDIR)/changes." -.PHONY: linkcheck linkcheck: $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck @echo @echo "Link check complete; look for any errors in the above output " \ "or in $(BUILDDIR)/linkcheck/output.txt." -.PHONY: doctest doctest: $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest @echo "Testing of doctests in the sources finished, look at the " \ "results in $(BUILDDIR)/doctest/output.txt." -.PHONY: coverage -coverage: - $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage - @echo "Testing of coverage in the sources finished, look at the " \ - "results in $(BUILDDIR)/coverage/python.txt." - -.PHONY: xml xml: $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml @echo @echo "Build finished. The XML files are in $(BUILDDIR)/xml." -.PHONY: pseudoxml pseudoxml: $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml @echo diff --git a/docs/api.rst b/docs/api.rst deleted file mode 100644 index 794c525..0000000 --- a/docs/api.rst +++ /dev/null @@ -1,53 +0,0 @@ -API -============ - -Alignment ---------------- - - -Vector -~~~~~~~~~~~~ - -.. autoclass:: pyim.alignment.vector.Alignment - :members: -.. autofunction:: pyim.alignment.vector.align_exact(target, query, query_strand=1) -.. autofunction:: pyim.alignment.vector.align_ssw(target, query, query_strand=1) -.. autofunction:: pyim.alignment.vector.align_with_reverse(target, query, align_func, query_strand=1, **kwargs) -.. autofunction:: pyim.alignment.vector.align_multiple(target, queries, align_func, raise_error=False, **kwargs) -.. autofunction:: pyim.alignment.vector.align_chained(align_chained(target, query, align_funcs, **kwargs) -.. autofunction:: pyim.alignment.vector.compose -.. autofunction:: pyim.alignment.vector.filter_and(target, query, align_func, filters, **kwargs) -.. autofunction:: pyim.alignment.vector.filter_or(target, query, align_func, filters, **kwargs) -.. autofunction:: pyim.alignment.vector.filter_score(alignment, min_score) -.. autofunction:: pyim.alignment.vector.filter_coverage(alignment, min_coverage, min_identity) -.. autofunction:: pyim.alignment.vector.filter_end_match(alignment) - -Genome -~~~~~~~~~~~~ - -.. autofunction:: pyim.alignment.bowtie2.align - -Annotation ---------------- - -Annotators -~~~~~~~~~~~~ - -.. autofunction:: pyim.annotation.annotator.annotate_windows -.. autoclass:: pyim.annotation.annotator.Window - :members: -.. autofunction:: pyim.annotation.annotator.annotate_rbm -.. autofunction:: pyim.annotation.annotator.annotate_rbm_cis - -Metadata -~~~~~~~~~~~~ - -.. autofunction:: pyim.annotation.metadata.add_metadata -.. autofunction:: pyim.annotation.metadata.feature_distance -.. autofunction:: pyim.annotation.metadata.feature_orientation - -Filtering -~~~~~~~~~~~~ - -.. autofunction:: pyim.annotation.filtering.filter_blacklist -.. autofunction:: pyim.annotation.filtering.select_closest diff --git a/docs/authors.rst b/docs/authors.rst new file mode 100644 index 0000000..e122f91 --- /dev/null +++ b/docs/authors.rst @@ -0,0 +1 @@ +.. include:: ../AUTHORS.rst diff --git a/docs/conf.py b/docs/conf.py old mode 100644 new mode 100755 index 9e2f37a..44fe434 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,8 +1,8 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python # -*- coding: utf-8 -*- # # pyim documentation build configuration file, created by -# sphinx-quickstart on Mon Mar 21 15:43:09 2016. +# sphinx-quickstart on Tue Jul 9 22:26:36 2013. # # This file is execfile()d with the current directory set to its # containing dir. @@ -16,38 +16,36 @@ import sys import os -sys.path.insert(0, os.path.abspath('..')) -from version import get_git_version +# If extensions (or modules to document with autodoc) are in another +# directory, add these directories to sys.path here. If the directory is +# relative to the documentation root, use os.path.abspath to make it +# absolute, like shown here. +#sys.path.insert(0, os.path.abspath('.')) -import sphinx_rtd_theme +# Get the project root dir, which is the parent dir of this +cwd = os.getcwd() +project_root = os.path.dirname(cwd) -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.insert(0, os.path.abspath('.')) +# Insert the project root dir as the first element in the PYTHONPATH. +# This lets us ensure that the source package is imported, and that its +# version is used. +sys.path.insert(0, project_root) -# -- General configuration ------------------------------------------------ +import pyim + +# -- General configuration --------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. #needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.doctest', - 'sphinx.ext.coverage', - 'sphinx.ext.viewcode', - 'sphinx.ext.napoleon' -] +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode'] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] -# The suffix(es) of source filenames. -# You can specify multiple suffix as a list of string: -# source_suffix = ['.rst', '.md'] +# The suffix of source filenames. source_suffix = '.rst' # The encoding of source files. @@ -57,28 +55,24 @@ master_doc = 'index' # General information about the project. -project = 'pyim' -copyright = '2016, Julian de Ruiter' -author = 'Julian de Ruiter' +project = u'PyIM' +copyright = u"2016, Julian de Ruiter" -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. +# The version info for the project you're documenting, acts as replacement +# for |version| and |release|, also used in various other places throughout +# the built documents. # # The short X.Y version. -version = get_git_version().split('-')[0] +version = pyim.__version__ # The full version, including alpha/beta/rc tags. -release = get_git_version() +release = pyim.__version__ # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. -# -# This is also used if you do content translation via gettext catalogs. -# Usually you set "language" from the command line for these cases. -language = None +#language = None -# There are two options for replacing |today|: either, you set today to some -# non-false value, then it is used: +# There are two options for replacing |today|: either, you set today to +# some non-false value, then it is used: #today = '' # Else, today_fmt is used as the format for a strftime call. #today_fmt = '%B %d, %Y' @@ -108,24 +102,19 @@ # A list of ignored prefixes for module index sorting. #modindex_common_prefix = [] -# If true, keep warnings as "system message" paragraphs in the built documents. +# If true, keep warnings as "system message" paragraphs in the built +# documents. #keep_warnings = False -# If true, `todo` and `todoList` produce output, else they produce nothing. -todo_include_todos = False - -# -- Options for HTML output ---------------------------------------------- +# -- Options for HTML output ------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -# html_theme = 'alabaster' -html_theme = "sphinx_rtd_theme" +html_theme = 'default' -html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the +# Theme options are theme-specific and customize the look and feel of a +# theme further. For a list of options available for each theme, see the # documentation. #html_theme_options = {} @@ -136,30 +125,27 @@ # " v documentation". #html_title = None -# A shorter title for the navigation bar. Default is the same as html_title. +# A shorter title for the navigation bar. Default is the same as +# html_title. #html_short_title = None -# The name of an image file (relative to this directory) to place at the top -# of the sidebar. +# The name of an image file (relative to this directory) to place at the +# top of the sidebar. #html_logo = None -# The name of an image file (within the static path) to use as favicon of the -# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 -# pixels large. +# The name of an image file (within the static path) to use as favicon +# of the docs. This file should be a Windows icon file (.ico) being +# 16x16 or 32x32 pixels large. #html_favicon = None -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". +# Add any paths that contain custom static files (such as style sheets) +# here, relative to this directory. They are copied after the builtin +# static files, so a file named "default.css" will overwrite the builtin +# "default.css". html_static_path = ['_static'] -# Add any extra paths that contain custom files (such as robots.txt or -# .htaccess) here, relative to this directory. These files are copied -# directly to the root of the documentation. -#html_extra_path = [] - -# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, -# using the given strftime format. +# If not '', a 'Last updated on:' timestamp is inserted at every page +# bottom, using the given strftime format. #html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to @@ -169,8 +155,8 @@ # Custom sidebar templates, maps document names to template names. #html_sidebars = {} -# Additional templates that should be rendered to pages, maps page names to -# template names. +# Additional templates that should be rendered to pages, maps page names +# to template names. #html_additional_pages = {} # If false, no module index is generated. @@ -185,67 +171,54 @@ # If true, links to the reST sources are added to the pages. #html_show_sourcelink = True -# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +# If true, "Created using Sphinx" is shown in the HTML footer. +# Default is True. #html_show_sphinx = True -# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +# If true, "(C) Copyright ..." is shown in the HTML footer. +# Default is True. #html_show_copyright = True -# If true, an OpenSearch description file will be output, and all pages will -# contain a tag referring to it. The value of this option must be the -# base URL from which the finished HTML is served. +# If true, an OpenSearch description file will be output, and all pages +# will contain a tag referring to it. The value of this option +# must be the base URL from which the finished HTML is served. #html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). #html_file_suffix = None -# Language to be used for generating the HTML full-text search index. -# Sphinx supports the following languages: -# 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja' -# 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr' -#html_search_language = 'en' - -# A dictionary with options for the search language support, empty by default. -# Now only 'ja' uses this config value -#html_search_options = {'type': 'default'} - -# The name of a javascript file (relative to the configuration directory) that -# implements a search results scorer. If empty, the default will be used. -#html_search_scorer = 'scorer.js' - # Output file base name for HTML help builder. htmlhelp_basename = 'pyimdoc' -# -- Options for LaTeX output --------------------------------------------- -latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', +# -- Options for LaTeX output ------------------------------------------ -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + #'papersize': 'letterpaper', -# Additional stuff for the LaTeX preamble. -#'preamble': '', + # The font size ('10pt', '11pt' or '12pt'). + #'pointsize': '10pt', -# Latex figure (float) alignment -#'figure_align': 'htbp', + # Additional stuff for the LaTeX preamble. + #'preamble': '', } # Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). +# (source start file, target name, title, author, documentclass +# [howto/manual]). latex_documents = [ - (master_doc, 'pyim.tex', 'pyim Documentation', - 'Julian de Ruiter', 'manual'), + ('index', 'pyim.tex', + u'PyIM Documentation', + u'Julian de Ruiter', 'manual'), ] -# The name of an image file (relative to this directory) to place at the top of -# the title page. +# The name of an image file (relative to this directory) to place at +# the top of the title page. #latex_logo = None -# For "manual" documents, if this is true, then toplevel headings are parts, -# not chapters. +# For "manual" documents, if this is true, then toplevel headings +# are parts, not chapters. #latex_use_parts = False # If true, show page references after internal links. @@ -261,27 +234,31 @@ #latex_domain_indices = True -# -- Options for manual page output --------------------------------------- +# -- Options for manual page output ------------------------------------ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - (master_doc, 'pyim', 'pyim Documentation', - [author], 1) + ('index', 'pyim', + u'PyIM Documentation', + [u'Julian de Ruiter'], 1) ] # If true, show URL addresses after external links. #man_show_urls = False -# -- Options for Texinfo output ------------------------------------------- +# -- Options for Texinfo output ---------------------------------------- # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'pyim', 'pyim Documentation', - author, 'pyim', 'One line description of project.', + ('index', 'pyim', + u'PyIM Documentation', + u'Julian de Ruiter', + 'pyim', + 'One line description of project.', 'Miscellaneous'), ] diff --git a/docs/contributing.rst b/docs/contributing.rst new file mode 100644 index 0000000..e582053 --- /dev/null +++ b/docs/contributing.rst @@ -0,0 +1 @@ +.. include:: ../CONTRIBUTING.rst diff --git a/docs/history.rst b/docs/history.rst new file mode 100644 index 0000000..2506499 --- /dev/null +++ b/docs/history.rst @@ -0,0 +1 @@ +.. include:: ../HISTORY.rst diff --git a/docs/index.rst b/docs/index.rst index 59c72c3..a3fdc28 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,20 +1,21 @@ .. pyim documentation master file, created by - sphinx-quickstart on Mon Mar 21 15:43:09 2016. + sphinx-quickstart on Tue Jul 9 22:26:36 2013. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -Welcome to pyim's documentation! -================================ +Welcome to PyIM's documentation! +====================================== Contents: .. toctree:: :maxdepth: 2 - introduction - api - - + readme + installation + usage + contributing + authorshistory Indices and tables ================== diff --git a/docs/installation.rst b/docs/installation.rst new file mode 100644 index 0000000..88153b3 --- /dev/null +++ b/docs/installation.rst @@ -0,0 +1,51 @@ +.. highlight:: shell + +============ +Installation +============ + + +Stable release +-------------- + +To install PyIM, run this command in your terminal: + +.. code-block:: console + + $ pip install pyim + +This is the preferred method to install PyIM, as it will always install the most recent stable release. + +If you don't have `pip`_ installed, this `Python installation guide`_ can guide +you through the process. + +.. _pip: https://pip.pypa.io +.. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/ + + +From sources +------------ + +The sources for PyIM can be downloaded from the `Github repo`_. + +You can either clone the public repository: + +.. code-block:: console + + $ git clone git://github.com/jrderuiter/pyim + +Or download the `tarball`_: + +.. code-block:: console + + $ curl -OL https://github.com/jrderuiter/pyim/tarball/master + +Once you have a copy of the source, you can install it with: + +.. code-block:: console + + $ python setup.py install + + +.. _Github repo: https://github.com/jrderuiter/pyim +.. _tarball: https://github.com/jrderuiter/pyim/tarball/master diff --git a/docs/introduction.rst b/docs/introduction.rst deleted file mode 100644 index dbc68da..0000000 --- a/docs/introduction.rst +++ /dev/null @@ -1,14 +0,0 @@ -Introduction -============ - -Alignment ----------------------- - -Merging sets ----------------------- - -CIS selection ----------------------- - -Annotating insertions ----------------------- diff --git a/docs/make.bat b/docs/make.bat index edcd341..7001ef4 100644 --- a/docs/make.bat +++ b/docs/make.bat @@ -1,263 +1,242 @@ -@ECHO OFF - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set BUILDDIR=_build -set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . -set I18NSPHINXOPTS=%SPHINXOPTS% . -if NOT "%PAPER%" == "" ( - set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% - set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% -) - -if "%1" == "" goto help - -if "%1" == "help" ( - :help - echo.Please use `make ^` where ^ is one of - echo. html to make standalone HTML files - echo. dirhtml to make HTML files named index.html in directories - echo. singlehtml to make a single large HTML file - echo. pickle to make pickle files - echo. json to make JSON files - echo. htmlhelp to make HTML files and a HTML help project - echo. qthelp to make HTML files and a qthelp project - echo. devhelp to make HTML files and a Devhelp project - echo. epub to make an epub - echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter - echo. text to make text files - echo. man to make manual pages - echo. texinfo to make Texinfo files - echo. gettext to make PO message catalogs - echo. changes to make an overview over all changed/added/deprecated items - echo. xml to make Docutils-native XML files - echo. pseudoxml to make pseudoxml-XML files for display purposes - echo. linkcheck to check all external links for integrity - echo. doctest to run all doctests embedded in the documentation if enabled - echo. coverage to run coverage check of the documentation if enabled - goto end -) - -if "%1" == "clean" ( - for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i - del /q /s %BUILDDIR%\* - goto end -) - - -REM Check if sphinx-build is available and fallback to Python version if any -%SPHINXBUILD% 1>NUL 2>NUL -if errorlevel 9009 goto sphinx_python -goto sphinx_ok - -:sphinx_python - -set SPHINXBUILD=python -m sphinx.__init__ -%SPHINXBUILD% 2> nul -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.http://sphinx-doc.org/ - exit /b 1 -) - -:sphinx_ok - - -if "%1" == "html" ( - %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The HTML pages are in %BUILDDIR%/html. - goto end -) - -if "%1" == "dirhtml" ( - %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. - goto end -) - -if "%1" == "singlehtml" ( - %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. - goto end -) - -if "%1" == "pickle" ( - %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle - if errorlevel 1 exit /b 1 - echo. - echo.Build finished; now you can process the pickle files. - goto end -) - -if "%1" == "json" ( - %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json - if errorlevel 1 exit /b 1 - echo. - echo.Build finished; now you can process the JSON files. - goto end -) - -if "%1" == "htmlhelp" ( - %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp - if errorlevel 1 exit /b 1 - echo. - echo.Build finished; now you can run HTML Help Workshop with the ^ -.hhp project file in %BUILDDIR%/htmlhelp. - goto end -) - -if "%1" == "qthelp" ( - %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp - if errorlevel 1 exit /b 1 - echo. - echo.Build finished; now you can run "qcollectiongenerator" with the ^ -.qhcp project file in %BUILDDIR%/qthelp, like this: - echo.^> qcollectiongenerator %BUILDDIR%\qthelp\pyim.qhcp - echo.To view the help file: - echo.^> assistant -collectionFile %BUILDDIR%\qthelp\pyim.ghc - goto end -) - -if "%1" == "devhelp" ( - %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. - goto end -) - -if "%1" == "epub" ( - %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The epub file is in %BUILDDIR%/epub. - goto end -) - -if "%1" == "latex" ( - %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex - if errorlevel 1 exit /b 1 - echo. - echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. - goto end -) - -if "%1" == "latexpdf" ( - %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex - cd %BUILDDIR%/latex - make all-pdf - cd %~dp0 - echo. - echo.Build finished; the PDF files are in %BUILDDIR%/latex. - goto end -) - -if "%1" == "latexpdfja" ( - %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex - cd %BUILDDIR%/latex - make all-pdf-ja - cd %~dp0 - echo. - echo.Build finished; the PDF files are in %BUILDDIR%/latex. - goto end -) - -if "%1" == "text" ( - %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The text files are in %BUILDDIR%/text. - goto end -) - -if "%1" == "man" ( - %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The manual pages are in %BUILDDIR%/man. - goto end -) - -if "%1" == "texinfo" ( - %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. - goto end -) - -if "%1" == "gettext" ( - %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The message catalogs are in %BUILDDIR%/locale. - goto end -) - -if "%1" == "changes" ( - %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes - if errorlevel 1 exit /b 1 - echo. - echo.The overview file is in %BUILDDIR%/changes. - goto end -) - -if "%1" == "linkcheck" ( - %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck - if errorlevel 1 exit /b 1 - echo. - echo.Link check complete; look for any errors in the above output ^ -or in %BUILDDIR%/linkcheck/output.txt. - goto end -) - -if "%1" == "doctest" ( - %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest - if errorlevel 1 exit /b 1 - echo. - echo.Testing of doctests in the sources finished, look at the ^ -results in %BUILDDIR%/doctest/output.txt. - goto end -) - -if "%1" == "coverage" ( - %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage - if errorlevel 1 exit /b 1 - echo. - echo.Testing of coverage in the sources finished, look at the ^ -results in %BUILDDIR%/coverage/python.txt. - goto end -) - -if "%1" == "xml" ( - %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The XML files are in %BUILDDIR%/xml. - goto end -) - -if "%1" == "pseudoxml" ( - %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml - if errorlevel 1 exit /b 1 - echo. - echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. - goto end -) - -:end +@ECHO OFF + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set BUILDDIR=_build +set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . +set I18NSPHINXOPTS=%SPHINXOPTS% . +if NOT "%PAPER%" == "" ( + set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% + set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% +) + +if "%1" == "" goto help + +if "%1" == "help" ( + :help + echo.Please use `make ^` where ^ is one of + echo. html to make standalone HTML files + echo. dirhtml to make HTML files named index.html in directories + echo. singlehtml to make a single large HTML file + echo. pickle to make pickle files + echo. json to make JSON files + echo. htmlhelp to make HTML files and a HTML help project + echo. qthelp to make HTML files and a qthelp project + echo. devhelp to make HTML files and a Devhelp project + echo. epub to make an epub + echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter + echo. text to make text files + echo. man to make manual pages + echo. texinfo to make Texinfo files + echo. gettext to make PO message catalogs + echo. changes to make an overview over all changed/added/deprecated items + echo. xml to make Docutils-native XML files + echo. pseudoxml to make pseudoxml-XML files for display purposes + echo. linkcheck to check all external links for integrity + echo. doctest to run all doctests embedded in the documentation if enabled + goto end +) + +if "%1" == "clean" ( + for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i + del /q /s %BUILDDIR%\* + goto end +) + + +%SPHINXBUILD% 2> nul +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "html" ( + %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/html. + goto end +) + +if "%1" == "dirhtml" ( + %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. + goto end +) + +if "%1" == "singlehtml" ( + %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. + goto end +) + +if "%1" == "pickle" ( + %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the pickle files. + goto end +) + +if "%1" == "json" ( + %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the JSON files. + goto end +) + +if "%1" == "htmlhelp" ( + %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run HTML Help Workshop with the ^ +.hhp project file in %BUILDDIR%/htmlhelp. + goto end +) + +if "%1" == "qthelp" ( + %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run "qcollectiongenerator" with the ^ +.qhcp project file in %BUILDDIR%/qthelp, like this: + echo.^> qcollectiongenerator %BUILDDIR%\qthelp\pyim.qhcp + echo.To view the help file: + echo.^> assistant -collectionFile %BUILDDIR%\qthelp\pyim.ghc + goto end +) + +if "%1" == "devhelp" ( + %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. + goto end +) + +if "%1" == "epub" ( + %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The epub file is in %BUILDDIR%/epub. + goto end +) + +if "%1" == "latex" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdf" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf + cd %BUILDDIR%/.. + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdfja" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf-ja + cd %BUILDDIR%/.. + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "text" ( + %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The text files are in %BUILDDIR%/text. + goto end +) + +if "%1" == "man" ( + %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The manual pages are in %BUILDDIR%/man. + goto end +) + +if "%1" == "texinfo" ( + %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. + goto end +) + +if "%1" == "gettext" ( + %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The message catalogs are in %BUILDDIR%/locale. + goto end +) + +if "%1" == "changes" ( + %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes + if errorlevel 1 exit /b 1 + echo. + echo.The overview file is in %BUILDDIR%/changes. + goto end +) + +if "%1" == "linkcheck" ( + %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck + if errorlevel 1 exit /b 1 + echo. + echo.Link check complete; look for any errors in the above output ^ +or in %BUILDDIR%/linkcheck/output.txt. + goto end +) + +if "%1" == "doctest" ( + %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest + if errorlevel 1 exit /b 1 + echo. + echo.Testing of doctests in the sources finished, look at the ^ +results in %BUILDDIR%/doctest/output.txt. + goto end +) + +if "%1" == "xml" ( + %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The XML files are in %BUILDDIR%/xml. + goto end +) + +if "%1" == "pseudoxml" ( + %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. + goto end +) + +:end diff --git a/docs/readme.rst b/docs/readme.rst new file mode 100644 index 0000000..72a3355 --- /dev/null +++ b/docs/readme.rst @@ -0,0 +1 @@ +.. include:: ../README.rst diff --git a/docs/usage.rst b/docs/usage.rst new file mode 100644 index 0000000..4c10dfa --- /dev/null +++ b/docs/usage.rst @@ -0,0 +1,7 @@ +===== +Usage +===== + +To use PyIM in a project:: + + import pyim diff --git a/environment.yml b/environment.yml deleted file mode 100644 index d892052..0000000 --- a/environment.yml +++ /dev/null @@ -1,11 +0,0 @@ -name: pyim-dev -dependencies: -- future -- numpy -- scipy -- pandas -- scikit-bio -- toolz -- pip: - - pysam - - rpy2 diff --git a/envs/dev.yml b/envs/dev.yml new file mode 100644 index 0000000..0e748f2 --- /dev/null +++ b/envs/dev.yml @@ -0,0 +1,29 @@ +name: pyim-dev2 +channels: + - conda-forge + - bioconda + - jrderuiter +dependencies: + - pip ==8.1.2 + + # Basic dependencies + - pandas ==0.18.1 + - pyfaidx ==0.4.7.1 + - pysam ==0.9.1 + - toolz ==0.8.0 + + # Development + - wheel ==0.29.0 + - coverage ==4.1 + - sphinx ==1.4.1 + - pytest ==2.9.2 + - flake8 ==3.0.4 + - pylint ==1.5.4 + - pytest-helpers-namespace ==2016.4.15 + + - pip: + # Development + - bumpversion ==0.5.3 + - watchdog ==0.8.3 + - yapf ==0.11.1 + - pytest-cov ==2.3.1 diff --git a/external/cimpl b/external/cimpl index 858f11b..c4a6f8f 160000 --- a/external/cimpl +++ b/external/cimpl @@ -1 +1 @@ -Subproject commit 858f11b99a3c7153278bb16ab5dff668ea96cf63 +Subproject commit c4a6f8fa3eec85956ea72724abfb9405fe7d8d51 diff --git a/external/kcrbm b/external/kcrbm deleted file mode 160000 index 8e47247..0000000 --- a/external/kcrbm +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 8e47247fca0c8627313ba7ba1dd47639d82d3386 diff --git a/readme.md b/readme.md deleted file mode 100644 index a953cc1..0000000 --- a/readme.md +++ /dev/null @@ -1,7 +0,0 @@ -PyIM -======================= - -PyIM is a software package for implementing pipelines that identify transposon -integration sites from targeted DNA-sequencing of transposon insertions. The -package implements a number of standard pipelines used in our group, but also -provides the basic build blocks for custom pipelines. diff --git a/setup.cfg b/setup.cfg index a2e923b..7b5af4c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,7 +1,18 @@ -[versioneer] -VCS = git -style = pep440 -versionfile_source = src/pyim/_version.py -versionfile_build = pyim/_version.py -tag_prefix = v -parentdir_prefix = pyim- +[bumpversion] +current_version = 0.1.0 +commit = True +tag = True + +[bumpversion:file:setup.py] +search = version='{current_version}' +replace = version='{new_version}' + +[bumpversion:file:pyim/__init__.py] +search = __version__ = '{current_version}' +replace = __version__ = '{new_version}' + +[bdist_wheel] +universal = 1 + +[flake8] +exclude = docs diff --git a/setup.py b/setup.py index 9775910..69c4ec3 100644 --- a/setup.py +++ b/setup.py @@ -1,54 +1,52 @@ -import sys +#!/usr/bin/env python +# -*- coding: utf-8 -*- -import setuptools -import versioneer +from setuptools import setup, find_packages -INSTALL_REQUIRES = ['future', 'numpy', 'scipy', 'pandas', 'pysam', - 'rpy2', 'scikit-bio', 'toolz', 'tqdm', 'intervaltree'] +with open('README.rst') as readme_file: + readme = readme_file.read() -EXTRAS_REQUIRE = { - 'dev': ['sphinx', 'pytest', 'pytest-mock', - 'pytest-datafiles', 'pytest-cov', - 'pytest-helpers-namespace'] -} +with open('HISTORY.rst') as history_file: + history = history_file.read() +requirements = ['pyfaidx', 'intervaltree', 'tqdm', 'toolz', 'frozendict', + 'rpy2'] -# Check setuptools version, as recommended by: -# https://hynek.me/articles/conditional-python-dependencies/. -if int(setuptools.__version__.split('.', 1)[0]) < 18: - assert 'bdist_wheel' not in sys.argv +test_requirements = [] - # Add pathlib for Pythons before 3.4. - if sys.version_info[0:2] < (3, 4): - INSTALL_REQUIRES.append('pathlib2') -else: - EXTRAS_REQUIRE[":python_version<'3.4'"] = ['pathlib2'] - - -setuptools.setup( +setup( name='pyim', - version=versioneer.get_version(), - cmdclass=versioneer.get_cmdclass(), - url='https://bitbucket.org/jrderuiter/pyim', - author='Julian de Ruiter', + version='0.2.0.dev0', + description="Tools for analyzing insertional mutagenesis data.", + long_description=readme + '\n\n' + history, + author="Julian de Ruiter", author_email='julianderuiter@gmail.com', - description='Predicts transposon insertion sites from DNA-seq data.', - license='BSD', - packages=setuptools.find_packages('src'), + url='https://github.com/jrderuiter/pyim', + packages=find_packages('src'), package_dir={'': 'src'}, include_package_data=True, + install_requires=requirements, + license="MIT license", + zip_safe=False, + keywords='pyim', + classifiers=[ + 'Development Status :: 2 - Pre-Alpha', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: MIT License', + 'Natural Language :: English', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.3', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', + ], + test_suite='tests', + tests_require=test_requirements, entry_points={'console_scripts': [ - 'pyim-align = pyim.main.align:main', - 'pyim-merge = pyim.main.merge:main', - 'pyim-merge-sets = pyim.main.merge_sets:main', - 'pyim-annotate = pyim.main.annotate:main', - 'pyim-cis = pyim.main.cis:main', - 'pyim-plot = pyim.main.plot:main', - 'pyim-gff = pyim.main.gff:main', - 'pyim-split = pyim.main.split:main' - ]}, - install_requires=INSTALL_REQUIRES, - extras_require=EXTRAS_REQUIRE, - zip_safe=True, - classifiers=[] -) + 'pyim-align = pyim.main.pyim_align:main', + 'pyim-demultiplex = pyim.main.pyim_demultiplex:main', + 'pyim-merge = pyim.main.pyim_merge:main', + 'pyim-cis = pyim.main.pyim_cis:main', + 'pyim-annotate = pyim.main.pyim_annotate:main', + 'pyim-bed = pyim.main.pyim_bed:main', + 'pyim-split = pyim.main.pyim_split:main' + ]}) diff --git a/shear_splink_cutadapt.py b/shear_splink_cutadapt.py deleted file mode 100644 index b3442ac..0000000 --- a/shear_splink_cutadapt.py +++ /dev/null @@ -1,38 +0,0 @@ - - -from ..common.cutadapt import cutadapt - -def shear_splink(reads, barcodes): - - # De-multiplex - sample_files = _demultiplex(reads, barcodes) - - # Filter for contaminants - - - # Select for SB and T7 - - - -def _demultiplex(reads, output, barcodes): - options = { - '-g': ('file:' + str(barcodes), ) - '--discard-untrimmed': () - } - - cutadapt(reads_path, output_path, options=options) - - -def _extract_genomic(reads, transposon, linker, contaminants): - - # Filter for contaminants. - options = { - '-g': ('file:' + str(contaminants), ) - '--discard-trimmed': () - } - - cutadapt(reads_path, tmp_path, options) - - # Select for and remove transposon sequence. - - # Select for and remove linker sequence. diff --git a/src/pyim/__init__.py b/src/pyim/__init__.py index 74f4e66..0a4ff68 100644 --- a/src/pyim/__init__.py +++ b/src/pyim/__init__.py @@ -1,4 +1,5 @@ +# -*- coding: utf-8 -*- -from ._version import get_versions -__version__ = get_versions()['version'] -del get_versions +__author__ = 'Julian de Ruiter' +__email__ = 'julianderuiter@gmail.com' +__version__ = '0.1.0' diff --git a/src/pyim/_version.py b/src/pyim/_version.py deleted file mode 100644 index 25319d8..0000000 --- a/src/pyim/_version.py +++ /dev/null @@ -1,484 +0,0 @@ - -# This file helps to compute a version number in source trees obtained from -# git-archive tarball (such as those provided by githubs download-from-tag -# feature). Distribution tarballs (built by setup.py sdist) and build -# directories (produced by setup.py build) will contain a much shorter file -# that just contains the computed version number. - -# This file is released into the public domain. Generated by -# versioneer-0.16 (https://github.com/warner/python-versioneer) - -"""Git implementation of _version.py.""" - -import errno -import os -import re -import subprocess -import sys - - -def get_keywords(): - """Get the keywords needed to look up the version information.""" - # these strings will be replaced by git during git-archive. - # setup.py/versioneer.py will grep for the variable names, so they must - # each be defined on a line of their own. _version.py will just call - # get_keywords(). - git_refnames = "$Format:%d$" - git_full = "$Format:%H$" - keywords = {"refnames": git_refnames, "full": git_full} - return keywords - - -class VersioneerConfig: - """Container for Versioneer configuration parameters.""" - - -def get_config(): - """Create, populate and return the VersioneerConfig() object.""" - # these strings are filled in when 'setup.py versioneer' creates - # _version.py - cfg = VersioneerConfig() - cfg.VCS = "git" - cfg.style = "pep440" - cfg.tag_prefix = "v" - cfg.parentdir_prefix = "pyim-" - cfg.versionfile_source = "src/pyim/_version.py" - cfg.verbose = False - return cfg - - -class NotThisMethod(Exception): - """Exception raised if a method is not valid for the current scenario.""" - - -LONG_VERSION_PY = {} -HANDLERS = {} - - -def register_vcs_handler(vcs, method): # decorator - """Decorator to mark a method as the handler for a particular VCS.""" - def decorate(f): - """Store f in HANDLERS[vcs][method].""" - if vcs not in HANDLERS: - HANDLERS[vcs] = {} - HANDLERS[vcs][method] = f - return f - return decorate - - -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False): - """Call the given command(s).""" - assert isinstance(commands, list) - p = None - for c in commands: - try: - dispcmd = str([c] + args) - # remember shell=False, so use git.cmd on windows, not just git - p = subprocess.Popen([c] + args, cwd=cwd, stdout=subprocess.PIPE, - stderr=(subprocess.PIPE if hide_stderr - else None)) - break - except EnvironmentError: - e = sys.exc_info()[1] - if e.errno == errno.ENOENT: - continue - if verbose: - print("unable to run %s" % dispcmd) - print(e) - return None - else: - if verbose: - print("unable to find command, tried %s" % (commands,)) - return None - stdout = p.communicate()[0].strip() - if sys.version_info[0] >= 3: - stdout = stdout.decode() - if p.returncode != 0: - if verbose: - print("unable to run %s (error)" % dispcmd) - return None - return stdout - - -def versions_from_parentdir(parentdir_prefix, root, verbose): - """Try to determine the version from the parent directory name. - - Source tarballs conventionally unpack into a directory that includes - both the project name and a version string. - """ - dirname = os.path.basename(root) - if not dirname.startswith(parentdir_prefix): - if verbose: - print("guessing rootdir is '%s', but '%s' doesn't start with " - "prefix '%s'" % (root, dirname, parentdir_prefix)) - raise NotThisMethod("rootdir doesn't start with parentdir_prefix") - return {"version": dirname[len(parentdir_prefix):], - "full-revisionid": None, - "dirty": False, "error": None} - - -@register_vcs_handler("git", "get_keywords") -def git_get_keywords(versionfile_abs): - """Extract version information from the given file.""" - # the code embedded in _version.py can just fetch the value of these - # keywords. When used from setup.py, we don't want to import _version.py, - # so we do it with a regexp instead. This function is not used from - # _version.py. - keywords = {} - try: - f = open(versionfile_abs, "r") - for line in f.readlines(): - if line.strip().startswith("git_refnames ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["refnames"] = mo.group(1) - if line.strip().startswith("git_full ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["full"] = mo.group(1) - f.close() - except EnvironmentError: - pass - return keywords - - -@register_vcs_handler("git", "keywords") -def git_versions_from_keywords(keywords, tag_prefix, verbose): - """Get version information from git keywords.""" - if not keywords: - raise NotThisMethod("no keywords at all, weird") - refnames = keywords["refnames"].strip() - if refnames.startswith("$Format"): - if verbose: - print("keywords are unexpanded, not using") - raise NotThisMethod("unexpanded keywords, not a git-archive tarball") - refs = set([r.strip() for r in refnames.strip("()").split(",")]) - # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of - # just "foo-1.0". If we see a "tag: " prefix, prefer those. - TAG = "tag: " - tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) - if not tags: - # Either we're using git < 1.8.3, or there really are no tags. We use - # a heuristic: assume all version tags have a digit. The old git %d - # expansion behaves like git log --decorate=short and strips out the - # refs/heads/ and refs/tags/ prefixes that would let us distinguish - # between branches and tags. By ignoring refnames without digits, we - # filter out many common branch names like "release" and - # "stabilization", as well as "HEAD" and "master". - tags = set([r for r in refs if re.search(r'\d', r)]) - if verbose: - print("discarding '%s', no digits" % ",".join(refs-tags)) - if verbose: - print("likely tags: %s" % ",".join(sorted(tags))) - for ref in sorted(tags): - # sorting will prefer e.g. "2.0" over "2.0rc1" - if ref.startswith(tag_prefix): - r = ref[len(tag_prefix):] - if verbose: - print("picking %s" % r) - return {"version": r, - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": None - } - # no suitable tags, so version is "0+unknown", but full hex is still there - if verbose: - print("no suitable tags, using unknown + full revision id") - return {"version": "0+unknown", - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": "no suitable tags"} - - -@register_vcs_handler("git", "pieces_from_vcs") -def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): - """Get version from 'git describe' in the root of the source tree. - - This only gets called if the git-archive 'subst' keywords were *not* - expanded, and _version.py hasn't already been rewritten with a short - version string, meaning we're inside a checked out source tree. - """ - if not os.path.exists(os.path.join(root, ".git")): - if verbose: - print("no .git in %s" % root) - raise NotThisMethod("no .git directory") - - GITS = ["git"] - if sys.platform == "win32": - GITS = ["git.cmd", "git.exe"] - # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] - # if there isn't one, this yields HEX[-dirty] (no NUM) - describe_out = run_command(GITS, ["describe", "--tags", "--dirty", - "--always", "--long", - "--match", "%s*" % tag_prefix], - cwd=root) - # --long was added in git-1.5.5 - if describe_out is None: - raise NotThisMethod("'git describe' failed") - describe_out = describe_out.strip() - full_out = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) - if full_out is None: - raise NotThisMethod("'git rev-parse' failed") - full_out = full_out.strip() - - pieces = {} - pieces["long"] = full_out - pieces["short"] = full_out[:7] # maybe improved later - pieces["error"] = None - - # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] - # TAG might have hyphens. - git_describe = describe_out - - # look for -dirty suffix - dirty = git_describe.endswith("-dirty") - pieces["dirty"] = dirty - if dirty: - git_describe = git_describe[:git_describe.rindex("-dirty")] - - # now we have TAG-NUM-gHEX or HEX - - if "-" in git_describe: - # TAG-NUM-gHEX - mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) - if not mo: - # unparseable. Maybe git-describe is misbehaving? - pieces["error"] = ("unable to parse git-describe output: '%s'" - % describe_out) - return pieces - - # tag - full_tag = mo.group(1) - if not full_tag.startswith(tag_prefix): - if verbose: - fmt = "tag '%s' doesn't start with prefix '%s'" - print(fmt % (full_tag, tag_prefix)) - pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" - % (full_tag, tag_prefix)) - return pieces - pieces["closest-tag"] = full_tag[len(tag_prefix):] - - # distance: number of commits since tag - pieces["distance"] = int(mo.group(2)) - - # commit: short hex revision ID - pieces["short"] = mo.group(3) - - else: - # HEX: no tags - pieces["closest-tag"] = None - count_out = run_command(GITS, ["rev-list", "HEAD", "--count"], - cwd=root) - pieces["distance"] = int(count_out) # total number of commits - - return pieces - - -def plus_or_dot(pieces): - """Return a + if we don't already have one, else return a .""" - if "+" in pieces.get("closest-tag", ""): - return "." - return "+" - - -def render_pep440(pieces): - """Build up version string, with post-release "local version identifier". - - Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you - get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty - - Exceptions: - 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - rendered += plus_or_dot(pieces) - rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) - if pieces["dirty"]: - rendered += ".dirty" - else: - # exception #1 - rendered = "0+untagged.%d.g%s" % (pieces["distance"], - pieces["short"]) - if pieces["dirty"]: - rendered += ".dirty" - return rendered - - -def render_pep440_pre(pieces): - """TAG[.post.devDISTANCE] -- No -dirty. - - Exceptions: - 1: no tags. 0.post.devDISTANCE - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"]: - rendered += ".post.dev%d" % pieces["distance"] - else: - # exception #1 - rendered = "0.post.dev%d" % pieces["distance"] - return rendered - - -def render_pep440_post(pieces): - """TAG[.postDISTANCE[.dev0]+gHEX] . - - The ".dev0" means dirty. Note that .dev0 sorts backwards - (a dirty tree will appear "older" than the corresponding clean one), - but you shouldn't be releasing software with -dirty anyways. - - Exceptions: - 1: no tags. 0.postDISTANCE[.dev0] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - rendered += ".post%d" % pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - rendered += plus_or_dot(pieces) - rendered += "g%s" % pieces["short"] - else: - # exception #1 - rendered = "0.post%d" % pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - rendered += "+g%s" % pieces["short"] - return rendered - - -def render_pep440_old(pieces): - """TAG[.postDISTANCE[.dev0]] . - - The ".dev0" means dirty. - - Eexceptions: - 1: no tags. 0.postDISTANCE[.dev0] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - rendered += ".post%d" % pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - else: - # exception #1 - rendered = "0.post%d" % pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - return rendered - - -def render_git_describe(pieces): - """TAG[-DISTANCE-gHEX][-dirty]. - - Like 'git describe --tags --dirty --always'. - - Exceptions: - 1: no tags. HEX[-dirty] (note: no 'g' prefix) - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"]: - rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) - else: - # exception #1 - rendered = pieces["short"] - if pieces["dirty"]: - rendered += "-dirty" - return rendered - - -def render_git_describe_long(pieces): - """TAG-DISTANCE-gHEX[-dirty]. - - Like 'git describe --tags --dirty --always -long'. - The distance/hash is unconditional. - - Exceptions: - 1: no tags. HEX[-dirty] (note: no 'g' prefix) - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) - else: - # exception #1 - rendered = pieces["short"] - if pieces["dirty"]: - rendered += "-dirty" - return rendered - - -def render(pieces, style): - """Render the given version pieces into the requested style.""" - if pieces["error"]: - return {"version": "unknown", - "full-revisionid": pieces.get("long"), - "dirty": None, - "error": pieces["error"]} - - if not style or style == "default": - style = "pep440" # the default - - if style == "pep440": - rendered = render_pep440(pieces) - elif style == "pep440-pre": - rendered = render_pep440_pre(pieces) - elif style == "pep440-post": - rendered = render_pep440_post(pieces) - elif style == "pep440-old": - rendered = render_pep440_old(pieces) - elif style == "git-describe": - rendered = render_git_describe(pieces) - elif style == "git-describe-long": - rendered = render_git_describe_long(pieces) - else: - raise ValueError("unknown style '%s'" % style) - - return {"version": rendered, "full-revisionid": pieces["long"], - "dirty": pieces["dirty"], "error": None} - - -def get_versions(): - """Get version information or return default if unable to do so.""" - # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have - # __file__, we can work backwards from there to the root. Some - # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which - # case we can only use expanded keywords. - - cfg = get_config() - verbose = cfg.verbose - - try: - return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, - verbose) - except NotThisMethod: - pass - - try: - root = os.path.realpath(__file__) - # versionfile_source is the relative path from the top of the source - # tree (where the .git directory might live) to this file. Invert - # this to find the root from __file__. - for i in cfg.versionfile_source.split('/'): - root = os.path.dirname(root) - except NameError: - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, - "error": "unable to find root of source tree"} - - try: - pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) - return render(pieces, cfg.style) - except NotThisMethod: - pass - - try: - if cfg.parentdir_prefix: - return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) - except NotThisMethod: - pass - - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, - "error": "unable to compute version"} diff --git a/src/pyim/align/bowtie2.py b/src/pyim/align/bowtie2.py deleted file mode 100644 index cd7139d..0000000 --- a/src/pyim/align/bowtie2.py +++ /dev/null @@ -1,92 +0,0 @@ -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import (ascii, bytes, chr, dict, filter, hex, input, - int, map, next, oct, open, pow, range, round, - str, super, zip) - -import os -import subprocess -from os import path - - -def align(m1, index, output, m2=None, options=None, - log=None, bam_output=False): - """Alignment with bowtie2.""" - options = {} or options - - # Inject inputs into options. - if m2 is None: - options['-U'] = m1 - else: - options['-1'] = m1 - options['-2'] = m2 - - # Inject index and output. - if not output.endswith('.sam'): - if output.endswith('.bam'): - output = output.replace('.bam', '.sam') - else: - output = output + '.sam' - - options['-x'] = index - options['-S'] = output - - # Format into arguments. - args = ['bowtie2'] + dict_to_args(options) - - if log is not None: - with open(log, 'w') as log_file: - subprocess.check_call(args, stderr=log_file) - else: - subprocess.check_call(args) - - # Convert to bam if needed. - if bam_output: - output = sam_to_bam(output, sort=True, - index=True, delete_sam=True) - - return output - - -def dict_to_args(arg_dict): - args = [] - - for key, value in arg_dict.items(): - if type(value) == bool: - if value: - args.append(key) - else: - args.append(key) - args.append(str(value)) - - return args - - -def sam_to_bam(sam_path, bam_path=None, sort=False, - index=False, delete_sam=False): - if bam_path is None: - # Default output name replaces .sam with .bam. - bam_path = path.splitext(sam_path)[0] + '.bam' - - if sort: - # Pipe bam into samtools sort for sorting. - p1 = subprocess.Popen(['samtools', 'view', '-b', sam_path], - stdout=subprocess.PIPE) - p2 = subprocess.Popen(['samtools', 'sort', '-o', bam_path, '-'], - stdin=p1.stdout) - p1.stdout.close() - p2.communicate() - - if index: - # Indexing bam file if needed. - subprocess.check_call(['samtools', 'index', bam_path]) - else: - # Only convert to bam. - subprocess.check_call(['samtools', 'view', '-b', - '-o', bam_path, sam_path]) - - if delete_sam: - # Delete original sam if requested. - os.unlink(sam_path) - - return bam_path diff --git a/src/pyim/align/common/cutadapt.py b/src/pyim/align/common/cutadapt.py deleted file mode 100644 index d063a2e..0000000 --- a/src/pyim/align/common/cutadapt.py +++ /dev/null @@ -1,107 +0,0 @@ -import subprocess - - -def cutadapt(input_path, output_path, options): - cmdline_args = _build_arguments(input_path, output_path, options) - print(cmdline_args) - #check_call(cmdline_args) - - -def cutadapt_piped(input_path, output_path, options_list): - raise NotImplementedError() - - -def _build_arguments(input_path, output_path, options): - """Builds argument list for cutadapt.""" - - cmdline_opts = flatten_options(options) - return (['cutadapt'] + cmdline_opts + - ['-o', str(output_path), str(input_path)]) # yapf: disable - - -def _run(arguments, stdout=None, stderr=None, *args, **kwargs): - stdout_ = _open_output(stdout) - stderr_ = _open_output(stderr) - - try: - process = subprocess.Popen( - arguments, stdout=stdout, stderr=stderr, *args, **kwargs) - finally: - _close_output(stdout_) - _close_output(stderr_) - - return process.returncode - - -def _run_piped(arguments_list, stdout=None, stderrs=None): - if len(arguments_list) < 2: - raise ValueError('At least two sets of arguments should be given') - - if stderrs is None: - stderrs = [None] * len(arguments_list) - - # Handle processes 1-(n-1). - processes = [] - file_handles = [] - - try: - prev_out = None - for arg_list, stderr in list(zip(arguments_list, stderrs))[:-1]: - # Setup processes. - stderr_fh = _open_output(stderr) - process = subprocess.Popen( - arg_list, - stdin=prev_out, - stdout=subprocess.PIPE, - stderr=stderr_fh) - - prev_out = process.stdout - - processes.append(process) - file_handles.append(stderr_fh) - - # Handle final process. - stdout_fh = _open_output(stdout) - stderr_fh = _open_output(stderrs[-1]) - process = subprocess.Popen( - arguments_list[-1], - stdout=stdout_fh, - stderr=stderr_fh, - stdin=prev_out) - - processes.append(process) - file_handles += [stderr_fh, stdout_fh] - - # Allow pi to receive a SIGPIPE. - for p in processes[:-1]: - p.stdout.close() - - process.wait() - - finally: - # Close all file handles. - for fh in file_handles: - _close_output(fh) - - return process.returncode - - -def _open_output(file_path, mode='w'): - if file_path is None: - return None - else: - return file_path.open(mode) - - -def _close_output(file_path): - if file_path is not None: - file_path.close() - - -def flatten_options(option_dict): - """Flattens a dict of options into an argument list.""" - - options = [] - for opt_name, opt_values in option_dict.items(): - options += [opt_name] + list(opt_values) - return options diff --git a/src/pyim/align/common/genomic.py b/src/pyim/align/common/genomic.py new file mode 100644 index 0000000..c116a11 --- /dev/null +++ b/src/pyim/align/common/genomic.py @@ -0,0 +1,147 @@ +import logging + +from pyim.util.path import build_path + +from pyim.external.cutadapt import cutadapt, cutadapt_summary + +DEFAULT_OVERLAP = 3 +DEFAULT_ERROR_RATE = 0.1 + + +def extract_genomic(reads_path, + output_path, + transposon_path, + linker_path=None, + contaminant_path=None, + min_length=None, + min_overlaps=None, + error_rates=None): + """Extracts genomic sequences from single-read data.""" + + logger = logging.getLogger() + + min_overlaps = min_overlaps or {} + error_rates = error_rates or {} + + # Ensure output dir exists. + output_path.parent.mkdir(exist_ok=True) + + # Track interim files for cleaning. + interim_files = [] + + if contaminant_path is not None: + # Remove contaminants. + contaminant_out_path = build_path(output_path, suffix='.contaminant') + contaminant_opts = { + '-g': 'file:' + str(contaminant_path), + '--discard-trimmed': True, + '-O': min_overlaps.get('contaminant', DEFAULT_OVERLAP), + '-e': error_rates.get('contaminant', DEFAULT_ERROR_RATE) + } + + p = cutadapt(reads_path, contaminant_out_path, contaminant_opts) + logger.info('Trimmed contaminant sequences' + + cutadapt_summary(p.stdout)) # yapf: disable + + interim_files.append(contaminant_out_path) + else: + contaminant_out_path = reads_path + + if linker_path is not None: + # Remove linker. + linker_out_path = build_path(output_path, suffix='.linker') + linker_opts = { + '-a': 'file:' + str(linker_path), + '--discard-untrimmed': True, + '-O': min_overlaps.get('linker', DEFAULT_OVERLAP), + '-e': error_rates.get('linker', DEFAULT_ERROR_RATE) + } + + p = cutadapt(contaminant_out_path, linker_out_path, linker_opts) + logger.info('Trimmed linker sequence' + + cutadapt_summary(p.stdout)) # yapf: disable + + interim_files.append(linker_out_path) + else: + linker_out_path = contaminant_out_path + + # Trim transposon and check minimum length. + transposon_opts = { + '-g': 'file:' + str(transposon_path), + '--discard-untrimmed': True, + '-O': min_overlaps.get('transposon', DEFAULT_OVERLAP), + '-e': error_rates.get('transposon', DEFAULT_ERROR_RATE) + } + + if min_length is not None: + transposon_opts['--minimum-length'] = min_length + + p = cutadapt(linker_out_path, output_path, transposon_opts) + logger.info('Trimmed transposon sequence and filtered for length' + + cutadapt_summary(p.stdout)) # yapf: disable + + # Clean-up interim files. + for fp in interim_files: + fp.unlink() + + +def extract_genomic_paired(reads_paths, + output_paths, + transposon_path, + linker_path=None, + contaminant_path=None, + min_length=None): + """Extracts genomic sequences from paired-end data.""" + + # Extract file paths. + in1_path, in2_path = reads_paths + out1_path, out2_path = output_paths + + # Ensure output dirs exists. + out1_path.parent.mkdir(exist_ok=True) + out2_path.parent.mkdir(exist_ok=True) + + # Track interim files. + interim_files = [] + + if contaminant_path is not None: + # Remove contaminants. + cont1_path = build_path(out1_path, suffix='.contaminant') + cont2_path = build_path(out2_path, suffix='.contaminant') + + contaminant_opts = {'-g': 'file:' + str(contaminant_path), + '--discard-trimmed': True} + cutadapt(in1_path, cont1_path, contaminant_opts, + in2_path=in2_path, out2_path=out2_path) # yapf: disable + + interim_files += [cont1_path, cont2_path] + else: + cont1_path, cont2_path = in1_path, in2_path + + if linker_path is not None: + # Remove linker. + link1_path = build_path(out1_path, suffix='.linker') + link2_path = build_path(out2_path, suffix='.linker') + + linker_opts = {'-A': 'file:' + str(linker_path), + '--discard-untrimmed': True} + cutadapt(cont1_path, link1_path, linker_opts, + in2_path=cont2_path, out2_path=link2_path) # yapf: disable + + interim_files += [link1_path, link2_path] + else: + link1_path, link2_path = cont1_path, cont2_path + + # Trim transposon and check minimum length. + transposon_opts = {'-g': 'file:' + str(transposon_path), + '--discard-untrimmed': True} + + if min_length is not None: + transposon_opts['--minimum-length'] = min_length + + cutadapt(link1_path, out1_path, transposon_opts, + in2_path=link2_path, out2_path=out2_path) # yapf: disable + + # Clean-up intermediary files. + for fp in interim_files: + fp.unlink() diff --git a/src/pyim/align/common/insertions.py b/src/pyim/align/common/insertions.py new file mode 100644 index 0000000..43c6f7a --- /dev/null +++ b/src/pyim/align/common/insertions.py @@ -0,0 +1,226 @@ +from collections import defaultdict +import itertools +import logging +import operator + +from frozendict import frozendict +import numpy as np +import pysam +import toolz + +from pyim.model import Insertion + + +def fetch_alignments(bam_path, only_primary=True, min_mapq=None): + bam_file = pysam.AlignmentFile(str(bam_path)) + + try: + alignments = iter(bam_file) + + if only_primary: + alignments = (aln for aln in alignments if not aln.is_secondary) + + if min_mapq is not None: + alignments = (aln for aln in alignments + if aln.mapping_quality >= min_mapq) + + yield from alignments + finally: + bam_file.close() + + +def summarize_alignments(alignments): + """Summarizes alignments into a dict of chromosomal positions. + + This function summarizes an iterable of alignments into a dict that + tracks the unique ends (ligation points) of the alignments for + different genomic positions. The genomic positions are encoded as a tuple + of (chromosome, position, strand) and are used as keys, whilst the + ligation points are tracked as a list of positions. + + This dict is an intermediate used by other functions to derive insertions. + + Parameters + ---------- + alignments : iterable[pysam.AlignedSegment] + Alignments to summarize. May be prefiltered (on mapping quality + for example), as this function does not perform any filtering itself. + + Returns + ------- + dict[(str, int, int), list[int]] + Returns a dictionary mapping genomic positions, encoded as a + (chromosome, position, strand) tuple to ligation points. + + """ + alignment_map = defaultdict(list) + + for aln in alignments: + tup = _process_alignment(aln) + if tup is not None: + alignment_map[tup[0]].append(tup[1]) + + return dict(alignment_map) + + +def summarize_alignments_by_group(alignments, group_func): + # Take subgroups of alignments into account. This allows us to make + # arbitrary subgroups of alignment summaries, for example by grouping + # reads by sample barcodes. + alignment_map = defaultdict(lambda: defaultdict(list)) + + for aln in alignments: + tup = _process_alignment(aln) + if tup is not None: + grp = group_func(aln) + if grp is not None: + alignment_map[grp][tup[0]].append(tup[1]) + + return {k: dict(v) for k, v in alignment_map.items()} + + +def _process_alignment(aln): + if aln.reference_id != -1: + ref = aln.reference_name + + if aln.is_reverse: + transposon_pos = aln.reference_end + linker_pos = aln.reference_start + strand = -1 + else: + transposon_pos = aln.reference_start + linker_pos = aln.reference_end + strand = 1 + + key = (ref, transposon_pos, strand) + + return key, linker_pos + else: + return None + + +def extract_barcode_mapping(reads, barcodes, barcode_mapping=None): + + # Create barcode/sample dict. + barcode_dict = {bc.name: bc.sequence for bc in barcodes} + + if barcode_mapping is not None: + barcode_dict = {sample: barcode_dict[barcode] + for barcode, sample in barcode_mapping.items()} + + # Build mapping. + mapping = {} + + for read in reads: + # Check each barcode for match in read. + matched = [k for k, v in barcode_dict.items() if v in read.sequence] + + if len(matched) == 1: + # Record single matches. + name = read.name.split()[0] + mapping[name] = matched[0] + elif len(matched) > 1: + logging.warning('Skipping %s due to multiple matching barcodes', + read.name.split()[0]) + + return mapping + + +def merge_summary_within_distance(aln_summary, max_distance=10): + """Merges alignment map entries that are within max_dist of each other.""" + + grouped_keys = _groupby_position(aln_summary.keys(), max_distance) + + merged = dict( + _merge_entries(aln_summary, key_grp) for key_grp in grouped_keys) + + return merged + + +def _groupby_position(alignment_keys, max_distance=10): + """Groups alignment keys that are in close proximity for merging.""" + + # First we sort by position and group by reference/strand. + sorted_keys = sorted(alignment_keys, key=lambda t: (t[2], t[0], t[1])) + grouped_keys = itertools.groupby(sorted_keys, lambda t: (t[2], t[0])) + + # Then we group the (position sorted) groups that are close together. + grouped_pos = itertools.chain.from_iterable( + _groupby_position_gen( + v, max_distance=max_distance) for _, v in grouped_keys) + + return grouped_pos + + +def _groupby_position_gen(key_group, max_distance): + key_iter = iter(key_group) + + prev = next(key_iter) + curr_group = [prev] + + for key in key_iter: + if (key[1] - prev[1]) <= max_distance: + # Continue group. + curr_group.append(key) + else: + # Start new group. + yield curr_group + curr_group = [key] + + yield curr_group + + +def _merge_entries(alignment_map, keys): + # Calculate (weighted) average position. + grp_pos, grp_size = zip(*((k[1], len(alignment_map[k])) for k in keys)) + pos = int(round(np.average(grp_pos, weights=grp_size))) + + # Generate new key/value. + ref = keys[0][0] + strand = keys[0][2] + + new_key = (ref, pos, strand) + new_values = list( + itertools.chain.from_iterable(alignment_map[k] for k in keys)) + + return new_key, new_values + + +def convert_summary_to_insertions(aln_summary, + min_support=1, + merge_distance=None, + id_fmt='INS_{}', + **kwargs): + """Converts an alignment map to a list of Insertions.""" + + # Optionally merge insertions within x distance. + if merge_distance is not None: + aln_summary = merge_summary_within_distance( + aln_summary, max_distance=merge_distance) + + # Convert to insertions. + insertions = (_to_insertion(ref, pos, strand, ends, id_=None, **kwargs) + for i, ((ref, pos, strand), ends) + in enumerate(aln_summary.items())) # yapf: disable + + # Filter for support. + insertions = (ins for ins in insertions if ins.support >= min_support) + + # Sort by depth and add IDs. + insertions = sorted(insertions, key=operator.attrgetter('support'))[::-1] + insertions = [ins._replace(id=id_fmt.format(i + 1)) + for i, ins in enumerate(insertions)] + + return insertions + + +def _to_insertion(ref, pos, strand, ends, id_=None, **kwargs): + metadata = toolz.merge({'depth': len(ends), + 'depth_unique': len(set(ends))}, kwargs) + return Insertion( + id=id_, + chromosome=ref, + position=pos, + strand=strand, + support=metadata['depth_unique'], + metadata=frozendict(metadata)) diff --git a/src/pyim/align/pipelines/__init__.py b/src/pyim/align/pipelines/__init__.py index e69de29..64e4aed 100644 --- a/src/pyim/align/pipelines/__init__.py +++ b/src/pyim/align/pipelines/__init__.py @@ -0,0 +1,2 @@ +from .base import Pipeline, get_pipelines, register_pipeline +from .single import SinglePipeline, SingleMultiplexedPipeline diff --git a/src/pyim/align/pipelines/_helpers/clustering.py b/src/pyim/align/pipelines/_helpers/clustering.py deleted file mode 100644 index a05a019..0000000 --- a/src/pyim/align/pipelines/_helpers/clustering.py +++ /dev/null @@ -1,64 +0,0 @@ -import toolz -import numpy as np -import pandas as pd -import scipy.cluster.hierarchy as sch -import scipy.spatial.distance as ssd - - -def merge_within_distance(insertions, max_dist=2000, agg_funcs=None): - clustered = cluster_insertions(insertions, max_dist=max_dist) - return merge_insertions(clustered, by='cluster', agg_funcs=agg_funcs) - - -def cluster_insertions(insertions, max_dist=2000, method='single'): - prev_n_clusters = 0 - - clustered_grps = [] - for _, group in insertions.groupby(['chrom', 'barcode', 'strand']): - clusters = _cluster_group(group, max_dist, method) - clusters += prev_n_clusters - - group = group.copy() - group['cluster'] = clusters - clustered_grps.append(group) - - prev_n_clusters = np.max(clusters) - - return pd.concat(clustered_grps, ignore_index=True) - - -def _cluster_group(insertions, max_dist, method): - if len(insertions) == 1: - clusters = np.array([1], dtype=np.int32) - else: - dists = genomic_distance(insertions) - z = sch.linkage(dists, method=method) - clusters = sch.fcluster(z, criterion='distance', t=max_dist) - - return clusters - - -def genomic_distance(insertions): - # Sanity check insertions (for debugging). - assert(insertions['chrom'].nunique() == 1) - assert(insertions['barcode'].nunique() == 1) - assert(insertions['strand'].nunique() == 1) - - # Calculate 1d distances. - loc = insertions['position'] - loc_2d = np.vstack([loc, np.zeros_like(loc)]).T - dist = ssd.pdist(loc_2d, lambda u, v: np.abs(u-v).sum()) - - return dist - - -def merge_insertions(insertions, by='cluster', agg_funcs=None): - # TODO: use weighted median. - default_agg = {'id': 'first', 'chrom': 'first', 'position': 'median', - 'strand': 'first', 'barcode': 'first'} - agg_funcs = toolz.merge(default_agg, agg_funcs or {}) - - col_order = [c for c in insertions.columns if c in agg_funcs] - merged = insertions.groupby(by).agg(agg_funcs)[col_order] - - return merged diff --git a/src/pyim/align/pipelines/_helpers/grouping.py b/src/pyim/align/pipelines/_helpers/grouping.py deleted file mode 100644 index 294bf6c..0000000 --- a/src/pyim/align/pipelines/_helpers/grouping.py +++ /dev/null @@ -1,235 +0,0 @@ -import collections -import itertools -import operator - -import heapq -import toolz - -from collections import namedtuple - - -class PrioritySet(object): - - def __init__(self): - self._heap = [] - self._set = set() - - def push(self, item, priority): - if item not in self._set: - heapq.heappush(self._heap, (priority, item)) - self._set.add(item) - - def pop(self): - priority, item = heapq.heappop(self._heap) - self._set.remove(item) - return item - - def first(self): - _, item = min(self._heap) - return item - - def __len__(self): - return len(self._heap) - - def __str__(self): - return 'PrioritySet(heap={}, set={})'\ - .format(str(self._heap), str(self._set)) - - def __repr__(self): - return str(self) - - -@toolz.curry -def groupby_reference(alignments, alignment_file=None): - for reference, group in itertools.groupby( - alignments, operator.attrgetter('reference_id')): - if alignment_file is not None: - reference = alignment_file.getrname(reference) - yield reference, group - - -def groupby_position(alignments): - """ Groups alignments by their positions, grouping forward strand - alignments with the same start position and reverse strand - alignments with the same end position. Assumes alignments - are all on a single reference sequence. - """ - # Setup our collections for tracking reads and positions. - # - # The priority set is used to track positions with alignment groups, - # ensuring that no position is listed twice (the set part) and - # always giving the lowest position first (the priority part). - # - # The alignment dict contains two lists for each position with at - # least one alignment, one for forward reads and one for reverse. - # Any alignments encountered as position x in orientation o are added - # to the corresponding entry dict[x][o] in the list, in which - # o is encoded as {0,1}, with 1 being for reverse strand alignments. - position_set = PrioritySet() - aln_dict = collections.defaultdict(lambda: ([], [])) - - curr_pos = 0 - for aln in alignments: - # Check our ordering. - if aln.reference_start < curr_pos: - raise ValueError('Alignments not ordered by position') - - curr_pos = aln.reference_start - - # Add current read to collections. - is_reverse = aln.is_reverse - ref_pos = aln.reference_end if is_reverse else curr_pos - aln_dict[ref_pos][bool(is_reverse)].append(aln) - position_set.push(ref_pos, ref_pos) - - # Return any alignment groups before our current position. - try: - while position_set.first() < curr_pos: - first_pos = position_set.pop() - fwd_grp, rev_grp = aln_dict.pop(first_pos) - if len(fwd_grp) > 0: - yield (fwd_grp[0].reference_start, 1), fwd_grp - if len(rev_grp) > 0: - yield (rev_grp[0].reference_end, -1), rev_grp - except ValueError: - pass - - # We're done, yield any remaining alignment groups. - for _ in range(len(position_set)): - fwd_grp, rev_grp = aln_dict.pop(position_set.pop()) - if len(fwd_grp) > 0: - yield (fwd_grp[0].reference_start, 1), fwd_grp - if len(rev_grp) > 0: - yield (rev_grp[0].reference_end, -1), rev_grp - - -GenomicPosition = namedtuple('GenomicPosition', - ['chromosome', 'position', 'strand']) - - -def groupby_position_mate(alignments): - """ Groups alignments by their positions, grouping forward strand - alignments with the same start position and reverse strand - alignments with the same end position. Assumes alignments - are all on a single reference sequence. - """ - # Setup our collections for tracking reads and positions. - # - # The priority set is used to track positions with alignment groups, - # ensuring that no position is listed twice (the set part) and - # always giving the lowest position first (the priority part). - # - # The alignment dict contains two lists for each position with at - # least one alignment, one for forward reads and one for reverse. - # Any alignments encountered as position x in orientation o are added - # to the corresponding entry dict[x][o] in the list, in which - # o is encoded as {0,1}, with 1 being for reverse strand alignments. - position_set = PrioritySet() - aln_dict = collections.defaultdict(lambda: ([], [])) - - # Only use proper pairs. - alignments = (aln for aln in alignments if aln.is_proper_pair) - - # Limit ourselves to alignments from one chromosome (the first - # encountered), as sort is only valid with the same chromosome. - aln, alignments = toolz.peek(alignments) - ref_name = aln.reference_name - - alignments = itertools.takewhile( - lambda aln: aln.reference_name == ref_name, alignments) - - # We match position on the first pair. The second is stored until - # needed and then returned together with the corresponding first pair. - second_pairs = {} - - curr_pos = 0 - for aln in alignments: - if aln.is_read2: - second_pairs[aln.query_name] = aln - else: - # Check our ordering. - if aln.reference_start < curr_pos: - raise ValueError('Alignments not ordered by position') - - curr_pos = aln.reference_start - - # Add current read to collections. - is_reverse = aln.is_reverse - ref_pos = aln.reference_end if is_reverse else curr_pos - aln_dict[ref_pos][bool(is_reverse)].append(aln) - position_set.push(ref_pos, ref_pos) - - # Return any alignment groups before our current position. - try: - while position_set.first() < curr_pos: - first_pos = position_set.pop() - fwd_grp, rev_grp = aln_dict.pop(first_pos) - - if len(fwd_grp) > 0: - fwd_mates = [second_pairs.pop(aln.query_name) - for aln in fwd_grp] - fwd_pos = fwd_grp[0].reference_start - yield (GenomicPosition(ref_name, fwd_pos, 1), - fwd_grp, fwd_mates) - - if len(rev_grp) > 0: - rev_mates = [second_pairs.pop(aln.query_name) - for aln in rev_grp] - rev_pos = rev_grp[0].reference_start - yield (GenomicPosition(ref_name, rev_pos, 1), - rev_grp, rev_mates) - - except ValueError: - pass - - # We're done, yield any remaining alignment groups. - for _ in range(len(position_set)): - fwd_grp, rev_grp = aln_dict.pop(position_set.pop()) - - if len(fwd_grp) > 0: - fwd_mates = [second_pairs.pop(aln.query_name) for aln in fwd_grp] - fwd_pos = fwd_grp[0].reference_start - yield (GenomicPosition(ref_name, fwd_pos, 1), fwd_grp, fwd_mates) - - if len(rev_grp) > 0: - rev_mates = [second_pairs.pop(aln.query_name) for aln in rev_grp] - rev_pos = rev_grp[0].reference_start - yield (GenomicPosition(ref_name, rev_pos, 1), rev_grp, rev_mates) - - -@toolz.curry -def groupby_reference_position(alignments, alignment_file=None): - chained = chain_groupby( - alignments, [groupby_reference(alignment_file=alignment_file), - groupby_position]) - for res in chained: - yield res - - -@toolz.curry -def groupby_barcode(alignments, barcode_map): - # Group alignments by barcodes. - groups = collections.defaultdict(list) - for aln in alignments: - barcode = barcode_map[aln.query_name] - groups[barcode].append(aln) - - # Yield group together with barcode. - for barcode, group in groups.items(): - yield barcode, group - - -def chain_groupby(iterable, groupby_funcs): - grouped = groupby_funcs[0](iterable) - - if len(groupby_funcs) == 1: - for key, group in grouped: - if not isinstance(key, tuple): - key = (key,) - yield key, group - else: - for key, group in grouped: - if not isinstance(key, tuple): - key = (key,) - for sub_key, sub_group in chain_groupby(group, groupby_funcs[1:]): - yield key + sub_key, sub_group diff --git a/src/pyim/align/pipelines/_helpers/pipeline.py b/src/pyim/align/pipelines/_helpers/pipeline.py deleted file mode 100644 index bca38ec..0000000 --- a/src/pyim/align/pipelines/_helpers/pipeline.py +++ /dev/null @@ -1,62 +0,0 @@ -import collections -import itertools - -import skbio -import toolz - - -@toolz.curry -def print_stats(results, logger=None, header=False): - print_ = print if logger is None else logger.info - - # Iterate over results, counting statuses. - status_counts = collections.defaultdict(int) - - for result in results: - status_counts[result.status.name] += 1 - yield result - - # We're done, so print frequencies! - if header: - print_('Extraction stats:') - - total = sum(status_counts.values()) - - for status in sorted(status_counts.keys()): - count = status_counts[status] - percentage = (count / total) * 100 - print_('{:>18}: {:>8} ({:5.2f}%)' - .format(status, count, percentage)) - - -@toolz.curry -def write_genomic_sequences(results, file_path, format='fastq', - mode='w', **io_kwargs): - """ Test docstring """ - with skbio.io.open(file_path, mode, **io_kwargs) as file_: - for result in results: - skbio.io.write(result.genomic_sequence, into=file_, format=format) - yield result - - -@toolz.curry -def build_barcode_map(results, sample_map=None): - if sample_map is None: - return {result.genomic_sequence.metadata['id']: - result.barcode - for result in results} - else: - return {result.genomic_sequence.metadata['id']: - sample_map[result.barcode] - for result in results} - - -def consume(iterator, n=None): - "Advance the iterator n-steps ahead. If n is none, consume entirely." - # Use functions that consume iterators at C speed. - if n is None: - # Feed the entire iterator into a zero-length deque - collections.deque(iterator, maxlen=0) - else: - # Advance to the empty slice starting at position n - next(itertools.islice(iterator, n, n), None) diff --git a/src/pyim/align/pipelines/_model.py b/src/pyim/align/pipelines/_model.py deleted file mode 100644 index 2728c60..0000000 --- a/src/pyim/align/pipelines/_model.py +++ /dev/null @@ -1,10 +0,0 @@ - -class ExtractResult(object): - - __slots__ = ('genomic_sequence', 'barcode', 'status') - - def __init__(self, genomic_sequence, barcode, status): - super().__init__() - self.genomic_sequence = genomic_sequence - self.barcode = barcode - self.status = status diff --git a/src/pyim/align/pipelines/base.py b/src/pyim/align/pipelines/base.py new file mode 100644 index 0000000..5f53c18 --- /dev/null +++ b/src/pyim/align/pipelines/base.py @@ -0,0 +1,34 @@ +import abc +from pathlib import Path + +_registry = {} + + +def register_pipeline(name, pipeline): + _registry[name] = pipeline + + +def get_pipelines(): + return dict(_registry) + + +class Pipeline(abc.ABC): + def __init__(self): + pass + + @abc.abstractclassmethod + def configure_args(cls, parser): + parser.add_argument('--reads', type=Path, required=True) + parser.add_argument('--output', type=Path, required=True) + + @abc.abstractclassmethod + def extract_args(cls, args): + raise NotImplementedError() + + @classmethod + def from_args(cls, args): + return cls(**cls.extract_args(args)) + + @abc.abstractclassmethod + def run(self, reads_path, work_dir): + raise NotImplementedError() diff --git a/src/pyim/align/pipelines/lam_pcr.py b/src/pyim/align/pipelines/lam_pcr.py deleted file mode 100644 index 0376fba..0000000 --- a/src/pyim/align/pipelines/lam_pcr.py +++ /dev/null @@ -1,244 +0,0 @@ -from __future__ import (absolute_import, division, - print_function, unicode_literals) -# noinspection PyUnresolvedReferences -from builtins import (ascii, bytes, chr, dict, filter, hex, input, - int, map, next, oct, open, pow, range, round, - str, super, zip) - -from enum import Enum -from pathlib import Path - -import numpy as np -import pandas as pd -from skbio import DNA, SequenceCollection - -from pyim.alignment.genome import Bowtie2Aligner -from pyim.alignment.vector import ExactAligner -from pyim.cluster import cluster_frame_merged - -from ._base import (Pipeline, GenomicExtractor, - InsertionIdentifier, genomic_distance) - - -class LamPcrPipeline(Pipeline): - - @classmethod - def configure_argparser(cls, subparsers, name='lampcr'): - parser = subparsers.add_parser(name, help=name + ' help') - - parser.add_argument('input', type=Path) - parser.add_argument('output_dir', type=Path) - parser.add_argument('reference', type=Path) - - parser.add_argument('--contaminants', type=Path, default=None) - parser.add_argument('--transposon', type=Path, default=None) - parser.add_argument('--barcodes', type=Path, default=None) - parser.add_argument('--barcode_map', type=Path, default=None) - parser.add_argument('--min_genomic_length', type=int, default=15) - - parser.add_argument('--min_depth', type=int, default=2) - parser.add_argument('--min_mapq', type=int, default=37) - - parser.add_argument('--threads', type=int, default=1) - - return parser - - @classmethod - def from_args(cls, args): - - # Read transposon sequence. - transposon_seq = DNA.read(str(args['transposon'])) \ - if args['transposon'] is not None else None - - # Read contaminant sequences. - contaminant_seqs = SequenceCollection.read( - str(args['contaminants']), constructor=DNA) \ - if args['contaminants'] is not None else None - - # Read barcode sequences if supplied. - barcode_seqs = SequenceCollection.read( - str(args['barcodes']), constructor=DNA) \ - if args['barcodes'] is not None else None - - # Read barcode map if supplied. - if barcode_seqs is not None and args['barcode_map'] is not None: - barcode_map = pd.read_csv(str(args['barcode_map']), sep='\t') - barcode_map = dict(zip(barcode_map['barcode'], - barcode_map['sample'])) - else: - barcode_map = None - - # Setup extractor and identifier for pipeline. - extractor = LamPcrExtractor( - transposon_sequence=transposon_seq, - barcode_sequences=barcode_seqs, - barcode_map=barcode_map, - contaminant_sequences=contaminant_seqs, - min_length=args['min_genomic_length']) - - aligner = Bowtie2Aligner(reference=args['reference'], bam_output=True, - local=True, threads=args['threads']) - - identifier = LamPcrIdentifier(min_mapq=args['min_mapq'], - min_depth=args['min_depth']) - - return cls(extractor=extractor, - aligner=aligner, - identifier=identifier) - - -class LamPcrStatus(Enum): - contaminant = 1 - no_barcode = 2 - duplicate_barcode = 3 - no_transposon = 4 - too_short = 5 - proper_read = 6 - - -class LamPcrExtractor(GenomicExtractor): - - DEFAULT_IN_FORMAT = 'fastq' - DEFAULT_OUT_FORMAT = 'fastq' - - STATUS = LamPcrStatus - - def __init__(self, transposon_sequence=None, - barcode_sequences=None, barcode_map=None, - contaminant_sequences=None, min_length=1, - threads=1, chunk_size=1000): - super().__init__(min_length=min_length, threads=threads, - chunk_size=chunk_size) - - self._transposon_sequence = transposon_sequence - self._transposon_aligner = ExactAligner(try_reverse=False) - - self._barcode_aligner = ExactAligner(try_reverse=False) - self._barcodes = barcode_sequences - self._barcode_map = barcode_map - - self._contaminant_aligner = ExactAligner(try_reverse=True) - self._contaminants = contaminant_sequences - - def extract_read(self, read): - # Check for contaminants. - if self._contaminants is not None: - contaminant_aln = self._contaminant_aligner.\ - align_multiple(self._contaminants, read, how='any') - - if contaminant_aln is not None: - return None, self.STATUS.contaminant - - # Check for a transposon sequence if specified. - tr_aln = None - - if self._transposon_sequence is not None: - tr_aln = self._transposon_aligner.align( - self._transposon_sequence, read) - - if tr_aln is None: - return None, self.STATUS.no_transposon - - # Check for barcode sequences if specified. - bc_aln, barcode = None, None - - if self._barcodes is not None: - try: - bc_aln = self._barcode_aligner.align_multiple( - self._barcodes, read) - except ValueError: - return None, self.STATUS.duplicate_barcode - - if bc_aln is None: - return None, self.STATUS.no_barcode - - # Lookup barcode. - barcode = bc_aln.query_id - if self._barcode_map is not None: - barcode = self._barcode_map[barcode] - - # Extract the genomic sequence. - if tr_aln is not None: - genomic = read[tr_aln.target_end:] - elif bc_aln is not None: - genomic = read[bc_aln.target_end:] - else: - genomic = read - - # Check for minimum length. - if len(genomic) < self._min_length: - return None, self.STATUS.too_short - - # Return read, barcode and alignment status. - return (read, barcode), self.STATUS.proper_read - - -class LamPcrIdentifier(InsertionIdentifier): - - def __init__(self, min_depth=0, min_mapq=37, merge_distance=10): - super().__init__() - - self._min_depth = min_depth - self._min_mapq = min_mapq - self._merge_distance = merge_distance - - def identify(self, alignment_path, barcode_map=None): - insertions = [] - - groups = self._group_by_position_bam( - alignment_path, min_mapq=self._min_mapq, barcode_map=barcode_map) - for (ref_id, pos, strand, bc), alns in groups: - # Determine depth as the number of reads at this position. - depth = len(alns) - - # Determine depth_unique by looking at differences in the - # other position (end for fwd strand, start for rev strand). - other_pos = (a.reference_end for a in alns) if strand == 1 \ - else (a.reference_start for a in alns) - depth_unique = len(set(other_pos)) - - insertions.append( - {'insertion_id': np.nan, 'seqname': ref_id, - 'location': pos, 'strand': strand, 'sample': bc, - 'depth': depth, 'depth_unique': depth_unique}) - - # Create insertion frame. - insertions = pd.DataFrame.from_records( - insertions, columns=['insertion_id', 'seqname', 'location', - 'strand', 'sample', 'depth', 'depth_unique']) - - # Merge insertions in close proximity to account for sequencing errors. - if self._merge_distance > 0: - insertions = cluster_frame_merged( - insertions, groupby=['seqname', 'sample', 'strand'], - dist_func=genomic_distance, merge_func=self._merge_insertions, - linkage='complete', criterion='distance', - t=self._merge_distance) - - # Filter by min_depth. - insertions = insertions.ix[ - insertions['depth_unique'] > self._min_depth] - - # Sort by coordinate and add identifiers. - insertions = insertions.sort(['seqname', 'location']) - - insertions['insertion_id'] = ['INS_{}'.format(i+1) - for i in range(len(insertions))] - - return insertions - - @classmethod - def _merge_insertions(cls, frame): - if len(frame) == 0: - return frame.iloc[0] - else: - ref = frame.iloc[0] - return pd.Series( - {'insertion_id': np.nan, - 'seqname': ref['seqname'], - 'location': int(frame['location'].mean()), - 'strand': ref['strand'], - 'sample': ref['sample'], - 'depth': ref['depth'].sum(), - 'depth_unique': ref['depth_unique'].sum()}, - index=ref.index) diff --git a/src/pyim/align/pipelines/paired.py b/src/pyim/align/pipelines/paired.py new file mode 100644 index 0000000..5e528f4 --- /dev/null +++ b/src/pyim/align/pipelines/paired.py @@ -0,0 +1,26 @@ +import abc +from pathlib import Path + +from pyim.util.path import build_path + +from ..external.cutadapt import cutadapt +from .base import Pipeline + + +class PairedPipeline(Pipeline): + @abc.abstractclassmethod + def configure_args(cls, parser): + parser.add_argument('--reads', type=Path, required=True) + parser.add_argument('--output', type=Path, required=True) + + @abc.abstractclassmethod + def from_args(cls, args): + raise NotImplementedError() + + @abc.abstractclassmethod + def run(self, reads_path, work_dir): + raise NotImplementedError() + + def extract_genomic(self, reads_path, output_base): + # Ensure output dir exists. + output_base.parent.mkdir(exist_ok=True) diff --git a/src/pyim/align/pipelines/shear_splink.py b/src/pyim/align/pipelines/shear_splink.py deleted file mode 100644 index 70e5e87..0000000 --- a/src/pyim/align/pipelines/shear_splink.py +++ /dev/null @@ -1,378 +0,0 @@ -import os -import operator -import logging -from enum import Enum -from os import path - -import pysam -import pandas as pd -import skbio -import toolz -import tqdm -from toolz.curried import (filter as curried_filter, - map as curried_map) - -from pyim.alignment.bowtie2 import align as bowtie_align -from pyim.alignment.vector import (align_exact, align_multiple, - align_with_reverse) -from pyim.util.file import count_fasta_entries - -from ._model import ExtractResult -from ._helpers.pipeline import (print_stats, build_barcode_map, - write_genomic_sequences) -from ._helpers.grouping import (chain_groupby, groupby_barcode, - groupby_reference_position) -from ._helpers.clustering import merge_within_distance - - -# --- Pipeline register hook + main --- # - -def register(subparsers, name='shear_splink'): - parser = subparsers.add_parser(name, help=name + ' help') - - # Required arguments. - parser.add_argument('input') - parser.add_argument('output_dir') - parser.add_argument('--bowtie_index', required=True) - parser.add_argument('--transposon', required=True) - parser.add_argument('--barcodes', required=True) - parser.add_argument('--linker', required=True) - - # Optional arguments. - parser.add_argument('--contaminants', default=None) - parser.add_argument('--sample_map', default=None) - parser.add_argument('--min_genomic_length', type=int, default=15) - parser.add_argument('--min_depth', type=int, default=2) - parser.add_argument('--min_mapq', type=int, default=37) - - # Set main for dispatch. - parser.set_defaults(main=main) - - return parser - - -def main(args): - # Prepare reads, counting total for progress bar. - reads = skbio.read(args.input, format='fasta', constructor=skbio.DNA) - total_reads = count_fasta_entries(args.input) - - # Read transposon, linker and barcode sequences. - transposon = skbio.io.read(args.transposon, format='fasta', into=skbio.DNA) - linker = skbio.io.read(args.linker, format='fasta', into=skbio.DNA) - - barcodes = list(skbio.io.read(args.barcodes, format='fasta', - constructor=skbio.DNA)) - - if args.contaminants is not None: - contaminants = list(skbio.io.read(args.contaminants, format='fasta', - constructor=skbio.DNA)) - else: - contaminants = None - - # Read barcode --> sample map if given. - if args.sample_map is not None: - sample_map = pd.read_csv(args.sample_map, sep='\t') - sample_map = dict(zip(sample_map['barcode'], - sample_map['sample'])) - else: - sample_map = None - - # Create output_dir if it does not exist. - if not path.exists(args.output_dir): - os.makedirs(args.output_dir, exist_ok=True) - - # Run pipeline! - insertions = shear_splink( - reads, transposon, linker, barcodes, - args.bowtie_index, args.output_dir, - contaminants=contaminants, sample_map=sample_map, - min_genomic_length=args.min_genomic_length, - min_mapq=args.min_mapq, min_depth=args.min_depth, - total_reads=total_reads) - - # Write insertion output. - insertions.to_csv(path.join(args.output_dir, 'insertions.txt'), - sep='\t', index=False) - - -# --- Overall pipeline --- # - -def shear_splink(reads, transposon, linker, barcodes, - bowtie_index, output_dir, contaminants=None, - sample_map=None, min_genomic_length=15, - min_mapq=37, min_depth=None, - extract_kws=None, total_reads=None): - - logger = logging.getLogger() - - # Subset barcodes to sample map (if given). - if sample_map is not None: - barcodes = [bc for bc in barcodes - if bc.metadata['id'] in sample_map] - - if len(barcodes) != len(sample_map): - raise ValueError('Missing or duplicate barcodes') - - # Determine paths for intermediates/outputs. - genomic_path = path.join(output_dir, 'genomic.fna') - barcode_path = path.join(output_dir, 'genomic.barcodes.txt') - alignment_base = path.join(output_dir, 'alignment') - - # Log progress with progressbar. - logger.info('Extracting genomic sequences') - reads = tqdm.tqdm(reads, total=total_reads, - unit='read', leave=False, ncols=60) - - # Extract genomic sequences and barcodes - _, barcode_frame = extract_genomic( - reads, transposon=transposon, barcodes=barcodes, linker=linker, - output_path=genomic_path, contaminants=contaminants, - min_length=min_genomic_length, logger=logger, extract_kws=extract_kws) - - barcode_frame.to_csv(barcode_path, sep='\t', index=False) - - # Align to reference with Bowtie2. - logger.info('Aligning to reference genome') - - aln_path = bowtie_align(genomic_path, bowtie_index, alignment_base, - bam_output=True, options={'-f': True}, - log=alignment_base + '.log') - - # Identify insertions from alignment. - logger.info('Identifying insertions') - - barcode_map = dict(zip(barcode_frame['read_id'], - barcode_frame['barcode'])) - insertions = identify_insertions(aln_path, barcode_map=barcode_map) - - # Cluster and merge close insertions - logger.info('Merging close insertions') - - agg_funcs = {'depth': 'sum', 'depth_unique': 'sum'} - insertions = merge_within_distance( - insertions, max_dist=2000, agg_funcs=agg_funcs) - - # Map barcodes to samples. - if sample_map is not None: - logger.info('Mapping insertions to samples') - insertions['sample'] = insertions['barcode'].map(sample_map) - - # Filter on (unique) depth. - if min_depth is not None: - logger.info('Filtering insertions with depth < {}'.format(min_depth)) - insertions = insertions.ix[insertions['depth_unique'] >= min_depth] - - # Annotate with clonality. - logger.info('Annotating insertions with (relative) clonality') - insertions = annotate_with_clonality(insertions) - - # Sort and assign ids to insertions. - insertions.sort_values(by=['chrom', 'position'], inplace=True) - insertions['id'] = ['INS_{}'.format(i) - for i in range(1, len(insertions) + 1)] - - return insertions - - -# --- Genomic sequence extraction --- # - -class ShearSplinkStatus(Enum): - contaminant = 1 - no_transposon = 2 - no_linker = 3 - no_barcode = 4 - multiple_barcodes = 5 - too_short = 6 - proper_read = 7 - - -def extract_genomic(reads, transposon, barcodes, linker, - output_path, contaminants=None, min_length=15, - logger=None, extract_kws=None): - - extract_kws = extract_kws or {} - - # Extract and write genomic sequences. - barcode_map = toolz.pipe( - reads, - _extract_from_reads(transposon=transposon, - barcodes=barcodes, - linker=linker, - contaminants=contaminants, - **extract_kws), - curried_map(_check_minimum_length(min_length=min_length)), - print_stats(logger=logger), - curried_filter(_proper_filter), - write_genomic_sequences(file_path=output_path, format='fasta'), - build_barcode_map) - - # Build frame mapping reads to barcodes. - barcode_frame = pd.DataFrame.from_records( - iter(barcode_map.items()), columns=['read_id', 'barcode']) - - return output_path, barcode_frame - - -@toolz.curry -def _extract_from_reads( - reads, transposon, barcodes, linker, contaminants=None, - transposon_func=None, barcode_func=None, linker_func=None): - - # Specify defaults for not provided aligners. - if transposon_func is None: - transposon_func = align_with_reverse(align_func=align_exact) - - if barcode_func is None: - barcode_func = align_multiple(align_func=align_exact) - - if linker_func is None: - linker_func = align_exact - - # Setup contaminant aligner if sequences are provided. - if contaminants is not None: - contaminant_func = align_multiple(queries=contaminants, - align_func=align_exact, - raise_error=False) - else: - contaminant_func = None - - # Prime aligners with their respective sequences. - transposon_func = transposon_func(query=transposon) - barcode_func = barcode_func(queries=barcodes) - linker_func = linker_func(query=linker) - - # Extract and return results. - extract_func = toolz.curry( - _extract_from_read, - transposon_func=transposon_func, - barcode_func=barcode_func, - linker_func=linker_func, - contaminant_func=contaminant_func) - - for result in map(extract_func, reads): - yield result - - -def _extract_from_read(read, transposon_func, barcode_func, - linker_func, contaminant_func=None): - """ Extracts the genomic sequence and barcode from the passed - read. Reads containing contaminants are dropped. Reads are - expected to look as follows: - - [barcode][transposon][genomic-sequence][linker] - - Each of these sequences is recognized by their corresponding - alignment function. The barcode alignment identifies the - barcode (and thus the sample) of the read, whilst the transposon - and linker alignments are used to delineate the genomic sequence. - - The function returns an ExactResult tuple that contains the - genomic sequence, barcode and a status flag. If any errors - occur during the extraction, the genomic sequence and barcode - values are None and the status flag indicates the underlying reason. - """ - - # Drop read if it contains a contaminant. - if contaminant_func is not None and contaminant_func(read) is not None: - return ExtractResult(None, None, ShearSplinkStatus.contaminant) - - # Identify location of the transposon. - transposon_aln = transposon_func(read) - if transposon_aln is None: - return ExtractResult(None, None, ShearSplinkStatus.no_transposon) - - # If transposon is on the reverse strand, flip the read and the - # alignment to bring everything into the same (fwd) orientation. - if transposon_aln.strand == -1: - read = read.reverse_complement() - transposon_aln = transposon_aln.reverse() - - # Identify location of linker. - linker_aln = linker_func(read) - if linker_aln is None: - return ExtractResult(None, None, ShearSplinkStatus.no_linker) - - # Identify barcode of the read. - try: - barcode_aln = barcode_func(read) - if barcode_aln is None: - return ExtractResult(None, None, ShearSplinkStatus.no_barcode) - except ValueError: - return ExtractResult(None, None, ShearSplinkStatus.multiple_barcodes) - - barcode = barcode_aln.query_id - - # Extract genomic sequence using previous alignments. - genomic = read[transposon_aln.target_end:linker_aln.target_start] - - return ExtractResult(genomic, barcode, ShearSplinkStatus.proper_read) - - -@toolz.curry -def _check_minimum_length(result, min_length): - """Flags proper reads if shorter than min_length.""" - if (result.status == ShearSplinkStatus.proper_read and - len(result.genomic_sequence) < min_length): - result.status = ShearSplinkStatus.too_short - return result - - -def _proper_filter(result): - """Filters extraction results for proper reads.""" - return result.status == ShearSplinkStatus.proper_read - - -# --- Insertion identification --- # - -def identify_insertions(alignment_path, barcode_map, min_mapq=37): - # Get alignments from bowtie. - bam = pysam.AlignmentFile(alignment_path) - alignments = bam.fetch(multiple_iterators=True) - - # Filter by mapq. - alignments = filter(lambda a: a.mapq >= min_mapq, alignments) - - # Group alignments by barcode and position. - aln_groups = chain_groupby( - alignments, - [groupby_reference_position(alignment_file=bam), - groupby_barcode(barcode_map=barcode_map)]) - - # Convert groups into insertion frame. - insertions = pd.DataFrame.from_records( - (_alignments_to_insertion(info, alns) - for info, alns in aln_groups) , - columns=['id', 'chrom', 'position', 'strand', - 'barcode', 'depth', 'depth_unique']) - - return insertions - - -def _alignments_to_insertion(info, alignments, id_=None): - # Extract group info. - ref, pos, strand, bc = info - - # Get positions of the non-transposon ends of the alignment. - end_field = 'reference_end' if strand == 1 else 'reference_start' - end_positions = map(operator.attrgetter(end_field), alignments) - - # Calculate overall depth and unique end depth. - depth = len(alignments) - depth_unique = len(set(end_positions)) - - return id_, ref, pos, strand, bc, depth, depth_unique - - -# --- Further annotation --- # - -def annotate_with_clonality(insertions): - def _clonality(grp): - clonality = grp['depth_unique'] / grp['depth_unique'].max() - return grp.assign(clonality=clonality) - - if 'sample' in insertions.columns: - per_sample = insertions.groupby('sample') - else: - per_sample = insertions.groupby('barcode') - - return pd.concat(_clonality(grp) for _, grp in per_sample) diff --git a/src/pyim/align/pipelines/shear_splink_sb.py b/src/pyim/align/pipelines/shear_splink_sb.py deleted file mode 100644 index 2921be0..0000000 --- a/src/pyim/align/pipelines/shear_splink_sb.py +++ /dev/null @@ -1,99 +0,0 @@ -import os -from os import path - -import pandas as pd -import skbio -from .shear_splink import shear_splink - -from pyim.alignment import vector as vec -from pyim.util.file import count_fasta_entries - - -# --- Pipeline register hook + main --- # - -def register(subparsers, name='shear_splink_sb'): - parser = subparsers.add_parser(name, help=name + ' help') - - # Required arguments. - parser.add_argument('input') - parser.add_argument('output_dir') - parser.add_argument('--bowtie_index', required=True) - parser.add_argument('--transposon', required=True) - parser.add_argument('--barcodes', required=True) - parser.add_argument('--linker', required=True) - - # Optional arguments. - parser.add_argument('--contaminants', default=None) - parser.add_argument('--sample_map', default=None) - parser.add_argument('--min_genomic_length', type=int, default=15) - parser.add_argument('--min_depth', type=int, default=2) - parser.add_argument('--min_mapq', type=int, default=37) - - # Set main for dispatch. - parser.set_defaults(main=main) - - return parser - - -def main(args): - # Prepare reads, counting total for progress bar. - reads = skbio.read(args.input, format='fasta', constructor=skbio.DNA) - total_reads = count_fasta_entries(args.input) - - # Read transposon, linker and barcode sequences. - transposon = skbio.io.read(args.transposon, format='fasta', into=skbio.DNA) - linker = skbio.io.read(args.linker, format='fasta', into=skbio.DNA) - - barcodes = list(skbio.io.read(args.barcodes, format='fasta', - constructor=skbio.DNA)) - - if args.contaminants is not None: - contaminants = list(skbio.io.read(args.contaminants, format='fasta', - constructor=skbio.DNA)) - else: - contaminants = None - - # Read barcode --> sample map if given. - if args.sample_map is not None: - sample_map = pd.read_csv(args.sample_map, sep='\t') - sample_map = dict(zip(sample_map['barcode'], - sample_map['sample'])) - else: - sample_map = None - - # Create output_dir if it does not exist. - if not path.exists(args.output_dir): - os.makedirs(args.output_dir, exist_ok=True) - - # Setup custom aligners. - transposon_aligner = vec.align_chained( - align_funcs=[vec.compose(vec.align_exact, try_reverse=True), - vec.compose(vec.align_ssw, try_reverse=True, - filters=[vec.filter_score(min_score=90)])]) - - linker_ssw_filters = [ - vec.filter_score(min_score=90), - vec.filter_and(filters=[ - vec.filter_end_match(), - vec.filter_coverage(min_coverage=0.5, min_identity=0.9)])] - - linker_aligner = vec.align_chained( - align_funcs=[vec.compose(vec.align_exact, try_reverse=True), - vec.compose(vec.align_ssw, try_reverse=True, - filters=linker_ssw_filters)]) - - extract_kws = {'linker_func': linker_aligner, - 'transposon_func': transposon_aligner} - - # Run pipeline! - insertions = shear_splink( - reads, transposon, linker, barcodes, - args.bowtie_index, args.output_dir, - contaminants=contaminants, sample_map=sample_map, - min_genomic_length=args.min_genomic_length, - min_depth=args.min_depth, extract_kws=extract_kws, - total_reads=total_reads) - - # Write insertion output. - insertions.to_csv(path.join(args.output_dir, 'insertions.txt'), - sep='\t', index=False) diff --git a/src/pyim/align/pipelines/single.py b/src/pyim/align/pipelines/single.py new file mode 100644 index 0000000..e6a1acb --- /dev/null +++ b/src/pyim/align/pipelines/single.py @@ -0,0 +1,283 @@ +import itertools +import logging +import os +from pathlib import Path + +from cutadapt import seqio +import pandas as pd + +from pyim.external import bowtie2 +from pyim.external.util import flatten_options +from pyim.util.path import build_path + +from ..common import genomic as cm_gen, insertions as cm_ins +from .base import Pipeline, register_pipeline + + +class SinglePipeline(Pipeline): + def __init__(self, + transposon_path, + bowtie_index_path, + linker_path=None, + contaminant_path=None, + min_length=15, + min_support=2, + min_mapq=23, + merge_distance=0, + bowtie_options=None, + min_overlaps=None, + error_rates=None): + super().__init__() + + self._transposon_path = transposon_path + self._linker_path = linker_path + self._contaminant_path = contaminant_path + + self._index_path = bowtie_index_path + + self._min_length = min_length + self._min_support = min_support + self._min_mapq = min_mapq + + self._merge_distance = merge_distance + self._bowtie_options = bowtie_options or {} + + self._min_overlaps = min_overlaps or {} + self._error_rates = error_rates or {} + + @classmethod + def configure_args(cls, parser): + super().configure_args(parser) + + parser.add_argument('--transposon', type=Path, required=True) + parser.add_argument('--bowtie_index', type=Path, required=True) + + parser.add_argument('--contaminants', type=Path, default=None) + parser.add_argument('--linker', type=Path, default=None) + + parser.add_argument('--min_length', type=int, default=15) + parser.add_argument('--min_support', type=int, default=2) + parser.add_argument('--min_mapq', type=int, default=23) + parser.add_argument('--merge_distance', type=int, default=None) + + parser.add_argument('--local', default=False, action='store_true') + + parser.add_argument('--contaminant_error', default=0.1, type=float) + parser.add_argument('--transposon_error', default=0.1, type=float) + parser.add_argument('--linker_error', default=0.1, type=float) + + parser.add_argument('--contaminant_overlap', default=3, type=int) + parser.add_argument('--transposon_overlap', default=3, type=int) + parser.add_argument('--linker_overlap', default=3, type=int) + + @classmethod + def extract_args(cls, args): + bowtie_options = {'--local': args.local} + + min_overlaps = { + 'contaminant': args.contaminant_overlap, + 'transposon': args.transposon_overlap, + 'linker': args.linker_overlap + } + + error_rates = { + 'contaminant': args.contaminant_error, + 'transposon': args.transposon_error, + 'linker': args.linker_error + } + + return dict( + transposon_path=args.transposon, + bowtie_index_path=args.bowtie_index, + linker_path=args.linker, + contaminant_path=args.contaminants, + min_length=args.min_length, + min_support=args.min_support, + min_mapq=args.min_mapq, + merge_distance=args.merge_distance, + bowtie_options=bowtie_options, + min_overlaps=min_overlaps, + error_rates=error_rates) + + def run(self, reads_path, work_dir): + logger = logging.getLogger() + + # Extract genomic sequences and align to reference. + alignment_path = self._extract_and_align(reads_path, work_dir, logger) + + # Extract alignment groups (grouped by position) from bam file. + logger.info('Summarizing alignments') + logger.info(' %-18s: %s', 'Minimum mapq', self._min_mapq) + + alignments = cm_ins.fetch_alignments( + alignment_path, min_mapq=self._min_mapq) + aln_summary = cm_ins.summarize_alignments(alignments) + + # Convert groups to insertions and return. + logger.info('Converting to insertions') + logger.info(' %-18s: %d', 'Minimum support', self._min_support) + logger.info(' %-18s: %d', 'Merge distance', self._merge_distance) + + yield from cm_ins.convert_groups_to_insertions( + aln_summary, + min_support=self._min_support, + merge_distance=self._merge_distance) + + def _extract_and_align(self, reads_path, work_dir, logger): + # Extract genomic sequences. + logger.info('Extracting genomic sequences') + logger.info(' %-18s: %s', 'Transposon', + shorten_path(self._transposon_path)) + logger.info(' %-18s: %s', 'Linker', shorten_path(self._linker_path)) + logger.info(' %-18s: %s', 'Contaminants', + shorten_path(self._contaminant_path)) + logger.info(' %-18s: %s', 'Minimum length', self._min_length) + + genomic_path = build_path(reads_path, dir_=work_dir, suffix='.genomic') + genomic_path.parent.mkdir(exist_ok=True, parents=True) + + cm_gen.extract_genomic( + reads_path, + genomic_path, + transposon_path=self._transposon_path, + linker_path=self._linker_path, + contaminant_path=self._contaminant_path, + min_length=self._min_length, + min_overlaps=self._min_overlaps, + error_rates=self._error_rates) + + # Align reads to genome. + logger.info('Aligning to reference') + logger.info(' %-18s: %s', 'Reference', shorten_path(self._index_path)) + logger.info(' %-18s: %s', 'Bowtie options', + flatten_options(self._bowtie_options)) + + alignment_path = build_path(reads_path, dir_=work_dir, ext='.bam') + alignment_path.parent.mkdir(exist_ok=True, parents=True) + + bowtie2.bowtie2( + [genomic_path], + self._index_path, + alignment_path, + options=self._bowtie_options, + verbose=True) + + return alignment_path + + +register_pipeline(name='single', pipeline=SinglePipeline) + + +class SingleMultiplexedPipeline(SinglePipeline): + def __init__(self, + transposon_path, + bowtie_index_path, + barcode_path, + barcode_mapping=None, + linker_path=None, + contaminant_path=None, + min_length=15, + min_support=2, + min_mapq=23, + merge_distance=0, + bowtie_options=None, + min_overlaps=None, + error_rates=None): + super().__init__( + transposon_path=transposon_path, + bowtie_index_path=bowtie_index_path, + linker_path=linker_path, + contaminant_path=contaminant_path, + min_length=min_length, + min_support=min_support, + min_mapq=min_mapq, + merge_distance=merge_distance, + bowtie_options=bowtie_options, + min_overlaps=min_overlaps, + error_rates=error_rates) + + self._barcode_path = barcode_path + self._barcode_mapping = barcode_mapping + + @classmethod + def configure_args(cls, parser): + super().configure_args(parser) + + parser.add_argument('--barcodes', required=True, type=Path) + parser.add_argument( + '--barcode_mapping', required=False, type=Path, default=None) + + @classmethod + def extract_args(cls, args): + arg_dict = super().extract_args(args) + + if args.barcode_mapping is not None: + map_df = pd.read_csv(args.barcode_mapping, sep='\t') + arg_dict['barcode_mapping'] = dict( + zip(map_df['barcode'], map_df['sample'])) + else: + arg_dict['barcode_mapping'] = None + + arg_dict['barcode_path'] = args.barcodes + + return arg_dict + + def run(self, reads_path, work_dir): + logger = logging.getLogger() + + # Extract genomic sequences and align to reference. + alignment_path = self._extract_and_align(reads_path, work_dir, logger) + + # Map reads to specific barcodes/samples. + logger.info('Extracting barcode/sample mapping') + logger.info(' %-18s: %s', 'Barcodes', + shorten_path(self._barcode_path)) + read_map = self._get_barcode_mapping(reads_path) + + # Extract alignment groups (grouped by position) from bam file. + logger.info('Summarizing alignments') + logger.info(' %-18s: %s', 'Minimum mapq', self._min_mapq) + + alignments = cm_ins.fetch_alignments( + alignment_path, min_mapq=self._min_mapq) + + aln_summaries = cm_ins.summarize_alignments_by_group( + alignments, + group_func=lambda aln: read_map.get(aln.query_name, None)) + + # Convert groups from each sample into insertions, + # adding sample name and sample prefix to the ID. + logger.info('Converting to insertions') + logger.info(' %-18s: %d', 'Minimum support', self._min_support) + logger.info(' %-18s: %d', 'Merge distance', self._merge_distance) + + insertion_grps = ( + cm_ins.convert_summary_to_insertions( + aln_summ, + min_support=self._min_support, + merge_distance=self._merge_distance, + sample=barcode, + id_fmt=barcode + '.INS_{}') + for barcode, aln_summ in aln_summaries.items()) # yapf: disable + + # Return concatenated list of insertions. + yield from itertools.chain.from_iterable(insertion_grps) + + def _get_barcode_mapping(self, reads_path): + # Read barcode sequences. + with seqio.open(str(self._barcode_path)) as barcode_file: + barcodes = list(barcode_file) + + # Extract read --> barcode mapping. + with seqio.open(str(reads_path)) as reads: + return cm_ins.extract_barcode_mapping(reads, barcodes, + self._barcode_mapping) + + +register_pipeline( + name='single-multiplexed', pipeline=SingleMultiplexedPipeline) + + +def shorten_path(file_name, limit=40): + f = os.path.split(str(file_name))[1] + return "%s~%s" % (f[:3], f[-(limit - 3):]) if len(f) > limit else f diff --git a/src/pyim/align/vector.py b/src/pyim/align/vector.py deleted file mode 100644 index 57d3810..0000000 --- a/src/pyim/align/vector.py +++ /dev/null @@ -1,207 +0,0 @@ -import collections - -from skbio.alignment import local_pairwise_align_ssw -from toolz import curry - - -class Alignment(object): - - __slots__ = ('query_id', 'query_start', 'query_end', 'query_len', - 'target_id', 'target_start', 'target_end', 'target_len', - 'strand', 'identity', 'coverage', 'score', 'type') - - def __init__(self, query_id, query_start, query_end, query_len, - target_id, target_start, target_end, target_len, - strand, identity, coverage, type): - self.query_id = query_id - self.query_start = query_start - self.query_end = query_end - self.query_len = query_len - - self.target_id = target_id - self.target_start = target_start - self.target_end = target_end - self.target_len = target_len - - self.strand = strand - self.identity = identity - self.coverage = coverage - self.type = type - - self.score = int(identity * coverage * 100) - - def reverse(self): - return Alignment(query_id=self.query_id, - query_start=self.query_start, - query_end=self.query_end, - query_len=self.query_len, - target_id=self.target_id, - target_start=self.target_len - self.target_end, - target_end=self.target_len - self.target_start, - target_len=self.target_len, - strand=self.strand * -1, - identity=self.identity, - coverage=self.coverage, - type=self.type) - - -@curry -def align_exact(target, query, query_strand=1): - """Aligns query to target using exact matching.""" - - # Note that this alignment returns the first occurrence it finds, - # later occurrences will not be found and are not checked for. - try: - index = str(target).index(str(query)) - except ValueError: - return None - else: - q_len = len(query) - - return Alignment( - query_id=query.metadata.get('id', None), query_start=0, - query_end=q_len, query_len=q_len, - target_id=target.metadata.get('id', None), target_start=index, - target_end=index + q_len, target_len=len(target), - strand=query_strand, identity=1.0, coverage=1.0, type='exact') - - -@curry -def align_ssw(target, query, query_strand=1): - """Aligns query to target using ssw aligner.""" - - # Perform actual alignment. - ssw_aln = local_pairwise_align_ssw(str(target), str(query)) - - # Extract positions. - pos = ssw_aln.start_end_positions() - q_start, q_end = pos[1] - t_start, t_end = pos[0] - - # Offset ends by one, making them exclusive - # to match python conventions. - q_end += 1 - t_end += 1 - - # Calculate basic metrics. - coverage = (q_end - q_start) / float(len(query)) - identity = ssw_aln[0].match_frequency(ssw_aln[1], relative=True) - - aln = Alignment( - query_id=query.metadata.get('id', None), query_start=q_start, - query_end=q_end, query_len=len(query), - target_id=target.metadata.get('id', None), target_start=t_start, - target_end=t_end, target_len=len(target), strand=query_strand, - identity=identity, coverage=coverage, type='ssw') - - return aln - - -@curry -def align_with_reverse(target, query, align_func, query_strand=1, **kwargs): - """Aligns query in both orientations to target sequence.""" - - aln_fwd = align_func(target, query, query_strand=query_strand, **kwargs) - aln_rev = align_func(target, query.reverse_complement(), - query_strand=query_strand * -1, **kwargs) - return _pick_best(list(filter(bool, [aln_fwd, aln_rev]))) - - -@curry -def align_multiple(target, queries, align_func, raise_error=False, **kwargs): - """Aligns multiple queries to target sequence.""" - - alignments = (align_func(target, query, **kwargs) - for query in queries) - alignments = list(filter(bool, alignments)) - - if len(alignments) > 1 and raise_error: - raise ValueError('Multiple alignments') - - return _pick_best(alignments) - - -def _pick_best(alignments): - """Picks best alignment from list (based on score).""" - - if len(alignments) == 0: - return None - if len(alignments) == 1: - return alignments[0] - else: - best = alignments[0] - for aln in alignments: - if aln.score > best.score: - best = aln - return best - - -@curry -def align_chained(target, query, align_funcs, **kwargs): - """Chains multiple vector alignment functions.""" - - for func in align_funcs: - aln = func(target, query, **kwargs) - if aln is not None: - return aln - return None - - -def compose(align_func, try_reverse=False, - filter='and', filters=None, **kwargs): - """Helper function to build an aligner.""" - - if try_reverse: - align_func = align_with_reverse(align_func=align_func) - - if filters is not None: - if filter == 'and': - align_func = filter_and(align_func=align_func, filters=filters) - elif filter == 'or': - align_func = filter_or(align_func=align_func, filters=filters) - else: - raise ValueError('Filter should be either "or" or "and" (not {})' - .format(filter)) - - return align_func(**kwargs) - - -# --- Filtering --- # - -@curry -def filter_and(target, query, align_func, filters, **kwargs): - """Performs AND of filters on resulting alignments.""" - - alignment = align_func(target, query, **kwargs) - for filter_ in filters: - if not filter_(alignment): - return None - return alignment - - -@curry -def filter_or(target, query, align_func, filters, **kwargs): - """Performs OR of filters on resulting alignments.""" - - return not filter_and(target, query, align_func, filters, **kwargs) - - -@curry -def filter_score(alignment, min_score): - """Checks if alignment has minimum score.""" - - return alignment.score >= min_score - - -@curry -def filter_coverage(alignment, min_coverage, min_identity): - """Checks if alignment is at end of read.""" - - return ((alignment.coverage >= min_coverage) and - (alignment.identity >= min_identity)) - -@curry -def filter_end_match(alignment): - """Checks if alignment is at end of read.""" - - return alignment.target_end == alignment.target_len diff --git a/src/pyim/annotate/__init__.py b/src/pyim/annotate/__init__.py new file mode 100644 index 0000000..062361e --- /dev/null +++ b/src/pyim/annotate/__init__.py @@ -0,0 +1 @@ +from .annotators import get_annotators, WindowAnnotator, RbmAnnotator diff --git a/src/pyim/annotate/annotators/__init__.py b/src/pyim/annotate/annotators/__init__.py new file mode 100644 index 0000000..ff40cc6 --- /dev/null +++ b/src/pyim/annotate/annotators/__init__.py @@ -0,0 +1,3 @@ +from .base import get_annotators +from .window import WindowAnnotator +from .rbm import RbmAnnotator diff --git a/src/pyim/annotate/annotators/base.py b/src/pyim/annotate/annotators/base.py new file mode 100644 index 0000000..63e86b2 --- /dev/null +++ b/src/pyim/annotate/annotators/base.py @@ -0,0 +1,146 @@ +from abc import ABC, abstractclassmethod, abstractmethod, abstractproperty +import functools +import itertools +import operator +from pathlib import Path + +from frozendict import frozendict +import numpy as np +import toolz + +from pyim.model import Insertion, CisSite +from ..metadata import add_metadata + +_registry = {} + + +def register_annotator(name, annotator): + _registry[name] = annotator + + +def get_annotators(): + return dict(_registry) + + +class Annotator(ABC): + def __init__(self): + pass + + @classmethod + def configure_args(cls, parser): + parser.add_argument('--insertions', type=Path, required=True) + parser.add_argument('--output', type=Path, required=True) + + @classmethod + def from_args(cls, args): + return cls(**cls.parse_args(args)) + + @abstractclassmethod + def parse_args(cls, args): + raise NotImplementedError() + + @abstractmethod + def annotate(self, insertions): + raise NotImplementedError() + + @abstractproperty + def gtf(self): + raise NotImplementedError() + + +class CisAnnotator(Annotator): + def __init__(self, *args, cis_sites=None, drop_cis_id=False, **kwargs): + super().__init__(*args, **kwargs) + + self._cis_sites = self._preprocess_sites(cis_sites) + self._drop_cis_id = drop_cis_id + + def _preprocess_sites(self, cis_sites): + """Pre-process cis sites, fixing unstrandedness etc.""" + + # Copy CISs that are unstranded to both strands. + return list(self._expand_unstranded_sites(cis_sites)) + + @staticmethod + def _expand_unstranded_sites(cis_sites): + for cis in cis_sites: + if np.isnan(cis.strand) or cis.strand is None: + yield cis._replace(strand=1) + yield cis._replace(strand=-1) + else: + yield cis + + @classmethod + def configure_args(cls, parser): + super().configure_args(parser) + parser.add_argument('--cis_sites', default=None, type=Path) + parser.add_argument( + '--drop_cis_id', default=False, action='store_true') + + @classmethod + def parse_args(cls, args): + parsed = super().parse_args(args) + + if args.cis_sites is not None: + cis_cols = ['id', 'chromosome', 'position', 'strand'] + cis_sites = list( + CisSite.from_csv( + args.cis_sites, usecols=cis_cols, sep='\t')) + else: + cis_sites = None + + cis_args = {'cis_sites': cis_sites, 'drop_cis_id': args.drop_cis_id} + + return toolz.merge(parsed, cis_args) + + def annotate(self, insertions): + if self._cis_sites is None: + yield from super().annotate(insertions) + else: + yield from self._annotate_cis(insertions) + + def _annotate_cis(self, insertions): + # Annotate CIS sites. + annotated_sites = super().annotate(self._cis_sites) + + # Create CIS --> gene map using annotations. + id_getter = operator.attrgetter('id') + annotated_sites = sorted(annotated_sites, key=id_getter) + grouped_sites = itertools.groupby(annotated_sites, key=id_getter) + + cis_map = { + cis_id: {(item.metadata['gene_name'], item.metadata['gene_id']) + for item in group if 'gene_name' in item.metadata} + for cis_id, group in grouped_sites + } + + # Annotate insertions, drop any duplicates and add metadata. + annotated_ins = set(self._annotate_insertions(insertions, cis_map)) + annotated_ins = add_metadata(annotated_ins, reference_gtf=self.gtf) + + yield from annotated_ins + + def _annotate_insertions(self, insertions, cis_map): + for insertion in insertions: + genes = cis_map.get(insertion.metadata['cis_id'], set()) + + if len(genes) > 0: + for gene_name, gene_id in genes: + metadata = {'gene_id': gene_id, 'gene_name': gene_name} + metadata = toolz.merge(insertion.metadata, metadata) + + if self._drop_cis_id: + metadata.pop('cis_id') + + yield insertion._replace(metadata=frozendict(metadata)) + else: + if self._drop_cis_id: + metadata = dict(insertion.metadata) + metadata.pop('cis_id') + yield insertion._replace(metadata=frozendict(metadata)) + else: + yield insertion + + @property + def gtf(self): + return super().gtf diff --git a/src/pyim/annotate/annotators/rbm.py b/src/pyim/annotate/annotators/rbm.py new file mode 100644 index 0000000..2f1f1e6 --- /dev/null +++ b/src/pyim/annotate/annotators/rbm.py @@ -0,0 +1,97 @@ +from pathlib import Path + +from .base import Annotator, CisAnnotator, register_annotator +from .window import WindowAnnotator, Window + +# Window format: (us, ua, ds, da) +WINDOW_PRESETS = { + 'SB': (20000, 10000, 25000, 5000), + 'MULV': (20000, 120000, 40000, 5000), + 'MMTV': (20000, 120000, 40000, 5000) +} + + +class RbmAnnotator(Annotator): + def __init__(self, + reference_gtf, + window_sizes=None, + preset=None, + closest=False, + blacklist=None, + cis_sites=None): + super().__init__() + + if window_sizes is None: + if preset is None: + raise ValueError('Either window_sizes or ' + 'preset must be defined') + else: + window_sizes = WINDOW_PRESETS[preset] + + windows = self._build_windows(window_sizes) + self._annotator = WindowAnnotator( + reference_gtf, + windows=windows, + closest=closest, + blacklist=blacklist) + self._cis_sites = cis_sites + + @classmethod + def configure_args(cls, parser): + super().configure_args(parser) + + # Required arguments. + parser.add_argument('--reference_gtf', required=True, type=Path) + + # Optional arguments. + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument('--preset', choices=WINDOW_PRESETS.keys()) + group.add_argument('--window_sizes', nargs=4, type=int) + + parser.add_argument('--closest', default=False, action='store_true') + parser.add_argument('--blacklist', nargs='+', default=None) + + @classmethod + def parse_args(cls, args): + return { + 'reference_gtf': args.reference_gtf, + 'window_sizes': args.window_sizes, + 'preset': args.preset, + 'closest': args.closest, + 'blacklist': args.blacklist + } + + def annotate(self, insertions): + return self._annotator.annotate(insertions) + + def _build_windows(self, window_sizes): + us, ua, ds, da = window_sizes + + windows = [ + Window(0, 1, strand=1, strict_left=False, + strict_right=False, name='is'), + Window(0, 1, strand=-1, strict_left=False, + strict_right=False, name='ia'), + Window(-us, 0, strand=1, strict_left=False, + strict_right=True, name='us'), + Window(-ua, 0, strand=-1, strict_left=False, + strict_right=True, name='ua'), + Window(1, ds, strand=1, strict_left=True, + strict_right=False, name='ds'), + Window(1, da, strand=-1, strict_left=True, + strict_right=False, name='da')] # yapf: disable + + return windows + + @property + def gtf(self): + return self._annotator.gtf + + +class RbmCisAnnotator(CisAnnotator, RbmAnnotator): + pass + + +register_annotator('rbm', RbmCisAnnotator) + +# register_annotator('rbm', RbmAnnotator) diff --git a/src/pyim/annotate/annotators/window.py b/src/pyim/annotate/annotators/window.py new file mode 100644 index 0000000..802f97f --- /dev/null +++ b/src/pyim/annotate/annotators/window.py @@ -0,0 +1,200 @@ +import collections +import itertools +from pathlib import Path + +from frozendict import frozendict +from tqdm import tqdm +import toolz + +from pyim.util.tabix import GtfFile, GtfFrame + +from .base import Annotator, CisAnnotator, register_annotator +from ..filter_ import select_closest, filter_blacklist +from ..metadata import add_metadata +from ..util import build_interval_trees, numeric_strand + + +class WindowAnnotator(Annotator): + def __init__(self, + reference_gtf, + windows, + closest=False, + blacklist=None, + verbose=True): + super().__init__() + + self._windows = windows + self._gtf = GtfFile(reference_gtf) + self._closest = closest + self._blacklist = blacklist + self._verbose = verbose + + self._gtf_frame = None + self._gtf_trees = None + + @classmethod + def configure_args(cls, parser): + super().configure_args(parser) + + # Required arguments. + parser.add_argument('--reference_gtf', required=True, type=Path) + + # Optional arguments. + parser.add_argument('--window_size', default=20000, type=int) + parser.add_argument('--closest', default=False, action='store_true') + parser.add_argument('--blacklist', nargs='+', default=None) + + @classmethod + def parse_args(cls, args): + window_size = args.window_size // 2 + windows = [Window( + -window_size, + window_size, + strand=None, + name=None, + strict_left=False, + strict_right=False)] + + return { + 'reference_gtf': args.reference_gtf, + 'windows': windows, + 'closest': args.closest, + 'blacklist': args.blacklist + } + + def annotate(self, insertions): + # Annotate insertions. + if self._verbose: + insertions = tqdm(list(insertions), ncols=80) + + annotated = itertools.chain.from_iterable( + (self._annotate_insertion(ins) for ins in insertions)) + + # Add metadata. + annotated = add_metadata(annotated, reference_gtf=self.gtf) + + # Filter to closest if needed. + if self._closest: + annotated = list(select_closest(annotated)) + + # Filter blacklist. + if self._blacklist is not None: + annotated = filter_blacklist(annotated, self._blacklist) + + return annotated + + def _annotate_insertion(self, insertion): + trees = self._trees + + # Identify overlapping features. + hits = set() + for window in self._windows: + applied_window = window.apply(insertion.chromosome, + insertion.position, insertion.strand) + + hits |= {(feature['gene_id'], feature['gene_name'], window.name) + for feature in applied_window.get_overlap(trees)} + + if len(hits) > 0: + # Annotate insertion with overlapping genes. + for gene_id, gene_name, window_name in hits: + metadata = {'gene_id': gene_id, 'gene_name': gene_name} + + if window_name is not None: + metadata['window'] = window_name + + metadata = toolz.merge(insertion.metadata, metadata) + yield insertion._replace(metadata=frozendict(metadata)) + else: + # In case of no overlap, return original insertion. + yield insertion + + @property + def gtf(self): + if self._gtf_frame is None: + if isinstance(self._gtf, GtfFrame): + self._gtf_frame = self._gtf + else: + self._gtf_frame = GtfFrame.from_records( + self._gtf.fetch(filters={'feature': 'gene'})) + return self._gtf_frame + + @property + def _trees(self): + if self._gtf_trees is None: + self._gtf_trees = build_interval_trees(self.gtf) + return self._gtf_trees + + +class WindowCisAnnotator(CisAnnotator, WindowAnnotator): + pass + + +register_annotator('window', WindowCisAnnotator) + +_Window = collections.namedtuple('Window', ['start', 'end', 'strand', 'name', + 'strict_left', 'strict_right']) + + +class Window(_Window): + __slots__ = () + + def apply(self, chromosome, position, strand): + # Determine start/end position. + if strand == 1: + start = position + self.start + end = position + self.end + + strict_left = self.strict_left + strict_right = self.strict_right + elif strand == -1: + start = position - self.end + end = position - self.start + + strict_right = self.strict_left + strict_left = self.strict_right + else: + raise ValueError('Unknown value for strand ({})'.format(strand)) + + # Determine new strand. + if self.strand is not None: + new_strand = self.strand * strand + else: + new_strand = None + + return AppliedWindow(chromosome, start, end, new_strand, self.name, + strict_left, strict_right) + + +_AppliedWindow = collections.namedtuple( + 'AppliedWindow', ['chromosome', 'start', 'end', 'strand', 'name', + 'strict_left', 'strict_right']) + + +class AppliedWindow(_AppliedWindow): + __slots__ = () + + def get_overlap(self, interval_trees): + # Find overlapping features. + try: + tree = interval_trees[self.chromosome] + overlap = tree[self.start:self.end] + except KeyError: + overlap = [] + + # Extract features. + features = (interval[2] for interval in overlap) + + # Filter inclusive/exclusive if needed. + if self.strict_left: + features = (f for f in features if f['start'] > self.start) + + if self.strict_right: + features = (f for f in features if f['end'] < self.end) + + # Filter for strand if needed. + if self.strand is not None: + features = (f for f in features + if numeric_strand(f['strand']) == self.strand) + + return features diff --git a/src/pyim/annotate/filter_.py b/src/pyim/annotate/filter_.py new file mode 100644 index 0000000..f98d425 --- /dev/null +++ b/src/pyim/annotate/filter_.py @@ -0,0 +1,57 @@ +import itertools +import operator +import sys + +import numpy as np + + +def select_closest(insertions, field='gene_distance'): + """Selects genes that are closest to the annotated insertions. + + Parameters: + insertions (iterable[Insertion]): Annotated insertions that are to + be filtered. The frame is expected to contain at least the + following columns: id, position, strand, *dist_col*. + field (str): Name of the column containing the distance to + the gene or feature. Can be added using the add_metadata function. + + Returns: + iterable[Insertion]: Filtered annotated insertions, which have been + reduced to only include the genes closest to the insertions. + + """ + + # Group insertions by id. + id_getter = operator.attrgetter('id') + insertions = sorted(insertions, key=id_getter) + grouped = itertools.groupby(insertions, key=id_getter) + + for _, group in grouped: + group = list(group) + + # Yield closest insertions (with minimum distance). + dists = np.abs([ins.metadata.get(field, sys.maxsize) for ins in group]) + yield from itertools.compress(group, dists == dists.min()) + + +def filter_blacklist(insertions, blacklist, field='gene_name'): + """Filters annotations that assign insertions to blacklisted genes. + + Args: + insertions (iterable[Insertion]): + blacklist (iterable[str]): List of blacklisted gene ids to filter. + field (str): Name of the column containing the id of the genes. + + Returns: + iterable[Insertion]: Filtered annotated insertions, which have been + reduced to remove blacklisted genes. + + """ + + # Ensure blacklist is a set. + blacklist = set(blacklist) + + # Drop any genes with a gene id in the blacklist. + for insertion in insertions: + if not insertion.metadata.get(field, None) in blacklist: + yield insertion diff --git a/src/pyim/annotation/metadata.py b/src/pyim/annotate/metadata.py similarity index 58% rename from src/pyim/annotation/metadata.py rename to src/pyim/annotate/metadata.py index 477f144..b9eb13e 100644 --- a/src/pyim/annotation/metadata.py +++ b/src/pyim/annotate/metadata.py @@ -1,10 +1,12 @@ -import pandas as pd -from pyim.util.tabix import GtfFile, GtfFrame +from pathlib import Path + +import toolz +from pyim.util.tabix import GtfFile, GtfFrame from .util import numeric_strand -def add_metadata(insertions, gtf): +def add_metadata(insertions, reference_gtf): """Adds metadata to annotated insertions. Adds extra metadata to already annotated insertions. This metadata @@ -12,47 +14,43 @@ def add_metadata(insertions, gtf): ('distance' column) and relative orientation ('orientation' column). Args: - insertions (pandas.DataFrame): Annotated insertions for which metadata + insertions (iterable[Insertion]): Annotated insertions for which metadata should be added. The frame is expected to contain at least the following columns: id, position, strand, gene_id. - gtf (str or GtfFile): Path to gtf file containing gene features. - Alternatively, a GtfFile object may also be given instead of a path. - Used to annotate insertion gene assignments. + gtf (str, Path, GtfFile of GtfFrame): Gtf containing gene features. Returns: - pandas.DataFrame: Annotated insertions with extra metadata. + iterable[Insertion]: Annotated insertions with extra metadata. """ - if isinstance(gtf, str): - gtf = GtfFile(gtf) + if isinstance(reference_gtf, (str, Path)): + reference_gtf = GtfFile(reference_gtf) - # Look-up genes in GTF file. - genes = GtfFrame.from_records(gtf.fetch(filters={'feature': 'gene'})) - genes.set_index('gene_id', drop=False, inplace=True) - - # Generate metadata. - metadata = pd.DataFrame.from_records( - (_annotate_insertion(ins, genes.ix[ins['gene_id']]) - for _, ins in insertions.iterrows() - if ins['gene_id'] in genes.index)) + if isinstance(reference_gtf, GtfFile): + reference_gtf = GtfFrame.from_records( + reference_gtf.fetch(filters={'feature': 'gene'})) - # Re-order columns. - extra_cols = set(metadata.columns) - {'id', 'gene_id'} - metadata = metadata[['id', 'gene_id'] + sorted(extra_cols)] + # Look-up genes in GTF frame. + genes = reference_gtf.get_region(filters={'feature': 'gene'}) + genes.set_index('gene_id', drop=False, inplace=True) - return pd.merge(insertions, metadata, on=['id', 'gene_id'], how='left') + for insertion in insertions: + if 'gene_id' in insertion.metadata: + # Add metadata for gene. + gene = genes.ix[insertion.metadata['gene_id']] -def _annotate_insertion(insertion, gene): - """Annotates a given insertion/gene combination.""" + gene_metadata = { + 'gene_distance': feature_distance(insertion, gene), + 'gene_orientation': feature_orientation(insertion, gene) + } - return { - 'id': insertion['id'], - 'gene_id': feature['gene_id'], - 'gene_distance': feature_distance(insertion, gene), - 'gene_orientation': feature_orientation(insertion, gene) - } + new_metadata = toolz.merge(insertion.metadata, gene_metadata) + yield insertion._replace(metadata=new_metadata) + else: + # Return original insertion. + yield insertion def feature_distance(insertion, feature): @@ -70,7 +68,7 @@ def feature_distance(insertion, feature): """ feat_start, feat_end = feature['start'], feature['end'] - ins_location = insertion['position'] + ins_location = insertion.position if feat_start <= ins_location <= feat_end: dist = 0 @@ -99,7 +97,7 @@ def feature_orientation(insertion, feature): """ - ins_strand = numeric_strand(insertion['strand']) + ins_strand = numeric_strand(insertion.strand) feat_strand = numeric_strand(feature['strand']) return 'sense' if ins_strand == feat_strand else 'antisense' diff --git a/src/pyim/annotate/util.py b/src/pyim/annotate/util.py new file mode 100644 index 0000000..1c9ed8c --- /dev/null +++ b/src/pyim/annotate/util.py @@ -0,0 +1,45 @@ +import itertools +from pathlib import Path + +from intervaltree import IntervalTree +import numpy as np + +from pyim.util.tabix import GtfFile + + +def build_interval_trees(reference_gtf): + """Builds an interval tree of genes for each chromosome in gtf.""" + + if isinstance(reference_gtf, (str, Path)): + reference_gtf = GtfFile(reference_gtf) + + # Only select gene features for now. + genes = reference_gtf.fetch(filters={'feature': 'gene'}) + + # Note, below code assumes that genes are ordered by contig. + + trees = {} + for contig, grp in itertools.groupby(genes, lambda r: r['contig']): + # Build a tree for each individual chromosome. + intervals = ((g['start'], g['end'], dict(g)) for g in grp + if g['end'] > g['start']) # Avoid null intervals. + trees[contig] = IntervalTree.from_tuples(intervals) + + return trees + + +def numeric_strand(strand): + """Converts strand to its numeric (integer) representation.""" + + if isinstance(strand, int): + return strand + elif isinstance(strand, (float, np.generic)): + return int(strand) + elif isinstance(strand, str): + if strand == '+': + return 1 + elif strand == '-': + return -1 + + raise ValueError('Unknown value {} for strand (type: {})' + .format(strand, type(strand))) diff --git a/src/pyim/annotation/__init__.py b/src/pyim/annotation/__init__.py deleted file mode 100644 index d72f89e..0000000 --- a/src/pyim/annotation/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from ._registry import register_annotator, get_annotators -from .annotator.window import WindowAnnotator \ No newline at end of file diff --git a/src/pyim/annotation/_registry.py b/src/pyim/annotation/_registry.py deleted file mode 100644 index c1948e7..0000000 --- a/src/pyim/annotation/_registry.py +++ /dev/null @@ -1,10 +0,0 @@ - -_registry = {} - - -def register_annotator(name, aligner): - _registry[name] = aligner - - -def get_annotators(): - return dict(_registry) diff --git a/src/pyim/annotation/annotator/__init__.py b/src/pyim/annotation/annotator/__init__.py deleted file mode 100644 index ef588c3..0000000 --- a/src/pyim/annotation/annotator/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -#from .rbm import annotate_rbm -#from .rbm_cis import annotate_rbm_cis -#from .window import annotate_windows, Window diff --git a/src/pyim/annotation/annotator/kcrbm.py b/src/pyim/annotation/annotator/kcrbm.py deleted file mode 100644 index fda503f..0000000 --- a/src/pyim/annotation/annotator/kcrbm.py +++ /dev/null @@ -1,164 +0,0 @@ -from __future__ import absolute_import, division, print_function - -#pylint: disable=wildcard-import,unused-wildcard-import,redefined-builtin -from builtins import * -#pylint: enable=wildcard-import,unused-wildcard-import,redefined-builtin - -import logging -from itertools import chain, repeat - -import pandas as pd -from rpy2 import robjects -from rpy2.robjects.packages import importr - -from pyim.util.rpy2 import dataframe_to_pandas - -from ..filtering import select_closest - - -CHROM_MAP = dict(zip( - list(map(str, range(1, 19+1))) + ['X', 'Y'], - range(1, 21+1) -)) - - -def register(subparsers, name='kcrbm'): - parser = subparsers.add_parser(name, help=name + ' help') - - # Required arguments. - parser.add_argument('input') - parser.add_argument('output') - - # Optional arguments. - parser.add_argument('--reference', default='mm10', choices={'mm10'}) - parser.add_argument('--method', default='genes', - choices={'genes', 'transcripts'}) - parser.add_argument('--system', default='SB', - choices={'MMTV', 'MuLV', 'SB'}) - parser.add_argument('--closest', default=False, action='store_true') - - # Set main for dispatch. - parser.set_defaults(main=main) - - return parser - - -def main(args): - logger = logging.getLogger() - - # Read insertions. - logger.info('Annotation insertions') - insertions = pd.read_csv(args.input, sep='\t', dtype={'chrom': str}) - logger.info('Read {} insertions'.format(len(insertions))) - - # Annotate with kcrbm. - annotation = annotate(insertions, args.reference, - args.system, args.method) - - if args.closest: - # Sub-select for closest features. - logger.info('Reducing to closest features') - annotation = select_closest(annotation, col='gene_distance') - - # Merge annotation. - logger.info('Merging annotation') - merged = pd.merge(insertions, annotation, on='id', how='left') - merged.to_csv(args.output, sep='\t', index=False) - - -def annotate(insertions, reference, system, method): - # Convert to kcrbm format. - ins_kcrbm = _convert_to_kcrbm(insertions) - - # Run KCRBM. - kcrbm = importr('kcrbm') - genome = _load_genome(reference) - - result = kcrbm.kcrbm(edata=genome, idata=ins_kcrbm, rules=system, - reference=reference, map_to=method) - result = dataframe_to_pandas(result) - - # Convert to gene/transcript frame. - if method == 'gene': - result = _convert_gene_result(result) - elif method == 'transcript': - result = _convert_transcript_result(result) - else: - raise ValueError('Unknown method {}'.format(method)) - - return result - - -def _convert_to_kcrbm(insertion): - # Extract and rename required columns. - kcrbm_frame = insertion.ix[:, ['id', 'seqname', 'location', 'strand']] - kcrbm_frame.columns = ['id', 'chr', 'base', 'ori'] - - # Remove any eccentric chromosomes from frame. - seq_mask = kcrbm_frame.chr.isin(CHROM_MAP.keys()) - if any(~seq_mask): - dropped_chr = set(kcrbm_frame.ix[~seq_mask].chr) - print('Warning: dropped insertions not in regular ' - 'chromosomes ({})'.format(', '.join(dropped_chr))) - - kcrbm_frame = kcrbm_frame.ix[seq_mask] - - # Convert chr to numeric representation. - kcrbm_frame['chr'] = kcrbm_frame['chr'].map(CHROM_MAP).astype(int) - - # Copy insertion id to extra column. - kcrbm_frame['ins_id'] = kcrbm_frame['id'] - - return kcrbm_frame - - -def _load_genome(genome): - utils = importr("utils") - - if genome == 'mm10': - utils.data('edata.mm10', package='kcrbm') - genome_obj = robjects.r['edata.mm10'] - else: - raise ValueError('Unknown genome version {}'.format(genome)) - - return genome_obj - - -def _convert_gene_result(result): - result = result.ix[result['ensid'].astype(str) != 'NA'] - - gene_distance = result[['d2gss', 'd2gts']]\ - .abs().min(axis=1).astype(int) - gene_distance.ix[result.mechanism.str.startswith('u')] *= -1 - - return pd.DataFrame({ - 'insertion_id': result['ins_id'], - 'gene_id': result['ensid'], - 'distance': gene_distance, - 'mechanism': result['mechanism']}, - columns=['insertion_id', 'gene_id', 'distance', 'mechanism']) - - -def _convert_transcript_result(result): - result = result.ix[result['ensid'].astype(str) != 'NA'] - - transcripts = result['transid'].str.split('|') - mechanisms = result['mechanism'].str.split('|') - - counts = list(map(len, transcripts)) - - ins_id = list(_repeat_list(result['ins_id'], counts)) - ens_id = list(_repeat_list(result['ensid'], counts)) - - return pd.DataFrame({'id': ins_id, 'gene': ens_id, - 'transcript': _flatten_list(transcripts), - 'mechanism': _flatten_list(mechanisms)}, - columns=['id', 'gene', 'transcript', 'mechanism']) - - -def _repeat_list(l, n): - return chain(*[repeat(el, num) for el, num in zip(l, n)]) - - -def _flatten_list(l): - return [item for sub_list in l for item in sub_list] diff --git a/src/pyim/annotation/annotator/rbm.py b/src/pyim/annotation/annotator/rbm.py deleted file mode 100644 index 0e789ad..0000000 --- a/src/pyim/annotation/annotator/rbm.py +++ /dev/null @@ -1,163 +0,0 @@ -from __future__ import absolute_import, division, print_function - -#pylint: disable=wildcard-import,unused-wildcard-import,redefined-builtin -from builtins import * -#pylint: enable=wildcard-import,unused-wildcard-import,redefined-builtin - -import itertools -import logging - -import pandas as pd - -#pylint: disable=import-error -from ..metadata import add_metadata -from ..filtering import filter_blacklist, select_closest -from .window import Window, annotate_windows -#pylint: enable=import-error - -# Window format: (us, ua, ds, da) -WINDOW_PRESETS = { - 'SB': (20000, 10000, 25000, 5000), - 'MULV': (20000, 120000, 40000, 5000), - 'MMTV': (20000, 120000, 40000, 5000) -} - - -def annotate_rbm(insertions, gtf, window_preset=None, window_sizes=None): - """Assigns insertions to genes using the rule-based-method (RBM) approach. - - Args: - insertions (pandas.DataFrame): Insertions to annotate in DataFrame - format. The frame is expected to contain at least the - following columns: id, position, strand. - gtf (str or GtfFile): Path to gtf file containing gene features. - Alternatively, a GtfFile object may also be given instead of a path. - window_preset (str): Preset to use for the RBM window sizes. - Alternatively custom window sizes can be given using the - *window_sizes* argument. Note that either *window_preset* or - *window_sizes* must be provided. - window_sizes (tuple[int]): Tuple of window sizes to use in the - RBM mapping. Should specify four window sizes, for the following - categories of insertions: upstream-sense, upstream-antisense, - downstream-sense, downstream-antisense. - - Returns: - pandas.DataFrame: Dataframe containing annotated insertions. Annotations - are added as columns 'gene_id' and 'gene_name', which respectively contain the id and name of the annotated gene. An extra column - 'window' indicates which of the RBM windows was used for - the annotation. - - """ - - # Lookup windows. - if window_preset is not None: - window_sizes = WINDOW_PRESETS[window_preset] - elif window_sizes is None: - raise ValueError('Either window_sizes or window_preset must be given') - - # Replace unstranded insertions with two stranded insertions. - if (~insertions['strand'].isin({-1, 1})).any(): - logging.warning('Replacing unstranded insertions') - converted = _replace_unstranded(insertions) - else: - converted = insertions - - # Define windows. - windows = _build_windows(window_sizes) - - # Annotate insertions. - annotated = annotate_windows(converted, gtf, windows) - - return annotated - - -def _build_windows(window_sizes): - us, ua, ds, da = window_sizes - - windows = [ - Window(0, 1, strand=1, incl_left=True, incl_right=True, name='is'), - Window(0, 1, strand=-1, incl_left=True, incl_right=True, name='ia'), - Window(-us, 0, strand=1, incl_left=True, incl_right=False, name='us'), - Window(-ua, 0, strand=-1, incl_left=True, incl_right=False, name='ua'), - Window(1, ds, strand=1, incl_left=False, incl_right=True, name='ds'), - Window(1, da, strand=-1, incl_left=False, incl_right=True, name='da')] - - return windows - - -def _replace_unstranded(insertions): - """Replaces unstranded insertions with two stranded insertions.""" - - # Split stranded and unstranded. - mask = insertions['strand'].isin({-1, 1}) - stranded = insertions.ix[mask] - unstranded = insertions.ix[~mask] - - # Convert unstranded into two stranded. - converted = (_to_stranded(ins) for _, ins in unstranded.iterrows()) - converted = pd.DataFrame.from_records( - itertools.chain.from_iterable(converted)) - - return pd.concat((stranded, converted), ignore_index=True) - - -def _to_stranded(insertion): - fwd = insertion.copy() - fwd['strand'] = 1 - - rev = insertion.copy() - rev['strand'] = -1 - - return (fwd, rev) - - -def register(subparsers, name='rbm'): - """Registers the RBM annotator as a subparser.""" - - parser = subparsers.add_parser(name, help=name + ' help') - - # Required arguments. - parser.add_argument('input') - parser.add_argument('output') - parser.add_argument('--gtf', required=True) - - group = parser.add_mutually_exclusive_group(required=True) - group.add_argument('--preset', choices=WINDOW_PRESETS.keys()) - group.add_argument('--window_sizes', nargs=4, type=int) - - # Optional arguments. - parser.add_argument('--closest', default=False, action='store_true') - parser.add_argument('--blacklist', default=None, nargs='+') - - # Set main for dispatch. - parser.set_defaults(main=main) - - return parser - - -def main(args): - """Main function for the RBM annotator command-line tool.""" - - # Read insertions. - insertions = pd.read_csv(args.input, sep='\t', dtype={'chrom': str}) - logging.info('Read %d insertions', insertions['id'].nunique()) - - # Annotate insertions. - logging.info('Annotating insertions') - annotated = annotate_rbm(insertions, args.gtf, window_preset=args.preset, - window_sizes=args.window_sizes) - - # Add metadata. - logging.info('Adding annotation metadata') - annotated = add_metadata(annotated, args.gtf) - - if args.blacklist is not None: - logging.info('Filtering blacklisted genes') - annotated = filter_blacklist(annotated, args.blacklist) - - if args.closest: - logging.info('Selecting closest genes') - annotated = select_closest(annotated) - - - annotated.to_csv(args.output, sep='\t', index=False) diff --git a/src/pyim/annotation/annotator/rbm_cis.py b/src/pyim/annotation/annotator/rbm_cis.py deleted file mode 100644 index 7b621c3..0000000 --- a/src/pyim/annotation/annotator/rbm_cis.py +++ /dev/null @@ -1,155 +0,0 @@ -from __future__ import absolute_import, division, print_function - -#pylint: disable=wildcard-import,unused-wildcard-import,redefined-builtin -from builtins import * -#pylint: enable=wildcard-import,unused-wildcard-import,redefined-builtin - -import logging -from os import path - -import numpy as np -import pandas as pd - -#pylint: disable=import-error -from .rbm import annotate_rbm, WINDOW_PRESETS as RBM_WINDOW_PRESETS -from ..metadata import add_metadata -from ..filtering import filter_blacklist, select_closest -#pylint: enable=import-error - - -def annotate_rbm_cis(insertions, cis_sites, gtf, window_preset=None, - window_sizes=None, blacklist=None, closest=False, - collapse=False): - """Assigns insertions to genes using the RBM approach via called CIS sites. - - Args: - insertions (pandas.DataFrame): Insertions to annotate in DataFrame - format. The frame is expected to contain at least the - following columns: id, position, strand. - cis_sites(pandas.DataFrame): Dataframe containing the CIS sites - for the given insertions. - gtf (str or GtfFile): Path to gtf file containing gene features. - Alternatively, a GtfFile object may also be given instead of a path. - window_preset (str): Preset to use for the RBM window sizes. - Alternatively custom window sizes can be given using the - *window_sizes* argument. Note that either *window_preset* or - *window_sizes* must be provided. - window_sizes (tuple[int]): Tuple of window sizes to use in the - RBM mapping. Should specify four window sizes, for the following - categories of insertions: upstream-sense, upstream-antisense, - downstream-sense, downstream-antisense. - - Returns: - tuple[pandas.DataFrame]: Returns two dataframes, the first - containing the annotated insertion sites, the second containing - the annotated CIS sites, which were used to annotate the insertions. - Annotations are added as columns 'gene_id' and 'gene_name', which - respectively contain the id and name of the annotated gene. An - extra column 'window' indicates which of the RBM windows was - used for the annotation. - - """ - - if 'strand' not in cis_sites: - # Add strand to cis sites if not present. - cis_sites = _determine_cis_strand(cis_sites, insertions) - - # Annotate cis sites. - cis_sites = cis_sites.rename(columns={'cis_id': 'id'}) - annotated_sites = annotate_rbm(cis_sites, gtf, - window_preset=window_preset, - window_sizes=window_sizes) - - if blacklist: - annotated_sites = filter_blacklist(annotated_sites, blacklist) - - if closest: - annotated_sites = add_metadata(annotated_sites, gtf) - annotated_sites = select_closest(annotated_sites) - - # Extract and merge annotation with insertions. - annotation = annotated_sites[['id', 'gene_id', 'gene_name']] - annotation = annotation.rename(columns={'id': 'cis_id'}) - annotated_ins = pd.merge(insertions, annotation, on='cis_id', how='left') - - # Add metadata to insertions. - annotated_ins = add_metadata(annotated_ins, gtf) - - if collapse: - # Collapse multiple insertion entries resulting from CIS annotation. - annotated_ins.drop(['cis_id'], axis=1, inplace=True) - annotated_ins.drop_duplicates(inplace=True) - - return annotated_ins, annotated_sites - - -def _determine_cis_strand(cis, cis_insertions, min_homogeneity=0.5): - """Determines the strand for CIS sites with homogeneous insertions.""" - - # Extract and clip strands at zero. - ins_strands = cis_insertions[['cis_id', 'strand']].copy() - ins_strands['strand'] = ins_strands['strand'].map({1: 1, -1: 0}) - - # Calculate fwd/rev ratio for each cis. - ratio = ins_strands.groupby('cis_id')['strand'].mean() - - # Determine closest strand and homogeneity. - cis_strands = pd.DataFrame( - {'strand': ratio.round().astype(int).map({1: 1, 0: -1}), - 'strand_homogeneity': np.maximum((1 - ratio), ratio)}, - columns=['strand', 'strand_homogeneity']) - - # Don't assign strand if low homogeneity. - homogeneity_mask = cis_strands['strand_homogeneity'] < min_homogeneity - cis_strands.ix[homogeneity_mask, 'strand'] = None - - return pd.merge(cis, cis_strands.reset_index()) - - -def register(subparsers, name='rbm-cis'): - """Registers the RBM-CIS annotator as a subparser.""" - - parser = subparsers.add_parser(name, help=name + ' help') - - # Required arguments. - parser.add_argument('input') - parser.add_argument('output') - parser.add_argument('--gtf', required=True) - parser.add_argument('--cis_sites', required=True) - - group = parser.add_mutually_exclusive_group(required=True) - group.add_argument('--preset', choices=RBM_WINDOW_PRESETS.keys()) - group.add_argument('--window_sizes', nargs=4, type=int) - - # Optional arguments. - parser.add_argument('--closest', default=False, action='store_true') - parser.add_argument('--collapse', default=False, action='store_true') - parser.add_argument('--blacklist', default=None, nargs='+') - - # Set main for dispatch. - parser.set_defaults(main=main) - - return parser - - -def main(args): - """Main function for the RBM-CIS annotator command-line tool.""" - - # Read insertions and cis sites. - insertions = pd.read_csv(args.input, sep='\t', dtype={'chrom': str}) - cis_sites = pd.read_csv(args.cis_sites, sep='\t', dtype={'chrom': str}) - - logging.info('Read %d insertions and %d cis sites', - insertions['id'].nunique(), len(cis_sites)) - - # Annotate insertions. - logging.info('Annotating insertions') - annotated_ins, annotated_sites = annotate_rbm_cis( - insertions, cis_sites, args.gtf, window_preset=args.preset, - window_sizes=args.window_sizes, collapse=args.collapse, - blacklist=args.blacklist, closest=args.closest) - - # Write outputs. - annotated_ins.to_csv(args.output, sep='\t', index=False) - annotated_sites.to_csv(path.splitext(args.output)[0] + '.sites.txt', - sep='\t', index=False) diff --git a/src/pyim/annotation/annotator/window.py b/src/pyim/annotation/annotator/window.py deleted file mode 100644 index 176454e..0000000 --- a/src/pyim/annotation/annotator/window.py +++ /dev/null @@ -1,146 +0,0 @@ -# pylint: disable=W0622,W0614,W0401 -from __future__ import absolute_import, division, print_function -from builtins import * -# pylint: enable=W0622,W0614,W0401 - -import collections -import itertools - -import toolz - -from pyim.annotation import register_annotator -from pyim.util.tabix import GtfFile - -# from ..filtering import filter_blacklist, select_closest -from ..util import build_interval_trees, numeric_strand - - -class WindowAnnotator(object): - - def __init__(self, reference_gtf, windows): - if not isinstance(reference_gtf, GtfFile): - reference_gtf = GtfFile(reference_gtf) - - self._windows = windows - self._gtf = reference_gtf - - self._trees = None - - @classmethod - def from_args(cls, args): - window_size = args.window_size // 2 - windows = [Window(-window_size, window_size, strand=None, - name=None, strict_left=False, strict_right=False)] - return cls(reference_gtf=args.reference_gtf, windows=windows) - - @classmethod - def setup_args(cls, parser): - # Required arguments. - parser.add_argument('--reference_gtf', required=True) - - # Optional arguments. - # parser.add_argument('--closest', default=False, action='store_true') - parser.add_argument('--window_size', default=20000, type=int) - - def annotate(self, insertions): - if self._trees is None: - self._trees = build_interval_trees(self._gtf) - - queries = itertools.product(insertions, self._windows) - annotated = itertools.chain.from_iterable( - (self._annotate(ins, window, self._trees) - for ins, window in queries)) - - return annotated - - def _annotate(self, ins, window, interval_trees): - # Identify overlapping features. - applied_window = window.apply(ins.chromosome, ins.position, ins.strand) - features = list(applied_window.get_overlap(interval_trees)) - - if len(features) > 0: - for feature in features: - feat_metadata = {'gene_id': feature['gene_id'], - 'gene_name': feature['gene_name']} - - if window.name is not None: - feat_metadata['window'] = window.name - - new_metadata = toolz.merge(ins.metadata, feat_metadata) - - yield ins._replace(metadata=new_metadata) - else: - yield ins - - -register_annotator('window', WindowAnnotator) - - -_Window =collections.namedtuple( - 'Window', ['start', 'end', 'strand', 'name', - 'strict_left', 'strict_right']) - - -class Window(_Window): - __slots__ = () - - def apply(self, chromosome, position, strand): - # Determine start/end position. - if strand == 1: - start = position + self.start - end = position + self.end - - strict_left = self.strict_left - strict_right = self.strict_right - elif strand == -1: - start = position - self.end - end = position - self.start - - strict_right = self.strict_left - strict_left = self.strict_right - else: - raise ValueError('Unknown value for strand ({})' - .format(strand)) - - # Determine new strand. - if self.strand is not None: - new_strand = self.strand * strand - else: - new_strand = None - - return AppliedWindow(chromosome, start, end, new_strand, - self.name, strict_left, strict_right) - - -_AppliedWindow = collections.namedtuple( - 'AppliedWindow', ['chromosome', 'start', 'end', 'strand', - 'name', 'strict_left', 'strict_right']) - - -class AppliedWindow(_AppliedWindow): - __slots__ = () - - def get_overlap(self, interval_trees): - # Find overlapping features. - try: - tree = interval_trees[self.chromosome] - overlap = tree[self.start:self.end] - except KeyError: - overlap = [] - - # Extract features. - features = (interval[2] for interval in overlap) - - # Filter inclusive/exclusive if needed. - if self.strict_left: - features = (f for f in features if f['start'] > self.start) - - if self.strict_right: - features = (f for f in features if f['end'] < self.end) - - # Filter for strand if needed. - if self.strand is not None: - features = (f for f in features - if numeric_strand(f['strand']) == self.strand) - - return features diff --git a/src/pyim/annotation/filtering.py b/src/pyim/annotation/filtering.py deleted file mode 100644 index 324b439..0000000 --- a/src/pyim/annotation/filtering.py +++ /dev/null @@ -1,45 +0,0 @@ - -def select_closest(insertions, id_col='id', dist_col='distance'): - """Selects genes that are closest to the annotated insertions. - - Args: - insertions (pandas.DataFrame): Annotated insertions that are to - be filtered. The frame is expected to contain at least the - following columns: id, position, strand, *dist_col*. - id_col (str): Name of the column containing the id of the insertion. - dist_col (str): Name of the column containing the distance to - the gene or feature. Can be added using the add_metadata function. - - Returns: - pandas.DataFrame: Filtered annotated insertions, which have been - reduced to only include the genes closest to the insertions. - - """ - - def _is_closest(x): - abs_dist = x[dist_col].abs() - return x.ix[abs_dist == abs_dist.min()] - - return (insertions.groupby(id_col) - .apply(_is_closest) - .reset_index(drop=True)) - - -def filter_blacklist(insertions, blacklist, gene_col='gene_name'): - """Filters annotations that assign insertions to blacklisted genes. - - Args: - insertions (pandas.DataFrame): Annotated insertions that are to - be filtered. The frame is expected to contain at least the - following columns: id, position, strand, *gene_id_col*. - blacklist (list[str]): List of blacklisted gene ids to filter. - gene_col (str): Name of the column containing the id of the genes. - - Returns: - pandas.DataFrame: Filtered annotated insertions, which have been - reduced remove blacklisted genes. - - """ - - mask = insertions[gene_col].isin(set(blacklist)) - return insertions.ix[~mask] diff --git a/src/pyim/annotation/util.py b/src/pyim/annotation/util.py deleted file mode 100644 index bc83abd..0000000 --- a/src/pyim/annotation/util.py +++ /dev/null @@ -1,34 +0,0 @@ -import itertools -from intervaltree import IntervalTree - - -def build_interval_trees(gtf): - """Builds an interval tree of genes for each chromosome in gtf.""" - - # Only select gene features for now. - genes = gtf.fetch(filters={'feature': 'gene'}) - - trees = {} - for contig, grp in itertools.groupby(genes, lambda r: r.contig): - # Build a tree for each individual chromosome. - intervals = ((g.start, g.end, dict(g)) for g in grp - if g.end > g.start) # Avoid null intervals. - trees[contig] = IntervalTree.from_tuples(intervals) - - return trees - - -def numeric_strand(strand): - """Converts strand to its numeric (integer) representation.""" - - if isinstance(strand, int): - return strand - elif isinstance(strand, float): - return int(strand) - else: - if strand == '+': - return 1 - elif strand == '-': - return -1 - else: - raise ValueError('Unknown value {} for strand'.format(strand)) diff --git a/src/pyim/cis/__init__.py b/src/pyim/cis/__init__.py index e69de29..681e4d6 100644 --- a/src/pyim/cis/__init__.py +++ b/src/pyim/cis/__init__.py @@ -0,0 +1 @@ +from .callers import get_callers, CisCaller, CimplCisCaller diff --git a/src/pyim/cis/_util.py b/src/pyim/cis/_util.py deleted file mode 100644 index 4574f97..0000000 --- a/src/pyim/cis/_util.py +++ /dev/null @@ -1,32 +0,0 @@ -import toolz - -import numpy as np -import pandas as pd - - -def annotate_cis_strand(cis, insertions, min_homogeneity): - # Determine strand of cis sites. - func = toolz.curry(_cis_strand, min_homogeneity=min_homogeneity) - cis_strand = insertions.groupby('cis_id').apply(func) - - # Merge with cis annotation - cis = pd.merge(cis, cis_strand.reset_index(), on='cis_id') - - return cis - - -def _cis_strand(insertions, min_homogeneity): - strand_mean = insertions.strand.mean() - strand = int(np.sign(strand_mean)) - - if strand != 0: - homogeneity = (insertions.strand == strand).sum() / len(insertions) - else: - homogeneity = 0.5 - - if homogeneity < min_homogeneity: - strand = 0 - - return pd.Series(dict(strand=strand, - strand_mean=strand_mean, - strand_homogeneity=homogeneity)) \ No newline at end of file diff --git a/src/pyim/cis/callers/__init__.py b/src/pyim/cis/callers/__init__.py new file mode 100644 index 0000000..790a056 --- /dev/null +++ b/src/pyim/cis/callers/__init__.py @@ -0,0 +1,2 @@ +from .base import get_callers, register_caller, CisCaller +from .cimpl import CimplCisCaller diff --git a/src/pyim/cis/callers/base.py b/src/pyim/cis/callers/base.py new file mode 100644 index 0000000..7b87126 --- /dev/null +++ b/src/pyim/cis/callers/base.py @@ -0,0 +1,42 @@ +from abc import ABC, abstractclassmethod, abstractmethod +from pathlib import Path + +_registry = {} + + +def register_caller(name, cis_caller): + _registry[name] = cis_caller + + +def get_callers(): + return dict(_registry) + + +class CisCaller(ABC): + def __init__(self): + pass + + @abstractclassmethod + def configure_args(cls, parser): + parser.add_argument('--insertions', type=Path, required=True) + parser.add_argument('--output', type=Path, required=True) + parser.add_argument( + '--output_sites', type=Path, required=False, default=None) + + @abstractclassmethod + def from_args(cls, args): + raise NotImplementedError() + + @abstractmethod + def call(self, insertions): + """Calls CIS sites for insertions. + + Parameters: + insertions (iterable[Insertion]) + + Returns: + iterable[Insertions], iterable[CisSites] + + """ + + raise NotImplementedError() \ No newline at end of file diff --git a/src/pyim/cis/callers/cimpl.py b/src/pyim/cis/callers/cimpl.py new file mode 100644 index 0000000..14de900 --- /dev/null +++ b/src/pyim/cis/callers/cimpl.py @@ -0,0 +1,267 @@ +import pandas as pd + +import readline +from rpy2 import robjects +from rpy2.robjects.packages import importr +from rpy2.robjects.vectors import DataFrame as RDataFrame, StrVector, IntVector +from pyim.util.rpy2 import pandas_to_dataframe, dataframe_to_pandas + +from pyim.model import Insertion, CisSite +from pyim.util import add_prefix, remove_prefix + +from .base import CisCaller, register_caller +from ..util import assign_strand, invert_otm_mapping + +R_GENOMES = {'mm10': 'BSgenome.Mmusculus.UCSC.mm10'} + + +class CimplCisCaller(CisCaller): + def __init__(self, + genome='mm10', + scales=(10000, 30000), + chromosomes=None, + alpha=0.05, + pattern=None, + lhc_method='none', + iterations=1000, + threads=1, + min_strand_homogeneity=0.75): + super().__init__() + + # Default to numbered mouse chromosomes + X. + if chromosomes is None: + chromosomes = [str(i) for i in range(1, 20)] + ['X'] + + # Add 'chr' prefix to chromosomes if missing. + chromosomes = add_prefix(chromosomes, prefix='chr') + + self._genome = genome + self._scales = scales + self._chromosomes = chromosomes + self._alpha = alpha + self._pattern = pattern + self._lhc_method = lhc_method + self._iterations = iterations + self._threads = threads + self._min_strand_homogeneity = min_strand_homogeneity + + self.__cimpl = None + + @property + def _cimpl(self): + if self.__cimpl is None: + self.__cimpl = importr('cimpl') + return self.__cimpl + + @classmethod + def configure_args(cls, parser): + super().configure_args(parser) + + parser.add_argument('--pattern', required=True) + + parser.add_argument('--genome', default='mm10') + parser.add_argument('--scales', default=(10000, 30000), + nargs='+', type=int) # yapf: disable + parser.add_argument('--chromosomes', default=None, nargs='+') + parser.add_argument('--alpha', default=0.05, type=float) + parser.add_argument('--lhc_method', default='exclude') + parser.add_argument('--iterations', default=1000, type=int) + parser.add_argument('--threads', default=1, type=int) + parser.add_argument( + '--min_strand_homogeneity', default=0.75, type=float) + + @classmethod + def from_args(cls, args): + return cls(pattern=args.pattern, + genome=args.genome, + chromosomes=args.chromosomes, + alpha=args.alpha, + lhc_method=args.lhc_method, + iterations=args.iterations, + threads=args.threads, + min_strand_homogeneity=args.min_strand_homogeneity) + + def call(self, insertions): + """Runs CIMPL on insertions.""" + + # Convert insertions to cimpl frame. + ins_frame = self._insertions_to_cimpl(insertions) + + # Load genome object from R. + genome_obj = self._load_genome(self._genome) + + # Check if contig_depth is present (if doing hop exclusion). + if self._lhc_method == 'exclude' and 'contig_depth' not in ins_frame: + raise ValueError('Insertion depth is needed for lhc exclusion') + + # Run CIMPL! + cimpl_result = self._cimpl.doCimplAnalysis( + pandas_to_dataframe(ins_frame), + scales=robjects.vectors.IntVector(self._scales), + n_iterations=self._iterations, + lhc_method=self._lhc_method, + threads=self._threads, + BSgenome=genome_obj, + chromosomes=robjects.vectors.StrVector(self._chromosomes), + verbose=1) + + # Extract cis sites and mapping. + cis_sites = self._extract_cis(cimpl_result, alpha=self._alpha) + cis_mapping = self._extract_mapping(cimpl_result, cis_sites) + + # Determine strandedness of cis_sites using inseriions. + ins_mapping = invert_otm_mapping(cis_mapping) + cis_sites = list( + assign_strand( + cis_sites, + insertions, + ins_mapping, + min_homogeneity=self._min_strand_homogeneity)) + + return cis_sites, cis_mapping + + def _insertions_to_cimpl(self, insertions): + # Convert insertions to frame representation. + ins_frame = Insertion.to_frame(insertions) + + # Extract and rename required columns. + column_map = { + 'id': 'id', + 'chromosome': 'chr', + 'position': 'location', + 'sample': 'sampleID' + } + + cimpl_ins = (ins_frame[list(column_map.keys())] + .rename(columns=column_map)) + + # Add chr prefix. + cimpl_ins['chr'] = add_prefix(cimpl_ins['chr'], prefix='chr') + + # Add depth if present. + if 'support' in ins_frame: + cimpl_ins['contig_depth'] = ins_frame['support'] + + elif 'depth_unique' in ins_frame: + cimpl_ins['contig_depth'] = ins_frame['depth_unique'] + + return cimpl_ins + + def _load_genome(self, genome): + # Lookup R package for genome. + try: + genome_pkg = R_GENOMES[genome] + except KeyError: + raise ValueError('Unsupported genome {}'.format(genome)) + + # Import package and extract genome object. + bs_genome = importr(genome_pkg) + genome_obj = bs_genome.Mmusculus + + return genome_obj + + def _extract_cis(self, cimpl_obj, alpha=0.05): + # Extract CIS frame from R into a pandas dataframe. + cis_obj = self._cimpl.getCISs(cimpl_obj, alpha=alpha, mul_test=True) + + cis_frame = dataframe_to_pandas(cis_obj).reset_index() + cis_frame.rename( + columns={'index': 'cis_id', + 'chromosome': 'seqname'}, inplace=True) + + # Clean-up converted dataframe (datatypes, prefixes + column names). + for col in ['peak_location', 'start', 'end', 'width', 'n_insertions']: + cis_frame[col] = cis_frame[col].astype(int) + + cis_frame['seqname'] = remove_prefix( + cis_frame['seqname'], prefix='chr') + + cis_frame = cis_frame.rename(columns={ + 'cis_id': 'id', + 'seqname': 'chromosome', + 'peak_location': 'position', + 'peak_height': 'height', + 'p_value': 'pvalue' + }) + + # Merge cis sites that are in fact the same, but appear multiple + # times with different height locations. + cols = ['chromosome', 'start', 'end', 'width', 'n_insertions', 'scale'] + cis_frame = pd.DataFrame((grp.ix[grp['height'].argmax()] + for _, grp in cis_frame.groupby(cols))) + + # For now, set strand to None. + cis_frame['strand'] = None + + # Convert to CisSite objects using a subset of the columns. + cis_frame_subset = cis_frame[['id', 'chromosome', 'position', 'start', + 'end', 'scale', 'pvalue', 'n_insertions', + 'height', 'width', 'strand']] + cis_sites = list(CisSite.from_frame(cis_frame_subset)) + + return cis_sites + + def _extract_mapping(self, cimpl_obj, cis_sites): + # Convert CIS sites to frame format. + cis_frame = CisSite.to_frame(cis_sites) + + # Convert to R representation for cimpl. + chr_with_prefix = add_prefix(cis_frame['chromosome'], prefix='chr') + + r_base = importr('base') + cis_frame_r = RDataFrame({ + 'id': r_base.I(StrVector(cis_frame['id'])), + 'chromosome': r_base.I(StrVector(chr_with_prefix)), + 'scale': StrVector(cis_frame['scale']), + 'start': IntVector(cis_frame['start']), + 'end': IntVector(cis_frame['end']) + }) + cis_frame_r.rownames = StrVector(cis_frame['id']) + + # Retrieve cis matrix from cimpl. + cis_matrix_r = self._cimpl.getCISMatrix(cimpl_obj, cis_frame_r) + cis_matrix = dataframe_to_pandas(cis_matrix_r) + + # Extract scale information from cis matrix. + scale_cols = [c for c in cis_matrix.columns if c.startswith('X')] + cis_matrix_scales = cis_matrix[['id'] + scale_cols] + + # Melt matrix into long format. + mapping = pd.melt(cis_matrix_scales, id_vars=['id']) + mapping = mapping[['id', 'value']] + mapping = mapping.rename(columns={'id': 'insertion_id', + 'value': 'cis_id'}) + + # Split cis_id column into individual entries (for entries + # with multiple ids). Then drop any empty rows, as these + # entries are empty cells in the matrix. + mapping = mapping.ix[mapping['cis_id'] != ''] + mapping = expand_column(mapping, col='cis_id', delimiter='|') + + mapping_dict = {ins_id: set(grp['cis_id']) + for ins_id, grp in mapping.groupby('insertion_id')} + + return mapping_dict + + +register_caller('cimpl', CimplCisCaller) + + +def expand_column(frame, col, delimiter): + exp = pd.concat( + (_expand_row(row, col=col, delimiter=delimiter) + for _, row in frame.iterrows()), + ignore_index=True) # yapf: disable + return exp[frame.columns] + + +def _expand_row(row, col, delimiter): + row_dict = dict(row) + + if type(row[col]) == str: + col_split = row[col].split(delimiter) + row_dict[col] = col_split + else: + row_dict[col] = [row[col]] + + return pd.DataFrame(row_dict) diff --git a/src/pyim/cis/cimpl.py b/src/pyim/cis/cimpl.py deleted file mode 100644 index 3cba07a..0000000 --- a/src/pyim/cis/cimpl.py +++ /dev/null @@ -1,214 +0,0 @@ -import pandas as pd - -import readline -from rpy2 import robjects -from rpy2.robjects.packages import importr - -from pyim.util.rpy2 import pandas_to_dataframe, dataframe_to_pandas - -R_GENOMES = {'mm10': 'BSgenome.Mmusculus.UCSC.mm10'} - - -def map_insertions(insertions, scales, genome, alpha=0.05, **kwargs): - """Maps given insertions to CISs using CIMPL.""" - - # Convert insertion to cimpl format. - cimpl_ins = convert_to_cimpl(insertions) - - # Run cimpl. - cimpl_result = cimpl(cimpl_ins, scales, genome, **kwargs) - - # Extract cis sites and mapping. - cis = extract_cis(cimpl_result, alpha=alpha) - mapping = extract_mapping(cimpl_result, cis) - - return cis, mapping - - -def cimpl(insertions, - scales, - genome, - system=None, - pattern=None, - lhc_method='none', - iterations=1000, - chromosomes=None, - verbose=False, - threads=1): - """Runs CIMPL on insertions (in CIMPL format).""" - - # Fill in chromosomes from data if not specified. - if chromosomes is None: - chromosomes = list(insertions['chr'].unique()) - - # Determine if system or specific pattern was specified. - if pattern is not None: - extra_args = {'specificity_pattern': pattern} - elif system is not None: - extra_args = {'system': system} - else: - raise ValueError('Either system or specificity pattern ' - 'should be specified.') - - # Prepare chromosomes argument, adding 'chr' prefix and - # converting to StrVector to pass to R. - if not chromosomes[0].startswith('chr'): - chromosomes = ['chr' + c for c in chromosomes] - - # Convert scales to IntVector if supplied as list. - if type(scales) == list: - scales = robjects.vectors.IntVector(scales) - - # Load genome object from R. - genome_obj = _load_genome(genome) - - # Check if contig_depth is present (if doing hop exclusion). - if lhc_method == 'exclude' and 'contig_depth' not in insertions: - raise ValueError('Insertion depth is needed for lhc exclusion') - - # Run CIMPL! - cimpl_r = importr('cimpl') - cimpl_obj = cimpl_r.doCimplAnalysis( - pandas_to_dataframe(insertions), - scales=scales, - n_iterations=iterations, - lhc_method=lhc_method, - threads=threads, - BSgenome=genome_obj, - chromosomes=robjects.vectors.StrVector(chromosomes), - verbose=verbose, - **extra_args) - - return cimpl_obj - - -def convert_to_cimpl(insertions): - # Extract and rename required columns. - cimpl_ins = insertions.ix[:, ['id', 'chromosome', 'position', 'sample']] - cimpl_ins.columns = ['id', 'chr', 'location', 'sampleID'] - - if 'depth_unique' in insertions: - cimpl_ins['contig_depth'] = insertions['depth_unique'] - - # Add 'chr' prefix to the chromosome names if needed. - cimpl_ins['chr'] = _prefix_chromosomes(cimpl_ins['chr']) - - return cimpl_ins - - -def _prefix_chromosomes(series, prefix='chr'): - # Add 'chr' prefix to the chromosome names if needed. - if len(series) > 0 and not series.iloc[0].startswith('chr'): - series = series.map(lambda c: prefix + c) - return series - - -def _load_genome(genome): - # Lookup R package for genome. - try: - genome_pkg = R_GENOMES[genome] - except KeyError: - raise ValueError('Unsupported genome {}'.format(genome)) - - # Import package and extract genome object. - bs_genome = importr(genome_pkg) - genome_obj = bs_genome.Mmusculus - - return genome_obj - - -def extract_cis(cimpl_obj, alpha=0.05, mul_test=True): - cimpl_r = importr('cimpl') - cis_obj = cimpl_r.getCISs(cimpl_obj, alpha=alpha, mul_test=mul_test) - - # Convert cis to pandas and rename index. - cis_frame = dataframe_to_pandas(cis_obj).reset_index() - cis_frame.rename( - columns={'index': 'cis_id', - 'chromosome': 'seqname'}, inplace=True) - - # Convert columns to int types. - for col in ['peak_location', 'start', 'end', 'width', 'n_insertions']: - cis_frame[col] = cis_frame[col].astype(int) - - # Remove chr prefix from chromosomes. - cis_frame['seqname'] = cis_frame['seqname'].str.replace('chr', '') - - # Reorder columns. - cis_frame = cis_frame[['cis_id', 'seqname', 'start', 'end', 'scale', - 'p_value', 'n_insertions', 'peak_location', - 'peak_height', 'width']] - - # Rename and reshuffle cis columns. - cis_frame = cis_frame.rename(columns={'seqname': 'chrom', - 'peak_location': 'position', - 'peak_height': 'height'}) - - cis_frame = cis_frame[['cis_id', 'chrom', 'position', 'scale', - 'n_insertions', 'p_value', 'start', 'end', 'height', - 'width']] - - return cis_frame - - -def extract_mapping(cimpl_obj, cis_frame): - # Add cis_id as index to cis frame before passing to R, - # ensures CIMPL uses cis id's instead of row indices. - cis_frame = cis_frame.copy() - cis_frame.set_index('cis_id', drop=False, inplace=True) - cis_frame['chromosomes'] = _prefix_chromosomes(cis_frame['chrom']) - cis_frame_r = pandas_to_dataframe(cis_frame) - - # Retrieve cis matrix from cimpl. - cimpl_r = importr('cimpl') - cis_matrix_r = cimpl_r.getCISMatrix(cimpl_obj, cis_frame_r) - cis_matrix = dataframe_to_pandas(cis_matrix_r) - - # Extract scale information from cis matrix. - scale_cols = [c for c in cis_matrix.columns if c.startswith('X')] - cis_matrix_scales = cis_matrix[['id'] + scale_cols] - - # Melt matrix into long format. - mapping = pd.melt(cis_matrix_scales, id_vars=['id']) - mapping = mapping[['id', 'value']] - mapping = mapping.rename(columns={'id': 'insertion_id', 'value': 'cis_id'}) - - # Split cis_id column into individual entries (for entries - # with multiple ids). Then drop any empty rows, as these - # entries are empty cells in the matrix. - mapping = _expand_column(mapping, col='cis_id', delimiter='|') - mapping = mapping.ix[mapping['cis_id'] != ''] - - return mapping - - -def merge_cis(cis_frame): - """ Merge cis sites that are in fact the same, but appear multiple times - with different peak_height locations. - :param cis_frame: - :return: - """ - - cols = ['chromosome', 'start', 'end', 'width', 'n_insertions', 'scale'] - return pd.DataFrame((grp.ix[grp['peak_height'].argmax()] - for _, grp in cis_frame.groupby(cols))) - - -def _expand_column(frame, col, delimiter): - exp = pd.concat( - (_expand_row( - row, col=col, delimiter=delimiter) for _, row in frame.iterrows()), - ignore_index=True) - return exp[frame.columns] - - -def _expand_row(row, col, delimiter): - row_dict = dict(row) - - if type(row[col]) == str: - col_split = row[col].split(delimiter) - row_dict[col] = col_split - else: - row_dict[col] = [row[col]] - - return pd.DataFrame(row_dict) diff --git a/src/pyim/cis/poisson.py b/src/pyim/cis/poisson.py deleted file mode 100644 index e40ae6e..0000000 --- a/src/pyim/cis/poisson.py +++ /dev/null @@ -1,96 +0,0 @@ -import itertools -import re - -import toolz -import pandas as pd -from intervaltree import IntervalTree -from scipy.stats import poisson -from statsmodels.stats.multitest import multipletests - - -def build_trees(insertions): - trees = {} - - for chrom, grp in insertions.groupby('chrom'): - intervals = zip(grp['position'], grp['position'] + 1, grp['id']) - trees[chrom] = IntervalTree.from_tuples(intervals) - - return trees - - -def count_pattern(record, pattern=None): - regex = re.compile(pattern) - return sum((1 for match in regex.finditer(record.seq))) - - -def count_matches(seq, regex): - return sum((1 for match in regex.finditer(seq))) - - -def generate_windows(insertions, window_size): - half_size = window_size // 2 - - # Generate list of windows for all insertions. - windows = (zip((chrom for _ in range(len(grp))), - grp['position'] - half_size, - grp['position'] + half_size) - for chrom, grp in insertions.groupby('chrom')) - windows = itertools.chain.from_iterable(windows) - - # Yield from windows. - for window in windows: - yield window - - -def calc_significance(insertions, reference, window_size, - pattern=None, chromosomes=None, total=None): - if chromosomes is None: - chromosomes = reference.keys() - - if pattern is not None: - regex = re.compile(pattern) - func = toolz.curry(count_matches, regex=regex) - else: - func = len - - if total is None: - total = sum((func(reference[c][0:len(reference[c])].seq) - for c in chromosomes)) - - # Subset insertions to chromosomes: - insertions = insertions.ix[ - insertions['chrom'].isin(chromosomes)] - - # Build lookup trees for insertions. - trees = build_trees(insertions) - - def _calc_for_window(window): - chrom, start, end = window - - # Calculate occurrence for region. - n_region = func(reference[chrom][int(start):int(end)].seq) - - # Calculate p-value. - x = len(trees[chrom][start:end]) - mu = len(insertions) * (n_region / total) - - p_val = poisson.sf(x, mu=mu, loc=1) - - return chrom, start, end, p_val - - # Generate windows. - windows = generate_windows(insertions, window_size=window_size) - - # Generate result. - res = pd.DataFrame.from_records( - (_calc_for_window(w) for w in windows), - columns=['chrom', 'start', 'end', 'p_val']) - res['p_val_corr'] = multipletests(res['p_val'], method='bonferroni')[1] - - return res - - -# result = calc_significance(insertions, ref, window_size=10000, -# pattern='(AT|TA)', chromosomes=chroms, -# total=genome_ta) -# result.query('p_val_corr < 0.05') diff --git a/src/pyim/cis/util.py b/src/pyim/cis/util.py new file mode 100644 index 0000000..d79b363 --- /dev/null +++ b/src/pyim/cis/util.py @@ -0,0 +1,54 @@ +import itertools +import operator + +from frozendict import frozendict +import numpy as np +import toolz + + +def assign_strand(cis_sites, insertions, mapping, min_homogeneity=0.75): + """Assigns CIS sites the average strand of their insertions.""" + + ins_lookup = {insertion.id: insertion for insertion in insertions} + + for cis_site in cis_sites: + # Lookup strands of CIS insertions. + cis_strands = np.array([ins_lookup[ins_id].strand + for ins_id in mapping[cis_site.id]]) + + # Calculate mean strand, strand and homogeneity. + mean_strand = np.mean(cis_strands) + strand = np.sign(mean_strand) + homogeneity = np.sum(cis_strands == strand) / len(cis_strands) + + # If homogeneity is below the given threshold, then we don't + # assign a specific strand (signified by a nan). + if homogeneity < min_homogeneity: + strand = np.nan + + # Merge strand metadata with existing metadata. + strand_metadata = {'strand_mean': mean_strand, + 'strand_homogeneity': homogeneity} + metadata = toolz.merge(cis_site.metadata, strand_metadata) + + yield cis_site._replace(strand=strand, metadata=frozendict(metadata)) + + +def invert_otm_mapping(mapping): + """Inverts a one-to-many mapping.""" + + # Create a list of inverted (v, k) tuples. + tuples = (itertools.zip_longest(v, [k], fillvalue=k) + for k, v in mapping.items() if len(v) > 0) # yapf: disable + tuples = itertools.chain.from_iterable(tuples) + + # Sort tuples by first element. + id_attr = operator.itemgetter(0) + sorted_tuples = sorted(tuples, key=id_attr) + + # Create inverted dictionary using groupby. + inverted = {k: set(list(zip(*grp))[1]) + for k, grp in itertools.groupby( + sorted_tuples, key=id_attr)} + + return inverted diff --git a/src/pyim/align/pipelines/_helpers/__init__.py b/src/pyim/external/__init__.py similarity index 100% rename from src/pyim/align/pipelines/_helpers/__init__.py rename to src/pyim/external/__init__.py diff --git a/src/pyim/external/bowtie2.py b/src/pyim/external/bowtie2.py new file mode 100644 index 0000000..8be223c --- /dev/null +++ b/src/pyim/external/bowtie2.py @@ -0,0 +1,57 @@ +import sys + +from . import util as shell + + +def bowtie2(in1_paths, + index_path, + output_path, + options=None, + in2_paths=None, + verbose=False): + """ + Aligns reads to a reference genome using Bowtie2. + + Parameters + ---------- + in1_paths : List[Path] + Path to input files containings reads. For single read data, + a list of Paths is expected. For paired-end sequencing data, + Paths should be passed as a tuple of lists, in which the first + element is taken as #1 mates and the second as #2 mates. + output_path : Path + Output path for the aligned (and sorted) bam file. + options : dict + Dict of extra options to pass to Bowtie2. + """ + + # Ensure we have a copy of options to work on. + options = dict(options) if options is not None else {} + + # Inject inputs + index into options. + if in2_paths is not None: + options['-1'] = ','.join(str(fp) for fp in in1_paths) + options['-2'] = ','.join(str(fp) for fp in in2_paths) + else: + options['-U'] = ','.join(str(fp) for fp in in1_paths) + + if any(ext in in1_paths[0].suffixes for ext in {'.fa', '.fna'}): + options['-f'] = True + + options['-x'] = str(index_path) + + # Build bowtie2 arguments. + bowtie_args = ['bowtie2'] + shell.flatten_options(options) + + # Sort arguments for samtools. + sort_args = ['samtools', 'sort', '-o', str(output_path), '-'] + + # Run in piped fashion to avoid extra IO. + processes = shell.run_piped([bowtie_args, sort_args]) + + if verbose: + # Print bowtie output to stderr for now. + # TODO: Rewrite to use logging. + print('', file=sys.stderr) + stderr = processes[0].stderr.read().decode() + print(stderr, file=sys.stderr) diff --git a/src/pyim/external/cutadapt.py b/src/pyim/external/cutadapt.py new file mode 100644 index 0000000..1b873b2 --- /dev/null +++ b/src/pyim/external/cutadapt.py @@ -0,0 +1,176 @@ +import itertools +from pathlib import Path +import shutil + +import pyfaidx + +from . import util as shell + + +def cutadapt(in1_path, out1_path, options, in2_path=None, out2_path=None): + """Runs cutadapt using the given options.""" + + cmdline_args = _build_arguments( + in1_path, out1_path, options, in2_path=in2_path, out2_path=out2_path) + + return shell.run(cmdline_args) + #process = subprocess.run(cmdline_args, + # stdout=subprocess.PIPE, + # stderr=subprocess.PIPE) + #process.check_returncode() + + # return process + + +def _build_arguments(in1_path=None, + out1_path=None, + options=None, + in2_path=None, + out2_path=None): + """Builds argument list for cutadapt.""" + + in1_path = in1_path or '-' + options = dict(options) if options is not None else {} + + if out1_path is not None: + options['-o'] = str(out1_path) + + if out2_path is not None: + options['-p'] = str(out2_path) + + cmdline_opts = shell.flatten_options(options) + cmdline_opts = ['cutadapt'] + cmdline_opts + [str(in1_path)] + + if in2_path is not None: + cmdline_opts += [str(in2_path)] + + return cmdline_opts + +# def cutadapt_piped(input_path, output_path, options_list, log_paths=None): +# """Runs multiple cutadapt commands in a piped fashion.""" + +# arg_list = [] +# for i, opts in enumerate(options_list): +# in_ = input_path if i == 0 else None +# out_ = output_path if i == (len(options_list) - 1) else None +# arg_list.append(_build_arguments(in_, out_, opts)) + +# if '-o' in arg_list[-1]: +# stdout = log_paths[-1] +# log_paths = log_paths[:-1] + [None] +# else: +# stdout = None + +# shell.run_piped(arg_list, stdout=stdout, stderrs=log_paths) + + +def demultiplex_samples(reads_path, + output_dir, + barcode_path, + error_rate=0.0, + sample_mapping=None): + """ + De-multiplexes reads into separate sample/barcode files. + + Parameters + ---------- + reads_path : Path + Path to the input reads file (in fasta/fastq format). + output_dir : Path + Output directory to which the de-multiplexed files will be written. + barcode_path : Path + Path to fasta file containing barcode sequences. + sample_mapping : dict + Dict mapping barcodes to samples. + + Returns + ------- + dict[Path] + Returns dict mapping samples to the respective demultiplexed file. + """ + + if sample_mapping is None: + # Directly de-multiplex using barcodes. + sample_paths = _demultiplex( + reads_path, output_dir, barcode_path, error_rate=error_rate) + else: + # First demultiplex to barcodes in temp dir. + tmp_dir = output_dir / '_barcodes' + barcode_paths = _demultiplex( + reads_path, tmp_dir, barcode_path, error_rate=error_rate) + + # Then rename files using mapping and delete files for unused barcodes. + sample_paths = {} + + for barcode, sample in sample_mapping.items(): + barcode_path = barcode_paths[barcode] + sample_path = output_dir / (sample + barcode_path.suffixes[-1]) + + if barcode_path.exists(): + shutil.move(str(barcode_path), str(sample_path)) + else: + # Create empty output if nothing was extracted for barcode. + sample_path.touch() + + sample_paths[sample] = sample_path + + shutil.rmtree(str(tmp_dir)) + + return sample_paths + + +def _demultiplex(reads_path, output_dir, barcode_path, error_rate): + """Runs cutadapt to de-multiplex reads into seperate files per barcode.""" + + output_dir.mkdir(parents=True) + + # De-multiplex using cutadapt. + options = {'-g': 'file:' + str(barcode_path), + '--discard-untrimmed': True, + '-e': error_rate} + output_base = output_dir / ('{name}' + reads_path.suffixes[-1]) + cutadapt(reads_path, output_base, options=options) + + # Identify output files. + barcode_keys = pyfaidx.Fasta(str(barcode_path)).keys() + output_paths = {bc: Path(str(output_base).format(name=bc)) + for bc in barcode_keys} + + return output_paths + + +def cutadapt_summary(stdstream): + sections = _split_log_sections(stdstream.read().decode()) + return '\n'.join([' '] + sections['=== Summary ===']) + + +def _split_log_sections(log_str): + return dict(_iter_log_sections(log_str.split('\n'))) + + +def _iter_log_sections(lines): + grouped = itertools.groupby(lines, lambda line: line.startswith('===')) + group_iter = (x[1] for x in grouped) + + yield 'Header', list(next(group_iter)) + + for name in group_iter: + header = next(name).strip() + lines = list(next(group_iter)) + yield header, lines + + +def _parse_summary_section(lines): + lines = (line.strip() for line in lines) + + stats = {} + for line in lines: + if line: + key, value = line.split(':') + + value = value.strip().split()[0] + value = value.replace(',', '') + + stats[key] = int(value) + + return stats diff --git a/src/pyim/external/util.py b/src/pyim/external/util.py new file mode 100644 index 0000000..24b7341 --- /dev/null +++ b/src/pyim/external/util.py @@ -0,0 +1,106 @@ +import subprocess + + +def run(arguments, stdout=None, stderr=None, check=True): + stdout_ = _open_stdstream(stdout) + stderr_ = _open_stdstream(stderr) + + try: + process = subprocess.Popen(arguments, stdout=stdout_, stderr=stderr_) + process.wait() + finally: + for std in [stdout_, stderr_]: + _close_stdstream(std) + + # Check return code. + if check and process.returncode != 0: + raise ValueError('Process terminated with errorcode {}' + .format(process.returncode)) + + return process + + +def run_piped(arguments_list, stdout=None, stderrs=None, check=True): + if len(arguments_list) < 2: + raise ValueError('At least two sets of arguments should be given') + + if stderrs is None: + stderrs = [None] * len(arguments_list) + + # Handle processes 1 to n-1. + processes = [] + stream_handles = [] + + try: + prev_out = None + for arg_list, stderr in zip(arguments_list[:-1], stderrs[:-1]): + # Setup processes. + stderr_fh = _open_stdstream(stderr) + stream_handles.append(stderr_fh) + + process = subprocess.Popen( + arg_list, + stdin=prev_out, + stdout=subprocess.PIPE, + stderr=stderr_fh) + + prev_out = process.stdout + processes.append(process) + + # Handle final process. + stdout_fh = _open_stdstream(stdout) + stderr_fh = _open_stdstream(stderrs[-1]) + stream_handles += [stdout_fh, stderr_fh] + + process = subprocess.Popen( + arguments_list[-1], + stdout=stdout_fh, + stderr=stderr_fh, + stdin=prev_out) + + processes.append(process) + + # Allow pi to receive a SIGPIPE. + for p in processes[:-1]: + p.stdout.close() + + process.wait() + + # Check return codes. + if check: + if process.returncode != 0: + raise ValueError('Process terminated with errorcode {}' + .format(process.returncode)) + + finally: + # Close all file handles. + for fh in stream_handles: + _close_stdstream(fh) + + return processes + + +def _open_stdstream(file_path, mode='w'): + if file_path is None: + return subprocess.PIPE + else: + return file_path.open(mode) + + +def _close_stdstream(stdstream): + if stdstream != subprocess.PIPE: + stdstream.close() + + +def flatten_options(option_dict): + """Flattens a dict of options into an argument list.""" + + options = [] + for opt_name, opt_value in option_dict.items(): + if isinstance(opt_value, (tuple, list)): + options += [str(v) for v in opt_value] + elif opt_value is True: + options += [opt_name] + elif not (opt_value is False or opt_value is None): + options += [opt_name, str(opt_value)] + return options diff --git a/src/pyim/main/_logging.py b/src/pyim/main/_logging.py deleted file mode 100644 index 4ea49f8..0000000 --- a/src/pyim/main/_logging.py +++ /dev/null @@ -1,23 +0,0 @@ -import logging -import pkg_resources - - -logging.basicConfig( - format='%(asctime)-15s %(levelname)-10s %(message)s', - datefmt='[%Y-%m-%d %H:%M:%S]', - level=logging.INFO) - - -def print_header(logger, command=None): - version = pkg_resources.require('pyim')[0].version - - if command is None: - header_str = ' PyIM ({}) '.format(version) - else: - header_str = ' PyIM {} ({}) '.format(command, version) - - logger.info('{:-^60}'.format(header_str)) - - -def print_footer(logger): - logger.info('{:-^60}'.format(' Done! ')) diff --git a/src/pyim/main/align.py b/src/pyim/main/align.py deleted file mode 100644 index 567c9c2..0000000 --- a/src/pyim/main/align.py +++ /dev/null @@ -1,33 +0,0 @@ -from __future__ import (absolute_import, division, - print_function, unicode_literals) - -import argparse -import logging - -from pyim.alignment.pipelines import shear_splink, shear_splink_sb -from ._logging import print_header, print_footer - - -def main(): - logger = logging.getLogger() - - # Setup main parser. - parser = argparse.ArgumentParser(prog='pyim-align') - subparsers = parser.add_subparsers(dest='pipeline') - subparsers.required = True - - # Register pipelines. - shear_splink.register(subparsers) - shear_splink_sb.register(subparsers) - - # Parse args. - args = parser.parse_args() - - # Dispatch to pipeline. - print_header(logger, command='align') - args.main(args) - print_footer(logger) - - -if __name__ == '__main__': - main() diff --git a/src/pyim/main/annotate.py b/src/pyim/main/annotate.py deleted file mode 100644 index 47c055a..0000000 --- a/src/pyim/main/annotate.py +++ /dev/null @@ -1,65 +0,0 @@ -from __future__ import absolute_import, division, print_function - -#pylint: disable=wildcard-import,unused-wildcard-import,redefined-builtin -from builtins import * -#pylint: enable=wildcard-import,unused-wildcard-import,redefined-builtin - -import argparse - -import pandas as pd - -from pyim.annotation import get_annotators -from pyim.model import Insertion - -# pylint: disable=import-error -# from ._logging import print_header, print_footer -# pylint: enable=import-error - - -def main(): - args = parse_args() - - ins_frame = pd.read_csv(args.input, sep='\t') - insertions = Insertion.from_frame(ins_frame) - - annotator = args.class_.from_args(args) - annotated = list(annotator.annotate(insertions)) - - annotated_frame = Insertion.to_frame(annotated) - annotated_frame.to_csv(args.output, sep='\t', index=False) - - # Dispatch to pipeline. - #cmd_str = '{} {}'.format('annotate', args.annotator) - #print_header(logger, command=cmd_str) - #args.main(args) - #print_footer(logger) - - -def parse_args(): - # Setup main parser. - parser = argparse.ArgumentParser(prog='pyim-annotate') - subparsers = parser.add_subparsers(dest='annotator') - subparsers.required = True - - # Register pipelines. - for name, class_ in get_annotators().items(): - annot_parser = subparsers.add_parser(name) - - _add_default_arguments(annot_parser) - class_.setup_args(annot_parser) - - annot_parser.set_defaults(class_=class_) - - # Actually parse args. - args = parser.parse_args() - - return args - - -def _add_default_arguments(parser): - parser.add_argument('input') - parser.add_argument('output') - - -if __name__ == '__main__': - main() diff --git a/src/pyim/main/cis.py b/src/pyim/main/cis.py deleted file mode 100644 index 90eb4ce..0000000 --- a/src/pyim/main/cis.py +++ /dev/null @@ -1,94 +0,0 @@ -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import (ascii, bytes, chr, dict, filter, hex, input, - int, map, next, oct, open, pow, range, round, - str, super, zip) -from future.utils import native_str - -from argparse import ArgumentParser -from os import path - -import logging -import pandas as pd - -from pyim.cis.cimpl import map_insertions -from pyim.util.insertions import subset_samples - -from ._logging import print_header, print_footer - - -def setup_parser(): - parser = ArgumentParser(prog='pyim-cis') - - parser.add_argument('input') - parser.add_argument('output') - - group = parser.add_mutually_exclusive_group(required=True) - group.add_argument('--pattern', default=None) - group.add_argument('--system', choices={'SB'}, default=None) - - parser.add_argument('--genome', choices={'mm10'}, default='mm10') - parser.add_argument('--chromosomes', nargs='+', default=None) - parser.add_argument('--scales', nargs='+', type=int, default=30000) - parser.add_argument('--samples', nargs='+', default=None) - - parser.add_argument('--iterations', type=int, default=1000) - parser.add_argument('--lhc_method', choices={'none', 'exclude'}, - default='exclude') - - parser.add_argument('--alpha', type=float, default=0.05) - - parser.add_argument('--threads', type=int, default=1) - parser.add_argument('--verbose', default=False, action='store_true') - - return parser - - -def main(): - logger = logging.getLogger() - - # Parse arguments. - parser = setup_parser() - args = parser.parse_args() - - # Print header. - print_header(logger, command='cis') - - # Read insertions. - insertions = pd.read_csv(args.input, sep=native_str('\t'), - dtype={'chrom': str}) - logger.info('Read {} insertions'.format(len(insertions))) - - # Subset to samples if needed. - if args.samples is not None: - logger.info('Subsetting to {} samples'.format(len(args.samples))) - insertions = subset_samples(insertions, args.samples, logger=logger) - - # Run cimpl on insertions. - logger.info('Running CIMPL in R') - - cis, mapping = map_insertions( - insertions, scales=args.scales, genome=args.genome, alpha=args.alpha, - system=args.system, pattern=args.pattern, lhc_method=args.lhc_method, - chromosomes=args.chromosomes, iterations=args.iterations, - threads=args.threads, verbose=args.verbose) - - # Annotate insertions with cis mapping. - logger.info('Merging CIMPL annotation') - - mapping_tmp = mapping.rename(columns={'insertion_id': 'id'}) - insertions = pd.merge(insertions, mapping_tmp, on='id') - - # Write out outputs. - logger.info('Writing outputs') - - cis_path = path.splitext(args.output)[0] + '.sites.txt' - cis.to_csv(cis_path, sep=native_str('\t'), index=False) - - insertions.to_csv(args.output, sep=native_str('\t'), index=False) - - print_footer(logger) - - -if __name__ == '__main__': - main() diff --git a/src/pyim/main/gff.py b/src/pyim/main/gff.py deleted file mode 100644 index 3459fae..0000000 --- a/src/pyim/main/gff.py +++ /dev/null @@ -1,70 +0,0 @@ -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import (ascii, bytes, chr, dict, filter, hex, input, - int, map, next, oct, open, pow, range, round, - str, super, zip) -from future.utils import native_str - -from argparse import ArgumentParser - -import pandas as pd - - -def setup_parser(): - parser = ArgumentParser(prog='pyim-gff') - - parser.add_argument('insertions') - parser.add_argument('output') - - return parser - - -def _ins_to_gff(ins, size=1000): - assert isinstance(ins.strand, int) - - attrs = [i for i in ins.index if i not in - {'id', 'seqname', 'location', 'strand'}] - - attr_dict = {attr: ins[attr] for attr in attrs} - - attr_dict['id'] = ins['id'] - attr_dict['name'] = ins['id'] - - attr_keys = sorted(attr_dict.keys()) - attr_str = ';'.join(('{} {}'.format(k, attr_dict[k]) for k in attr_keys)) - - return { - 'seqname': ins['chrom'], - 'source': '.', - 'feature': 'insertion', - 'start': int(ins['position'] - (size / 2)), - 'end': int(ins['position'] + (size / 2)), - 'score': '.', - 'strand': '+' if ins.strand == 1 else '-', - 'frame': '.', - 'attribute': attr_str - } - - -def main(): - parser = setup_parser() - args = parser.parse_args() - - # Read input. - ins_frame = pd.read_csv(args.insertions, sep=native_str('\t'), - dtype={'chrom': str, 'position': int}) - - # Transform to gff frame. - gff_frame = pd.DataFrame.from_records( - (_ins_to_gff(r) for _, r in ins_frame.iterrows()), - columns=['seqname', 'source', 'feature', 'start', 'end', - 'score', 'strand', 'frame', 'attribute']) - gff_frame = gff_frame.sort_values(by=['seqname', 'start', 'end']) - - # Write output. - gff_frame.to_csv(args.output, sep=native_str('\t'), - index=False, header=False) - - -if __name__ == '__main__': - main() diff --git a/src/pyim/main/merge_sets.py b/src/pyim/main/merge_sets.py deleted file mode 100644 index 43b9a1e..0000000 --- a/src/pyim/main/merge_sets.py +++ /dev/null @@ -1,86 +0,0 @@ -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import (ascii, bytes, chr, dict, filter, hex, input, - int, map, next, oct, open, pow, range, round, - str, super, zip) -from future.utils import native_str - -import logging -from argparse import ArgumentParser -from pathlib import Path - -import pandas as pd - -from pyim.util.insertions import subset_samples -from ._logging import print_header, print_footer - - -def setup_parser(): - parser = ArgumentParser(prog='pyim-merge-sets') - - parser.add_argument('insertions', nargs='+', type=Path) - parser.add_argument('output', type=Path) - - parser.add_argument('--names', nargs='+', default=None) - parser.add_argument('--samples', nargs='+', default=None) - # parser.add_argument('--complement', default=False, action='store_true') - - return parser - - -def main(): - parser = setup_parser() - args = parser.parse_args() - - # Get logger and print header. - logger = logging.getLogger() - print_header(logger, command='merge') - - # Generate default names if none given. - if args.names is None: - names = ['Set{}'.format(i) for i in range(1, len(args.insertions) + 1)] - else: - names = args.names - - # Read frames. - ins_frames, samples = [], set() - for (ins_path, name) in zip(args.insertions, names): - frame = pd.read_csv(str(ins_path), sep=native_str('\t')) - - # Check for overlapping samples. - frame_samples = set(filter(bool, frame['sample'])) - overlap = samples.intersection(frame_samples) - - if len(overlap) > 0: - raise ValueError('Overlapping samples between frames ({})' - .format(', '.join(overlap))) - - samples = samples.union(frame_samples) - - # Augment ids to avoid duplicates in merged frame. - if name != '': - frame['id'] = ['{}.{}'.format(name, id_) - for id_ in frame['id']] - ins_frames.append(frame) - - # Merge frames. - merged = pd.concat(ins_frames, ignore_index=True) - - logger.info('Merging insertions for {} datasets, containing {} samples' - .format(len(args.insertions), merged['sample'].nunique())) - - # Filter samples if needed. - if args.samples is not None: - logger.info('Subsetting dataset to {} samples' - .format(len(args.samples))) - merged = subset_samples(merged, args.samples, logger=logger) - - # Write output. - logging.info('Writing merged output') - merged.to_csv(str(args.output), sep=native_str('\t'), index=False) - - print_footer(logger) - - -if __name__ == '__main__': - main() diff --git a/src/pyim/main/pyim_align.py b/src/pyim/main/pyim_align.py new file mode 100644 index 0000000..b73b9b1 --- /dev/null +++ b/src/pyim/main/pyim_align.py @@ -0,0 +1,50 @@ +import argparse +import logging + +from pyim.align.pipelines import get_pipelines +from pyim.model import Insertion + +logging.basicConfig( + format='[%(asctime)-15s] %(message)s', + level=logging.INFO, + datefmt='%Y-%m-%d %H:%M:%S') + + +def main(): + """Main function for pyim-align.""" + + args = parse_args() + + # Run pipeline. + pipeline = args.pipeline.from_args(args) + insertions = pipeline.run(reads_path=args.reads, + work_dir=args.output.parent) + + # Write insertions to output file. + ins_frame = Insertion.to_frame(insertions) + ins_frame.to_csv(str(args.output), sep='\t', index=False) + + +def parse_args(): + """Parses arguments for pyim-align.""" + + # Setup main parser. + parser = argparse.ArgumentParser(prog='pyim-align') + subparsers = parser.add_subparsers(dest='pipeline') + subparsers.required = True + + # Register pipelines. + pipelines = get_pipelines() + + for pipeline_name in sorted(pipelines.keys()): + pipeline_class = pipelines[pipeline_name] + + pipeline_parser = subparsers.add_parser(pipeline_name) + pipeline_class.configure_args(pipeline_parser) + pipeline_parser.set_defaults(pipeline=pipeline_class) + + return parser.parse_args() + + +if __name__ == '__main__': + main() diff --git a/src/pyim/main/pyim_annotate.py b/src/pyim/main/pyim_annotate.py new file mode 100644 index 0000000..7e83128 --- /dev/null +++ b/src/pyim/main/pyim_annotate.py @@ -0,0 +1,39 @@ +import argparse + +from pyim.annotate import get_annotators +from pyim.model import Insertion + + +def main(): + """Main function for pyim-annotate.""" + args = parse_args() + + insertions = Insertion.from_csv(args.insertions, sep='\t') + + annotator = args.caller.from_args(args) + annotated = list(annotator.annotate(insertions)) + + annotated_frame = Insertion.to_frame(annotated) + annotated_frame = annotated_frame.sort_values(by='id') + annotated_frame.to_csv(str(args.output), sep='\t', index=False) + + +def parse_args(): + """Parses arguments for pyim-annotate.""" + + # Setup main parser. + parser = argparse.ArgumentParser(prog='pyim-annotate') + subparsers = parser.add_subparsers(dest='annotator') + subparsers.required = True + + # Register pipelines. + for name, class_ in get_annotators().items(): + cis_parser = subparsers.add_parser(name) + class_.configure_args(cis_parser) + cis_parser.set_defaults(caller=class_) + + return parser.parse_args() + + +if __name__ == '__main__': + main() diff --git a/src/pyim/main/pyim_bed.py b/src/pyim/main/pyim_bed.py new file mode 100644 index 0000000..40f8de1 --- /dev/null +++ b/src/pyim/main/pyim_bed.py @@ -0,0 +1,64 @@ +import argparse +from collections import OrderedDict +from pathlib import Path + +import numpy as np +import pandas as pd + +from pyim.model import Insertion + + +def main(): + """Main function for pyim-cis.""" + + args = parse_args() + + # Read insertions. + ins_frame = Insertion.from_csv(args.insertions, sep='\t', as_frame=True) + + # Drop any columns if needed. + if args.drop is not None: + ins_frame = ins_frame.drop(args.drop, axis=1) + ins_frame = ins_frame.drop_duplicates() + + # Convert to BED frame. + start = (ins_frame['position'] - (args.width // 2)).astype(int) + end = (ins_frame['position'] + (args.width // 2)).astype(int) + strand = ins_frame['strand'].map({1: '+', -1: '-', np.nan: '.'}) + color = strand.map({'+': '0,0,255', '-': '255,0,0', '.': '60,60,60'}) + + bed_frame = pd.DataFrame( + OrderedDict([ + ('chrom', ins_frame['chromosome']), + ('chromStart', start), + ('chromEnd', end), + ('name', ins_frame['id']), + ('score', ins_frame['support']), + ('strand', strand), + ('thickStart', start), + ('thickEnd', end), + ('itemRgb', color) + ]) + ) # yapf: disable + + # Write output. + bed_frame.to_csv(str(args.output), sep='\t', index=False, header=False) + + +def parse_args(): + """Parses arguments for pyim-cis.""" + + parser = argparse.ArgumentParser(prog='pyim-bed') + + parser.add_argument('--insertions', required=True, type=Path) + parser.add_argument('--output', required=True, type=Path) + + parser.add_argument('--width', default=500, type=int) + + parser.add_argument('--drop', nargs='+', default=None) + + return parser.parse_args() + + +if __name__ == '__main__': + main() diff --git a/src/pyim/main/pyim_cis.py b/src/pyim/main/pyim_cis.py new file mode 100644 index 0000000..d71b0eb --- /dev/null +++ b/src/pyim/main/pyim_cis.py @@ -0,0 +1,64 @@ +import argparse + +from frozendict import frozendict +import toolz + +from pyim.cis import get_callers +from pyim.model import Insertion, CisSite + + +def main(): + """Main function for pyim-cis.""" + + args = parse_args() + caller = args.caller.from_args(args) + + # Identify CIS sites. + insertions = list(Insertion.from_csv(args.insertions, sep='\t')) + cis_sites, cis_mapping = caller.call(insertions=insertions) + + # Annotate insertions. + annotated_ins = _annotate_insertions(insertions, cis_mapping) + + # Write outputs. + Insertion.to_csv(args.output, annotated_ins, sep='\t', index=False) + + if args.output_sites is None: + cis_path = args.output.with_suffix('.sites.txt') + else: + cis_path = args.output_sites + + CisSite.to_csv(cis_path, cis_sites, sep='\t', index=False) + + +def _annotate_insertions(insertions, cis_map): + """Annotates insertions with CIS sites using given mapping.""" + + for insertion in insertions: + cis_ids = cis_map.get(insertion.id, set()) + + for cis_id in cis_ids: + cis_metadata = {'cis_id': cis_id} + new_metadata = toolz.merge(insertion.metadata, cis_metadata) + yield insertion._replace(metadata=frozendict(new_metadata)) + + +def parse_args(): + """Parses arguments for pyim-cis.""" + + # Setup main parser. + parser = argparse.ArgumentParser(prog='pyim-cis') + subparsers = parser.add_subparsers(dest='caller') + subparsers.required = True + + # Register pipelines. + for name, class_ in get_callers().items(): + cis_parser = subparsers.add_parser(name) + class_.configure_args(cis_parser) + cis_parser.set_defaults(caller=class_) + + return parser.parse_args() + + +if __name__ == '__main__': + main() diff --git a/src/pyim/main/pyim_demultiplex.py b/src/pyim/main/pyim_demultiplex.py new file mode 100644 index 0000000..db6115d --- /dev/null +++ b/src/pyim/main/pyim_demultiplex.py @@ -0,0 +1,46 @@ +import argparse +from pathlib import Path + +import pandas as pd + +from pyim.external.cutadapt import demultiplex_samples + + +def main(): + """Main function for pyim-demultiplex.""" + + args = parse_args() + + # Construct sample mapping if given. + if args.sample_mapping is not None: + map_df = pd.read_csv(str(args.sample_mapping), sep='\t') + sample_mapping = dict(zip(map_df['barcode'], map_df['sample'])) + else: + sample_mapping = None + + # Perform de-multiplexing. + demultiplex_samples( + reads_path=args.reads, + output_dir=args.output_dir, + barcode_path=args.barcodes, + error_rate=args.error_rate, + sample_mapping=sample_mapping) + + +def parse_args(): + """Parses arguments for pyim-demultiplex.""" + + parser = argparse.ArgumentParser(prog='pyim-demultiplex') + + parser.add_argument('--reads', required=True, type=Path) + parser.add_argument('--output_dir', required=True, type=Path) + parser.add_argument('--barcodes', required=True, type=Path) + + parser.add_argument('--sample_mapping', type=Path) + parser.add_argument('--error_rate', type=float, default=0.0) + + return parser.parse_args() + + +if __name__ == '__main__': + main() diff --git a/src/pyim/main/merge.py b/src/pyim/main/pyim_merge.py similarity index 65% rename from src/pyim/main/merge.py rename to src/pyim/main/pyim_merge.py index 2ae2027..aa5d426 100644 --- a/src/pyim/main/merge.py +++ b/src/pyim/main/pyim_merge.py @@ -1,8 +1,3 @@ -# pylint: disable=W0622,W0614,W0401 -from __future__ import absolute_import, division, print_function -from builtins import * -# pylint: enable=W0622,W0614,W0401 - import logging from argparse import ArgumentParser from pathlib import Path @@ -11,7 +6,7 @@ import pandas as pd from pyim.model import Insertion -from ._logging import print_header, print_footer +from pyim.util import add_prefix def setup_parser(): @@ -28,24 +23,18 @@ def main(): parser = setup_parser() args = parser.parse_args() - # Get logger and print header. - logger = logging.getLogger() - print_header(logger, command='merge') - # Read and merge frames. - merge_files(args.insertions, args.output, sample_names=args.sample_names) - - print_footer(logger) - - -def merge_files(file_paths, output_path, sample_names=None): - if sample_names is None: - sample_names = [fp.stem for fp in file_paths] + if args.sample_names is None: + sample_names = [fp.stem for fp in args.insertions] + else: + sample_names = args.sample_names - ins_frames = (pd.read_csv(fp, sep='\t') for fp in file_paths) + # Read frames. + ins_frames = (pd.read_csv(fp, sep='\t') for fp in args.insertions) + # Merge and write output. merged = merge_frames(ins_frames, sample_names) - merged.to_csv(str(output_path), sep='\t', index=False) + merged.to_csv(str(args.output), sep='\t', index=False) def merge_frames(insertion_frames, sample_names): @@ -66,7 +55,7 @@ def merge_frames(insertion_frames, sample_names): # Augment frame with sample name. frame = frame.copy() frame['sample'] = sample_name - frame['id'] = (sample_name + '.') + frame['id'] + frame['id'] = add_prefix(frame['id'], prefix=sample_name + '.') frames.append(frame) diff --git a/src/pyim/main/pyim_split.py b/src/pyim/main/pyim_split.py new file mode 100644 index 0000000..348f020 --- /dev/null +++ b/src/pyim/main/pyim_split.py @@ -0,0 +1,48 @@ +from argparse import ArgumentParser +from pathlib import Path + +import pandas as pd + +from pyim.model import Insertion +from pyim.util import remove_prefix + + +def setup_parser(): + parser = ArgumentParser(prog='pyim-split') + + parser.add_argument('--insertions', type=Path, required=True) + parser.add_argument('--output_dir', type=Path, required=True) + + parser.add_argument('--samples', nargs='+', required=False, default=None) + parser.add_argument('--remove_prefix', default=False, action='store_true') + + return parser + + +def main(): + parser = setup_parser() + args = parser.parse_args() + + # Read frame. + ins_frame = Insertion.from_csv(args.insertions, sep='\t', as_frame=True) + + # Create output directory if it doesn't exist. + args.output_dir.mkdir(exist_ok=True, parents=True) + + if args.samples is not None: + # Subset for samples and convert to categorical. + ins_frame = ins_frame.ix[ins_frame['sample'].isin(args.samples)] + ins_frame['sample'] = pd.Categorical( + ins_frame['sample'], categories=args.samples) + + # Split and write individual outputs. + for sample, sample_frame in ins_frame.groupby('sample'): + if args.remove_prefix: + sample_frame['id'] = remove_prefix( + sample_frame['id'], prefix=sample + '.') + + if len(sample_frame) == 0: + print('WARNING: no insertions found for sample {}'.format(sample)) + + sample_path = args.output_dir / '{}.txt'.format(sample) + sample_frame.to_csv(str(sample_path), sep='\t', index=False) diff --git a/src/pyim/main/split.py b/src/pyim/main/split.py deleted file mode 100644 index 9f2ce94..0000000 --- a/src/pyim/main/split.py +++ /dev/null @@ -1,61 +0,0 @@ -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import (ascii, bytes, chr, dict, filter, hex, input, - int, map, next, oct, open, pow, range, round, - str, super, zip) - -from argparse import ArgumentParser -from pathlib import Path - -import pysam -import pandas as pd - - -def setup_parser(): - parser = ArgumentParser(prog='pyim-split') - - parser.add_argument('alignment_bam', type=Path) - parser.add_argument('read_barcode_map', type=Path) - - parser.add_argument('--output_dir', type=Path, default='.') - - return parser - - -def main(): - parser = setup_parser() - args = parser.parse_args() - - # Create output dir. - if not args.output_dir.exists(): - args.output_dir.mkdir() - - # Read barcodes. - barcode_map = pd.read_csv(str(args.read_barcode_map), sep='\t') - barcode_map = dict(zip(barcode_map['read_id'], barcode_map['barcode'])) - - # Split reads into separate files. - with pysam.AlignmentFile(str(args.alignment_bam), 'rb') as in_file: - - out_files = {} - try: - # Open output files. - for sample in set(barcode_map.values()): - out_name = args.alignment_bam.stem + '.{}.bam'.format(sample) - out_path = args.output_dir / out_name - - out_files[sample] = pysam.AlignmentFile( - str(out_path), 'wb', template=in_file) - - # Write reads to separate files. - for read in in_file: - sample = barcode_map[read.query_name] - out_files[sample].write(read) - - finally: - for out_path in out_files.values(): - out_path.close() - - -if __name__ == '__main__': - main() diff --git a/src/pyim/model.py b/src/pyim/model.py index e30036d..f4843ac 100644 --- a/src/pyim/model.py +++ b/src/pyim/model.py @@ -1,10 +1,9 @@ -# pylint: disable=W0622,W0614,W0401 -from __future__ import absolute_import, division, print_function -from builtins import * -# pylint: enable=W0622,W0614,W0401 +"""Module containing model classes for fusions and insertions.""" import collections +from frozendict import frozendict +import numpy as np import pandas as pd import toolz @@ -12,6 +11,8 @@ class MetadataFrameMixin(object): """Mixin class adding namedtuple/frame conversion support.""" + _dtypes = {} + @classmethod def _non_metadata_fields(cls): fields = list(cls._fields) @@ -22,27 +23,28 @@ def _non_metadata_fields(cls): def to_frame(cls, insertions): """Converts list of objects to a dataframe representation.""" - rows = (cls._to_dict(ins) for ins in insertions) + # Check if insertions is empty. + is_empty, insertions = cls._is_empty(insertions) - df = pd.DataFrame.from_records(rows) - df = cls.format_frame(df) + if is_empty: + df = pd.DataFrame.from_records( + [], columns=cls._non_metadata_fields()) + else: + rows = (cls._to_dict(ins) for ins in insertions) + df = pd.DataFrame.from_records(rows) + df = cls.format_frame(df) return df - @classmethod - def format_frame(cls, df): - """Ensures frame is properly formatted (column order etc.)""" - cls.check_frame(df) - return cls._reorder_columns(df, order=cls._non_metadata_fields()) - - @classmethod - def check_frame(cls, df): - basic_fields = cls._non_metadata_fields() - missing_columns = set(basic_fields) - set(df.columns) + @staticmethod + def _is_empty(iterable): + try: + _, iterable = toolz.peek(iterable) + empty = False + except StopIteration: + empty = True - if len(missing_columns) > 0: - raise ValueError('Missing required columns {}', - ', '.join(missing_columns)) + return empty, iterable @classmethod def _to_dict(cls, obj): @@ -56,6 +58,26 @@ def _reorder_columns(cls, df, order): col_order = list(order) + sorted(extra_cols) return df[col_order] + @classmethod + def check_frame(cls, df): + missing = set(cls._non_metadata_fields()) - set(df.columns) + if len(missing) > 0: + raise ValueError('Missing required columns: {}' + .format(', '.join(missing))) + + @classmethod + def format_frame(cls, df): + cls.check_frame(df) + + df2 = df.copy() + + for col, dtype in cls._dtypes.items(): + df2[col] = df[col].astype(dtype) + + df2 = cls._reorder_columns(df, order=cls._non_metadata_fields()) + + return df2 + @classmethod def from_frame(cls, df): """Converts dataframe into a list of objects.""" @@ -69,14 +91,36 @@ def from_frame(cls, df): row_dict = row._asdict() metadata = {k: row_dict.pop(k) for k in metadata_fields} + metadata = frozendict(toolz.valfilter(_not_nan, metadata)) + row_dict.pop('Index', None) - yield cls(**row_dict, metadata=metadata) + if not set(basic_fields) == set(row_dict.keys()): + missing_fields = set(basic_fields) - set(row_dict.keys()) + raise ValueError('Missing required fields ({})' + .format(', '.join(missing_fields))) + yield cls(metadata=metadata, **row_dict) -_Insertion = collections.namedtuple( - 'Insertion', ['id', 'chromosome', 'position', - 'strand', 'metadata']) + @classmethod + def from_csv(cls, file_path, as_frame=False, **kwargs): + df = pd.read_csv(file_path, dtype=cls._dtypes, **kwargs) + cls.check_frame(df) + + if as_frame: + return df + else: + return cls.from_frame(df) + + @classmethod + def to_csv(cls, file_path, insertions, index=False, **kwargs): + df = cls.to_frame(insertions) + df.to_csv(str(file_path), index=index, **kwargs) + + +_Insertion = collections.namedtuple('Insertion', + ['id', 'chromosome', 'position', 'strand', + 'support', 'metadata']) class Insertion(MetadataFrameMixin, _Insertion): @@ -84,9 +128,28 @@ class Insertion(MetadataFrameMixin, _Insertion): __slots__ = () - @classmethod - def format_frame(cls, df): - df = super().format_frame(df) - df['position'] = df['position'].astype(int) - df['strand'] = df['strand'].astype(int) - return df + _dtypes = {'chromosome': str} + + +_CisSite = collections.namedtuple( + 'CisSite', ['id', 'chromosome', 'position', 'strand', 'metadata']) + + +class CisSite(MetadataFrameMixin, _CisSite): + """Model class representing an Common Insertion Site (CIS).""" + + __slots__ = () + + _dtypes = {'chromosome': str} + + +def _not_nan(value): + if value is None: + return False + elif isinstance(value, str) and value == '': + return False + else: + try: + return not np.isnan(value) + except TypeError: + return True \ No newline at end of file diff --git a/src/pyim/util/__init__.py b/src/pyim/util/__init__.py index e69de29..36c6b3a 100644 --- a/src/pyim/util/__init__.py +++ b/src/pyim/util/__init__.py @@ -0,0 +1,6 @@ +def add_prefix(values, prefix): + return [prefix + v if not v.startswith(prefix) else v for v in values] + + +def remove_prefix(values, prefix): + return [v[len(prefix):] if v.startswith(prefix) else v for v in values] diff --git a/src/pyim/util/file.py b/src/pyim/util/file.py deleted file mode 100644 index 82a568e..0000000 --- a/src/pyim/util/file.py +++ /dev/null @@ -1,29 +0,0 @@ -import pysam -from functools import reduce - - -def _make_gen(reader): - b = reader(1024 * 1024) - while b: - yield b - b = reader(1024*1024) - - -def count_lines(file_path): - f = open(file_path, 'rb') - f_gen = _make_gen(f.raw.read) - return sum(buf.count(b'\n') for buf in f_gen) - - -def count_fasta_entries(file_path): - f = open(file_path, 'rb') - f_gen = _make_gen(f.raw.read) - return sum(buf.count(b'>') for buf in f_gen) - - -def count_bam_entries(file_path): - # From Biostars at https://www.biostars.org/p/1890/. - # Could be faster for sorted/index bam files using idxstats. - reduce(lambda x, y: x + y, - [eval('+'.join(l.rstrip('\n').split('\t')[2:])) - for l in pysam.idxstats(file_path)]) diff --git a/src/pyim/util/insertions.py b/src/pyim/util/insertions.py deleted file mode 100644 index a9a9077..0000000 --- a/src/pyim/util/insertions.py +++ /dev/null @@ -1,13 +0,0 @@ - - -def subset_samples(insertions, samples, logger=None): - warn = print if logger is None else logger.warning - - # Check for missing samples. - ins_samples = set(insertions['sample']) - for sample in samples: - if sample not in ins_samples: - warn('- Missing insertions for sample {}'.format(sample)) - - # Actually subset insertions. - return insertions.ix[ insertions['sample'].isin(set(samples))] diff --git a/src/pyim/util/pandas.py b/src/pyim/util/pandas.py deleted file mode 100644 index bdee1c6..0000000 --- a/src/pyim/util/pandas.py +++ /dev/null @@ -1,13 +0,0 @@ -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import (ascii, bytes, chr, dict, filter, hex, input, - int, map, next, oct, open, pow, range, round, - str, super, zip) - - -def reorder_columns(frame, order, drop_extra=False): - if drop_extra: - return frame[order] - else: - extra_cols = [c for c in frame.columns if c not in set(order)] - return frame[list(order) + extra_cols] diff --git a/src/pyim/util/path.py b/src/pyim/util/path.py new file mode 100644 index 0000000..f074420 --- /dev/null +++ b/src/pyim/util/path.py @@ -0,0 +1,14 @@ +from pathlib import Path + + +def build_path(file_path, suffix='', dir_=None, ext=None): + file_path = Path(file_path) + + ext = ext or file_path.suffixes[-1] + suffix = suffix + ext + new_path = file_path.with_suffix(suffix) + + if dir_ is not None: + new_path = Path(dir_) / new_path.name + + return new_path diff --git a/src/pyim/util/rpy2.py b/src/pyim/util/rpy2.py index 1906b36..dd50510 100644 --- a/src/pyim/util/rpy2.py +++ b/src/pyim/util/rpy2.py @@ -2,7 +2,6 @@ from rpy2.robjects import pandas2ri from rpy2.rinterface import RNULLType - pandas2ri.activate() diff --git a/src/pyim/util/tabix.py b/src/pyim/util/tabix.py index cf12230..f3c2e1e 100644 --- a/src/pyim/util/tabix.py +++ b/src/pyim/util/tabix.py @@ -1,20 +1,12 @@ -from __future__ import (absolute_import, division, - print_function, unicode_literals) - -# noinspection PyUnresolvedReferences -from builtins import (ascii, bytes, chr, dict, filter, hex, input, - int, map, next, oct, open, pow, range, round, - str, super, zip) -from future.utils import native_str - import contextlib import itertools import os import subprocess -import pysam import numpy as np import pandas as pd +import pysam +import toolz def _parse_float(value): @@ -26,16 +18,26 @@ def _parse_float(value): def _reorder_columns(frame, order): columns = list(order) - extra_columns = sorted([c for c in frame.columns - if c not in set(columns)]) + extra_columns = sorted([c for c in frame.columns if c not in set(columns)]) return frame[columns + extra_columns] -def _get_region(frame, reference, start=None, end=None, - filters=None, incl_left=True, incl_right=True, - ref_col='contig', start_col='start', end_col='end'): +def _get_region(frame, + reference, + start=None, + end=None, + filters=None, + incl_left=True, + incl_right=True, + ref_col='contig', + start_col='start', + end_col='end'): + + mask = pd.Series(True, index=frame.index) + # Filter on passed range. - mask = frame[ref_col] == reference + if reference is not None: + mask = frame[ref_col] == reference if start is not None: mask &= frame[start_col] <= end @@ -73,41 +75,46 @@ def tabix(file_path, preset): class TabixIterator(object): - def __init__(self, file_path, parser=None): self._file_path = file_path self._parser = parser - def fetch(self, reference=None, start=None, end=None, - filters=None, incl_left=True, incl_right=True): - file_obj = pysam.TabixFile(native_str(self._file_path), - parser=self._parser) + def fetch(self, + reference=None, + start=None, + end=None, + filters=None, + incl_left=True, + incl_right=True): + file_obj = pysam.TabixFile(str(self._file_path), parser=self._parser) with contextlib.closing(file_obj) as tb_file: if reference is not None: - reference = native_str(reference) + reference = str(reference) - records = self._fetch(tb_file, reference=reference, - start=start, end=end) + records = self._fetch( + tb_file, reference=reference, start=start, end=end) # Filter records on additional filters. if filters is not None: for name, value in filters.items(): - records = (r for r in records - if hasattr(r, name) - and getattr(r, name) == value) + records = self._apply_filter(records, name, value) # Filter inclusive/exclusive if needed. if not incl_left: - records = filter(lambda r: r.start > start, records) + records = (r for r in records if r.start > start) if not incl_right: - records = filter(lambda r: r.end < end, records) + records = (r for r in records if r.end < end) # Yield records. for record in records: yield record + @staticmethod + def _apply_filter(records, name, value): + return (rec for rec in records + if hasattr(rec, name) and getattr(rec, name) == value) def _fetch(self, tb_file, reference=None, **kwargs): # For some reason pysam does not fetch all records if reference @@ -115,9 +122,8 @@ def _fetch(self, tb_file, reference=None, **kwargs): # the contig records into one iterable. if reference is None: contigs = tb_file.contigs - records = itertools.chain.from_iterable( - (tb_file.fetch(reference=ref, **kwargs) - for ref in contigs)) + records = itertools.chain.from_iterable((tb_file.fetch( + reference=ref, **kwargs) for ref in contigs)) else: records = tb_file.fetch(reference=reference, **kwargs) @@ -126,28 +132,47 @@ def _fetch(self, tb_file, reference=None, **kwargs): class TabixFile(object): - def __init__(self, file_path, parser): self._file_path = file_path self._iterator = TabixIterator(file_path, parser=parser) - def fetch(self, reference=None, start=None, end=None, - filters=None, incl_left=True, incl_right=True): + def fetch(self, + reference=None, + start=None, + end=None, + filters=None, + incl_left=True, + incl_right=True): + """Fetches records for the given region.""" records = self._iterator.fetch( - reference=reference, start=start, end=end, - filters=filters, incl_left=incl_left, incl_right=incl_right) - - for record in (self._to_series(r) for r in records): - yield record + reference=reference, + start=start, + end=end, + filters=filters, + incl_left=incl_left, + incl_right=incl_right) - def get_region(self, reference=None, start=None, end=None, - filters=None, incl_left=True, incl_right=True): - records = self.fetch(reference, start, end, filters=filters, - incl_left=incl_left, incl_right=incl_right) + for record in records: + yield self._to_dict(record) + + def get_region(self, + reference=None, + start=None, + end=None, + filters=None, + incl_left=True, + incl_right=True): + """Fetches DataFrame of features for the given region.""" + records = self.fetch( + reference, + start, + end, + filters=filters, + incl_left=incl_left, + incl_right=incl_right) return self._frame_constructor().from_records(records) - @classmethod - def _to_series(cls, record): + def _to_dict(self, record): raise NotImplementedError() @classmethod @@ -156,30 +181,60 @@ def _frame_constructor(cls): class TabixFrame(pd.DataFrame): - @property def _constructor(self): raise NotImplementedError() - def fetch(self, reference=None, start=None, end=None, - filters=None, incl_left=True, incl_right=True): - raise NotImplementedError() - - def get_region(self, reference=None, start=None, end=None, - filters=None, incl_left=True, incl_right=True, **kwargs): - return _get_region(self, reference, start, end, filters=filters, - incl_left=incl_left, incl_right=incl_right, **kwargs) + def fetch(self, + reference=None, + start=None, + end=None, + filters=None, + incl_left=True, + incl_right=True): + + subset = self.get_region( + reference, + start, + end, + filters=filters, + incl_left=incl_left, + incl_right=incl_right) + + for tup in subset.itertuples(): + yield tup._asdict() + + def get_region(self, + reference=None, + start=None, + end=None, + filters=None, + incl_left=True, + incl_right=True, + **kwargs): + return _get_region( + self, + reference, + start, + end, + filters=filters, + incl_left=incl_left, + incl_right=incl_right, + **kwargs) class GtfFile(TabixFile): - TYPE_MAP = {3: int, 4: int, 5: _parse_float} - - FIELDS = ('contig', 'source', 'feature', 'start', - 'end', 'score', 'strand', 'frame', 'attribute') + FIELDS = ('contig', 'source', 'feature', 'start', 'end', 'score', 'strand', + 'frame') + TYPES = {'start': int, 'end': int, 'score': _parse_float} def __init__(self, file_path): file_path = str(file_path) + + if not os.path.exists(file_path): + raise IOError('File does not exist ({})'.format(file_path)) + if not file_path.endswith('.gz'): if os.path.exists(file_path + '.gz'): file_path += '.gz' @@ -188,20 +243,25 @@ def __init__(self, file_path): super().__init__(file_path, parser=pysam.asGTF()) - @classmethod - def _to_series(cls, record): - rec_values = tuple((cls.TYPE_MAP.get(i, lambda x: x)(val) - for i, val in enumerate(record))) - attr_keys, attr_values = zip(*dict(record).items()) - return pd.Series(rec_values[:-1] + attr_values, - index=cls.FIELDS[:-1] + attr_keys) + def _to_dict(self, record): + basic_attr = dict(zip(self.FIELDS, record)) + + for key, func in self.TYPES.items(): + basic_attr[key] = func(basic_attr[key]) + + return toolz.merge(basic_attr, dict(record)) @classmethod def _frame_constructor(cls): return GtfFrame - def get_gene(self, gene_id, feature_type='gene', - field_name='gene_id', **kwargs): + def get_gene(self, + gene_id, + feature_type='gene', + field_name='gene_id', + **kwargs): + """Fetchs a given gene by id.""" + # Add feature filter to filters (if given). filters = kwargs.pop('filter', {}) filters['feature'] = feature_type @@ -209,7 +269,7 @@ def get_gene(self, gene_id, feature_type='gene', # Search for gene record. records = self._iterator.fetch(filters=filters, **kwargs) for record in records: - if record[native_str(field_name)] == gene_id: + if record[str(field_name)] == gene_id: return self._to_series(record) raise ValueError('Gene {} does not exist'.format(gene_id)) @@ -239,11 +299,11 @@ def compress(cls, file_path, out_path=None, sort=True, create_index=True): @classmethod def sort(cls, file_path, out_path): - """Sorts a gtf file by position, as required for tabix.""" + """Sorts a gtf file by position, required for indexing by tabix.""" with open(out_path, 'w') as out_file: - cmd = '(grep ^"#" {0}; grep -v ^"#" {0} ''| sort -k1,1 -k4,4n)' - subprocess.check_call(cmd.format(file_path), - stdout=out_file, shell=True) + cmd = '(grep ^"#" {0}; grep -v ^"#" {0} | sort -k1,1 -k4,4n)' + subprocess.check_call( + cmd.format(file_path), stdout=out_file, shell=True) return out_path def __repr__(self): @@ -251,7 +311,6 @@ def __repr__(self): class GtfFrame(TabixFrame): - @property def _constructor(self): return GtfFrame @@ -268,7 +327,7 @@ def from_records(cls, data, *args, **kwargs): # Handle empty case. if len(frame) == 0: - frame = pd.DataFrame([], columns=GtfFile.FIELDS[:-1]) + frame = cls([], columns=GtfFile.FIELDS[:-1]) return cls._format_frame(frame) @@ -287,7 +346,7 @@ def _format_frame(cls, frame): def get_gene(self, gene_id): result = self.ix[((self['feature'] == 'gene') & - (self['gene_id'] == gene_id))] + (self['gene_id'] == gene_id))] if len(result) == 0: raise ValueError('Gene {} does not exist'.format(gene_id)) diff --git a/tests/pyim/util/test_shell.py b/tests/pyim/util/test_shell.py new file mode 100644 index 0000000..00c96de --- /dev/null +++ b/tests/pyim/util/test_shell.py @@ -0,0 +1,2 @@ +# shell.run_piped([['ls'], ['grep', '-f']], +# stderrs=[None, Path('test.log')]) diff --git a/versioneer.py b/versioneer.py deleted file mode 100644 index 7ed2a21..0000000 --- a/versioneer.py +++ /dev/null @@ -1,1774 +0,0 @@ - -# Version: 0.16 - -"""The Versioneer - like a rocketeer, but for versions. - -The Versioneer -============== - -* like a rocketeer, but for versions! -* https://github.com/warner/python-versioneer -* Brian Warner -* License: Public Domain -* Compatible With: python2.6, 2.7, 3.3, 3.4, 3.5, and pypy -* [![Latest Version] -(https://pypip.in/version/versioneer/badge.svg?style=flat) -](https://pypi.python.org/pypi/versioneer/) -* [![Build Status] -(https://travis-ci.org/warner/python-versioneer.png?branch=master) -](https://travis-ci.org/warner/python-versioneer) - -This is a tool for managing a recorded version number in distutils-based -python projects. The goal is to remove the tedious and error-prone "update -the embedded version string" step from your release process. Making a new -release should be as easy as recording a new tag in your version-control -system, and maybe making new tarballs. - - -## Quick Install - -* `pip install versioneer` to somewhere to your $PATH -* add a `[versioneer]` section to your setup.cfg (see below) -* run `versioneer install` in your source tree, commit the results - -## Version Identifiers - -Source trees come from a variety of places: - -* a version-control system checkout (mostly used by developers) -* a nightly tarball, produced by build automation -* a snapshot tarball, produced by a web-based VCS browser, like github's - "tarball from tag" feature -* a release tarball, produced by "setup.py sdist", distributed through PyPI - -Within each source tree, the version identifier (either a string or a number, -this tool is format-agnostic) can come from a variety of places: - -* ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows - about recent "tags" and an absolute revision-id -* the name of the directory into which the tarball was unpacked -* an expanded VCS keyword ($Id$, etc) -* a `_version.py` created by some earlier build step - -For released software, the version identifier is closely related to a VCS -tag. Some projects use tag names that include more than just the version -string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool -needs to strip the tag prefix to extract the version identifier. For -unreleased software (between tags), the version identifier should provide -enough information to help developers recreate the same tree, while also -giving them an idea of roughly how old the tree is (after version 1.2, before -version 1.3). Many VCS systems can report a description that captures this, -for example `git describe --tags --dirty --always` reports things like -"0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the -0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has -uncommitted changes. - -The version identifier is used for multiple purposes: - -* to allow the module to self-identify its version: `myproject.__version__` -* to choose a name and prefix for a 'setup.py sdist' tarball - -## Theory of Operation - -Versioneer works by adding a special `_version.py` file into your source -tree, where your `__init__.py` can import it. This `_version.py` knows how to -dynamically ask the VCS tool for version information at import time. - -`_version.py` also contains `$Revision$` markers, and the installation -process marks `_version.py` to have this marker rewritten with a tag name -during the `git archive` command. As a result, generated tarballs will -contain enough information to get the proper version. - -To allow `setup.py` to compute a version too, a `versioneer.py` is added to -the top level of your source tree, next to `setup.py` and the `setup.cfg` -that configures it. This overrides several distutils/setuptools commands to -compute the version when invoked, and changes `setup.py build` and `setup.py -sdist` to replace `_version.py` with a small static file that contains just -the generated version data. - -## Installation - -First, decide on values for the following configuration variables: - -* `VCS`: the version control system you use. Currently accepts "git". - -* `style`: the style of version string to be produced. See "Styles" below for - details. Defaults to "pep440", which looks like - `TAG[+DISTANCE.gSHORTHASH[.dirty]]`. - -* `versionfile_source`: - - A project-relative pathname into which the generated version strings should - be written. This is usually a `_version.py` next to your project's main - `__init__.py` file, so it can be imported at runtime. If your project uses - `src/myproject/__init__.py`, this should be `src/myproject/_version.py`. - This file should be checked in to your VCS as usual: the copy created below - by `setup.py setup_versioneer` will include code that parses expanded VCS - keywords in generated tarballs. The 'build' and 'sdist' commands will - replace it with a copy that has just the calculated version string. - - This must be set even if your project does not have any modules (and will - therefore never import `_version.py`), since "setup.py sdist" -based trees - still need somewhere to record the pre-calculated version strings. Anywhere - in the source tree should do. If there is a `__init__.py` next to your - `_version.py`, the `setup.py setup_versioneer` command (described below) - will append some `__version__`-setting assignments, if they aren't already - present. - -* `versionfile_build`: - - Like `versionfile_source`, but relative to the build directory instead of - the source directory. These will differ when your setup.py uses - 'package_dir='. If you have `package_dir={'myproject': 'src/myproject'}`, - then you will probably have `versionfile_build='myproject/_version.py'` and - `versionfile_source='src/myproject/_version.py'`. - - If this is set to None, then `setup.py build` will not attempt to rewrite - any `_version.py` in the built tree. If your project does not have any - libraries (e.g. if it only builds a script), then you should use - `versionfile_build = None`. To actually use the computed version string, - your `setup.py` will need to override `distutils.command.build_scripts` - with a subclass that explicitly inserts a copy of - `versioneer.get_version()` into your script file. See - `test/demoapp-script-only/setup.py` for an example. - -* `tag_prefix`: - - a string, like 'PROJECTNAME-', which appears at the start of all VCS tags. - If your tags look like 'myproject-1.2.0', then you should use - tag_prefix='myproject-'. If you use unprefixed tags like '1.2.0', this - should be an empty string, using either `tag_prefix=` or `tag_prefix=''`. - -* `parentdir_prefix`: - - a optional string, frequently the same as tag_prefix, which appears at the - start of all unpacked tarball filenames. If your tarball unpacks into - 'myproject-1.2.0', this should be 'myproject-'. To disable this feature, - just omit the field from your `setup.cfg`. - -This tool provides one script, named `versioneer`. That script has one mode, -"install", which writes a copy of `versioneer.py` into the current directory -and runs `versioneer.py setup` to finish the installation. - -To versioneer-enable your project: - -* 1: Modify your `setup.cfg`, adding a section named `[versioneer]` and - populating it with the configuration values you decided earlier (note that - the option names are not case-sensitive): - - ```` - [versioneer] - VCS = git - style = pep440 - versionfile_source = src/myproject/_version.py - versionfile_build = myproject/_version.py - tag_prefix = - parentdir_prefix = myproject- - ```` - -* 2: Run `versioneer install`. This will do the following: - - * copy `versioneer.py` into the top of your source tree - * create `_version.py` in the right place (`versionfile_source`) - * modify your `__init__.py` (if one exists next to `_version.py`) to define - `__version__` (by calling a function from `_version.py`) - * modify your `MANIFEST.in` to include both `versioneer.py` and the - generated `_version.py` in sdist tarballs - - `versioneer install` will complain about any problems it finds with your - `setup.py` or `setup.cfg`. Run it multiple times until you have fixed all - the problems. - -* 3: add a `import versioneer` to your setup.py, and add the following - arguments to the setup() call: - - version=versioneer.get_version(), - cmdclass=versioneer.get_cmdclass(), - -* 4: commit these changes to your VCS. To make sure you won't forget, - `versioneer install` will mark everything it touched for addition using - `git add`. Don't forget to add `setup.py` and `setup.cfg` too. - -## Post-Installation Usage - -Once established, all uses of your tree from a VCS checkout should get the -current version string. All generated tarballs should include an embedded -version string (so users who unpack them will not need a VCS tool installed). - -If you distribute your project through PyPI, then the release process should -boil down to two steps: - -* 1: git tag 1.0 -* 2: python setup.py register sdist upload - -If you distribute it through github (i.e. users use github to generate -tarballs with `git archive`), the process is: - -* 1: git tag 1.0 -* 2: git push; git push --tags - -Versioneer will report "0+untagged.NUMCOMMITS.gHASH" until your tree has at -least one tag in its history. - -## Version-String Flavors - -Code which uses Versioneer can learn about its version string at runtime by -importing `_version` from your main `__init__.py` file and running the -`get_versions()` function. From the "outside" (e.g. in `setup.py`), you can -import the top-level `versioneer.py` and run `get_versions()`. - -Both functions return a dictionary with different flavors of version -information: - -* `['version']`: A condensed version string, rendered using the selected - style. This is the most commonly used value for the project's version - string. The default "pep440" style yields strings like `0.11`, - `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section - below for alternative styles. - -* `['full-revisionid']`: detailed revision identifier. For Git, this is the - full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac". - -* `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that - this is only accurate if run in a VCS checkout, otherwise it is likely to - be False or None - -* `['error']`: if the version string could not be computed, this will be set - to a string describing the problem, otherwise it will be None. It may be - useful to throw an exception in setup.py if this is set, to avoid e.g. - creating tarballs with a version string of "unknown". - -Some variants are more useful than others. Including `full-revisionid` in a -bug report should allow developers to reconstruct the exact code being tested -(or indicate the presence of local changes that should be shared with the -developers). `version` is suitable for display in an "about" box or a CLI -`--version` output: it can be easily compared against release notes and lists -of bugs fixed in various releases. - -The installer adds the following text to your `__init__.py` to place a basic -version in `YOURPROJECT.__version__`: - - from ._version import get_versions - __version__ = get_versions()['version'] - del get_versions - -## Styles - -The setup.cfg `style=` configuration controls how the VCS information is -rendered into a version string. - -The default style, "pep440", produces a PEP440-compliant string, equal to the -un-prefixed tag name for actual releases, and containing an additional "local -version" section with more detail for in-between builds. For Git, this is -TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags ---dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the -tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and -that this commit is two revisions ("+2") beyond the "0.11" tag. For released -software (exactly equal to a known tag), the identifier will only contain the -stripped tag, e.g. "0.11". - -Other styles are available. See details.md in the Versioneer source tree for -descriptions. - -## Debugging - -Versioneer tries to avoid fatal errors: if something goes wrong, it will tend -to return a version of "0+unknown". To investigate the problem, run `setup.py -version`, which will run the version-lookup code in a verbose mode, and will -display the full contents of `get_versions()` (including the `error` string, -which may help identify what went wrong). - -## Updating Versioneer - -To upgrade your project to a new release of Versioneer, do the following: - -* install the new Versioneer (`pip install -U versioneer` or equivalent) -* edit `setup.cfg`, if necessary, to include any new configuration settings - indicated by the release notes -* re-run `versioneer install` in your source tree, to replace - `SRC/_version.py` -* commit any changed files - -### Upgrading to 0.16 - -Nothing special. - -### Upgrading to 0.15 - -Starting with this version, Versioneer is configured with a `[versioneer]` -section in your `setup.cfg` file. Earlier versions required the `setup.py` to -set attributes on the `versioneer` module immediately after import. The new -version will refuse to run (raising an exception during import) until you -have provided the necessary `setup.cfg` section. - -In addition, the Versioneer package provides an executable named -`versioneer`, and the installation process is driven by running `versioneer -install`. In 0.14 and earlier, the executable was named -`versioneer-installer` and was run without an argument. - -### Upgrading to 0.14 - -0.14 changes the format of the version string. 0.13 and earlier used -hyphen-separated strings like "0.11-2-g1076c97-dirty". 0.14 and beyond use a -plus-separated "local version" section strings, with dot-separated -components, like "0.11+2.g1076c97". PEP440-strict tools did not like the old -format, but should be ok with the new one. - -### Upgrading from 0.11 to 0.12 - -Nothing special. - -### Upgrading from 0.10 to 0.11 - -You must add a `versioneer.VCS = "git"` to your `setup.py` before re-running -`setup.py setup_versioneer`. This will enable the use of additional -version-control systems (SVN, etc) in the future. - -## Future Directions - -This tool is designed to make it easily extended to other version-control -systems: all VCS-specific components are in separate directories like -src/git/ . The top-level `versioneer.py` script is assembled from these -components by running make-versioneer.py . In the future, make-versioneer.py -will take a VCS name as an argument, and will construct a version of -`versioneer.py` that is specific to the given VCS. It might also take the -configuration arguments that are currently provided manually during -installation by editing setup.py . Alternatively, it might go the other -direction and include code from all supported VCS systems, reducing the -number of intermediate scripts. - - -## License - -To make Versioneer easier to embed, all its code is dedicated to the public -domain. The `_version.py` that it creates is also in the public domain. -Specifically, both are released under the Creative Commons "Public Domain -Dedication" license (CC0-1.0), as described in -https://creativecommons.org/publicdomain/zero/1.0/ . - -""" - -from __future__ import print_function -try: - import configparser -except ImportError: - import ConfigParser as configparser -import errno -import json -import os -import re -import subprocess -import sys - - -class VersioneerConfig: - """Container for Versioneer configuration parameters.""" - - -def get_root(): - """Get the project root directory. - - We require that all commands are run from the project root, i.e. the - directory that contains setup.py, setup.cfg, and versioneer.py . - """ - root = os.path.realpath(os.path.abspath(os.getcwd())) - setup_py = os.path.join(root, "setup.py") - versioneer_py = os.path.join(root, "versioneer.py") - if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): - # allow 'python path/to/setup.py COMMAND' - root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0]))) - setup_py = os.path.join(root, "setup.py") - versioneer_py = os.path.join(root, "versioneer.py") - if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): - err = ("Versioneer was unable to run the project root directory. " - "Versioneer requires setup.py to be executed from " - "its immediate directory (like 'python setup.py COMMAND'), " - "or in a way that lets it use sys.argv[0] to find the root " - "(like 'python path/to/setup.py COMMAND').") - raise VersioneerBadRootError(err) - try: - # Certain runtime workflows (setup.py install/develop in a setuptools - # tree) execute all dependencies in a single python process, so - # "versioneer" may be imported multiple times, and python's shared - # module-import table will cache the first one. So we can't use - # os.path.dirname(__file__), as that will find whichever - # versioneer.py was first imported, even in later projects. - me = os.path.realpath(os.path.abspath(__file__)) - if os.path.splitext(me)[0] != os.path.splitext(versioneer_py)[0]: - print("Warning: build in %s is using versioneer.py from %s" - % (os.path.dirname(me), versioneer_py)) - except NameError: - pass - return root - - -def get_config_from_root(root): - """Read the project setup.cfg file to determine Versioneer config.""" - # This might raise EnvironmentError (if setup.cfg is missing), or - # configparser.NoSectionError (if it lacks a [versioneer] section), or - # configparser.NoOptionError (if it lacks "VCS="). See the docstring at - # the top of versioneer.py for instructions on writing your setup.cfg . - setup_cfg = os.path.join(root, "setup.cfg") - parser = configparser.SafeConfigParser() - with open(setup_cfg, "r") as f: - parser.readfp(f) - VCS = parser.get("versioneer", "VCS") # mandatory - - def get(parser, name): - if parser.has_option("versioneer", name): - return parser.get("versioneer", name) - return None - cfg = VersioneerConfig() - cfg.VCS = VCS - cfg.style = get(parser, "style") or "" - cfg.versionfile_source = get(parser, "versionfile_source") - cfg.versionfile_build = get(parser, "versionfile_build") - cfg.tag_prefix = get(parser, "tag_prefix") - if cfg.tag_prefix in ("''", '""'): - cfg.tag_prefix = "" - cfg.parentdir_prefix = get(parser, "parentdir_prefix") - cfg.verbose = get(parser, "verbose") - return cfg - - -class NotThisMethod(Exception): - """Exception raised if a method is not valid for the current scenario.""" - -# these dictionaries contain VCS-specific tools -LONG_VERSION_PY = {} -HANDLERS = {} - - -def register_vcs_handler(vcs, method): # decorator - """Decorator to mark a method as the handler for a particular VCS.""" - def decorate(f): - """Store f in HANDLERS[vcs][method].""" - if vcs not in HANDLERS: - HANDLERS[vcs] = {} - HANDLERS[vcs][method] = f - return f - return decorate - - -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False): - """Call the given command(s).""" - assert isinstance(commands, list) - p = None - for c in commands: - try: - dispcmd = str([c] + args) - # remember shell=False, so use git.cmd on windows, not just git - p = subprocess.Popen([c] + args, cwd=cwd, stdout=subprocess.PIPE, - stderr=(subprocess.PIPE if hide_stderr - else None)) - break - except EnvironmentError: - e = sys.exc_info()[1] - if e.errno == errno.ENOENT: - continue - if verbose: - print("unable to run %s" % dispcmd) - print(e) - return None - else: - if verbose: - print("unable to find command, tried %s" % (commands,)) - return None - stdout = p.communicate()[0].strip() - if sys.version_info[0] >= 3: - stdout = stdout.decode() - if p.returncode != 0: - if verbose: - print("unable to run %s (error)" % dispcmd) - return None - return stdout -LONG_VERSION_PY['git'] = ''' -# This file helps to compute a version number in source trees obtained from -# git-archive tarball (such as those provided by githubs download-from-tag -# feature). Distribution tarballs (built by setup.py sdist) and build -# directories (produced by setup.py build) will contain a much shorter file -# that just contains the computed version number. - -# This file is released into the public domain. Generated by -# versioneer-0.16 (https://github.com/warner/python-versioneer) - -"""Git implementation of _version.py.""" - -import errno -import os -import re -import subprocess -import sys - - -def get_keywords(): - """Get the keywords needed to look up the version information.""" - # these strings will be replaced by git during git-archive. - # setup.py/versioneer.py will grep for the variable names, so they must - # each be defined on a line of their own. _version.py will just call - # get_keywords(). - git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s" - git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s" - keywords = {"refnames": git_refnames, "full": git_full} - return keywords - - -class VersioneerConfig: - """Container for Versioneer configuration parameters.""" - - -def get_config(): - """Create, populate and return the VersioneerConfig() object.""" - # these strings are filled in when 'setup.py versioneer' creates - # _version.py - cfg = VersioneerConfig() - cfg.VCS = "git" - cfg.style = "%(STYLE)s" - cfg.tag_prefix = "%(TAG_PREFIX)s" - cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s" - cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s" - cfg.verbose = False - return cfg - - -class NotThisMethod(Exception): - """Exception raised if a method is not valid for the current scenario.""" - - -LONG_VERSION_PY = {} -HANDLERS = {} - - -def register_vcs_handler(vcs, method): # decorator - """Decorator to mark a method as the handler for a particular VCS.""" - def decorate(f): - """Store f in HANDLERS[vcs][method].""" - if vcs not in HANDLERS: - HANDLERS[vcs] = {} - HANDLERS[vcs][method] = f - return f - return decorate - - -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False): - """Call the given command(s).""" - assert isinstance(commands, list) - p = None - for c in commands: - try: - dispcmd = str([c] + args) - # remember shell=False, so use git.cmd on windows, not just git - p = subprocess.Popen([c] + args, cwd=cwd, stdout=subprocess.PIPE, - stderr=(subprocess.PIPE if hide_stderr - else None)) - break - except EnvironmentError: - e = sys.exc_info()[1] - if e.errno == errno.ENOENT: - continue - if verbose: - print("unable to run %%s" %% dispcmd) - print(e) - return None - else: - if verbose: - print("unable to find command, tried %%s" %% (commands,)) - return None - stdout = p.communicate()[0].strip() - if sys.version_info[0] >= 3: - stdout = stdout.decode() - if p.returncode != 0: - if verbose: - print("unable to run %%s (error)" %% dispcmd) - return None - return stdout - - -def versions_from_parentdir(parentdir_prefix, root, verbose): - """Try to determine the version from the parent directory name. - - Source tarballs conventionally unpack into a directory that includes - both the project name and a version string. - """ - dirname = os.path.basename(root) - if not dirname.startswith(parentdir_prefix): - if verbose: - print("guessing rootdir is '%%s', but '%%s' doesn't start with " - "prefix '%%s'" %% (root, dirname, parentdir_prefix)) - raise NotThisMethod("rootdir doesn't start with parentdir_prefix") - return {"version": dirname[len(parentdir_prefix):], - "full-revisionid": None, - "dirty": False, "error": None} - - -@register_vcs_handler("git", "get_keywords") -def git_get_keywords(versionfile_abs): - """Extract version information from the given file.""" - # the code embedded in _version.py can just fetch the value of these - # keywords. When used from setup.py, we don't want to import _version.py, - # so we do it with a regexp instead. This function is not used from - # _version.py. - keywords = {} - try: - f = open(versionfile_abs, "r") - for line in f.readlines(): - if line.strip().startswith("git_refnames ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["refnames"] = mo.group(1) - if line.strip().startswith("git_full ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["full"] = mo.group(1) - f.close() - except EnvironmentError: - pass - return keywords - - -@register_vcs_handler("git", "keywords") -def git_versions_from_keywords(keywords, tag_prefix, verbose): - """Get version information from git keywords.""" - if not keywords: - raise NotThisMethod("no keywords at all, weird") - refnames = keywords["refnames"].strip() - if refnames.startswith("$Format"): - if verbose: - print("keywords are unexpanded, not using") - raise NotThisMethod("unexpanded keywords, not a git-archive tarball") - refs = set([r.strip() for r in refnames.strip("()").split(",")]) - # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of - # just "foo-1.0". If we see a "tag: " prefix, prefer those. - TAG = "tag: " - tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) - if not tags: - # Either we're using git < 1.8.3, or there really are no tags. We use - # a heuristic: assume all version tags have a digit. The old git %%d - # expansion behaves like git log --decorate=short and strips out the - # refs/heads/ and refs/tags/ prefixes that would let us distinguish - # between branches and tags. By ignoring refnames without digits, we - # filter out many common branch names like "release" and - # "stabilization", as well as "HEAD" and "master". - tags = set([r for r in refs if re.search(r'\d', r)]) - if verbose: - print("discarding '%%s', no digits" %% ",".join(refs-tags)) - if verbose: - print("likely tags: %%s" %% ",".join(sorted(tags))) - for ref in sorted(tags): - # sorting will prefer e.g. "2.0" over "2.0rc1" - if ref.startswith(tag_prefix): - r = ref[len(tag_prefix):] - if verbose: - print("picking %%s" %% r) - return {"version": r, - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": None - } - # no suitable tags, so version is "0+unknown", but full hex is still there - if verbose: - print("no suitable tags, using unknown + full revision id") - return {"version": "0+unknown", - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": "no suitable tags"} - - -@register_vcs_handler("git", "pieces_from_vcs") -def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): - """Get version from 'git describe' in the root of the source tree. - - This only gets called if the git-archive 'subst' keywords were *not* - expanded, and _version.py hasn't already been rewritten with a short - version string, meaning we're inside a checked out source tree. - """ - if not os.path.exists(os.path.join(root, ".git")): - if verbose: - print("no .git in %%s" %% root) - raise NotThisMethod("no .git directory") - - GITS = ["git"] - if sys.platform == "win32": - GITS = ["git.cmd", "git.exe"] - # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] - # if there isn't one, this yields HEX[-dirty] (no NUM) - describe_out = run_command(GITS, ["describe", "--tags", "--dirty", - "--always", "--long", - "--match", "%%s*" %% tag_prefix], - cwd=root) - # --long was added in git-1.5.5 - if describe_out is None: - raise NotThisMethod("'git describe' failed") - describe_out = describe_out.strip() - full_out = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) - if full_out is None: - raise NotThisMethod("'git rev-parse' failed") - full_out = full_out.strip() - - pieces = {} - pieces["long"] = full_out - pieces["short"] = full_out[:7] # maybe improved later - pieces["error"] = None - - # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] - # TAG might have hyphens. - git_describe = describe_out - - # look for -dirty suffix - dirty = git_describe.endswith("-dirty") - pieces["dirty"] = dirty - if dirty: - git_describe = git_describe[:git_describe.rindex("-dirty")] - - # now we have TAG-NUM-gHEX or HEX - - if "-" in git_describe: - # TAG-NUM-gHEX - mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) - if not mo: - # unparseable. Maybe git-describe is misbehaving? - pieces["error"] = ("unable to parse git-describe output: '%%s'" - %% describe_out) - return pieces - - # tag - full_tag = mo.group(1) - if not full_tag.startswith(tag_prefix): - if verbose: - fmt = "tag '%%s' doesn't start with prefix '%%s'" - print(fmt %% (full_tag, tag_prefix)) - pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'" - %% (full_tag, tag_prefix)) - return pieces - pieces["closest-tag"] = full_tag[len(tag_prefix):] - - # distance: number of commits since tag - pieces["distance"] = int(mo.group(2)) - - # commit: short hex revision ID - pieces["short"] = mo.group(3) - - else: - # HEX: no tags - pieces["closest-tag"] = None - count_out = run_command(GITS, ["rev-list", "HEAD", "--count"], - cwd=root) - pieces["distance"] = int(count_out) # total number of commits - - return pieces - - -def plus_or_dot(pieces): - """Return a + if we don't already have one, else return a .""" - if "+" in pieces.get("closest-tag", ""): - return "." - return "+" - - -def render_pep440(pieces): - """Build up version string, with post-release "local version identifier". - - Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you - get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty - - Exceptions: - 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - rendered += plus_or_dot(pieces) - rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) - if pieces["dirty"]: - rendered += ".dirty" - else: - # exception #1 - rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"], - pieces["short"]) - if pieces["dirty"]: - rendered += ".dirty" - return rendered - - -def render_pep440_pre(pieces): - """TAG[.post.devDISTANCE] -- No -dirty. - - Exceptions: - 1: no tags. 0.post.devDISTANCE - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"]: - rendered += ".post.dev%%d" %% pieces["distance"] - else: - # exception #1 - rendered = "0.post.dev%%d" %% pieces["distance"] - return rendered - - -def render_pep440_post(pieces): - """TAG[.postDISTANCE[.dev0]+gHEX] . - - The ".dev0" means dirty. Note that .dev0 sorts backwards - (a dirty tree will appear "older" than the corresponding clean one), - but you shouldn't be releasing software with -dirty anyways. - - Exceptions: - 1: no tags. 0.postDISTANCE[.dev0] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - rendered += ".post%%d" %% pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - rendered += plus_or_dot(pieces) - rendered += "g%%s" %% pieces["short"] - else: - # exception #1 - rendered = "0.post%%d" %% pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - rendered += "+g%%s" %% pieces["short"] - return rendered - - -def render_pep440_old(pieces): - """TAG[.postDISTANCE[.dev0]] . - - The ".dev0" means dirty. - - Eexceptions: - 1: no tags. 0.postDISTANCE[.dev0] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - rendered += ".post%%d" %% pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - else: - # exception #1 - rendered = "0.post%%d" %% pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - return rendered - - -def render_git_describe(pieces): - """TAG[-DISTANCE-gHEX][-dirty]. - - Like 'git describe --tags --dirty --always'. - - Exceptions: - 1: no tags. HEX[-dirty] (note: no 'g' prefix) - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"]: - rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) - else: - # exception #1 - rendered = pieces["short"] - if pieces["dirty"]: - rendered += "-dirty" - return rendered - - -def render_git_describe_long(pieces): - """TAG-DISTANCE-gHEX[-dirty]. - - Like 'git describe --tags --dirty --always -long'. - The distance/hash is unconditional. - - Exceptions: - 1: no tags. HEX[-dirty] (note: no 'g' prefix) - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) - else: - # exception #1 - rendered = pieces["short"] - if pieces["dirty"]: - rendered += "-dirty" - return rendered - - -def render(pieces, style): - """Render the given version pieces into the requested style.""" - if pieces["error"]: - return {"version": "unknown", - "full-revisionid": pieces.get("long"), - "dirty": None, - "error": pieces["error"]} - - if not style or style == "default": - style = "pep440" # the default - - if style == "pep440": - rendered = render_pep440(pieces) - elif style == "pep440-pre": - rendered = render_pep440_pre(pieces) - elif style == "pep440-post": - rendered = render_pep440_post(pieces) - elif style == "pep440-old": - rendered = render_pep440_old(pieces) - elif style == "git-describe": - rendered = render_git_describe(pieces) - elif style == "git-describe-long": - rendered = render_git_describe_long(pieces) - else: - raise ValueError("unknown style '%%s'" %% style) - - return {"version": rendered, "full-revisionid": pieces["long"], - "dirty": pieces["dirty"], "error": None} - - -def get_versions(): - """Get version information or return default if unable to do so.""" - # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have - # __file__, we can work backwards from there to the root. Some - # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which - # case we can only use expanded keywords. - - cfg = get_config() - verbose = cfg.verbose - - try: - return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, - verbose) - except NotThisMethod: - pass - - try: - root = os.path.realpath(__file__) - # versionfile_source is the relative path from the top of the source - # tree (where the .git directory might live) to this file. Invert - # this to find the root from __file__. - for i in cfg.versionfile_source.split('/'): - root = os.path.dirname(root) - except NameError: - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, - "error": "unable to find root of source tree"} - - try: - pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) - return render(pieces, cfg.style) - except NotThisMethod: - pass - - try: - if cfg.parentdir_prefix: - return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) - except NotThisMethod: - pass - - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, - "error": "unable to compute version"} -''' - - -@register_vcs_handler("git", "get_keywords") -def git_get_keywords(versionfile_abs): - """Extract version information from the given file.""" - # the code embedded in _version.py can just fetch the value of these - # keywords. When used from setup.py, we don't want to import _version.py, - # so we do it with a regexp instead. This function is not used from - # _version.py. - keywords = {} - try: - f = open(versionfile_abs, "r") - for line in f.readlines(): - if line.strip().startswith("git_refnames ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["refnames"] = mo.group(1) - if line.strip().startswith("git_full ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["full"] = mo.group(1) - f.close() - except EnvironmentError: - pass - return keywords - - -@register_vcs_handler("git", "keywords") -def git_versions_from_keywords(keywords, tag_prefix, verbose): - """Get version information from git keywords.""" - if not keywords: - raise NotThisMethod("no keywords at all, weird") - refnames = keywords["refnames"].strip() - if refnames.startswith("$Format"): - if verbose: - print("keywords are unexpanded, not using") - raise NotThisMethod("unexpanded keywords, not a git-archive tarball") - refs = set([r.strip() for r in refnames.strip("()").split(",")]) - # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of - # just "foo-1.0". If we see a "tag: " prefix, prefer those. - TAG = "tag: " - tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) - if not tags: - # Either we're using git < 1.8.3, or there really are no tags. We use - # a heuristic: assume all version tags have a digit. The old git %d - # expansion behaves like git log --decorate=short and strips out the - # refs/heads/ and refs/tags/ prefixes that would let us distinguish - # between branches and tags. By ignoring refnames without digits, we - # filter out many common branch names like "release" and - # "stabilization", as well as "HEAD" and "master". - tags = set([r for r in refs if re.search(r'\d', r)]) - if verbose: - print("discarding '%s', no digits" % ",".join(refs-tags)) - if verbose: - print("likely tags: %s" % ",".join(sorted(tags))) - for ref in sorted(tags): - # sorting will prefer e.g. "2.0" over "2.0rc1" - if ref.startswith(tag_prefix): - r = ref[len(tag_prefix):] - if verbose: - print("picking %s" % r) - return {"version": r, - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": None - } - # no suitable tags, so version is "0+unknown", but full hex is still there - if verbose: - print("no suitable tags, using unknown + full revision id") - return {"version": "0+unknown", - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": "no suitable tags"} - - -@register_vcs_handler("git", "pieces_from_vcs") -def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): - """Get version from 'git describe' in the root of the source tree. - - This only gets called if the git-archive 'subst' keywords were *not* - expanded, and _version.py hasn't already been rewritten with a short - version string, meaning we're inside a checked out source tree. - """ - if not os.path.exists(os.path.join(root, ".git")): - if verbose: - print("no .git in %s" % root) - raise NotThisMethod("no .git directory") - - GITS = ["git"] - if sys.platform == "win32": - GITS = ["git.cmd", "git.exe"] - # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] - # if there isn't one, this yields HEX[-dirty] (no NUM) - describe_out = run_command(GITS, ["describe", "--tags", "--dirty", - "--always", "--long", - "--match", "%s*" % tag_prefix], - cwd=root) - # --long was added in git-1.5.5 - if describe_out is None: - raise NotThisMethod("'git describe' failed") - describe_out = describe_out.strip() - full_out = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) - if full_out is None: - raise NotThisMethod("'git rev-parse' failed") - full_out = full_out.strip() - - pieces = {} - pieces["long"] = full_out - pieces["short"] = full_out[:7] # maybe improved later - pieces["error"] = None - - # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] - # TAG might have hyphens. - git_describe = describe_out - - # look for -dirty suffix - dirty = git_describe.endswith("-dirty") - pieces["dirty"] = dirty - if dirty: - git_describe = git_describe[:git_describe.rindex("-dirty")] - - # now we have TAG-NUM-gHEX or HEX - - if "-" in git_describe: - # TAG-NUM-gHEX - mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) - if not mo: - # unparseable. Maybe git-describe is misbehaving? - pieces["error"] = ("unable to parse git-describe output: '%s'" - % describe_out) - return pieces - - # tag - full_tag = mo.group(1) - if not full_tag.startswith(tag_prefix): - if verbose: - fmt = "tag '%s' doesn't start with prefix '%s'" - print(fmt % (full_tag, tag_prefix)) - pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" - % (full_tag, tag_prefix)) - return pieces - pieces["closest-tag"] = full_tag[len(tag_prefix):] - - # distance: number of commits since tag - pieces["distance"] = int(mo.group(2)) - - # commit: short hex revision ID - pieces["short"] = mo.group(3) - - else: - # HEX: no tags - pieces["closest-tag"] = None - count_out = run_command(GITS, ["rev-list", "HEAD", "--count"], - cwd=root) - pieces["distance"] = int(count_out) # total number of commits - - return pieces - - -def do_vcs_install(manifest_in, versionfile_source, ipy): - """Git-specific installation logic for Versioneer. - - For Git, this means creating/changing .gitattributes to mark _version.py - for export-time keyword substitution. - """ - GITS = ["git"] - if sys.platform == "win32": - GITS = ["git.cmd", "git.exe"] - files = [manifest_in, versionfile_source] - if ipy: - files.append(ipy) - try: - me = __file__ - if me.endswith(".pyc") or me.endswith(".pyo"): - me = os.path.splitext(me)[0] + ".py" - versioneer_file = os.path.relpath(me) - except NameError: - versioneer_file = "versioneer.py" - files.append(versioneer_file) - present = False - try: - f = open(".gitattributes", "r") - for line in f.readlines(): - if line.strip().startswith(versionfile_source): - if "export-subst" in line.strip().split()[1:]: - present = True - f.close() - except EnvironmentError: - pass - if not present: - f = open(".gitattributes", "a+") - f.write("%s export-subst\n" % versionfile_source) - f.close() - files.append(".gitattributes") - run_command(GITS, ["add", "--"] + files) - - -def versions_from_parentdir(parentdir_prefix, root, verbose): - """Try to determine the version from the parent directory name. - - Source tarballs conventionally unpack into a directory that includes - both the project name and a version string. - """ - dirname = os.path.basename(root) - if not dirname.startswith(parentdir_prefix): - if verbose: - print("guessing rootdir is '%s', but '%s' doesn't start with " - "prefix '%s'" % (root, dirname, parentdir_prefix)) - raise NotThisMethod("rootdir doesn't start with parentdir_prefix") - return {"version": dirname[len(parentdir_prefix):], - "full-revisionid": None, - "dirty": False, "error": None} - -SHORT_VERSION_PY = """ -# This file was generated by 'versioneer.py' (0.16) from -# revision-control system data, or from the parent directory name of an -# unpacked source archive. Distribution tarballs contain a pre-generated copy -# of this file. - -import json -import sys - -version_json = ''' -%s -''' # END VERSION_JSON - - -def get_versions(): - return json.loads(version_json) -""" - - -def versions_from_file(filename): - """Try to determine the version from _version.py if present.""" - try: - with open(filename) as f: - contents = f.read() - except EnvironmentError: - raise NotThisMethod("unable to read _version.py") - mo = re.search(r"version_json = '''\n(.*)''' # END VERSION_JSON", - contents, re.M | re.S) - if not mo: - raise NotThisMethod("no version_json in _version.py") - return json.loads(mo.group(1)) - - -def write_to_version_file(filename, versions): - """Write the given version number to the given _version.py file.""" - os.unlink(filename) - contents = json.dumps(versions, sort_keys=True, - indent=1, separators=(",", ": ")) - with open(filename, "w") as f: - f.write(SHORT_VERSION_PY % contents) - - print("set %s to '%s'" % (filename, versions["version"])) - - -def plus_or_dot(pieces): - """Return a + if we don't already have one, else return a .""" - if "+" in pieces.get("closest-tag", ""): - return "." - return "+" - - -def render_pep440(pieces): - """Build up version string, with post-release "local version identifier". - - Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you - get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty - - Exceptions: - 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - rendered += plus_or_dot(pieces) - rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) - if pieces["dirty"]: - rendered += ".dirty" - else: - # exception #1 - rendered = "0+untagged.%d.g%s" % (pieces["distance"], - pieces["short"]) - if pieces["dirty"]: - rendered += ".dirty" - return rendered - - -def render_pep440_pre(pieces): - """TAG[.post.devDISTANCE] -- No -dirty. - - Exceptions: - 1: no tags. 0.post.devDISTANCE - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"]: - rendered += ".post.dev%d" % pieces["distance"] - else: - # exception #1 - rendered = "0.post.dev%d" % pieces["distance"] - return rendered - - -def render_pep440_post(pieces): - """TAG[.postDISTANCE[.dev0]+gHEX] . - - The ".dev0" means dirty. Note that .dev0 sorts backwards - (a dirty tree will appear "older" than the corresponding clean one), - but you shouldn't be releasing software with -dirty anyways. - - Exceptions: - 1: no tags. 0.postDISTANCE[.dev0] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - rendered += ".post%d" % pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - rendered += plus_or_dot(pieces) - rendered += "g%s" % pieces["short"] - else: - # exception #1 - rendered = "0.post%d" % pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - rendered += "+g%s" % pieces["short"] - return rendered - - -def render_pep440_old(pieces): - """TAG[.postDISTANCE[.dev0]] . - - The ".dev0" means dirty. - - Eexceptions: - 1: no tags. 0.postDISTANCE[.dev0] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - rendered += ".post%d" % pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - else: - # exception #1 - rendered = "0.post%d" % pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - return rendered - - -def render_git_describe(pieces): - """TAG[-DISTANCE-gHEX][-dirty]. - - Like 'git describe --tags --dirty --always'. - - Exceptions: - 1: no tags. HEX[-dirty] (note: no 'g' prefix) - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"]: - rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) - else: - # exception #1 - rendered = pieces["short"] - if pieces["dirty"]: - rendered += "-dirty" - return rendered - - -def render_git_describe_long(pieces): - """TAG-DISTANCE-gHEX[-dirty]. - - Like 'git describe --tags --dirty --always -long'. - The distance/hash is unconditional. - - Exceptions: - 1: no tags. HEX[-dirty] (note: no 'g' prefix) - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) - else: - # exception #1 - rendered = pieces["short"] - if pieces["dirty"]: - rendered += "-dirty" - return rendered - - -def render(pieces, style): - """Render the given version pieces into the requested style.""" - if pieces["error"]: - return {"version": "unknown", - "full-revisionid": pieces.get("long"), - "dirty": None, - "error": pieces["error"]} - - if not style or style == "default": - style = "pep440" # the default - - if style == "pep440": - rendered = render_pep440(pieces) - elif style == "pep440-pre": - rendered = render_pep440_pre(pieces) - elif style == "pep440-post": - rendered = render_pep440_post(pieces) - elif style == "pep440-old": - rendered = render_pep440_old(pieces) - elif style == "git-describe": - rendered = render_git_describe(pieces) - elif style == "git-describe-long": - rendered = render_git_describe_long(pieces) - else: - raise ValueError("unknown style '%s'" % style) - - return {"version": rendered, "full-revisionid": pieces["long"], - "dirty": pieces["dirty"], "error": None} - - -class VersioneerBadRootError(Exception): - """The project root directory is unknown or missing key files.""" - - -def get_versions(verbose=False): - """Get the project version from whatever source is available. - - Returns dict with two keys: 'version' and 'full'. - """ - if "versioneer" in sys.modules: - # see the discussion in cmdclass.py:get_cmdclass() - del sys.modules["versioneer"] - - root = get_root() - cfg = get_config_from_root(root) - - assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg" - handlers = HANDLERS.get(cfg.VCS) - assert handlers, "unrecognized VCS '%s'" % cfg.VCS - verbose = verbose or cfg.verbose - assert cfg.versionfile_source is not None, \ - "please set versioneer.versionfile_source" - assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" - - versionfile_abs = os.path.join(root, cfg.versionfile_source) - - # extract version from first of: _version.py, VCS command (e.g. 'git - # describe'), parentdir. This is meant to work for developers using a - # source checkout, for users of a tarball created by 'setup.py sdist', - # and for users of a tarball/zipball created by 'git archive' or github's - # download-from-tag feature or the equivalent in other VCSes. - - get_keywords_f = handlers.get("get_keywords") - from_keywords_f = handlers.get("keywords") - if get_keywords_f and from_keywords_f: - try: - keywords = get_keywords_f(versionfile_abs) - ver = from_keywords_f(keywords, cfg.tag_prefix, verbose) - if verbose: - print("got version from expanded keyword %s" % ver) - return ver - except NotThisMethod: - pass - - try: - ver = versions_from_file(versionfile_abs) - if verbose: - print("got version from file %s %s" % (versionfile_abs, ver)) - return ver - except NotThisMethod: - pass - - from_vcs_f = handlers.get("pieces_from_vcs") - if from_vcs_f: - try: - pieces = from_vcs_f(cfg.tag_prefix, root, verbose) - ver = render(pieces, cfg.style) - if verbose: - print("got version from VCS %s" % ver) - return ver - except NotThisMethod: - pass - - try: - if cfg.parentdir_prefix: - ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose) - if verbose: - print("got version from parentdir %s" % ver) - return ver - except NotThisMethod: - pass - - if verbose: - print("unable to compute version") - - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, "error": "unable to compute version"} - - -def get_version(): - """Get the short version string for this project.""" - return get_versions()["version"] - - -def get_cmdclass(): - """Get the custom setuptools/distutils subclasses used by Versioneer.""" - if "versioneer" in sys.modules: - del sys.modules["versioneer"] - # this fixes the "python setup.py develop" case (also 'install' and - # 'easy_install .'), in which subdependencies of the main project are - # built (using setup.py bdist_egg) in the same python process. Assume - # a main project A and a dependency B, which use different versions - # of Versioneer. A's setup.py imports A's Versioneer, leaving it in - # sys.modules by the time B's setup.py is executed, causing B to run - # with the wrong versioneer. Setuptools wraps the sub-dep builds in a - # sandbox that restores sys.modules to it's pre-build state, so the - # parent is protected against the child's "import versioneer". By - # removing ourselves from sys.modules here, before the child build - # happens, we protect the child from the parent's versioneer too. - # Also see https://github.com/warner/python-versioneer/issues/52 - - cmds = {} - - # we add "version" to both distutils and setuptools - from distutils.core import Command - - class cmd_version(Command): - description = "report generated version string" - user_options = [] - boolean_options = [] - - def initialize_options(self): - pass - - def finalize_options(self): - pass - - def run(self): - vers = get_versions(verbose=True) - print("Version: %s" % vers["version"]) - print(" full-revisionid: %s" % vers.get("full-revisionid")) - print(" dirty: %s" % vers.get("dirty")) - if vers["error"]: - print(" error: %s" % vers["error"]) - cmds["version"] = cmd_version - - # we override "build_py" in both distutils and setuptools - # - # most invocation pathways end up running build_py: - # distutils/build -> build_py - # distutils/install -> distutils/build ->.. - # setuptools/bdist_wheel -> distutils/install ->.. - # setuptools/bdist_egg -> distutils/install_lib -> build_py - # setuptools/install -> bdist_egg ->.. - # setuptools/develop -> ? - - # we override different "build_py" commands for both environments - if "setuptools" in sys.modules: - from setuptools.command.build_py import build_py as _build_py - else: - from distutils.command.build_py import build_py as _build_py - - class cmd_build_py(_build_py): - def run(self): - root = get_root() - cfg = get_config_from_root(root) - versions = get_versions() - _build_py.run(self) - # now locate _version.py in the new build/ directory and replace - # it with an updated value - if cfg.versionfile_build: - target_versionfile = os.path.join(self.build_lib, - cfg.versionfile_build) - print("UPDATING %s" % target_versionfile) - write_to_version_file(target_versionfile, versions) - cmds["build_py"] = cmd_build_py - - if "cx_Freeze" in sys.modules: # cx_freeze enabled? - from cx_Freeze.dist import build_exe as _build_exe - - class cmd_build_exe(_build_exe): - def run(self): - root = get_root() - cfg = get_config_from_root(root) - versions = get_versions() - target_versionfile = cfg.versionfile_source - print("UPDATING %s" % target_versionfile) - write_to_version_file(target_versionfile, versions) - - _build_exe.run(self) - os.unlink(target_versionfile) - with open(cfg.versionfile_source, "w") as f: - LONG = LONG_VERSION_PY[cfg.VCS] - f.write(LONG % - {"DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - }) - cmds["build_exe"] = cmd_build_exe - del cmds["build_py"] - - # we override different "sdist" commands for both environments - if "setuptools" in sys.modules: - from setuptools.command.sdist import sdist as _sdist - else: - from distutils.command.sdist import sdist as _sdist - - class cmd_sdist(_sdist): - def run(self): - versions = get_versions() - self._versioneer_generated_versions = versions - # unless we update this, the command will keep using the old - # version - self.distribution.metadata.version = versions["version"] - return _sdist.run(self) - - def make_release_tree(self, base_dir, files): - root = get_root() - cfg = get_config_from_root(root) - _sdist.make_release_tree(self, base_dir, files) - # now locate _version.py in the new base_dir directory - # (remembering that it may be a hardlink) and replace it with an - # updated value - target_versionfile = os.path.join(base_dir, cfg.versionfile_source) - print("UPDATING %s" % target_versionfile) - write_to_version_file(target_versionfile, - self._versioneer_generated_versions) - cmds["sdist"] = cmd_sdist - - return cmds - - -CONFIG_ERROR = """ -setup.cfg is missing the necessary Versioneer configuration. You need -a section like: - - [versioneer] - VCS = git - style = pep440 - versionfile_source = src/myproject/_version.py - versionfile_build = myproject/_version.py - tag_prefix = - parentdir_prefix = myproject- - -You will also need to edit your setup.py to use the results: - - import versioneer - setup(version=versioneer.get_version(), - cmdclass=versioneer.get_cmdclass(), ...) - -Please read the docstring in ./versioneer.py for configuration instructions, -edit setup.cfg, and re-run the installer or 'python versioneer.py setup'. -""" - -SAMPLE_CONFIG = """ -# See the docstring in versioneer.py for instructions. Note that you must -# re-run 'versioneer.py setup' after changing this section, and commit the -# resulting files. - -[versioneer] -#VCS = git -#style = pep440 -#versionfile_source = -#versionfile_build = -#tag_prefix = -#parentdir_prefix = - -""" - -INIT_PY_SNIPPET = """ -from ._version import get_versions -__version__ = get_versions()['version'] -del get_versions -""" - - -def do_setup(): - """Main VCS-independent setup function for installing Versioneer.""" - root = get_root() - try: - cfg = get_config_from_root(root) - except (EnvironmentError, configparser.NoSectionError, - configparser.NoOptionError) as e: - if isinstance(e, (EnvironmentError, configparser.NoSectionError)): - print("Adding sample versioneer config to setup.cfg", - file=sys.stderr) - with open(os.path.join(root, "setup.cfg"), "a") as f: - f.write(SAMPLE_CONFIG) - print(CONFIG_ERROR, file=sys.stderr) - return 1 - - print(" creating %s" % cfg.versionfile_source) - with open(cfg.versionfile_source, "w") as f: - LONG = LONG_VERSION_PY[cfg.VCS] - f.write(LONG % {"DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - }) - - ipy = os.path.join(os.path.dirname(cfg.versionfile_source), - "__init__.py") - if os.path.exists(ipy): - try: - with open(ipy, "r") as f: - old = f.read() - except EnvironmentError: - old = "" - if INIT_PY_SNIPPET not in old: - print(" appending to %s" % ipy) - with open(ipy, "a") as f: - f.write(INIT_PY_SNIPPET) - else: - print(" %s unmodified" % ipy) - else: - print(" %s doesn't exist, ok" % ipy) - ipy = None - - # Make sure both the top-level "versioneer.py" and versionfile_source - # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so - # they'll be copied into source distributions. Pip won't be able to - # install the package without this. - manifest_in = os.path.join(root, "MANIFEST.in") - simple_includes = set() - try: - with open(manifest_in, "r") as f: - for line in f: - if line.startswith("include "): - for include in line.split()[1:]: - simple_includes.add(include) - except EnvironmentError: - pass - # That doesn't cover everything MANIFEST.in can do - # (http://docs.python.org/2/distutils/sourcedist.html#commands), so - # it might give some false negatives. Appending redundant 'include' - # lines is safe, though. - if "versioneer.py" not in simple_includes: - print(" appending 'versioneer.py' to MANIFEST.in") - with open(manifest_in, "a") as f: - f.write("include versioneer.py\n") - else: - print(" 'versioneer.py' already in MANIFEST.in") - if cfg.versionfile_source not in simple_includes: - print(" appending versionfile_source ('%s') to MANIFEST.in" % - cfg.versionfile_source) - with open(manifest_in, "a") as f: - f.write("include %s\n" % cfg.versionfile_source) - else: - print(" versionfile_source already in MANIFEST.in") - - # Make VCS-specific changes. For git, this means creating/changing - # .gitattributes to mark _version.py for export-time keyword - # substitution. - do_vcs_install(manifest_in, cfg.versionfile_source, ipy) - return 0 - - -def scan_setup_py(): - """Validate the contents of setup.py against Versioneer's expectations.""" - found = set() - setters = False - errors = 0 - with open("setup.py", "r") as f: - for line in f.readlines(): - if "import versioneer" in line: - found.add("import") - if "versioneer.get_cmdclass()" in line: - found.add("cmdclass") - if "versioneer.get_version()" in line: - found.add("get_version") - if "versioneer.VCS" in line: - setters = True - if "versioneer.versionfile_source" in line: - setters = True - if len(found) != 3: - print("") - print("Your setup.py appears to be missing some important items") - print("(but I might be wrong). Please make sure it has something") - print("roughly like the following:") - print("") - print(" import versioneer") - print(" setup( version=versioneer.get_version(),") - print(" cmdclass=versioneer.get_cmdclass(), ...)") - print("") - errors += 1 - if setters: - print("You should remove lines like 'versioneer.VCS = ' and") - print("'versioneer.versionfile_source = ' . This configuration") - print("now lives in setup.cfg, and should be removed from setup.py") - print("") - errors += 1 - return errors - -if __name__ == "__main__": - cmd = sys.argv[1] - if cmd == "setup": - errors = do_setup() - errors += scan_setup_py() - if errors: - sys.exit(1) From 248c7d1351c326c2ea8e6f826e553cf7cd60013f Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Tue, 6 Dec 2016 23:03:09 +0100 Subject: [PATCH 078/100] Updated readme. --- .gitignore | 1 + README.rst | 69 ++++++++++++++++++++++++++++++++++++------------- conda/meta.yaml | 12 +++++---- 3 files changed, 59 insertions(+), 23 deletions(-) diff --git a/.gitignore b/.gitignore index 6d26c7e..0635ab3 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ _legacy docs/_build RELEASE-VERSION /.vscode +/.cache diff --git a/README.rst b/README.rst index 02b9a1b..546845e 100644 --- a/README.rst +++ b/README.rst @@ -2,10 +2,6 @@ PyIM =============================== - -.. image:: https://img.shields.io/pypi/v/pyim.svg - :target: https://pypi.python.org/pypi/pyim - .. image:: https://img.shields.io/travis/jrderuiter/pyim.svg :target: https://travis-ci.org/jrderuiter/pyim @@ -13,28 +9,65 @@ PyIM :target: https://pyim.readthedocs.io/en/latest/?badge=latest :alt: Documentation Status -.. image:: https://pyup.io/repos/github/jrderuiter/pyim/shield.svg - :target: https://pyup.io/repos/github/jrderuiter/pyim/ - :alt: Updates +PyIM (Python Insertional Mutagenesis) is a python package for analyzing +insertional mutagenesis data from targeted sequencing of transposon insertion +sites. The package provides several command line tools for identifying +insertions, calling common insertion sites (CISs) and annotating +insertions/CISs directly from the command line. It also aims to provides +the basic building blocks for implementing new pipelines, CIS callers, etc. + +Documentation +------------- + +PyIM's documentation will be made available online soon. + +Requirements +------------ + +PyIM is written for Python 3 and requires Python 3.3 or newer to be installed. +Depending on the used functionality, PyIM also has the following external +dependencies: + +- cutadapt/bowtie2 (Needed for identifying insertions from sequencing data) +- cimpl (R package, needed for calling CIS sites using CIMPL) + +Installation +------------ + +Using conda +~~~~~~~~~~~ + +The recommended way to install PyIM is using conda, as with conda you can +install PyIM together with its external dependencies (cutadapt and bowtie2) +into an isolated environment using a single command: + +.. code:: bash + + conda create -n pyim -c jrderuiter -c bioconda -c r pyim + +Alternatively, PyIM can be installed in an existing environent using: +.. code:: bash -PTools for analyzing insertional mutagenesis data. + conda install -c jrderuiter -c bioconda -c r pyim +Conda packages are available for both OSX and Linux (64-bit). -* Free software: MIT license -* Documentation: https://pyim.readthedocs.io. +Using pip +~~~~~~~~~ +PyIM can be installed from Github using pip as follows: -Features --------- +.. code:: bash -* TODO + pip install git+git://github.com/jrderuiter/pyim.git#egg=pyim -Credits ---------- +Note that in this case, external dependencies must be installed manually. -This package was created with Cookiecutter_ and the `audreyr/cookiecutter-pypackage`_ project template. +Unfortunately, PyIM is not yet available on PyPI, though this may +change when the package is further developed. -.. _Cookiecutter: https://github.com/audreyr/cookiecutter -.. _`audreyr/cookiecutter-pypackage`: https://github.com/audreyr/cookiecutter-pypackage +License +------- +This software is released under the MIT license. diff --git a/conda/meta.yaml b/conda/meta.yaml index 2b1d201..da8a6e8 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -70,8 +70,10 @@ test: about: home: http://www.github.com/jrderuiter/pyim license: MIT License - summary: 'PyIM is a package for that implements a number of pipelines for - identifying transposon integration sites from targeted DNA-sequencing - of transposon insertions. The package implements a number of standard - pipelines used in our group, but also provides the basic build blocks - for custom pipelines.' + summary: 'PyIM (Python Insertional Mutagenesis) is a python package for + analyzing insertional mutagenesis data from targeted sequencing of + transposon insertion sites. The package provides several command line + tools for identifying insertions, calling common insertion sites (CISs) + and annotating insertions/CISs directly from the command line. It + also aims to provides the basic building blocks for implementing + new pipelines, CIS callers, etc.' From 50b9acb10b0dd261dac07f91382b4506a009dbf4 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 14 Dec 2016 10:11:29 +0100 Subject: [PATCH 079/100] Added some unit tests. --- .travis.yml | 35 ++ setup.py | 10 +- src/pyim/annotate/annotators/base.py | 5 +- src/pyim/annotate/annotators/rbm.py | 8 +- src/pyim/annotate/annotators/window.py | 14 +- src/pyim/external/cutadapt.py | 44 +- src/pyim/external/util.py | 12 +- src/pyim/util/path.py | 6 +- tests/conftest.py | 23 + tests/data/reference.gtf | 408 ++++++++++++++++++ tests/data/reference.gtf.gz | Bin 0 -> 6235 bytes tests/data/reference.gtf.gz.tbi | Bin 0 -> 452 bytes tests/pyim/annotate/annotators/conftest.py | 49 +++ tests/pyim/annotate/annotators/test_rbm.py | 39 ++ tests/pyim/annotate/annotators/test_window.py | 90 ++++ tests/pyim/external/test_bowtie2.py | 71 +++ tests/pyim/external/test_cutadapt.py | 49 +++ tests/pyim/external/test_util.py | 89 ++++ tests/pyim/util/test_path.py | 54 +++ tests/pyim/util/test_shell.py | 2 - 20 files changed, 946 insertions(+), 62 deletions(-) create mode 100644 .travis.yml create mode 100644 tests/conftest.py create mode 100644 tests/data/reference.gtf create mode 100644 tests/data/reference.gtf.gz create mode 100644 tests/data/reference.gtf.gz.tbi create mode 100644 tests/pyim/annotate/annotators/conftest.py create mode 100644 tests/pyim/annotate/annotators/test_rbm.py create mode 100644 tests/pyim/annotate/annotators/test_window.py create mode 100644 tests/pyim/external/test_bowtie2.py create mode 100644 tests/pyim/external/test_cutadapt.py create mode 100644 tests/pyim/external/test_util.py create mode 100644 tests/pyim/util/test_path.py delete mode 100644 tests/pyim/util/test_shell.py diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..065135e --- /dev/null +++ b/.travis.yml @@ -0,0 +1,35 @@ +language: python + +python: + # We don't actually use the Travis Python, but this keeps it organized. + - '3.4' + - '3.5' + +install: + - sudo apt-get update + + # Install conda. + - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; + - bash miniconda.sh -b -p $HOME/miniconda + - export PATH="$HOME/miniconda/bin:$PATH" + - hash -r + - conda config --set always_yes yes --set changeps1 no + - conda update -q conda + + # Useful for debugging any issues with conda. + - conda info -a + + # Create conda environment. + - conda create -q -n test python=$TRAVIS_PYTHON_VERSION + - source activate test + + # Install dependencies. + - conda install -c r -c bioconda r + - pip install .[test] + + # Install test dependencies. + +script: py.test --cov pyim --cov-report term-missing + +#after_success: +# - coveralls diff --git a/setup.py b/setup.py index 69c4ec3..5992fda 100644 --- a/setup.py +++ b/setup.py @@ -10,9 +10,10 @@ history = history_file.read() requirements = ['pyfaidx', 'intervaltree', 'tqdm', 'toolz', 'frozendict', - 'rpy2'] + 'rpy2', 'numpy', 'pandas', 'pysam'] -test_requirements = [] +test_requirements = ['pytest', 'pytest-cov', 'pytest-mock', + 'pytest-helpers-namespace', 'python-coveralls'] setup( name='pyim', @@ -39,8 +40,9 @@ 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', ], - test_suite='tests', - tests_require=test_requirements, + extras_require={ + 'test': test_requirements + }, entry_points={'console_scripts': [ 'pyim-align = pyim.main.pyim_align:main', 'pyim-demultiplex = pyim.main.pyim_demultiplex:main', diff --git a/src/pyim/annotate/annotators/base.py b/src/pyim/annotate/annotators/base.py index 63e86b2..b740703 100644 --- a/src/pyim/annotate/annotators/base.py +++ b/src/pyim/annotate/annotators/base.py @@ -59,7 +59,10 @@ def _preprocess_sites(self, cis_sites): """Pre-process cis sites, fixing unstrandedness etc.""" # Copy CISs that are unstranded to both strands. - return list(self._expand_unstranded_sites(cis_sites)) + if cis_sites is None: + return None + else: + return list(self._expand_unstranded_sites(cis_sites)) @staticmethod def _expand_unstranded_sites(cis_sites): diff --git a/src/pyim/annotate/annotators/rbm.py b/src/pyim/annotate/annotators/rbm.py index 2f1f1e6..518988f 100644 --- a/src/pyim/annotate/annotators/rbm.py +++ b/src/pyim/annotate/annotators/rbm.py @@ -18,7 +18,7 @@ def __init__(self, preset=None, closest=False, blacklist=None, - cis_sites=None): + verbose=True): super().__init__() if window_sizes is None: @@ -33,8 +33,8 @@ def __init__(self, reference_gtf, windows=windows, closest=closest, - blacklist=blacklist) - self._cis_sites = cis_sites + blacklist=blacklist, + verbose=verbose) @classmethod def configure_args(cls, parser): @@ -93,5 +93,3 @@ class RbmCisAnnotator(CisAnnotator, RbmAnnotator): register_annotator('rbm', RbmCisAnnotator) - -# register_annotator('rbm', RbmAnnotator) diff --git a/src/pyim/annotate/annotators/window.py b/src/pyim/annotate/annotators/window.py index 802f97f..fdedb89 100644 --- a/src/pyim/annotate/annotators/window.py +++ b/src/pyim/annotate/annotators/window.py @@ -9,7 +9,7 @@ from pyim.util.tabix import GtfFile, GtfFrame from .base import Annotator, CisAnnotator, register_annotator -from ..filter_ import select_closest, filter_blacklist +from ..filter_ import select_closest from ..metadata import add_metadata from ..util import build_interval_trees, numeric_strand @@ -77,10 +77,6 @@ def annotate(self, insertions): if self._closest: annotated = list(select_closest(annotated)) - # Filter blacklist. - if self._blacklist is not None: - annotated = filter_blacklist(annotated, self._blacklist) - return annotated def _annotate_insertion(self, insertion): @@ -95,6 +91,10 @@ def _annotate_insertion(self, insertion): hits |= {(feature['gene_id'], feature['gene_name'], window.name) for feature in applied_window.get_overlap(trees)} + # Filter for blacklist. + if self._blacklist is not None: + hits = {hit for hit in hits if hit[1] not in self._blacklist} + if len(hits) > 0: # Annotate insertion with overlapping genes. for gene_id, gene_name, window_name in hits: @@ -175,10 +175,10 @@ class AppliedWindow(_AppliedWindow): __slots__ = () def get_overlap(self, interval_trees): - # Find overlapping features. + # Find overlapping features (end-inclusive). try: tree = interval_trees[self.chromosome] - overlap = tree[self.start:self.end] + overlap = tree[self.start:self.end + 1] except KeyError: overlap = [] diff --git a/src/pyim/external/cutadapt.py b/src/pyim/external/cutadapt.py index 1b873b2..54aa366 100644 --- a/src/pyim/external/cutadapt.py +++ b/src/pyim/external/cutadapt.py @@ -10,25 +10,6 @@ def cutadapt(in1_path, out1_path, options, in2_path=None, out2_path=None): """Runs cutadapt using the given options.""" - cmdline_args = _build_arguments( - in1_path, out1_path, options, in2_path=in2_path, out2_path=out2_path) - - return shell.run(cmdline_args) - #process = subprocess.run(cmdline_args, - # stdout=subprocess.PIPE, - # stderr=subprocess.PIPE) - #process.check_returncode() - - # return process - - -def _build_arguments(in1_path=None, - out1_path=None, - options=None, - in2_path=None, - out2_path=None): - """Builds argument list for cutadapt.""" - in1_path = in1_path or '-' options = dict(options) if options is not None else {} @@ -38,30 +19,13 @@ def _build_arguments(in1_path=None, if out2_path is not None: options['-p'] = str(out2_path) - cmdline_opts = shell.flatten_options(options) - cmdline_opts = ['cutadapt'] + cmdline_opts + [str(in1_path)] + cmdline_args = shell.flatten_options(options) + cmdline_args = ['cutadapt'] + cmdline_args + [str(in1_path)] if in2_path is not None: - cmdline_opts += [str(in2_path)] + cmdline_args += [str(in2_path)] - return cmdline_opts - -# def cutadapt_piped(input_path, output_path, options_list, log_paths=None): -# """Runs multiple cutadapt commands in a piped fashion.""" - -# arg_list = [] -# for i, opts in enumerate(options_list): -# in_ = input_path if i == 0 else None -# out_ = output_path if i == (len(options_list) - 1) else None -# arg_list.append(_build_arguments(in_, out_, opts)) - -# if '-o' in arg_list[-1]: -# stdout = log_paths[-1] -# log_paths = log_paths[:-1] + [None] -# else: -# stdout = None - -# shell.run_piped(arg_list, stdout=stdout, stderrs=log_paths) + return shell.run(cmdline_args) def demultiplex_samples(reads_path, diff --git a/src/pyim/external/util.py b/src/pyim/external/util.py index 24b7341..a5f393e 100644 --- a/src/pyim/external/util.py +++ b/src/pyim/external/util.py @@ -95,12 +95,20 @@ def _close_stdstream(stdstream): def flatten_options(option_dict): """Flattens a dict of options into an argument list.""" + # Iterate over keys in lexical order, so that we have a + # reproducible order of iteration (useful for tests). + opt_names = sorted(option_dict.keys()) + + # Flatten values. options = [] - for opt_name, opt_value in option_dict.items(): + for opt_name in opt_names: + opt_value = option_dict[opt_name] + if isinstance(opt_value, (tuple, list)): - options += [str(v) for v in opt_value] + options += [opt_name] + [str(v) for v in opt_value] elif opt_value is True: options += [opt_name] elif not (opt_value is False or opt_value is None): options += [opt_name, str(opt_value)] + return options diff --git a/src/pyim/util/path.py b/src/pyim/util/path.py index f074420..43dc071 100644 --- a/src/pyim/util/path.py +++ b/src/pyim/util/path.py @@ -4,7 +4,11 @@ def build_path(file_path, suffix='', dir_=None, ext=None): file_path = Path(file_path) - ext = ext or file_path.suffixes[-1] + try: + ext = ext or file_path.suffixes[-1] + except IndexError: + ext = '' + suffix = suffix + ext new_path = file_path.with_suffix(suffix) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..5ef04ed --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,23 @@ +pytest_plugins = ['helpers_namespace'] + +from pathlib import Path + +import pytest + +BASE_DIR = Path(__file__).parent + + +@pytest.helpers.register +def data_path(relative_path, relative_to=None): + if relative_to is None: + # Use BASE_DIR as default. + relative_to = BASE_DIR + elif not isinstance(relative_to, Path): + # Ensure relative_to is a Path. + relative_to = Path(relative_to) + + # If relative_to is not a path, move up one level. + if not relative_to.is_dir(): + relative_to = relative_to.parent + + return relative_to / 'data' / relative_path diff --git a/tests/data/reference.gtf b/tests/data/reference.gtf new file mode 100644 index 0000000..19b828d --- /dev/null +++ b/tests/data/reference.gtf @@ -0,0 +1,408 @@ +1 protein_coding gene 182409172 182462432 . + . gene_id "ENSMUSG00000026510"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; +1 protein_coding transcript 182409172 182462432 . + . gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; +1 protein_coding exon 182409172 182409450 . + . gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "1"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; exon_id "ENSMUSE00000720180"; +1 protein_coding CDS 182409424 182409450 . + 0 gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "1"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; protein_id "ENSMUSP00000112508"; +1 protein_coding start_codon 182409424 182409426 . + 0 gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "1"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; +1 protein_coding exon 182428894 182429041 . + . gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "2"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; exon_id "ENSMUSE00000348882"; +1 protein_coding CDS 182428894 182429041 . + 0 gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "2"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; protein_id "ENSMUSP00000112508"; +1 protein_coding exon 182431559 182431672 . + . gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "3"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; exon_id "ENSMUSE00000393286"; +1 protein_coding CDS 182431559 182431672 . + 2 gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "3"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; protein_id "ENSMUSP00000112508"; +1 protein_coding exon 182435352 182435434 . + . gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "4"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; exon_id "ENSMUSE00000160090"; +1 protein_coding CDS 182435352 182435434 . + 2 gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "4"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; protein_id "ENSMUSP00000112508"; +1 protein_coding exon 182436898 182436999 . + . gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "5"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; exon_id "ENSMUSE00000160078"; +1 protein_coding CDS 182436898 182436999 . + 0 gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "5"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; protein_id "ENSMUSP00000112508"; +1 protein_coding exon 182440837 182441011 . + . gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "6"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; exon_id "ENSMUSE00000160082"; +1 protein_coding CDS 182440837 182441011 . + 0 gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "6"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; protein_id "ENSMUSP00000112508"; +1 protein_coding exon 182441622 182441803 . + . gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "7"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; exon_id "ENSMUSE00000160081"; +1 protein_coding CDS 182441622 182441803 . + 2 gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "7"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; protein_id "ENSMUSP00000112508"; +1 protein_coding exon 182442133 182442297 . + . gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "8"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; exon_id "ENSMUSE00000160080"; +1 protein_coding CDS 182442133 182442297 . + 0 gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "8"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; protein_id "ENSMUSP00000112508"; +1 protein_coding exon 182442633 182442861 . + . gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "9"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; exon_id "ENSMUSE00000304580"; +1 protein_coding CDS 182442633 182442861 . + 0 gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "9"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; protein_id "ENSMUSP00000112508"; +1 protein_coding exon 182444325 182444432 . + . gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "10"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; exon_id "ENSMUSE00000304575"; +1 protein_coding CDS 182444325 182444432 . + 2 gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "10"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; protein_id "ENSMUSP00000112508"; +1 protein_coding exon 182444661 182444809 . + . gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "11"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; exon_id "ENSMUSE00000304568"; +1 protein_coding CDS 182444661 182444809 . + 2 gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "11"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; protein_id "ENSMUSP00000112508"; +1 protein_coding exon 182446274 182446736 . + . gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "12"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; exon_id "ENSMUSE00000160085"; +1 protein_coding CDS 182446274 182446736 . + 0 gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "12"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; protein_id "ENSMUSP00000112508"; +1 protein_coding exon 182448401 182449179 . + . gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "13"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; exon_id "ENSMUSE00000394841"; +1 protein_coding CDS 182448401 182449179 . + 2 gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "13"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; protein_id "ENSMUSP00000112508"; +1 protein_coding exon 182452655 182452792 . + . gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "14"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; exon_id "ENSMUSE00000415385"; +1 protein_coding CDS 182452655 182452792 . + 0 gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "14"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; protein_id "ENSMUSP00000112508"; +1 protein_coding exon 182453698 182453831 . + . gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "15"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; exon_id "ENSMUSE00000359579"; +1 protein_coding CDS 182453698 182453831 . + 0 gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "15"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; protein_id "ENSMUSP00000112508"; +1 protein_coding exon 182455697 182455863 . + . gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "16"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; exon_id "ENSMUSE00000534391"; +1 protein_coding CDS 182455697 182455863 . + 1 gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "16"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; protein_id "ENSMUSP00000112508"; +1 protein_coding exon 182458760 182458959 . + . gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "17"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; exon_id "ENSMUSE00000351872"; +1 protein_coding CDS 182458760 182458959 . + 2 gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "17"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; protein_id "ENSMUSP00000112508"; +1 protein_coding exon 182461687 182462432 . + . gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "18"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; exon_id "ENSMUSE00000709152"; +1 protein_coding CDS 182461687 182461725 . + 0 gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "18"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; protein_id "ENSMUSP00000112508"; +1 protein_coding stop_codon 182461726 182461728 . + 0 gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; exon_number "18"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; +1 protein_coding UTR 182409172 182409423 . + . gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; +1 protein_coding UTR 182461729 182462432 . + . gene_id "ENSMUSG00000026510"; transcript_id "ENSMUST00000117245"; gene_name "Trp53bp2"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Trp53bp2-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS48476"; +1 protein_coding gene 134754658 134955942 . - . gene_id "ENSMUSG00000073557"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; +1 protein_coding transcript 134754658 134955940 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; +1 protein_coding exon 134955492 134955940 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "1"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; exon_id "ENSMUSE00000431438"; +1 protein_coding CDS 134955492 134955782 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "1"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; protein_id "ENSMUSP00000047463"; +1 protein_coding start_codon 134955780 134955782 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "1"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; +1 protein_coding exon 134902378 134902508 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "2"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; exon_id "ENSMUSE00001237379"; +1 protein_coding CDS 134902378 134902508 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "2"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; protein_id "ENSMUSP00000047463"; +1 protein_coding exon 134896343 134896461 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "3"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; exon_id "ENSMUSE00001212763"; +1 protein_coding CDS 134896343 134896461 . - 1 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "3"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; protein_id "ENSMUSP00000047463"; +1 protein_coding exon 134893350 134893509 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "4"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; exon_id "ENSMUSE00001257351"; +1 protein_coding CDS 134893350 134893509 . - 2 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "4"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; protein_id "ENSMUSP00000047463"; +1 protein_coding exon 134892158 134892302 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "5"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; exon_id "ENSMUSE00001310095"; +1 protein_coding CDS 134892158 134892302 . - 1 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "5"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; protein_id "ENSMUSP00000047463"; +1 protein_coding exon 134890756 134890830 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "6"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; exon_id "ENSMUSE00001254946"; +1 protein_coding CDS 134890756 134890830 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "6"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; protein_id "ENSMUSP00000047463"; +1 protein_coding exon 134887275 134887354 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "7"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; exon_id "ENSMUSE00001306209"; +1 protein_coding CDS 134887275 134887354 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "7"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; protein_id "ENSMUSP00000047463"; +1 protein_coding exon 134886423 134886562 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "8"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; exon_id "ENSMUSE00001221227"; +1 protein_coding CDS 134886423 134886562 . - 1 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "8"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; protein_id "ENSMUSP00000047463"; +1 protein_coding exon 134879633 134879745 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "9"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; exon_id "ENSMUSE00001209742"; +1 protein_coding CDS 134879633 134879745 . - 2 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "9"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; protein_id "ENSMUSP00000047463"; +1 protein_coding exon 134875965 134876165 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "10"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; exon_id "ENSMUSE00001204951"; +1 protein_coding CDS 134875965 134876165 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "10"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; protein_id "ENSMUSP00000047463"; +1 protein_coding exon 134873939 134874021 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "11"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; exon_id "ENSMUSE00001293729"; +1 protein_coding CDS 134873939 134874021 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "11"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; protein_id "ENSMUSP00000047463"; +1 protein_coding exon 134872807 134872932 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "12"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; exon_id "ENSMUSE00001310503"; +1 protein_coding CDS 134872807 134872932 . - 1 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "12"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; protein_id "ENSMUSP00000047463"; +1 protein_coding exon 134865778 134865960 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "13"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; exon_id "ENSMUSE00001220925"; +1 protein_coding CDS 134865778 134865960 . - 1 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "13"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; protein_id "ENSMUSP00000047463"; +1 protein_coding exon 134842643 134842733 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "14"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; exon_id "ENSMUSE00001306218"; +1 protein_coding CDS 134842643 134842733 . - 1 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "14"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; protein_id "ENSMUSP00000047463"; +1 protein_coding exon 134837853 134838041 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "15"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; exon_id "ENSMUSE00001251618"; +1 protein_coding CDS 134837853 134838041 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "15"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; protein_id "ENSMUSP00000047463"; +1 protein_coding exon 134835978 134836167 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "16"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; exon_id "ENSMUSE00001227684"; +1 protein_coding CDS 134835978 134836167 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "16"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; protein_id "ENSMUSP00000047463"; +1 protein_coding exon 134835785 134835897 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "17"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; exon_id "ENSMUSE00001208140"; +1 protein_coding CDS 134835785 134835897 . - 2 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "17"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; protein_id "ENSMUSP00000047463"; +1 protein_coding exon 134834435 134834476 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "18"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; exon_id "ENSMUSE00001258898"; +1 protein_coding CDS 134834435 134834476 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "18"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; protein_id "ENSMUSP00000047463"; +1 protein_coding exon 134782151 134782167 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "19"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; exon_id "ENSMUSE00001319977"; +1 protein_coding CDS 134782151 134782167 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "19"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; protein_id "ENSMUSP00000047463"; +1 protein_coding exon 134777315 134777459 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "20"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; exon_id "ENSMUSE00001334232"; +1 protein_coding CDS 134777315 134777459 . - 1 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "20"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; protein_id "ENSMUSP00000047463"; +1 protein_coding exon 134776376 134776480 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "21"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; exon_id "ENSMUSE00001323797"; +1 protein_coding CDS 134776376 134776480 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "21"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; protein_id "ENSMUSP00000047463"; +1 protein_coding exon 134773426 134773479 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "22"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; exon_id "ENSMUSE00001327213"; +1 protein_coding CDS 134773426 134773479 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "22"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; protein_id "ENSMUSP00000047463"; +1 protein_coding exon 134771998 134772048 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "23"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; exon_id "ENSMUSE00001327926"; +1 protein_coding CDS 134771998 134772048 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "23"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; protein_id "ENSMUSP00000047463"; +1 protein_coding exon 134754658 134760495 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "24"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; exon_id "ENSMUSE00000596382"; +1 protein_coding CDS 134760412 134760495 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "24"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; protein_id "ENSMUSP00000047463"; +1 protein_coding stop_codon 134760409 134760411 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; exon_number "24"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; +1 protein_coding UTR 134955783 134955940 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; +1 protein_coding UTR 134754658 134760408 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000045665"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-001"; transcript_source "havana"; +1 processed_transcript transcript 134755568 134760353 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000137022"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-007"; transcript_source "havana"; +1 processed_transcript exon 134759937 134760353 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000137022"; exon_number "1"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-007"; transcript_source "havana"; exon_id "ENSMUSE00000816793"; +1 processed_transcript exon 134755568 134755810 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000137022"; exon_number "2"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-007"; transcript_source "havana"; exon_id "ENSMUSE00000819978"; +1 processed_transcript transcript 134759067 134846420 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000156348"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-002"; transcript_source "havana"; +1 processed_transcript exon 134846357 134846420 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000156348"; exon_number "1"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-002"; transcript_source "havana"; exon_id "ENSMUSE00000827849"; +1 processed_transcript exon 134842643 134842733 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000156348"; exon_number "2"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-002"; transcript_source "havana"; exon_id "ENSMUSE00001263521"; +1 processed_transcript exon 134837853 134838041 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000156348"; exon_number "3"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-002"; transcript_source "havana"; exon_id "ENSMUSE00001213563"; +1 processed_transcript exon 134835978 134836167 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000156348"; exon_number "4"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-002"; transcript_source "havana"; exon_id "ENSMUSE00001206601"; +1 processed_transcript exon 134835785 134835897 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000156348"; exon_number "5"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-002"; transcript_source "havana"; exon_id "ENSMUSE00001231195"; +1 processed_transcript exon 134834435 134834476 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000156348"; exon_number "6"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-002"; transcript_source "havana"; exon_id "ENSMUSE00001298430"; +1 processed_transcript exon 134782151 134782167 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000156348"; exon_number "7"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-002"; transcript_source "havana"; exon_id "ENSMUSE00001319029"; +1 processed_transcript exon 134777315 134777459 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000156348"; exon_number "8"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-002"; transcript_source "havana"; exon_id "ENSMUSE00001326554"; +1 processed_transcript exon 134776376 134776480 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000156348"; exon_number "9"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-002"; transcript_source "havana"; exon_id "ENSMUSE00001329673"; +1 processed_transcript exon 134773426 134773479 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000156348"; exon_number "10"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-002"; transcript_source "havana"; exon_id "ENSMUSE00001326814"; +1 processed_transcript exon 134771998 134772048 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000156348"; exon_number "11"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-002"; transcript_source "havana"; exon_id "ENSMUSE00001322754"; +1 processed_transcript exon 134765897 134766077 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000156348"; exon_number "12"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-002"; transcript_source "havana"; exon_id "ENSMUSE00001334760"; +1 processed_transcript exon 134759067 134760495 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000156348"; exon_number "13"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-002"; transcript_source "havana"; exon_id "ENSMUSE00000758068"; +1 protein_coding transcript 134759969 134955851 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; +1 protein_coding exon 134955492 134955851 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "1"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00000874253"; +1 protein_coding CDS 134955492 134955782 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "1"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000131406"; +1 protein_coding start_codon 134955780 134955782 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "1"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; +1 protein_coding exon 134902378 134902508 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "2"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001237379"; +1 protein_coding CDS 134902378 134902508 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "2"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000131406"; +1 protein_coding exon 134896343 134896461 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "3"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001212763"; +1 protein_coding CDS 134896343 134896461 . - 1 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "3"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000131406"; +1 protein_coding exon 134893350 134893509 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "4"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001257351"; +1 protein_coding CDS 134893350 134893509 . - 2 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "4"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000131406"; +1 protein_coding exon 134892158 134892302 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "5"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001310095"; +1 protein_coding CDS 134892158 134892302 . - 1 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "5"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000131406"; +1 protein_coding exon 134890756 134890830 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "6"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001254946"; +1 protein_coding CDS 134890756 134890830 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "6"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000131406"; +1 protein_coding exon 134887275 134887354 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "7"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001306209"; +1 protein_coding CDS 134887275 134887354 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "7"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000131406"; +1 protein_coding exon 134886423 134886562 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "8"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001221227"; +1 protein_coding CDS 134886423 134886562 . - 1 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "8"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000131406"; +1 protein_coding exon 134879633 134879745 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "9"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001209742"; +1 protein_coding CDS 134879633 134879745 . - 2 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "9"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000131406"; +1 protein_coding exon 134875965 134876165 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "10"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001204951"; +1 protein_coding CDS 134875965 134876165 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "10"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000131406"; +1 protein_coding exon 134873939 134874021 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "11"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001293729"; +1 protein_coding CDS 134873939 134874021 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "11"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000131406"; +1 protein_coding exon 134872807 134872932 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "12"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001310503"; +1 protein_coding CDS 134872807 134872932 . - 1 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "12"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000131406"; +1 protein_coding exon 134865778 134865960 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "13"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001220925"; +1 protein_coding CDS 134865778 134865960 . - 1 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "13"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000131406"; +1 protein_coding exon 134842643 134842733 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "14"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001306218"; +1 protein_coding CDS 134842643 134842733 . - 1 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "14"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000131406"; +1 protein_coding exon 134837853 134838041 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "15"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001251618"; +1 protein_coding CDS 134837853 134838041 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "15"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000131406"; +1 protein_coding exon 134835978 134836167 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "16"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001227684"; +1 protein_coding CDS 134835978 134836167 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "16"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000131406"; +1 protein_coding exon 134835785 134835897 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "17"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001208140"; +1 protein_coding CDS 134835785 134835897 . - 2 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "17"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000131406"; +1 protein_coding exon 134834435 134834476 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "18"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001258898"; +1 protein_coding CDS 134834435 134834476 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "18"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000131406"; +1 protein_coding exon 134782151 134782167 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "19"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001319977"; +1 protein_coding CDS 134782151 134782167 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "19"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000131406"; +1 protein_coding exon 134777315 134777459 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "20"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001334232"; +1 protein_coding CDS 134777315 134777459 . - 1 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "20"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000131406"; +1 protein_coding exon 134776376 134776480 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "21"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001323797"; +1 protein_coding CDS 134776376 134776480 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "21"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000131406"; +1 protein_coding exon 134773426 134773479 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "22"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001327213"; +1 protein_coding CDS 134773426 134773479 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "22"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000131406"; +1 protein_coding exon 134771998 134772048 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "23"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001327926"; +1 protein_coding CDS 134771998 134772048 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "23"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000131406"; +1 protein_coding exon 134765897 134766077 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "24"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001333658"; +1 protein_coding CDS 134765946 134766077 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "24"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000131406"; +1 protein_coding stop_codon 134765943 134765945 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "24"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; +1 protein_coding exon 134759969 134760495 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; exon_number "25"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00000913878"; +1 protein_coding UTR 134955783 134955851 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; +1 protein_coding UTR 134765897 134765942 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; +1 protein_coding UTR 134759969 134760495 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000168381"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-202"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; +1 protein_coding transcript 134765943 134955940 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; +1 protein_coding exon 134955492 134955940 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "1"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00000431438"; +1 protein_coding CDS 134955492 134955782 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "1"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000083633"; +1 protein_coding start_codon 134955780 134955782 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "1"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; +1 protein_coding exon 134902378 134902508 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "2"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001237379"; +1 protein_coding CDS 134902378 134902508 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "2"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000083633"; +1 protein_coding exon 134896343 134896461 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "3"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001212763"; +1 protein_coding CDS 134896343 134896461 . - 1 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "3"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000083633"; +1 protein_coding exon 134893350 134893509 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "4"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001257351"; +1 protein_coding CDS 134893350 134893509 . - 2 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "4"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000083633"; +1 protein_coding exon 134892158 134892302 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "5"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001310095"; +1 protein_coding CDS 134892158 134892302 . - 1 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "5"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000083633"; +1 protein_coding exon 134890756 134890830 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "6"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001254946"; +1 protein_coding CDS 134890756 134890830 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "6"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000083633"; +1 protein_coding exon 134887275 134887354 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "7"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001306209"; +1 protein_coding CDS 134887275 134887354 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "7"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000083633"; +1 protein_coding exon 134886423 134886562 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "8"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001221227"; +1 protein_coding CDS 134886423 134886562 . - 1 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "8"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000083633"; +1 protein_coding exon 134879633 134879745 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "9"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001209742"; +1 protein_coding CDS 134879633 134879745 . - 2 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "9"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000083633"; +1 protein_coding exon 134875965 134876165 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "10"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001204951"; +1 protein_coding CDS 134875965 134876165 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "10"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000083633"; +1 protein_coding exon 134873939 134874021 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "11"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001293729"; +1 protein_coding CDS 134873939 134874021 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "11"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000083633"; +1 protein_coding exon 134872807 134872932 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "12"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001310503"; +1 protein_coding CDS 134872807 134872932 . - 1 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "12"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000083633"; +1 protein_coding exon 134865778 134865960 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "13"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001220925"; +1 protein_coding CDS 134865778 134865960 . - 1 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "13"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000083633"; +1 protein_coding exon 134842643 134842733 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "14"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001306218"; +1 protein_coding CDS 134842643 134842733 . - 1 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "14"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000083633"; +1 protein_coding exon 134837853 134838041 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "15"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001251618"; +1 protein_coding CDS 134837853 134838041 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "15"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000083633"; +1 protein_coding exon 134835978 134836167 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "16"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001227684"; +1 protein_coding CDS 134835978 134836167 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "16"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000083633"; +1 protein_coding exon 134835785 134835897 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "17"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001208140"; +1 protein_coding CDS 134835785 134835897 . - 2 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "17"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000083633"; +1 protein_coding exon 134834435 134834476 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "18"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001258898"; +1 protein_coding CDS 134834435 134834476 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "18"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000083633"; +1 protein_coding exon 134782151 134782167 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "19"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001319977"; +1 protein_coding CDS 134782151 134782167 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "19"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000083633"; +1 protein_coding exon 134777315 134777459 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "20"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001334232"; +1 protein_coding CDS 134777315 134777459 . - 1 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "20"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000083633"; +1 protein_coding exon 134776376 134776480 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "21"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001323797"; +1 protein_coding CDS 134776376 134776480 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "21"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000083633"; +1 protein_coding exon 134773426 134773479 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "22"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001327213"; +1 protein_coding CDS 134773426 134773479 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "22"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000083633"; +1 protein_coding exon 134771998 134772048 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "23"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00001327926"; +1 protein_coding CDS 134771998 134772048 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "23"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000083633"; +1 protein_coding exon 134765943 134766077 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "24"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; exon_id "ENSMUSE00000690651"; +1 protein_coding CDS 134765946 134766077 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "24"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; protein_id "ENSMUSP00000083633"; +1 protein_coding stop_codon 134765943 134765945 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; exon_number "24"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; +1 protein_coding UTR 134955783 134955940 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000086444"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-201"; transcript_source "ensembl"; tag "CCDS"; ccds_id "CCDS35717"; +1 retained_intron transcript 134779110 134803851 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000141419"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-006"; transcript_source "havana"; +1 retained_intron exon 134803102 134803851 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000141419"; exon_number "1"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-006"; transcript_source "havana"; exon_id "ENSMUSE00000748331"; +1 retained_intron exon 134779110 134782167 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000141419"; exon_number "2"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-006"; transcript_source "havana"; exon_id "ENSMUSE00000776995"; +1 retained_intron transcript 134780947 134783297 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000191540"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-008"; transcript_source "havana"; +1 retained_intron exon 134780947 134783297 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000191540"; exon_number "1"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-008"; transcript_source "havana"; exon_id "ENSMUSE00001330298"; +1 processed_transcript transcript 134834063 134835820 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000146639"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-005"; transcript_source "havana"; +1 processed_transcript exon 134835785 134835820 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000146639"; exon_number "1"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-005"; transcript_source "havana"; exon_id "ENSMUSE00000756520"; +1 processed_transcript exon 134834435 134834476 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000146639"; exon_number "2"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-005"; transcript_source "havana"; exon_id "ENSMUSE00001298430"; +1 processed_transcript exon 134834063 134834261 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000146639"; exon_number "3"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-005"; transcript_source "havana"; exon_id "ENSMUSE00000823129"; +1 retained_intron transcript 134834066 134955928 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000132025"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-004"; transcript_source "havana"; +1 retained_intron exon 134955492 134955928 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000132025"; exon_number "1"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-004"; transcript_source "havana"; exon_id "ENSMUSE00000730901"; +1 retained_intron exon 134902378 134902508 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000132025"; exon_number "2"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-004"; transcript_source "havana"; exon_id "ENSMUSE00001308177"; +1 retained_intron exon 134896343 134896461 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000132025"; exon_number "3"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-004"; transcript_source "havana"; exon_id "ENSMUSE00001261512"; +1 retained_intron exon 134893350 134893509 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000132025"; exon_number "4"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-004"; transcript_source "havana"; exon_id "ENSMUSE00001282761"; +1 retained_intron exon 134892158 134892302 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000132025"; exon_number "5"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-004"; transcript_source "havana"; exon_id "ENSMUSE00001270405"; +1 retained_intron exon 134890756 134890830 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000132025"; exon_number "6"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-004"; transcript_source "havana"; exon_id "ENSMUSE00001262573"; +1 retained_intron exon 134887275 134887354 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000132025"; exon_number "7"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-004"; transcript_source "havana"; exon_id "ENSMUSE00001299027"; +1 retained_intron exon 134886423 134886562 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000132025"; exon_number "8"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-004"; transcript_source "havana"; exon_id "ENSMUSE00001217619"; +1 retained_intron exon 134879633 134879745 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000132025"; exon_number "9"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-004"; transcript_source "havana"; exon_id "ENSMUSE00001286528"; +1 retained_intron exon 134875965 134876165 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000132025"; exon_number "10"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-004"; transcript_source "havana"; exon_id "ENSMUSE00001208248"; +1 retained_intron exon 134873939 134874021 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000132025"; exon_number "11"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-004"; transcript_source "havana"; exon_id "ENSMUSE00001262753"; +1 retained_intron exon 134872807 134872932 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000132025"; exon_number "12"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-004"; transcript_source "havana"; exon_id "ENSMUSE00001306801"; +1 retained_intron exon 134872268 134872310 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000132025"; exon_number "13"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-004"; transcript_source "havana"; exon_id "ENSMUSE00000828790"; +1 retained_intron exon 134870068 134870250 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000132025"; exon_number "14"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-004"; transcript_source "havana"; exon_id "ENSMUSE00000783558"; +1 retained_intron exon 134865778 134865960 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000132025"; exon_number "15"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-004"; transcript_source "havana"; exon_id "ENSMUSE00001231552"; +1 retained_intron exon 134842643 134842733 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000132025"; exon_number "16"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-004"; transcript_source "havana"; exon_id "ENSMUSE00001263521"; +1 retained_intron exon 134837853 134838041 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000132025"; exon_number "17"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-004"; transcript_source "havana"; exon_id "ENSMUSE00001213563"; +1 retained_intron exon 134835978 134836167 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000132025"; exon_number "18"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-004"; transcript_source "havana"; exon_id "ENSMUSE00001206601"; +1 retained_intron exon 134835785 134835897 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000132025"; exon_number "19"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-004"; transcript_source "havana"; exon_id "ENSMUSE00001231195"; +1 retained_intron exon 134834066 134834476 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000132025"; exon_number "20"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-004"; transcript_source "havana"; exon_id "ENSMUSE00000744822"; +1 protein_coding transcript 134947943 134955942 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000112163"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-003"; transcript_source "havana"; +1 protein_coding exon 134955492 134955942 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000112163"; exon_number "1"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-003"; transcript_source "havana"; exon_id "ENSMUSE00000690646"; +1 protein_coding CDS 134955492 134955782 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000112163"; exon_number "1"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-003"; transcript_source "havana"; protein_id "ENSMUSP00000107788"; +1 protein_coding start_codon 134955780 134955782 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000112163"; exon_number "1"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-003"; transcript_source "havana"; +1 protein_coding exon 134948996 134949037 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000112163"; exon_number "2"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-003"; transcript_source "havana"; exon_id "ENSMUSE00000690644"; +1 protein_coding CDS 134949014 134949037 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000112163"; exon_number "2"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-003"; transcript_source "havana"; protein_id "ENSMUSP00000107788"; +1 protein_coding stop_codon 134949011 134949013 . - 0 gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000112163"; exon_number "2"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-003"; transcript_source "havana"; +1 protein_coding exon 134947943 134948277 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000112163"; exon_number "3"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-003"; transcript_source "havana"; exon_id "ENSMUSE00000690642"; +1 protein_coding UTR 134955783 134955942 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000112163"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-003"; transcript_source "havana"; +1 protein_coding UTR 134948996 134949010 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000112163"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-003"; transcript_source "havana"; +1 protein_coding UTR 134947943 134948277 . - . gene_id "ENSMUSG00000073557"; transcript_id "ENSMUST00000112163"; gene_name "Ppp1r12b"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Ppp1r12b-003"; transcript_source "havana"; +15 protein_coding gene 77760587 77842175 . - . gene_id "ENSMUSG00000022443"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; +15 protein_coding transcript 77760587 77842175 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; +15 protein_coding exon 77841957 77842175 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "1"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00000483485"; +15 protein_coding exon 77812880 77813231 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "2"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00001248482"; +15 protein_coding CDS 77812880 77813212 . - 0 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "2"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding start_codon 77813210 77813212 . - 0 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "2"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; +15 protein_coding exon 77807868 77808024 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "3"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00001251580"; +15 protein_coding CDS 77807868 77808024 . - 0 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "3"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77797001 77797028 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "4"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00001305133"; +15 protein_coding CDS 77797001 77797028 . - 2 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "4"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77795961 77796054 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "5"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00001300167"; +15 protein_coding CDS 77795961 77796054 . - 1 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "5"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77792254 77792346 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "6"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00001281542"; +15 protein_coding CDS 77792254 77792346 . - 0 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "6"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77791716 77791779 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "7"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00001293984"; +15 protein_coding CDS 77791716 77791779 . - 0 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "7"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77791048 77791146 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "8"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00001207975"; +15 protein_coding CDS 77791048 77791146 . - 2 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "8"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77790704 77790847 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "9"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00001250903"; +15 protein_coding CDS 77790704 77790847 . - 2 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "9"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77789868 77789963 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "10"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00000434618"; +15 protein_coding CDS 77789868 77789963 . - 2 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "10"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77788919 77789037 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "11"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00000434614"; +15 protein_coding CDS 77788919 77789037 . - 2 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "11"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77787510 77787662 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "12"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00000434608"; +15 protein_coding CDS 77787510 77787662 . - 0 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "12"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77786487 77786660 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "13"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00000434604"; +15 protein_coding CDS 77786487 77786660 . - 0 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "13"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77785093 77785266 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "14"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00000434601"; +15 protein_coding CDS 77785093 77785266 . - 0 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "14"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77783389 77783503 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "15"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00000558274"; +15 protein_coding CDS 77783389 77783503 . - 0 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "15"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77781322 77781515 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "16"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00000558272"; +15 protein_coding CDS 77781322 77781515 . - 2 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "16"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77781111 77781232 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "17"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00000558270"; +15 protein_coding CDS 77781111 77781232 . - 0 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "17"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77779990 77780059 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "18"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00000434320"; +15 protein_coding CDS 77779990 77780059 . - 1 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "18"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77777768 77777928 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "19"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00000434315"; +15 protein_coding CDS 77777768 77777928 . - 0 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "19"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77776891 77776999 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "20"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00001078946"; +15 protein_coding CDS 77776891 77776999 . - 1 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "20"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77775803 77775934 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "21"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00000434307"; +15 protein_coding CDS 77775803 77775934 . - 0 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "21"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77775074 77775280 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "22"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00000434304"; +15 protein_coding CDS 77775074 77775280 . - 0 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "22"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77774557 77774694 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "23"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00000434300"; +15 protein_coding CDS 77774557 77774694 . - 0 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "23"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77773935 77774058 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "24"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00000878257"; +15 protein_coding CDS 77773935 77774058 . - 0 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "24"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77773272 77773443 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "25"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00000434293"; +15 protein_coding CDS 77773272 77773443 . - 2 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "25"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77771824 77772036 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "26"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00000434290"; +15 protein_coding CDS 77771824 77772036 . - 1 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "26"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77771125 77771269 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "27"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00000434287"; +15 protein_coding CDS 77771125 77771269 . - 1 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "27"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77769767 77769973 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "28"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00000434283"; +15 protein_coding CDS 77769767 77769973 . - 0 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "28"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77769505 77769609 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "29"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00000434280"; +15 protein_coding CDS 77769505 77769609 . - 0 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "29"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77769201 77769353 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "30"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00000622974"; +15 protein_coding CDS 77769201 77769353 . - 0 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "30"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77768792 77769040 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "31"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00000434272"; +15 protein_coding CDS 77768792 77769040 . - 0 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "31"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77767451 77767663 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "32"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00000434269"; +15 protein_coding CDS 77767451 77767663 . - 0 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "32"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77767131 77767343 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "33"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00000622973"; +15 protein_coding CDS 77767131 77767343 . - 0 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "33"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77766714 77766875 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "34"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00001253968"; +15 protein_coding CDS 77766714 77766875 . - 0 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "34"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77765700 77765828 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "35"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00000434259"; +15 protein_coding CDS 77765700 77765828 . - 0 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "35"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77764505 77764593 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "36"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00000126905"; +15 protein_coding CDS 77764505 77764593 . - 0 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "36"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77764300 77764423 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "37"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00000126981"; +15 protein_coding CDS 77764300 77764423 . - 1 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "37"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77763792 77764000 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "38"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00000126919"; +15 protein_coding CDS 77763792 77764000 . - 0 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "38"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77763222 77763330 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "39"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00000126911"; +15 protein_coding CDS 77763222 77763330 . - 1 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "39"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77762952 77763124 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "40"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00000558252"; +15 protein_coding CDS 77762952 77763124 . - 0 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "40"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding exon 77760587 77762016 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "41"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; exon_id "ENSMUSE00000233349"; +15 protein_coding CDS 77761902 77762016 . - 1 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "41"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; protein_id "ENSMUSP00000016771"; +15 protein_coding stop_codon 77761899 77761901 . - 0 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; exon_number "41"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; +15 protein_coding UTR 77841957 77842175 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; +15 protein_coding UTR 77813213 77813231 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; +15 protein_coding UTR 77760587 77761898 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000016771"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS27605"; +15 retained_intron transcript 77765328 77767190 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000139729"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-007"; transcript_source "havana"; +15 retained_intron exon 77767131 77767190 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000139729"; exon_number "1"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-007"; transcript_source "havana"; exon_id "ENSMUSE00000815801"; +15 retained_intron exon 77766714 77766875 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000139729"; exon_number "2"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-007"; transcript_source "havana"; exon_id "ENSMUSE00001218044"; +15 retained_intron exon 77765328 77765828 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000139729"; exon_number "3"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-007"; transcript_source "havana"; exon_id "ENSMUSE00000761299"; +15 retained_intron transcript 77788191 77842056 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000126796"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-004"; transcript_source "havana"; +15 retained_intron exon 77841957 77842056 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000126796"; exon_number "1"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-004"; transcript_source "havana"; exon_id "ENSMUSE00000805930"; +15 retained_intron exon 77812880 77813231 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000126796"; exon_number "2"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-004"; transcript_source "havana"; exon_id "ENSMUSE00001226645"; +15 retained_intron exon 77807868 77808024 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000126796"; exon_number "3"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-004"; transcript_source "havana"; exon_id "ENSMUSE00001253622"; +15 retained_intron exon 77797001 77797028 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000126796"; exon_number "4"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-004"; transcript_source "havana"; exon_id "ENSMUSE00001275357"; +15 retained_intron exon 77795961 77796054 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000126796"; exon_number "5"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-004"; transcript_source "havana"; exon_id "ENSMUSE00001252584"; +15 retained_intron exon 77792254 77792346 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000126796"; exon_number "6"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-004"; transcript_source "havana"; exon_id "ENSMUSE00001225534"; +15 retained_intron exon 77791716 77791779 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000126796"; exon_number "7"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-004"; transcript_source "havana"; exon_id "ENSMUSE00001235129"; +15 retained_intron exon 77791048 77791146 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000126796"; exon_number "8"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-004"; transcript_source "havana"; exon_id "ENSMUSE00001255619"; +15 retained_intron exon 77790704 77790847 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000126796"; exon_number "9"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-004"; transcript_source "havana"; exon_id "ENSMUSE00001253562"; +15 retained_intron exon 77788191 77789963 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000126796"; exon_number "10"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-004"; transcript_source "havana"; exon_id "ENSMUSE00000802330"; +15 retained_intron transcript 77790225 77842105 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000129453"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-002"; transcript_source "havana"; +15 retained_intron exon 77841957 77842105 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000129453"; exon_number "1"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-002"; transcript_source "havana"; exon_id "ENSMUSE00000479844"; +15 retained_intron exon 77812880 77813231 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000129453"; exon_number "2"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-002"; transcript_source "havana"; exon_id "ENSMUSE00001226645"; +15 retained_intron exon 77807868 77808024 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000129453"; exon_number "3"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-002"; transcript_source "havana"; exon_id "ENSMUSE00001253622"; +15 retained_intron exon 77797001 77797028 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000129453"; exon_number "4"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-002"; transcript_source "havana"; exon_id "ENSMUSE00001275357"; +15 retained_intron exon 77795961 77796054 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000129453"; exon_number "5"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-002"; transcript_source "havana"; exon_id "ENSMUSE00001252584"; +15 retained_intron exon 77792254 77792346 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000129453"; exon_number "6"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-002"; transcript_source "havana"; exon_id "ENSMUSE00001225534"; +15 retained_intron exon 77791048 77791779 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000129453"; exon_number "7"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-002"; transcript_source "havana"; exon_id "ENSMUSE00000835137"; +15 retained_intron exon 77790225 77790847 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000129453"; exon_number "8"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-002"; transcript_source "havana"; exon_id "ENSMUSE00000798143"; +15 nonsense_mediated_decay transcript 77790741 77842047 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000123101"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-006"; transcript_source "havana"; +15 nonsense_mediated_decay exon 77841957 77842047 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000123101"; exon_number "1"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-006"; transcript_source "havana"; exon_id "ENSMUSE00000744221"; +15 nonsense_mediated_decay exon 77812880 77813231 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000123101"; exon_number "2"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-006"; transcript_source "havana"; exon_id "ENSMUSE00001248482"; +15 nonsense_mediated_decay CDS 77812880 77813212 . - 0 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000123101"; exon_number "2"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-006"; transcript_source "havana"; protein_id "ENSMUSP00000116198"; +15 nonsense_mediated_decay start_codon 77813210 77813212 . - 0 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000123101"; exon_number "2"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-006"; transcript_source "havana"; +15 nonsense_mediated_decay exon 77808353 77808458 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000123101"; exon_number "3"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-006"; transcript_source "havana"; exon_id "ENSMUSE00000815234"; +15 nonsense_mediated_decay CDS 77808432 77808458 . - 0 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000123101"; exon_number "3"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-006"; transcript_source "havana"; protein_id "ENSMUSP00000116198"; +15 nonsense_mediated_decay stop_codon 77808429 77808431 . - 0 gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000123101"; exon_number "3"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-006"; transcript_source "havana"; +15 nonsense_mediated_decay exon 77807868 77808024 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000123101"; exon_number "4"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-006"; transcript_source "havana"; exon_id "ENSMUSE00001253622"; +15 nonsense_mediated_decay exon 77797001 77797028 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000123101"; exon_number "5"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-006"; transcript_source "havana"; exon_id "ENSMUSE00001275357"; +15 nonsense_mediated_decay exon 77795961 77796054 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000123101"; exon_number "6"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-006"; transcript_source "havana"; exon_id "ENSMUSE00001252584"; +15 nonsense_mediated_decay exon 77792254 77792346 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000123101"; exon_number "7"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-006"; transcript_source "havana"; exon_id "ENSMUSE00001225534"; +15 nonsense_mediated_decay exon 77791716 77791779 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000123101"; exon_number "8"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-006"; transcript_source "havana"; exon_id "ENSMUSE00001235129"; +15 nonsense_mediated_decay exon 77791048 77791146 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000123101"; exon_number "9"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-006"; transcript_source "havana"; exon_id "ENSMUSE00001255619"; +15 nonsense_mediated_decay exon 77790741 77790847 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000123101"; exon_number "10"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-006"; transcript_source "havana"; exon_id "ENSMUSE00000725891"; +15 nonsense_mediated_decay UTR 77841957 77842047 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000123101"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-006"; transcript_source "havana"; +15 nonsense_mediated_decay UTR 77813213 77813231 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000123101"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-006"; transcript_source "havana"; +15 nonsense_mediated_decay UTR 77808353 77808428 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000123101"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-006"; transcript_source "havana"; +15 nonsense_mediated_decay UTR 77807868 77808024 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000123101"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-006"; transcript_source "havana"; +15 nonsense_mediated_decay UTR 77797001 77797028 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000123101"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-006"; transcript_source "havana"; +15 nonsense_mediated_decay UTR 77795961 77796054 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000123101"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-006"; transcript_source "havana"; +15 nonsense_mediated_decay UTR 77792254 77792346 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000123101"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-006"; transcript_source "havana"; +15 nonsense_mediated_decay UTR 77791716 77791779 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000123101"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-006"; transcript_source "havana"; +15 nonsense_mediated_decay UTR 77791048 77791146 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000123101"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-006"; transcript_source "havana"; +15 nonsense_mediated_decay UTR 77790741 77790847 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000123101"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-006"; transcript_source "havana"; +15 processed_transcript transcript 77794652 77813231 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000124844"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-005"; transcript_source "havana"; +15 processed_transcript exon 77812880 77813231 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000124844"; exon_number "1"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-005"; transcript_source "havana"; exon_id "ENSMUSE00001226645"; +15 processed_transcript exon 77807868 77808024 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000124844"; exon_number "2"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-005"; transcript_source "havana"; exon_id "ENSMUSE00001253622"; +15 processed_transcript exon 77797001 77797028 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000124844"; exon_number "3"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-005"; transcript_source "havana"; exon_id "ENSMUSE00001275357"; +15 processed_transcript exon 77795961 77796054 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000124844"; exon_number "4"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-005"; transcript_source "havana"; exon_id "ENSMUSE00001252584"; +15 processed_transcript exon 77794652 77794740 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000124844"; exon_number "5"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-005"; transcript_source "havana"; exon_id "ENSMUSE00000752813"; +15 retained_intron transcript 77805546 77842105 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000134878"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-003"; transcript_source "havana"; +15 retained_intron exon 77841957 77842105 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000134878"; exon_number "1"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-003"; transcript_source "havana"; exon_id "ENSMUSE00000479844"; +15 retained_intron exon 77812880 77813231 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000134878"; exon_number "2"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-003"; transcript_source "havana"; exon_id "ENSMUSE00001226645"; +15 retained_intron exon 77805546 77808024 . - . gene_id "ENSMUSG00000022443"; transcript_id "ENSMUST00000134878"; exon_number "3"; gene_name "Myh9"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "Myh9-003"; transcript_source "havana"; exon_id "ENSMUSE00000768616"; diff --git a/tests/data/reference.gtf.gz b/tests/data/reference.gtf.gz new file mode 100644 index 0000000000000000000000000000000000000000..abc3ca03767c54df1774446020c8e5a8af41230b GIT binary patch literal 6235 zcmZWubyQSa`zDlhB&3Fc3y5^*FhdDL3^k;PFog8bEhP;~H%Lo2jx-X2goL!f5F#Kc z4FXc~f%m)Lb^U(lkF(BM@80iz-sgGVv-VzxA(G%0*3ET=hb0Tc!7?P?&YE=qfl0&O zd@1Jl%?Vn=^-jZUwIN$}vKOn5qqR1NxkMaqt0*-mu#hxyLXdxD-omDtjVmO%u=>T+ z1oPf+^!C)B<9p-Cu(OkD%qQi4<|NZJqQ#D?XO_PuKrL&^QbAdOo6eG9ny+oFsO zrG1URynDl!vf16h=WlrBb$Bv(V0p2Vdj6u=pW#bG?AN{hHps1EZwGb#3E`7*EF!itDcbcxorG%AS3A|A7RABEII|6^W9tL#n zb;*)Z{^(i)wLkq3H30$mr(eww>3oCnJzU*fnV4;O@8|YycDDKG zILqB}^nJc#8#|*+JY+F7RhN)@uv<(dl+aV!tqI_1G+@=ZWa6AdXrmZ$ zBwLtClsB|Dny%k$P(2a#a(SCOHkKDXLa~+)9M=k2z28Or-joYnAF2IzFz`-Hcnvcb z=w5uTx2Y+1iEFnU+DIlQY`bLk9-z+R5N8uUo6NYaG;PHQctX$nidJ#PvKm?dEYH6K zFrW{hZkymhccqVhXJiiUT-@phxe^OX0yn^Ug$($mP< zV5bPTRHByV2h)mynB4`atxStzB1mwD8XNs zl(5~)UpA)BN_;ZaW&(X*@hGGs+;wD9nH1CK+$|{z6YB|vFxS6h4`q#E(%p)w1r6Dq zb-mRga)CM%>9n!(GLTvhLhi9qUk_26)aAv9O)P}cqwD*}w;5sf@|77 z-D3m7PDoaLSXi@D-wQHrg3n1yd?l1(Y&>Qqz@Ci`T7-L}*ARj>?Jjz%d(MF$ z2@pgYxaD3Ga1no1KuO+1fm` z*vVSM$A{ZUAp$g5DeiLPVB#pZr8U(IKGU$ zGsJM+0^0!1TyH@*c1dU}lMTO};uj}jbA8}mTFt-Sr5FnUcUchmbmI8nIM46(u7tGY zx_<6-rh>_xX)=z0L|vvSEs_QZ^KlnGoP-@2wxlir;^R9`^V=YJdG~55;r0}O3Az5|iFM6X$8`q}F}&IQKxtQ!eP?Vxl*`0O;l?!}Kha2b(WmC< zz<&*HIiixS?UDwcVmRLR#CY&eDi`WHl=`>h1#OUE=w+-R_QGz$j?z)7EU7*9R z7mTB`^=>q)T-wtvb6(TiWy_88SUu0X!p~qrqU5dtu4G(s;(0HDmbCz>>B=-FZ<;4V zf{+a&N2!QsW0>9ryV>P0iu(v!U>Vz2QLl5&pv16}CGicS3BVg;&HGAw5Z*bU%Yomdzf@WuwU}POJYkC@+2*$CeB2 z##QTvLFX+`JJ%e;3Ki*2IB{C>6f*R2?1NWy=B6f$b_YB0w_<|_C}Kj?jHD7wov}*R zsfuM$KKFCYNL!33vK~u+s0y)Gtj@mw{RG?#m+>E0O@!n>Af5pg%k;d6*Gap(tb>V^ zoS!$jhnsJbIEeYS4G`28yDK^!E!i>uP-ApkEp+Udj%NNxZhG<`$?CuuxQx%ZYL0Lj zB|xglN_gCb#+l<_OY;#%S)yZ}@prb?u6tVm-rw0&CA#Oa{+msTOZ|hQ0@~|{c}w}F z$Ae9dJU^e{D2iI}VC`9TO8Z?ORc;U0?$7*!nagj%lpdWi4PZ=HJ3Gc)e9P14n(3He zwc}0%8ceF==A)fky9j3wr93?Nu82t{-G)zU%e5ZkfjF!kXEt=Z3+}Q}DHH+~GRwi7yJ)( z&)xq=ogVmK>M~{B;`!&2?q>WUMZ=A*KI3pMvwn(X3SUo0IClzvn=PLMxV2g>y9P{o z42`Y_um5JSX588*c<_cn5Ao;*FylWA_KHk(1f$nm&d5AiC^U0gWfX~KGCaI# zKxG~I-8s_FF|y<{J9Doo!%$gK15Y#_wO{P>X9e7H#b&pE!S=Ep?8B}?F*mTZr^WWU zp!WWA;>KiDO=|R8-ED;wMkK*_|C)5~Dg-NAN%ZyyyB51!UADhBO0l2k%yZAl)a&=7 zH4djlKzEkRX|!C7abfQ$-*~j;=g*_g%VFuOOKm%8Ny+z)@5U|+fNU8?$L}4c zy1&lwa+Zd+UTyFl?CgQQh&*pgRK_=R3#eL&ZaJU*^6}>h{^u(E=3kf98#&Ktqo8Tl zBY8T5)PtT z8NDy7?GvV#KdX@v(kD_n$oX3}M`hEqAXhj`;4U~$A8!iXux3yq<;CK# ztSmbSY9tJ#kw-mDmAd=sK)@6&O`5}v*psSVW+dR*jKghPW_{@@E^BJnrwAXi8zs|1 z?Mf~Smz)16G8}m4v5aj~6(KN%Gk6^r8A#))ntt7Gbyy+T|8Qzdd1|!6@>88$4S|8n zpo?Xsjjlcu4)or%&E(lavo1ky4A{i(G*UPc%`(MpRj)J%rLcSRgr~*DUPXb^UDy`r z$}_^>Gg2tcoRF9+P03A~aiTIVy>5Fc_p$W-H_5eo$fc z?|K=Nc9X+4XoISG-2=QpfvNk0cc*XW82Em9Xf2fYq_W+uvaMfs2f&#e$7@_mfIK5- zwQ5wFfF?e!jE20n_afsgO>vefQmunE{wQ0y11{CiFcwEIxJ|i~=~BkKb`;6g5g@%M zT*5Lu>Q0fHb?aj?aaFoiE68dNO&*+7qq@vR*6&aESGLXq!)sNiT(CJ;^oQg#a3)Nh zHXaQCdHp^s(^+lsSe6Ik?n<#d8``4kzB0`X8b8!YH8Q|LBb_AU#H$U`pm9_`ey70u1I$UzW!)>i>LR~nbs1&-=rWjvJ*k) zBMO+_3K+=ZyEUS}0wF|YiR+jBIyz?v%+x&0rSN@?9dIyh-h}&*)JG9--ARZGX71^L zTbOj0ePs1~2lt4!K!^>yP(IkVp|!?sylv6Gt84z}XN#y2(V)0#5Sas~AgNO{{r1C- zr@Tst3&(K1v5;fKPmG}^`UT2CRS#sHLGT9Hd>qGR+{bv1HaAm=S`VpG&D$gUI?A63 zeJE)TxJTFHIGPlm2zC!lngoAZ)11&LN)L&&MtLLA%l`T&d~%eZ*F$ z{H61_^=?pUydpg;n}wEMuyBf9u}CTU%=fPeg;!N%I)xH8zyt2F^#G1m1&E-UO>ItC z-s37!#%P?cq8GiPDw538h`*gu)z7!2c8JM-6}}1hi&LjdMP}1A$u!fuFNn%m$pR0Wj*w) zJYH{}tWY?(sT{z^lMQ1ob?jk2fwp(F zi+TEF$$;(WPD_^YK#gP%G|opYyLg3bj3WKGHXmjBIMC|9Qm%7f&@5wSst8Z|Bc~yQdr{Md-#9_pP4)LU{>iCkD=a-}z+gUW&ET(X zW;FbkRV#0l>30zkJ0$mp@S}|%joQ{S&4|iYUiK!KsTIiO!-|$^E&7Dh6fKTJ-bco_ zO@G}El=*(!l_(}^B|7+vx;8b=CnM@r3U7w%; znUquh$6nfgpI3(?Jj-W6RIPROW3<2ag(UU%e)rCAp>2h86tm*D={`R(upR39qtf)r z^q*y%~6Y>BDRv_`?JJkp(sLU0Xn8~EV0K(H4sz2k%@T-oyL7W zMVC2yJz-fbqFwK}B_aT)r)0`0iWvxEvbqJe51ETxe*1~UWQK)CqX7ag!(Bjnr=^_c z7ivv^ppgxiICP!sG)OpA`WuNiN;t~gZwcNbc(r0zOB1`?B=*!b9|%!P1JPTx6nCHd z+^TD^dly=0f37k1+VFJn19`1uwt^FZZBO)~Wxrf?Q=ClMor?UG)w@qz_h3qb=BYC4 z6e<=$_T>};xs=A_?W0-~*KFJpa&7`fNpp0Nc77oLv=TC`gyz436v+9tr+qukB)?MP zY}u5F;MB2)iY=ADWCkAQ2;pQkioTD$0rbmN0Wp5D_E)ih_@dnTTmXjA)0!0~u8H5? z^h0nEA(nq!rS=I!XsY)r>y`8QH(_yS0^*u`Fy-Y!zZgXhm91N)dsX0u!Ak4N&T0BIta1 zSDV+x?m(v=!(!?x$vRj8l}A+#Re)<@a(JX(KTcFL!bR`ZX68Qps7t;AR}VH7u9@dw z_7^Nt_=`I^@wH5>6f+VJpLMa2DhjSbuVp%d4~ZtMErm1<4L~f#hESgnKUdRIGs4=s zujgZ}s1tT5Kn= OzyV?5{{EYAbNdhe3-Y@F literal 0 HcmV?d00001 diff --git a/tests/data/reference.gtf.gz.tbi b/tests/data/reference.gtf.gz.tbi new file mode 100644 index 0000000000000000000000000000000000000000..3731309402d8d87c207c07d20f2945539c905712 GIT binary patch literal 452 zcmb2|=3rp}f&Xj_PR>jW_Zi-vHRNJ46kxruLqp6=scEj*w6N$7g&Q|It|h&U-P5#J zeZyygS556klUTNWW_o|{!{n0&mU-XjYN^bPxErxI;`5!~b6*GOJKM|6n{84eF^_%! z@vkaRwZEVKYvTR$+~1=5H9=-SH!pgV{i$Z=?)O1*>l8!Q{=D>l>vWgPd%hccz29DV z>a-HrMdTBIKC^1KC46D{&%hv$=D*_%Z%^;#JQN_(aB=mGn;u%+(@(HWJ0z`S?D=Ms zQ`pIXlb;nQ34HiEZ@pzfz0wl Date: Mon, 20 Mar 2017 16:58:57 +0100 Subject: [PATCH 080/100] Added internal version of frozendict. --- conda/meta.yaml | 22 +++--- setup.py | 4 +- src/pyim/align/common/insertions.py | 2 +- src/pyim/annotate/annotators/base.py | 2 +- src/pyim/annotate/annotators/window.py | 2 +- src/pyim/cis/callers/cimpl.py | 2 +- src/pyim/cis/util.py | 2 +- src/pyim/main/pyim_cis.py | 2 +- src/pyim/model.py | 2 +- src/pyim/util/frozendict.py | 89 ++++++++++++++++++++++ tests/pyim/annotate/annotators/conftest.py | 2 +- 11 files changed, 109 insertions(+), 22 deletions(-) create mode 100644 src/pyim/util/frozendict.py diff --git a/conda/meta.yaml b/conda/meta.yaml index da8a6e8..f35b6b4 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -20,37 +20,35 @@ requirements: - setuptools # Basic dependencies - - pandas ==0.18.1 - - pyfaidx ==0.4.7.1 - - pysam ==0.9.1 - - toolz ==0.8.0 + - pandas >=0.18.1 + - pyfaidx >=0.4.8.1 + - pysam >=0.9.1 + - toolz >=0.8.0 - tqdm - intervaltree - - frozendict - cutadapt # R dependencies - r - r-cimpl - - rpy2 >=2.7.4 + - rpy2 >=2.8.2 run: - python # Basic dependencies - - pandas ==0.18.1 - - pyfaidx ==0.4.7.1 - - pysam ==0.9.1 - - toolz ==0.8.0 + - pandas >=0.18.1 + - pyfaidx >=0.4.8.1 + - pysam >=0.9.1 + - toolz >=0.8.0 - tqdm - intervaltree - - frozendict - cutadapt # R dependencies - r - r-cimpl - - rpy2 >=2.7.4 + - rpy2 >=2.8.2 # External dependencies - bowtie2 diff --git a/setup.py b/setup.py index 5992fda..35e9833 100644 --- a/setup.py +++ b/setup.py @@ -9,8 +9,8 @@ with open('HISTORY.rst') as history_file: history = history_file.read() -requirements = ['pyfaidx', 'intervaltree', 'tqdm', 'toolz', 'frozendict', - 'rpy2', 'numpy', 'pandas', 'pysam'] +requirements = ['pyfaidx', 'intervaltree', 'tqdm', 'toolz', 'rpy2', 'numpy', + 'pandas', 'pysam'] test_requirements = ['pytest', 'pytest-cov', 'pytest-mock', 'pytest-helpers-namespace', 'python-coveralls'] diff --git a/src/pyim/align/common/insertions.py b/src/pyim/align/common/insertions.py index 43c6f7a..c42a161 100644 --- a/src/pyim/align/common/insertions.py +++ b/src/pyim/align/common/insertions.py @@ -3,7 +3,7 @@ import logging import operator -from frozendict import frozendict +from pyim.util.frozendict import frozendict import numpy as np import pysam import toolz diff --git a/src/pyim/annotate/annotators/base.py b/src/pyim/annotate/annotators/base.py index b740703..44bc6dd 100644 --- a/src/pyim/annotate/annotators/base.py +++ b/src/pyim/annotate/annotators/base.py @@ -4,7 +4,7 @@ import operator from pathlib import Path -from frozendict import frozendict +from pyim.util.frozendict import frozendict import numpy as np import toolz diff --git a/src/pyim/annotate/annotators/window.py b/src/pyim/annotate/annotators/window.py index fdedb89..5b7e6a6 100644 --- a/src/pyim/annotate/annotators/window.py +++ b/src/pyim/annotate/annotators/window.py @@ -2,7 +2,7 @@ import itertools from pathlib import Path -from frozendict import frozendict +from pyim.util.frozendict import frozendict from tqdm import tqdm import toolz diff --git a/src/pyim/cis/callers/cimpl.py b/src/pyim/cis/callers/cimpl.py index 14de900..e7d2c67 100644 --- a/src/pyim/cis/callers/cimpl.py +++ b/src/pyim/cis/callers/cimpl.py @@ -100,7 +100,7 @@ def call(self, insertions): scales=robjects.vectors.IntVector(self._scales), n_iterations=self._iterations, lhc_method=self._lhc_method, - threads=self._threads, + cores=self._threads, BSgenome=genome_obj, chromosomes=robjects.vectors.StrVector(self._chromosomes), verbose=1) diff --git a/src/pyim/cis/util.py b/src/pyim/cis/util.py index d79b363..777a7d5 100644 --- a/src/pyim/cis/util.py +++ b/src/pyim/cis/util.py @@ -1,7 +1,7 @@ import itertools import operator -from frozendict import frozendict +from pyim.util.frozendict import frozendict import numpy as np import toolz diff --git a/src/pyim/main/pyim_cis.py b/src/pyim/main/pyim_cis.py index d71b0eb..76cde95 100644 --- a/src/pyim/main/pyim_cis.py +++ b/src/pyim/main/pyim_cis.py @@ -1,6 +1,6 @@ import argparse -from frozendict import frozendict +from pyim.util.frozendict import frozendict import toolz from pyim.cis import get_callers diff --git a/src/pyim/model.py b/src/pyim/model.py index f4843ac..0dead5c 100644 --- a/src/pyim/model.py +++ b/src/pyim/model.py @@ -2,7 +2,7 @@ import collections -from frozendict import frozendict +from pyim.util.frozendict import frozendict import numpy as np import pandas as pd import toolz diff --git a/src/pyim/util/frozendict.py b/src/pyim/util/frozendict.py new file mode 100644 index 0000000..2cab983 --- /dev/null +++ b/src/pyim/util/frozendict.py @@ -0,0 +1,89 @@ +""" +Frozendict implementation (version 1.2), obtained from +slezica/python-frozendict on Github +(https://github.com/slezica/python-frozendict). + +Copyright (c) 2012 Santiago Lezica + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import collections +import operator +import functools +import sys + +try: + from collections import OrderedDict +except ImportError: # python < 2.7 + OrderedDict = NotImplemented + +iteritems = getattr(dict, 'iteritems', dict.items) # py2-3 compatibility + + +class frozendict(collections.Mapping): + """ + An immutable wrapper around dictionaries that implements the complete + :py:class:`collections.Mapping` interface. It can be used as a drop-in + replacement for dictionaries where immutability is desired. + """ + + dict_cls = dict + + def __init__(self, *args, **kwargs): + self._dict = self.dict_cls(*args, **kwargs) + self._hash = None + + def __getitem__(self, key): + return self._dict[key] + + def __contains__(self, key): + return key in self._dict + + def copy(self, **add_or_replace): + return self.__class__(self, **add_or_replace) + + def __iter__(self): + return iter(self._dict) + + def __len__(self): + return len(self._dict) + + def __repr__(self): + return '<%s %r>' % (self.__class__.__name__, self._dict) + + def __hash__(self): + if self._hash is None: + h = 0 + for key, value in iteritems(self._dict): + h ^= hash((key, value)) + self._hash = h + return self._hash + + +class FrozenOrderedDict(frozendict): + """ + A frozendict subclass that maintains key order + """ + + dict_cls = OrderedDict + + +if OrderedDict is NotImplemented: + del FrozenOrderedDict diff --git a/tests/pyim/annotate/annotators/conftest.py b/tests/pyim/annotate/annotators/conftest.py index 9ea8894..ec964a0 100644 --- a/tests/pyim/annotate/annotators/conftest.py +++ b/tests/pyim/annotate/annotators/conftest.py @@ -1,5 +1,5 @@ import pytest -from frozendict import frozendict +from pyim.util.frozendict import frozendict from pyim.model import Insertion, CisSite From 072c7464a5d795940893f33a444f03bb4e9ed201 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Tue, 21 Mar 2017 11:31:16 +0100 Subject: [PATCH 081/100] Renamed single pipeline to ShearSplink. --- src/pyim/align/pipelines/__init__.py | 2 +- src/pyim/align/pipelines/base.py | 21 ++-- src/pyim/align/pipelines/paired.py | 26 ---- .../pipelines/{single.py => shear_splink.py} | 78 ++++++++---- src/pyim/external/bowtie2.py | 4 +- src/pyim/external/cutadapt.py | 4 +- src/pyim/external/util.py | 111 ++++++++++++------ src/pyim/main/pyim_align.py | 9 +- tests/pyim/external/test_util.py | 18 +-- 9 files changed, 164 insertions(+), 109 deletions(-) delete mode 100644 src/pyim/align/pipelines/paired.py rename src/pyim/align/pipelines/{single.py => shear_splink.py} (79%) diff --git a/src/pyim/align/pipelines/__init__.py b/src/pyim/align/pipelines/__init__.py index 64e4aed..6b387e1 100644 --- a/src/pyim/align/pipelines/__init__.py +++ b/src/pyim/align/pipelines/__init__.py @@ -1,2 +1,2 @@ from .base import Pipeline, get_pipelines, register_pipeline -from .single import SinglePipeline, SingleMultiplexedPipeline +from .shear_splink import ShearSplinkPipeline, MultiplexedShearSplinkPipeline diff --git a/src/pyim/align/pipelines/base.py b/src/pyim/align/pipelines/base.py index 5f53c18..5e805ce 100644 --- a/src/pyim/align/pipelines/base.py +++ b/src/pyim/align/pipelines/base.py @@ -13,22 +13,27 @@ def get_pipelines(): class Pipeline(abc.ABC): + """Base pipeline class.""" + def __init__(self): pass @abc.abstractclassmethod def configure_args(cls, parser): + """Configures argument parser for the pipeline.""" parser.add_argument('--reads', type=Path, required=True) - parser.add_argument('--output', type=Path, required=True) - - @abc.abstractclassmethod - def extract_args(cls, args): - raise NotImplementedError() + parser.add_argument('--reads2', type=Path, required=False) + parser.add_argument('--output_dir', type=Path, required=True) @classmethod def from_args(cls, args): - return cls(**cls.extract_args(args)) + """Builds a pipeline instance from the given arguments.""" + return cls(**cls._extract_args(args)) @abc.abstractclassmethod - def run(self, reads_path, work_dir): - raise NotImplementedError() + def _extract_args(cls, args): + """Extract arguments from args for from_args.""" + + @abc.abstractmethod + def run(self, reads_path, output_dir, reads2_path=None): + """Runs the pipeline with the given input.""" diff --git a/src/pyim/align/pipelines/paired.py b/src/pyim/align/pipelines/paired.py deleted file mode 100644 index 5e528f4..0000000 --- a/src/pyim/align/pipelines/paired.py +++ /dev/null @@ -1,26 +0,0 @@ -import abc -from pathlib import Path - -from pyim.util.path import build_path - -from ..external.cutadapt import cutadapt -from .base import Pipeline - - -class PairedPipeline(Pipeline): - @abc.abstractclassmethod - def configure_args(cls, parser): - parser.add_argument('--reads', type=Path, required=True) - parser.add_argument('--output', type=Path, required=True) - - @abc.abstractclassmethod - def from_args(cls, args): - raise NotImplementedError() - - @abc.abstractclassmethod - def run(self, reads_path, work_dir): - raise NotImplementedError() - - def extract_genomic(self, reads_path, output_base): - # Ensure output dir exists. - output_base.parent.mkdir(exist_ok=True) diff --git a/src/pyim/align/pipelines/single.py b/src/pyim/align/pipelines/shear_splink.py similarity index 79% rename from src/pyim/align/pipelines/single.py rename to src/pyim/align/pipelines/shear_splink.py index e6a1acb..9beaed9 100644 --- a/src/pyim/align/pipelines/single.py +++ b/src/pyim/align/pipelines/shear_splink.py @@ -7,14 +7,16 @@ import pandas as pd from pyim.external import bowtie2 -from pyim.external.util import flatten_options -from pyim.util.path import build_path +from pyim.external.util import flatten_arguments +from pyim.model import Insertion from ..common import genomic as cm_gen, insertions as cm_ins from .base import Pipeline, register_pipeline -class SinglePipeline(Pipeline): +class ShearSplinkPipeline(Pipeline): + """ShearSplink pipeline.""" + def __init__(self, transposon_path, bowtie_index_path, @@ -71,7 +73,7 @@ def configure_args(cls, parser): parser.add_argument('--linker_overlap', default=3, type=int) @classmethod - def extract_args(cls, args): + def _extract_args(cls, args): bowtie_options = {'--local': args.local} min_overlaps = { @@ -99,11 +101,15 @@ def extract_args(cls, args): min_overlaps=min_overlaps, error_rates=error_rates) - def run(self, reads_path, work_dir): + def run(self, reads_path, output_dir, reads2_path=None): + if reads2_path is not None: + raise ValueError('Pipeline does not support paired-end data') + logger = logging.getLogger() # Extract genomic sequences and align to reference. - alignment_path = self._extract_and_align(reads_path, work_dir, logger) + alignment_path = self._extract_and_align(reads_path, output_dir, + logger) # Extract alignment groups (grouped by position) from bam file. logger.info('Summarizing alignments') @@ -118,12 +124,20 @@ def run(self, reads_path, work_dir): logger.info(' %-18s: %d', 'Minimum support', self._min_support) logger.info(' %-18s: %d', 'Merge distance', self._merge_distance) - yield from cm_ins.convert_groups_to_insertions( + insertions = cm_ins.convert_groups_to_insertions( aln_summary, min_support=self._min_support, merge_distance=self._merge_distance) - def _extract_and_align(self, reads_path, work_dir, logger): + # Write insertions to output file. + insertion_path = output_dir / 'insertions.txt' + + ins_frame = Insertion.to_frame(insertions) + ins_frame.to_csv(str(insertion_path), sep='\t', index=False) + + def _extract_and_align(self, reads_path, output_dir, logger): + output_dir.mkdir(exist_ok=True, parents=True) + # Extract genomic sequences. logger.info('Extracting genomic sequences') logger.info(' %-18s: %s', 'Transposon', @@ -133,8 +147,7 @@ def _extract_and_align(self, reads_path, work_dir, logger): shorten_path(self._contaminant_path)) logger.info(' %-18s: %s', 'Minimum length', self._min_length) - genomic_path = build_path(reads_path, dir_=work_dir, suffix='.genomic') - genomic_path.parent.mkdir(exist_ok=True, parents=True) + genomic_path = output_dir / ('genomic' + reads_path.suffixes[-1]) cm_gen.extract_genomic( reads_path, @@ -150,10 +163,9 @@ def _extract_and_align(self, reads_path, work_dir, logger): logger.info('Aligning to reference') logger.info(' %-18s: %s', 'Reference', shorten_path(self._index_path)) logger.info(' %-18s: %s', 'Bowtie options', - flatten_options(self._bowtie_options)) + flatten_arguments(self._bowtie_options)) - alignment_path = build_path(reads_path, dir_=work_dir, ext='.bam') - alignment_path.parent.mkdir(exist_ok=True, parents=True) + alignment_path = output_dir / 'alignment.bam' bowtie2.bowtie2( [genomic_path], @@ -165,10 +177,12 @@ def _extract_and_align(self, reads_path, work_dir, logger): return alignment_path -register_pipeline(name='single', pipeline=SinglePipeline) +register_pipeline(name='shearsplink', pipeline=ShearSplinkPipeline) + +class MultiplexedShearSplinkPipeline(ShearSplinkPipeline): + """ShearSplink pipeline with multiplexed reads.""" -class SingleMultiplexedPipeline(SinglePipeline): def __init__(self, transposon_path, bowtie_index_path, @@ -208,8 +222,8 @@ def configure_args(cls, parser): '--barcode_mapping', required=False, type=Path, default=None) @classmethod - def extract_args(cls, args): - arg_dict = super().extract_args(args) + def _extract_args(cls, args): + arg_dict = super()._extract_args(args) if args.barcode_mapping is not None: map_df = pd.read_csv(args.barcode_mapping, sep='\t') @@ -222,11 +236,15 @@ def extract_args(cls, args): return arg_dict - def run(self, reads_path, work_dir): + def run(self, reads_path, output_dir, reads2_path=None): + if reads2_path is not None: + raise ValueError('Pipeline does not support paired-end data') + logger = logging.getLogger() # Extract genomic sequences and align to reference. - alignment_path = self._extract_and_align(reads_path, work_dir, logger) + alignment_path = self._extract_and_align(reads_path, output_dir, + logger) # Map reads to specific barcodes/samples. logger.info('Extracting barcode/sample mapping') @@ -249,7 +267,7 @@ def run(self, reads_path, work_dir): # adding sample name and sample prefix to the ID. logger.info('Converting to insertions') logger.info(' %-18s: %d', 'Minimum support', self._min_support) - logger.info(' %-18s: %d', 'Merge distance', self._merge_distance) + logger.info(' %-18s: %s', 'Merge distance', self._merge_distance) insertion_grps = ( cm_ins.convert_summary_to_insertions( @@ -261,7 +279,13 @@ def run(self, reads_path, work_dir): for barcode, aln_summ in aln_summaries.items()) # yapf: disable # Return concatenated list of insertions. - yield from itertools.chain.from_iterable(insertion_grps) + insertions = itertools.chain.from_iterable(insertion_grps) + + # Write insertions to output file. + insertion_path = output_dir / 'insertions.txt' + + ins_frame = Insertion.to_frame(insertions) + ins_frame.to_csv(str(insertion_path), sep='\t', index=False) def _get_barcode_mapping(self, reads_path): # Read barcode sequences. @@ -275,9 +299,15 @@ def _get_barcode_mapping(self, reads_path): register_pipeline( - name='single-multiplexed', pipeline=SingleMultiplexedPipeline) + name='shearsplink-multiplexed', pipeline=MultiplexedShearSplinkPipeline) def shorten_path(file_name, limit=40): - f = os.path.split(str(file_name))[1] - return "%s~%s" % (f[:3], f[-(limit - 3):]) if len(f) > limit else f + """Shorten path for str to limit for logging.""" + + name = os.path.split(str(file_name))[1] + + if len(name) > limit: + return "%s~%s" % (name[:3], name[-(limit - 3):]) + else: + return name diff --git a/src/pyim/external/bowtie2.py b/src/pyim/external/bowtie2.py index 8be223c..c1b5ac7 100644 --- a/src/pyim/external/bowtie2.py +++ b/src/pyim/external/bowtie2.py @@ -1,3 +1,5 @@ +"""Module with functions for calling bowtie2.""" + import sys from . import util as shell @@ -41,7 +43,7 @@ def bowtie2(in1_paths, options['-x'] = str(index_path) # Build bowtie2 arguments. - bowtie_args = ['bowtie2'] + shell.flatten_options(options) + bowtie_args = ['bowtie2'] + shell.flatten_arguments(options) # Sort arguments for samtools. sort_args = ['samtools', 'sort', '-o', str(output_path), '-'] diff --git a/src/pyim/external/cutadapt.py b/src/pyim/external/cutadapt.py index 54aa366..ebb87fb 100644 --- a/src/pyim/external/cutadapt.py +++ b/src/pyim/external/cutadapt.py @@ -1,3 +1,5 @@ +"""Module with functions for calling cutadapt.""" + import itertools from pathlib import Path import shutil @@ -19,7 +21,7 @@ def cutadapt(in1_path, out1_path, options, in2_path=None, out2_path=None): if out2_path is not None: options['-p'] = str(out2_path) - cmdline_args = shell.flatten_options(options) + cmdline_args = shell.flatten_arguments(options) cmdline_args = ['cutadapt'] + cmdline_args + [str(in1_path)] if in2_path is not None: diff --git a/src/pyim/external/util.py b/src/pyim/external/util.py index a5f393e..0605e7e 100644 --- a/src/pyim/external/util.py +++ b/src/pyim/external/util.py @@ -1,12 +1,36 @@ +"""Utility module for running with external commands.""" + import subprocess -def run(arguments, stdout=None, stderr=None, check=True): +def run(args, stdout=None, stderr=None, check=True): + """Runs command for given arguments. + + Compared to the subprocess.run command, this function adds + automatic opening/closing of any file paths passed to stdout/stderr. + + Parameters + ---------- + args : List[str] + Arguments for launching the process, passed as a list. + stdout and stderr: Union[Path, int] + These specify the executed programs' standard + input, standard output and standard error file handles, respectively. + check : bool + Whether to check the returncode of the process. + + Returns + ------- + subprocess.Popen + Handle to completed process. + + """ + stdout_ = _open_stdstream(stdout) stderr_ = _open_stdstream(stderr) try: - process = subprocess.Popen(arguments, stdout=stdout_, stderr=stderr_) + process = subprocess.Popen(args, stdout=stdout_, stderr=stderr_) process.wait() finally: for std in [stdout_, stderr_]: @@ -14,18 +38,34 @@ def run(arguments, stdout=None, stderr=None, check=True): # Check return code. if check and process.returncode != 0: - raise ValueError('Process terminated with errorcode {}' - .format(process.returncode)) + stderr_msg = process.stderr.read().decode() + raise ValueError('Process terminated with errorcode {}\n\n' + 'Output from stderr:\n\n' + .format(process.returncode) + stderr_msg) return process -def run_piped(arguments_list, stdout=None, stderrs=None, check=True): - if len(arguments_list) < 2: +def _open_stdstream(file_path, mode='w'): + if file_path is None: + return subprocess.PIPE + else: + return file_path.open(mode) + + +def _close_stdstream(stdstream): + if stdstream != subprocess.PIPE: + stdstream.close() + + +def run_piped(args_list, stdout=None, stderrs=None, check=True): + """Runs piped commands for given argument lists.""" + + if len(args_list) < 2: raise ValueError('At least two sets of arguments should be given') if stderrs is None: - stderrs = [None] * len(arguments_list) + stderrs = [None] * len(args_list) # Handle processes 1 to n-1. processes = [] @@ -33,7 +73,7 @@ def run_piped(arguments_list, stdout=None, stderrs=None, check=True): try: prev_out = None - for arg_list, stderr in zip(arguments_list[:-1], stderrs[:-1]): + for arg_list, stderr in zip(args_list[:-1], stderrs[:-1]): # Setup processes. stderr_fh = _open_stdstream(stderr) stream_handles.append(stderr_fh) @@ -52,57 +92,58 @@ def run_piped(arguments_list, stdout=None, stderrs=None, check=True): stderr_fh = _open_stdstream(stderrs[-1]) stream_handles += [stdout_fh, stderr_fh] - process = subprocess.Popen( - arguments_list[-1], - stdout=stdout_fh, - stderr=stderr_fh, - stdin=prev_out) + final_process = subprocess.Popen( + args_list[-1], stdout=stdout_fh, stderr=stderr_fh, stdin=prev_out) - processes.append(process) + processes.append(final_process) # Allow pi to receive a SIGPIPE. - for p in processes[:-1]: - p.stdout.close() + for process in processes[:-1]: + process.stdout.close() - process.wait() + final_process.wait() # Check return codes. if check: - if process.returncode != 0: - raise ValueError('Process terminated with errorcode {}' - .format(process.returncode)) + if final_process.returncode != 0: + stderr_msg = final_process.stderr.read().decode() + raise ValueError('Process terminated with errorcode {}\n\n' + 'Output from stderr:\n\n'.format( + final_process.returncode) + stderr_msg) finally: # Close all file handles. - for fh in stream_handles: - _close_stdstream(fh) + for handle in stream_handles: + _close_stdstream(handle) return processes -def _open_stdstream(file_path, mode='w'): - if file_path is None: - return subprocess.PIPE - else: - return file_path.open(mode) - +def flatten_arguments(arg_dict): + """Flattens a dict of options into an argument list. -def _close_stdstream(stdstream): - if stdstream != subprocess.PIPE: - stdstream.close() + Parameters + ---------- + arg_dict : Dict[Str, Any] + Dictionary of arguments. Keys should be strings, values may be + lists or tuples (for multiple values), booleans (for flags) + or any other value that can be converted to a string. + Returns + ------- + List[str] + List of flattened arguments. -def flatten_options(option_dict): - """Flattens a dict of options into an argument list.""" + """ # Iterate over keys in lexical order, so that we have a # reproducible order of iteration (useful for tests). - opt_names = sorted(option_dict.keys()) + opt_names = sorted(arg_dict.keys()) # Flatten values. options = [] for opt_name in opt_names: - opt_value = option_dict[opt_name] + opt_value = arg_dict[opt_name] if isinstance(opt_value, (tuple, list)): options += [opt_name] + [str(v) for v in opt_value] diff --git a/src/pyim/main/pyim_align.py b/src/pyim/main/pyim_align.py index b73b9b1..5f51696 100644 --- a/src/pyim/main/pyim_align.py +++ b/src/pyim/main/pyim_align.py @@ -17,12 +17,9 @@ def main(): # Run pipeline. pipeline = args.pipeline.from_args(args) - insertions = pipeline.run(reads_path=args.reads, - work_dir=args.output.parent) - - # Write insertions to output file. - ins_frame = Insertion.to_frame(insertions) - ins_frame.to_csv(str(args.output), sep='\t', index=False) + pipeline.run(reads_path=args.reads, + output_dir=args.output_dir, + reads2_path=args.reads2) def parse_args(): diff --git a/tests/pyim/external/test_util.py b/tests/pyim/external/test_util.py index a52ff3f..ba9fb66 100644 --- a/tests/pyim/external/test_util.py +++ b/tests/pyim/external/test_util.py @@ -4,6 +4,8 @@ class TestRun(object): + """Unit tests for the run function.""" + def test_simple(self): """Tests a simple command.""" process = util.run(['echo', 'test']) @@ -48,12 +50,14 @@ def test_stdout(self, tmpdir): assert log_file.read() == b'testing\n' -class TestFlattenOptions(object): +class TestFlattenArguments(object): + """Unit tests for the flatten_arguments function.""" + def test_simple(self): """Tests a basic set of options.""" options = OrderedDict([('--opt1', 'a'), ('--opt2', 'b')]) - result = util.flatten_options(options) + result = util.flatten_arguments(options) assert result == ['--opt1', 'a', '--opt2', 'b'] @@ -61,29 +65,29 @@ def test_non_str_type(self): """Tests options with a non-string type.""" options = {'--opt1': 1} - assert util.flatten_options(options) == ['--opt1', '1'] + assert util.flatten_arguments(options) == ['--opt1', '1'] def test_list(self): """Tests options with a list value.""" options = OrderedDict([('--opt1', ['a', 'b']), ('--opt2', 'c')]) - result = util.flatten_options(options) + result = util.flatten_arguments(options) assert result == ['--opt1', 'a', 'b', '--opt2', 'c'] def test_flag_true(self): """Tests options with positive flag.""" options = {'--opt1': True} - assert util.flatten_options(options) == ['--opt1'] + assert util.flatten_arguments(options) == ['--opt1'] def test_flag_false(self): """Tests options with negative flag.""" options = {'--opt1': False} - assert util.flatten_options(options) == [] + assert util.flatten_arguments(options) == [] def test_flag_none(self): """Tests options with negative (None) flag.""" options = {'--opt1': None} - assert util.flatten_options(options) == [] + assert util.flatten_arguments(options) == [] From af7cc2e1b3ff92c2e99f801dfed4ca8644c13036 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Tue, 21 Mar 2017 11:31:28 +0100 Subject: [PATCH 082/100] Added dev environment file. --- environment.yml | 33 +++++++++++++++++++++++++++++++++ src/pyim/external/util.py | 27 +++++++++++++++++++++++---- 2 files changed, 56 insertions(+), 4 deletions(-) create mode 100644 environment.yml diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..9421dc9 --- /dev/null +++ b/environment.yml @@ -0,0 +1,33 @@ +name: pyim-dev +channels: + - bioconda + - r + - defaults + - conda-forge + - jrderuiter +dependencies: + # Basic dependencies + - python=3.5.* + - setuptools + - pandas >=0.18.1 + - pyfaidx >=0.4.8.1 + - pysam >=0.9.1 + - toolz >=0.8.0 + - tqdm + - intervaltree + - cutadapt =1.12 + - bowtie2 =2.3.0 + + # R dependencies + - r-base =3.3.1 + - r-cimpl + - rpy2 >=2.8.2 + + # Dev dependencies + - pytest + - pytest-cov =2.3.1 + - pytest-mock =1.1 + - python-coveralls =2.9.* + + - pip: + - pytest-helpers-namespace diff --git a/src/pyim/external/util.py b/src/pyim/external/util.py index 0605e7e..6645294 100644 --- a/src/pyim/external/util.py +++ b/src/pyim/external/util.py @@ -13,9 +13,10 @@ def run(args, stdout=None, stderr=None, check=True): ---------- args : List[str] Arguments for launching the process, passed as a list. - stdout and stderr: Union[Path, int] - These specify the executed programs' standard - input, standard output and standard error file handles, respectively. + stdout : Union[Path, int] + Specifies the standard output handle for the process. + stdout : Union[Path, int] + Specifies the standard error handle for the processe. check : bool Whether to check the returncode of the process. @@ -59,7 +60,25 @@ def _close_stdstream(stdstream): def run_piped(args_list, stdout=None, stderrs=None, check=True): - """Runs piped commands for given argument lists.""" + """Runs piped command for given list of arguments. + + Parameters + ---------- + args : List[str] + Arguments for launching the process, passed as a list. + stdout : Union[Path, int] + Specifies the standard output handle for the final process. + stdout : List[Union[Path, int]] + Specifies the standard error handles for the processes. + check : bool + Whether to check the returncode of the process. + + Returns + ------- + subprocess.Popen + Handle to completed process. + + """ if len(args_list) < 2: raise ValueError('At least two sets of arguments should be given') From 36e2bb62e52da9cd2bd7a7df734c8f9e2b5410d7 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Tue, 21 Mar 2017 12:23:31 +0100 Subject: [PATCH 083/100] Bump version number. --- conda/meta.yaml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conda/meta.yaml b/conda/meta.yaml index f35b6b4..d405faa 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -1,4 +1,4 @@ -{% set version = "0.2.0.dev0" %} +{% set version = "0.2.0" %} package: name: pyim diff --git a/setup.py b/setup.py index 35e9833..408407a 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ setup( name='pyim', - version='0.2.0.dev0', + version='0.2.0', description="Tools for analyzing insertional mutagenesis data.", long_description=readme + '\n\n' + history, author="Julian de Ruiter", From 7bb7ebd2d989598a06c4bb647e4109467e78caf8 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Tue, 21 Mar 2017 13:07:08 +0100 Subject: [PATCH 084/100] Add lower bounds for dependencies. --- conda/meta.yaml | 40 ++++++++++++++++++++++------------------ setup.py | 17 +++++++++-------- 2 files changed, 31 insertions(+), 26 deletions(-) diff --git a/conda/meta.yaml b/conda/meta.yaml index d405faa..775a534 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -16,38 +16,42 @@ build: requirements: build: - - python - - setuptools - # Basic dependencies - - pandas >=0.18.1 + - python >=3.3 + - setuptools + - pandas >=0.18 - pyfaidx >=0.4.8.1 - - pysam >=0.9.1 - - toolz >=0.8.0 - - tqdm - - intervaltree - - cutadapt + - pysam >=0.9 + - toolz >=0.8 + - tqdm >=4.7 + - intervaltree >=2.1 + - cutadapt >=1.8 + + # Fix htslib version for pysam + - htslib >=1.3,<1.4 # R dependencies - r - - r-cimpl + - r-cimpl >=1.1 - rpy2 >=2.8.2 run: - - python - # Basic dependencies - - pandas >=0.18.1 + - python >=3.3 + - pandas >=0.18 - pyfaidx >=0.4.8.1 - - pysam >=0.9.1 - - toolz >=0.8.0 - - tqdm + - pysam >=0.9 + - toolz >=0.8 + - tqdm >=4.7 - intervaltree - - cutadapt + - cutadapt >=1.8 + + # Fix htslib version for pysam + - htslib >=1.3,<1.4 # R dependencies - r - - r-cimpl + - r-cimpl >=1.1 - rpy2 >=2.8.2 # External dependencies diff --git a/setup.py b/setup.py index 408407a..952609c 100644 --- a/setup.py +++ b/setup.py @@ -9,8 +9,9 @@ with open('HISTORY.rst') as history_file: history = history_file.read() -requirements = ['pyfaidx', 'intervaltree', 'tqdm', 'toolz', 'rpy2', 'numpy', - 'pandas', 'pysam'] +requirements = ['pyfaidx>=0.4.8.1', 'intervaltree>=2.1', 'tqdm>=4.7', + 'toolz>=0.8', 'rpy2>=2.8.2', 'numpy', 'pandas>=0.18', + 'pysam>=0.9'] test_requirements = ['pytest', 'pytest-cov', 'pytest-mock', 'pytest-helpers-namespace', 'python-coveralls'] @@ -18,24 +19,24 @@ setup( name='pyim', version='0.2.0', - description="Tools for analyzing insertional mutagenesis data.", + description=('Tool for identifying transposon insertions ' + 'from targeted DNA-sequencing data.'), long_description=readme + '\n\n' + history, - author="Julian de Ruiter", + author='Julian de Ruiter', author_email='julianderuiter@gmail.com', url='https://github.com/jrderuiter/pyim', packages=find_packages('src'), package_dir={'': 'src'}, include_package_data=True, install_requires=requirements, - license="MIT license", + license='MIT license', zip_safe=False, keywords='pyim', classifiers=[ - 'Development Status :: 2 - Pre-Alpha', - 'Intended Audience :: Developers', + 'Development Status :: 4 - Beta', + 'Intended Audience :: Science/Research', 'License :: OSI Approved :: MIT License', 'Natural Language :: English', - 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', From 86390fc24196dd3904a3916a387946cd5965b074 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Tue, 21 Mar 2017 13:07:23 +0100 Subject: [PATCH 085/100] Add documentation. --- tests/pyim/external/test_util.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/pyim/external/test_util.py b/tests/pyim/external/test_util.py index ba9fb66..627ff33 100644 --- a/tests/pyim/external/test_util.py +++ b/tests/pyim/external/test_util.py @@ -27,6 +27,8 @@ def test_stdout(self, tmpdir): class TestRunPiped(object): + """Unit tests for the run_piped function.""" + def test_simple(self): """Tests a simple piped command.""" From ead7e31c8d54410d85160e2de61aefe3a6a4374f Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Tue, 21 Mar 2017 14:27:33 +0100 Subject: [PATCH 086/100] Styling + renaming of parameters. --- src/pyim/align/common/genomic.py | 51 ++++++++++++++++-------- src/pyim/align/common/insertions.py | 2 +- src/pyim/align/pipelines/shear_splink.py | 12 +++--- src/pyim/external/bowtie2.py | 29 ++++++++------ src/pyim/external/cutadapt.py | 24 +++++------ tests/pyim/external/test_bowtie2.py | 20 +++++----- tests/pyim/external/test_cutadapt.py | 18 ++++----- 7 files changed, 91 insertions(+), 65 deletions(-) diff --git a/src/pyim/align/common/genomic.py b/src/pyim/align/common/genomic.py index c116a11..ede2f8a 100644 --- a/src/pyim/align/common/genomic.py +++ b/src/pyim/align/common/genomic.py @@ -9,14 +9,20 @@ def extract_genomic(reads_path, - output_path, + output_dir, transposon_path, linker_path=None, contaminant_path=None, min_length=None, min_overlaps=None, error_rates=None): - """Extracts genomic sequences from single-read data.""" + """Extracts genomic sequences from single-read data. + + Process reads of the following structure: + + [Transposon-Genomic-Linker] + + """ logger = logging.getLogger() @@ -24,14 +30,16 @@ def extract_genomic(reads_path, error_rates = error_rates or {} # Ensure output dir exists. - output_path.parent.mkdir(exist_ok=True) + output_dir.mkdir(exist_ok=True) + + suffix = _extract_suffix(reads_path) # Track interim files for cleaning. interim_files = [] if contaminant_path is not None: # Remove contaminants. - contaminant_out_path = build_path(output_path, suffix='.contaminant') + contaminant_out_path = output_dir / ('filt_contaminant' + suffix) contaminant_opts = { '-g': 'file:' + str(contaminant_path), '--discard-trimmed': True, @@ -39,9 +47,9 @@ def extract_genomic(reads_path, '-e': error_rates.get('contaminant', DEFAULT_ERROR_RATE) } - p = cutadapt(reads_path, contaminant_out_path, contaminant_opts) + process = cutadapt(reads_path, contaminant_out_path, contaminant_opts) logger.info('Trimmed contaminant sequences' + - cutadapt_summary(p.stdout)) # yapf: disable + cutadapt_summary(process.stdout)) # yapf: disable interim_files.append(contaminant_out_path) else: @@ -49,7 +57,7 @@ def extract_genomic(reads_path, if linker_path is not None: # Remove linker. - linker_out_path = build_path(output_path, suffix='.linker') + linker_out_path = output_dir / ('filt_linker' + suffix) linker_opts = { '-a': 'file:' + str(linker_path), '--discard-untrimmed': True, @@ -57,9 +65,9 @@ def extract_genomic(reads_path, '-e': error_rates.get('linker', DEFAULT_ERROR_RATE) } - p = cutadapt(contaminant_out_path, linker_out_path, linker_opts) + process = cutadapt(contaminant_out_path, linker_out_path, linker_opts) logger.info('Trimmed linker sequence' + - cutadapt_summary(p.stdout)) # yapf: disable + cutadapt_summary(process.stdout)) # yapf: disable interim_files.append(linker_out_path) else: @@ -76,13 +84,24 @@ def extract_genomic(reads_path, if min_length is not None: transposon_opts['--minimum-length'] = min_length - p = cutadapt(linker_out_path, output_path, transposon_opts) + genomic_path = output_dir / ('genomic' + suffix) + process = cutadapt(linker_out_path, genomic_path, transposon_opts) logger.info('Trimmed transposon sequence and filtered for length' + - cutadapt_summary(p.stdout)) # yapf: disable + cutadapt_summary(process.stdout)) # yapf: disable # Clean-up interim files. - for fp in interim_files: - fp.unlink() + for file_path in interim_files: + file_path.unlink() + + return genomic_path + + +def _extract_suffix(file_path): + if file_path.suffixes[-1] == '.gz': + suffix = ''.join(file_path.suffixes[-2:]) + else: + suffix = file_path.suffixes[-1] + return suffix def extract_genomic_paired(reads_paths, @@ -112,7 +131,7 @@ def extract_genomic_paired(reads_paths, contaminant_opts = {'-g': 'file:' + str(contaminant_path), '--discard-trimmed': True} cutadapt(in1_path, cont1_path, contaminant_opts, - in2_path=in2_path, out2_path=out2_path) # yapf: disable + reads2_path=in2_path, out2_path=out2_path) # yapf: disable interim_files += [cont1_path, cont2_path] else: @@ -126,7 +145,7 @@ def extract_genomic_paired(reads_paths, linker_opts = {'-A': 'file:' + str(linker_path), '--discard-untrimmed': True} cutadapt(cont1_path, link1_path, linker_opts, - in2_path=cont2_path, out2_path=link2_path) # yapf: disable + reads2_path=cont2_path, out2_path=link2_path) # yapf: disable interim_files += [link1_path, link2_path] else: @@ -140,7 +159,7 @@ def extract_genomic_paired(reads_paths, transposon_opts['--minimum-length'] = min_length cutadapt(link1_path, out1_path, transposon_opts, - in2_path=link2_path, out2_path=out2_path) # yapf: disable + reads2_path=link2_path, out2_path=out2_path) # yapf: disable # Clean-up intermediary files. for fp in interim_files: diff --git a/src/pyim/align/common/insertions.py b/src/pyim/align/common/insertions.py index c42a161..398680a 100644 --- a/src/pyim/align/common/insertions.py +++ b/src/pyim/align/common/insertions.py @@ -3,12 +3,12 @@ import logging import operator -from pyim.util.frozendict import frozendict import numpy as np import pysam import toolz from pyim.model import Insertion +from pyim.util.frozendict import frozendict def fetch_alignments(bam_path, only_primary=True, min_mapq=None): diff --git a/src/pyim/align/pipelines/shear_splink.py b/src/pyim/align/pipelines/shear_splink.py index 9beaed9..00d94ca 100644 --- a/src/pyim/align/pipelines/shear_splink.py +++ b/src/pyim/align/pipelines/shear_splink.py @@ -10,9 +10,13 @@ from pyim.external.util import flatten_arguments from pyim.model import Insertion -from ..common import genomic as cm_gen, insertions as cm_ins +from ..common.genomic import extract_genomic +from ..common import insertions as cm_ins from .base import Pipeline, register_pipeline +DEFAULT_OVERLAP = 3 +DEFAULT_ERROR_RATE = 0.1 + class ShearSplinkPipeline(Pipeline): """ShearSplink pipeline.""" @@ -147,11 +151,9 @@ def _extract_and_align(self, reads_path, output_dir, logger): shorten_path(self._contaminant_path)) logger.info(' %-18s: %s', 'Minimum length', self._min_length) - genomic_path = output_dir / ('genomic' + reads_path.suffixes[-1]) - - cm_gen.extract_genomic( + genomic_path = extract_genomic( reads_path, - genomic_path, + output_dir, transposon_path=self._transposon_path, linker_path=self._linker_path, contaminant_path=self._contaminant_path, diff --git a/src/pyim/external/bowtie2.py b/src/pyim/external/bowtie2.py index c1b5ac7..0c29f7f 100644 --- a/src/pyim/external/bowtie2.py +++ b/src/pyim/external/bowtie2.py @@ -5,39 +5,42 @@ from . import util as shell -def bowtie2(in1_paths, +def bowtie2(read_paths, index_path, output_path, options=None, - in2_paths=None, + read2_paths=None, verbose=False): """ Aligns reads to a reference genome using Bowtie2. Parameters ---------- - in1_paths : List[Path] - Path to input files containings reads. For single read data, - a list of Paths is expected. For paired-end sequencing data, - Paths should be passed as a tuple of lists, in which the first - element is taken as #1 mates and the second as #2 mates. + read_paths : List[Path] + Path to input files containing reads. output_path : Path Output path for the aligned (and sorted) bam file. options : dict - Dict of extra options to pass to Bowtie2. + Dict of extra options to pass to Bowtie2. Should conform to the + format expected by flatten_arguments. + read2_paths : List[Path] + Path to input files containing the second end (for paired-end data). + verbose : bool + Whether to print output from bowtie2 to stderr. + """ # Ensure we have a copy of options to work on. options = dict(options) if options is not None else {} # Inject inputs + index into options. - if in2_paths is not None: - options['-1'] = ','.join(str(fp) for fp in in1_paths) - options['-2'] = ','.join(str(fp) for fp in in2_paths) + if read2_paths is not None: + options['-1'] = ','.join(str(fp) for fp in read_paths) + options['-2'] = ','.join(str(fp) for fp in read2_paths) else: - options['-U'] = ','.join(str(fp) for fp in in1_paths) + options['-U'] = ','.join(str(fp) for fp in read_paths) - if any(ext in in1_paths[0].suffixes for ext in {'.fa', '.fna'}): + if any(ext in read_paths[0].suffixes for ext in {'.fa', '.fna'}): options['-f'] = True options['-x'] = str(index_path) diff --git a/src/pyim/external/cutadapt.py b/src/pyim/external/cutadapt.py index ebb87fb..c199ee1 100644 --- a/src/pyim/external/cutadapt.py +++ b/src/pyim/external/cutadapt.py @@ -9,14 +9,14 @@ from . import util as shell -def cutadapt(in1_path, out1_path, options, in2_path=None, out2_path=None): +def cutadapt(read_path, out_path, options, read2_path=None, out2_path=None): """Runs cutadapt using the given options.""" - in1_path = in1_path or '-' + in1_path = read_path or '-' options = dict(options) if options is not None else {} - if out1_path is not None: - options['-o'] = str(out1_path) + if out_path is not None: + options['-o'] = str(out_path) if out2_path is not None: options['-p'] = str(out2_path) @@ -24,13 +24,13 @@ def cutadapt(in1_path, out1_path, options, in2_path=None, out2_path=None): cmdline_args = shell.flatten_arguments(options) cmdline_args = ['cutadapt'] + cmdline_args + [str(in1_path)] - if in2_path is not None: - cmdline_args += [str(in2_path)] + if read2_path is not None: + cmdline_args += [str(read2_path)] return shell.run(cmdline_args) -def demultiplex_samples(reads_path, +def demultiplex_samples(read_path, output_dir, barcode_path, error_rate=0.0, @@ -58,12 +58,12 @@ def demultiplex_samples(reads_path, if sample_mapping is None: # Directly de-multiplex using barcodes. sample_paths = _demultiplex( - reads_path, output_dir, barcode_path, error_rate=error_rate) + read_path, output_dir, barcode_path, error_rate=error_rate) else: # First demultiplex to barcodes in temp dir. tmp_dir = output_dir / '_barcodes' barcode_paths = _demultiplex( - reads_path, tmp_dir, barcode_path, error_rate=error_rate) + read_path, tmp_dir, barcode_path, error_rate=error_rate) # Then rename files using mapping and delete files for unused barcodes. sample_paths = {} @@ -85,7 +85,7 @@ def demultiplex_samples(reads_path, return sample_paths -def _demultiplex(reads_path, output_dir, barcode_path, error_rate): +def _demultiplex(read_path, output_dir, barcode_path, error_rate): """Runs cutadapt to de-multiplex reads into seperate files per barcode.""" output_dir.mkdir(parents=True) @@ -94,8 +94,8 @@ def _demultiplex(reads_path, output_dir, barcode_path, error_rate): options = {'-g': 'file:' + str(barcode_path), '--discard-untrimmed': True, '-e': error_rate} - output_base = output_dir / ('{name}' + reads_path.suffixes[-1]) - cutadapt(reads_path, output_base, options=options) + output_base = output_dir / ('{name}' + read_path.suffixes[-1]) + cutadapt(read_path, output_base, options=options) # Identify output files. barcode_keys = pyfaidx.Fasta(str(barcode_path)).keys() diff --git a/tests/pyim/external/test_bowtie2.py b/tests/pyim/external/test_bowtie2.py index 7c67238..0c9a3d7 100644 --- a/tests/pyim/external/test_bowtie2.py +++ b/tests/pyim/external/test_bowtie2.py @@ -12,8 +12,8 @@ def bowtie_args(): """Basic arguments for bowtie2 function.""" return { - 'in1_paths': [Path('/path/to/reads.R1.fastq')], - 'in2_paths': [Path('/path/to/reads.R2.fastq')], + 'read_paths': [Path('/path/to/reads.R1.fastq')], + 'read2_paths': [Path('/path/to/reads.R2.fastq')], 'output_path': Path('/path/to/output.bam'), 'index_path': Path('/path/to/index'), 'options': {'--threads': 10}, @@ -27,8 +27,8 @@ def test_paired(mocker, bowtie_args): bowtie2(**bowtie_args) expected_bt2 = ['bowtie2', '--threads', '10', '-1', - str(bowtie_args['in1_paths'][0]), '-2', - str(bowtie_args['in2_paths'][0]), '-x', + str(bowtie_args['read_paths'][0]), '-2', + str(bowtie_args['read2_paths'][0]), '-x', str(bowtie_args['index_path'])] expected_st = ['samtools', 'sort', '-o', str(bowtie_args['output_path']), '-'] @@ -39,13 +39,13 @@ def test_paired(mocker, bowtie_args): def test_single(mocker, bowtie_args): """Tests single-end invocation of bowtie2.""" - bowtie_args['in2_paths'] = None + bowtie_args['read2_paths'] = None mock = mocker.patch.object(shell, 'run_piped') bowtie2(**bowtie_args) expected_bt2 = ['bowtie2', '--threads', '10', '-U', - str(bowtie_args['in1_paths'][0]), '-x', + str(bowtie_args['read_paths'][0]), '-x', str(bowtie_args['index_path'])] expected_st = ['samtools', 'sort', '-o', str(bowtie_args['output_path']), '-'] @@ -56,14 +56,16 @@ def test_single(mocker, bowtie_args): def test_single_fa(mocker, bowtie_args): """Tests single-end invocation of bowtie2 with fasta file.""" - bowtie_args['in1_paths'] = [bowtie_args['in1_paths'][0].with_suffix('.fa')] - bowtie_args['in2_paths'] = None + bowtie_args['read_paths'] = [ + bowtie_args['read_paths'][0].with_suffix('.fa') + ] + bowtie_args['read2_paths'] = None mock = mocker.patch.object(shell, 'run_piped') bowtie2(**bowtie_args) expected_bt2 = ['bowtie2', '--threads', '10', '-U', - str(bowtie_args['in1_paths'][0]), '-f', '-x', + str(bowtie_args['read_paths'][0]), '-f', '-x', str(bowtie_args['index_path'])] expected_st = ['samtools', 'sort', '-o', str(bowtie_args['output_path']), '-'] diff --git a/tests/pyim/external/test_cutadapt.py b/tests/pyim/external/test_cutadapt.py index 218fc4f..4c18077 100644 --- a/tests/pyim/external/test_cutadapt.py +++ b/tests/pyim/external/test_cutadapt.py @@ -12,9 +12,9 @@ def cutadapt_args(): """Basic arguments for bowtie2 function.""" return { - 'in1_path': Path('/path/to/reads.R1.fastq'), - 'in2_path': Path('/path/to/reads.R2.fastq'), - 'out1_path': Path('/path/to/output.R1.fastq'), + 'read_path': Path('/path/to/reads.R1.fastq'), + 'read2_path': Path('/path/to/reads.R2.fastq'), + 'out_path': Path('/path/to/output.R1.fastq'), 'out2_path': Path('/path/to/output.R2.fastq'), 'options': {'-m': 10}, } @@ -27,23 +27,23 @@ def test_paired(mocker, cutadapt_args): cutadapt(**cutadapt_args) expected = ['cutadapt', '-m', '10', - '-o', str(cutadapt_args['out1_path']), + '-o', str(cutadapt_args['out_path']), '-p', str(cutadapt_args['out2_path']), - str(cutadapt_args['in1_path']), - str(cutadapt_args['in2_path'])] # yapf: disable + str(cutadapt_args['read_path']), + str(cutadapt_args['read2_path'])] # yapf: disable mock.assert_called_with(expected) def test_single(mocker, cutadapt_args): """Tests single-end invocation of cutadapt.""" - cutadapt_args['in2_path'] = None + cutadapt_args['read2_path'] = None cutadapt_args['out2_path'] = None mock = mocker.patch.object(shell, 'run') cutadapt(**cutadapt_args) expected = ['cutadapt', '-m', '10', - '-o', str(cutadapt_args['out1_path']), - str(cutadapt_args['in1_path'])] # yapf: disable + '-o', str(cutadapt_args['out_path']), + str(cutadapt_args['read_path'])] # yapf: disable mock.assert_called_with(expected) From 699fb7767ef2f9aae0e175bd2dfd917eafe2b163 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 22 Mar 2017 17:21:57 +0100 Subject: [PATCH 087/100] Added extract_insertions function. --- src/pyim/align/common/insertions.py | 197 ++++++++++++++--------- src/pyim/align/pipelines/shear_splink.py | 128 +++++++++------ 2 files changed, 197 insertions(+), 128 deletions(-) diff --git a/src/pyim/align/common/insertions.py b/src/pyim/align/common/insertions.py index 398680a..714b8be 100644 --- a/src/pyim/align/common/insertions.py +++ b/src/pyim/align/common/insertions.py @@ -1,35 +1,104 @@ from collections import defaultdict import itertools -import logging import operator import numpy as np -import pysam import toolz from pyim.model import Insertion from pyim.util.frozendict import frozendict -def fetch_alignments(bam_path, only_primary=True, min_mapq=None): - bam_file = pysam.AlignmentFile(str(bam_path)) +def extract_insertions(alignments, + func, + paired=False, + group_func=None, + merge_dist=None, + min_mapq=None, + min_support=1, + logger=None): + """Extracts insertions from given alignments.""" - try: - alignments = iter(bam_file) + if logger is not None: + logger.info('Summarizing alignments') + logger.info(' %-18s: %s', 'Minimum mapq', min_mapq) - if only_primary: - alignments = (aln for aln in alignments if not aln.is_secondary) + alignments = filter_alignments(alignments, min_mapq=min_mapq) - if min_mapq is not None: - alignments = (aln for aln in alignments - if aln.mapping_quality >= min_mapq) + if group_func is None: + summ_func = summarize_mates if paired else summarize_alignments + summary = summ_func(alignments, func=func) - yield from alignments - finally: - bam_file.close() + if logger is not None: + _log_insertions(logger, min_support, merge_dist) + + insertions = convert_summary_to_insertions( + summary, merge_dist=merge_dist, min_support=min_support) + else: + if paired: + raise NotImplementedError( + 'Grouping is not yet supported for paired-end data') + else: + aln_summaries = summarize_alignments_by_group( + alignments, func, group_func=group_func) + + if logger is not None: + _log_insertions(logger, min_support, merge_dist) + + insertion_grps = ( + convert_summary_to_insertions( + aln_summ, + min_support=min_support, + merge_dist=merge_dist, + sample=barcode, + id_fmt=barcode + '.INS_{}') + for barcode, aln_summ in aln_summaries.items()) # yapf: disable + + insertions = list(itertools.chain.from_iterable(insertion_grps)) + + return insertions + + +def _log_insertions(logger, min_support, merge_dist): + logger.info('Converting to insertions') + logger.info(' %-18s: %d', 'Minimum support', min_support) + logger.info(' %-18s: %s', 'Merge distance', merge_dist) -def summarize_alignments(alignments): +def filter_alignments(alignments, only_primary=True, min_mapq=None): + """Filters alignments on mapping quality, etc.""" + + if only_primary: + alignments = (aln for aln in alignments if not aln.is_secondary) + + if min_mapq is not None: + alignments = (aln for aln in alignments + if aln.mapping_quality >= min_mapq) + + yield from alignments + + +def iter_mates(alignments): + """Iterates over mate pairs in alignments.""" + + cache = {} + for aln in alignments: + if aln.is_proper_pair: + # Try to get mate from cache. + mate = cache.pop(aln.query_name, None) + + if mate is None: + # If not found, cache this mate. + cache[aln.query_name] = aln + else: + # Otherwise, yield with mate. + if aln.is_read1: + yield aln, mate + else: + yield mate, aln + + +def summarize_alignments(alignments, func): """Summarizes alignments into a dict of chromosomal positions. This function summarizes an iterable of alignments into a dict that @@ -45,6 +114,13 @@ def summarize_alignments(alignments): alignments : iterable[pysam.AlignedSegment] Alignments to summarize. May be prefiltered (on mapping quality for example), as this function does not perform any filtering itself. + func : Function + Function that takes an alignment and returns a Tuple containing + (a) the location of the breakpoint with the transposon and (b) the + position of the breakpoint with the linker (or the end of the read + if no linkeris used). The former should be returned as a tuple of + (chromosome, position, strand), whereas the latter should only be + a position. Returns ------- @@ -53,77 +129,45 @@ def summarize_alignments(alignments): (chromosome, position, strand) tuple to ligation points. """ + alignment_map = defaultdict(list) for aln in alignments: - tup = _process_alignment(aln) - if tup is not None: - alignment_map[tup[0]].append(tup[1]) + transposon_position, linker_position = func(aln) + alignment_map[transposon_position].append(linker_position) return dict(alignment_map) -def summarize_alignments_by_group(alignments, group_func): - # Take subgroups of alignments into account. This allows us to make - # arbitrary subgroups of alignment summaries, for example by grouping - # reads by sample barcodes. - alignment_map = defaultdict(lambda: defaultdict(list)) - - for aln in alignments: - tup = _process_alignment(aln) - if tup is not None: - grp = group_func(aln) - if grp is not None: - alignment_map[grp][tup[0]].append(tup[1]) - - return {k: dict(v) for k, v in alignment_map.items()} - - -def _process_alignment(aln): - if aln.reference_id != -1: - ref = aln.reference_name - - if aln.is_reverse: - transposon_pos = aln.reference_end - linker_pos = aln.reference_start - strand = -1 - else: - transposon_pos = aln.reference_start - linker_pos = aln.reference_end - strand = 1 +def summarize_mates(alignments, func): + """Summarizes mate pairs into a dict of chromosomal positions.""" - key = (ref, transposon_pos, strand) + alignment_map = defaultdict(list) - return key, linker_pos - else: - return None + for mate1, mate2 in iter_mates(alignments): + transposon_position, linker_position = func(mate1, mate2) + alignment_map[transposon_position].append(linker_position) + return dict(alignment_map) -def extract_barcode_mapping(reads, barcodes, barcode_mapping=None): - # Create barcode/sample dict. - barcode_dict = {bc.name: bc.sequence for bc in barcodes} +def summarize_alignments_by_group(alignments, func, group_func): + """Summarizes groups of alignments into a dict of chromosomal positions.""" - if barcode_mapping is not None: - barcode_dict = {sample: barcode_dict[barcode] - for barcode, sample in barcode_mapping.items()} + # Take subgroups of alignments into account. This allows us to make + # arbitrary subgroups of alignment summaries, for example by grouping + # reads by sample barcodes. - # Build mapping. - mapping = {} + alignment_map = defaultdict(lambda: defaultdict(list)) - for read in reads: - # Check each barcode for match in read. - matched = [k for k, v in barcode_dict.items() if v in read.sequence] + for aln in alignments: + transposon_position, linker_position = func(aln) + group = group_func(aln) - if len(matched) == 1: - # Record single matches. - name = read.name.split()[0] - mapping[name] = matched[0] - elif len(matched) > 1: - logging.warning('Skipping %s due to multiple matching barcodes', - read.name.split()[0]) + if group is not None: + alignment_map[group][transposon_position].append(linker_position) - return mapping + return {k: dict(v) for k, v in alignment_map.items()} def merge_summary_within_distance(aln_summary, max_distance=10): @@ -188,15 +232,15 @@ def _merge_entries(alignment_map, keys): def convert_summary_to_insertions(aln_summary, min_support=1, - merge_distance=None, + merge_dist=None, id_fmt='INS_{}', **kwargs): """Converts an alignment map to a list of Insertions.""" # Optionally merge insertions within x distance. - if merge_distance is not None: + if merge_dist is not None: aln_summary = merge_summary_within_distance( - aln_summary, max_distance=merge_distance) + aln_summary, max_distance=merge_dist) # Convert to insertions. insertions = (_to_insertion(ref, pos, strand, ends, id_=None, **kwargs) @@ -206,12 +250,11 @@ def convert_summary_to_insertions(aln_summary, # Filter for support. insertions = (ins for ins in insertions if ins.support >= min_support) - # Sort by depth and add IDs. - insertions = sorted(insertions, key=operator.attrgetter('support'))[::-1] - insertions = [ins._replace(id=id_fmt.format(i + 1)) - for i, ins in enumerate(insertions)] + # Add ids. + insertions = (ins._replace(id=id_fmt.format(i + 1)) + for i, ins in enumerate(insertions)) - return insertions + return list(insertions) def _to_insertion(ref, pos, strand, ends, id_=None, **kwargs): diff --git a/src/pyim/align/pipelines/shear_splink.py b/src/pyim/align/pipelines/shear_splink.py index 00d94ca..84c6152 100644 --- a/src/pyim/align/pipelines/shear_splink.py +++ b/src/pyim/align/pipelines/shear_splink.py @@ -5,13 +5,14 @@ from cutadapt import seqio import pandas as pd +import pysam from pyim.external import bowtie2 from pyim.external.util import flatten_arguments from pyim.model import Insertion from ..common.genomic import extract_genomic -from ..common import insertions as cm_ins +from ..common.insertions import extract_insertions from .base import Pipeline, register_pipeline DEFAULT_OVERLAP = 3 @@ -115,23 +116,19 @@ def run(self, reads_path, output_dir, reads2_path=None): alignment_path = self._extract_and_align(reads_path, output_dir, logger) - # Extract alignment groups (grouped by position) from bam file. - logger.info('Summarizing alignments') - logger.info(' %-18s: %s', 'Minimum mapq', self._min_mapq) + # Extract insertions from bam file. + bam_file = pysam.AlignmentFile(str(alignment_path)) - alignments = cm_ins.fetch_alignments( - alignment_path, min_mapq=self._min_mapq) - aln_summary = cm_ins.summarize_alignments(alignments) - - # Convert groups to insertions and return. - logger.info('Converting to insertions') - logger.info(' %-18s: %d', 'Minimum support', self._min_support) - logger.info(' %-18s: %d', 'Merge distance', self._merge_distance) - - insertions = cm_ins.convert_groups_to_insertions( - aln_summary, - min_support=self._min_support, - merge_distance=self._merge_distance) + try: + insertions = extract_insertions( + iter(bam_file), + func=_process_alignment, + merge_dist=self._merge_distance, + min_mapq=self._min_mapq, + min_support=self._min_support, + logger=logger) + finally: + bam_file.close() # Write insertions to output file. insertion_path = output_dir / 'insertions.txt' @@ -145,10 +142,10 @@ def _extract_and_align(self, reads_path, output_dir, logger): # Extract genomic sequences. logger.info('Extracting genomic sequences') logger.info(' %-18s: %s', 'Transposon', - shorten_path(self._transposon_path)) - logger.info(' %-18s: %s', 'Linker', shorten_path(self._linker_path)) + _shorten_path(self._transposon_path)) + logger.info(' %-18s: %s', 'Linker', _shorten_path(self._linker_path)) logger.info(' %-18s: %s', 'Contaminants', - shorten_path(self._contaminant_path)) + _shorten_path(self._contaminant_path)) logger.info(' %-18s: %s', 'Minimum length', self._min_length) genomic_path = extract_genomic( @@ -163,7 +160,8 @@ def _extract_and_align(self, reads_path, output_dir, logger): # Align reads to genome. logger.info('Aligning to reference') - logger.info(' %-18s: %s', 'Reference', shorten_path(self._index_path)) + logger.info(' %-18s: %s', 'Reference', + _shorten_path(self._index_path)) logger.info(' %-18s: %s', 'Bowtie options', flatten_arguments(self._bowtie_options)) @@ -251,37 +249,23 @@ def run(self, reads_path, output_dir, reads2_path=None): # Map reads to specific barcodes/samples. logger.info('Extracting barcode/sample mapping') logger.info(' %-18s: %s', 'Barcodes', - shorten_path(self._barcode_path)) + _shorten_path(self._barcode_path)) read_map = self._get_barcode_mapping(reads_path) - # Extract alignment groups (grouped by position) from bam file. - logger.info('Summarizing alignments') - logger.info(' %-18s: %s', 'Minimum mapq', self._min_mapq) - - alignments = cm_ins.fetch_alignments( - alignment_path, min_mapq=self._min_mapq) - - aln_summaries = cm_ins.summarize_alignments_by_group( - alignments, - group_func=lambda aln: read_map.get(aln.query_name, None)) + # Extract insertions from bam file. + bam_file = pysam.AlignmentFile(str(alignment_path)) - # Convert groups from each sample into insertions, - # adding sample name and sample prefix to the ID. - logger.info('Converting to insertions') - logger.info(' %-18s: %d', 'Minimum support', self._min_support) - logger.info(' %-18s: %s', 'Merge distance', self._merge_distance) - - insertion_grps = ( - cm_ins.convert_summary_to_insertions( - aln_summ, + try: + insertions = extract_insertions( + iter(bam_file), + func=_process_alignment, + group_func=lambda aln: read_map.get(aln.query_name, None), + merge_dist=self._merge_distance, + min_mapq=self._min_mapq, min_support=self._min_support, - merge_distance=self._merge_distance, - sample=barcode, - id_fmt=barcode + '.INS_{}') - for barcode, aln_summ in aln_summaries.items()) # yapf: disable - - # Return concatenated list of insertions. - insertions = itertools.chain.from_iterable(insertion_grps) + logger=logger) + finally: + bam_file.close() # Write insertions to output file. insertion_path = output_dir / 'insertions.txt' @@ -296,15 +280,15 @@ def _get_barcode_mapping(self, reads_path): # Extract read --> barcode mapping. with seqio.open(str(reads_path)) as reads: - return cm_ins.extract_barcode_mapping(reads, barcodes, - self._barcode_mapping) + return _extract_barcode_mapping(reads, barcodes, + self._barcode_mapping) register_pipeline( name='shearsplink-multiplexed', pipeline=MultiplexedShearSplinkPipeline) -def shorten_path(file_name, limit=40): +def _shorten_path(file_name, limit=40): """Shorten path for str to limit for logging.""" name = os.path.split(str(file_name))[1] @@ -313,3 +297,45 @@ def shorten_path(file_name, limit=40): return "%s~%s" % (name[:3], name[-(limit - 3):]) else: return name + + +def _extract_barcode_mapping(reads, barcodes, barcode_mapping=None): + + # Create barcode/sample dict. + barcode_dict = {bc.name: bc.sequence for bc in barcodes} + + if barcode_mapping is not None: + barcode_dict = {sample: barcode_dict[barcode] + for barcode, sample in barcode_mapping.items()} + + # Build mapping. + mapping = {} + + for read in reads: + # Check each barcode for match in read. + matched = [k for k, v in barcode_dict.items() if v in read.sequence] + + if len(matched) == 1: + # Record single matches. + name = read.name.split()[0] + mapping[name] = matched[0] + elif len(matched) > 1: + logging.warning('Skipping %s due to multiple matching barcodes', + read.name.split()[0]) + + return mapping + + +def _process_alignment(aln): + ref = aln.reference_name + + if aln.is_reverse: + transposon_pos = aln.reference_end + linker_pos = aln.reference_start + strand = -1 + else: + transposon_pos = aln.reference_start + linker_pos = aln.reference_end + strand = 1 + + return (ref, transposon_pos, strand), linker_pos From c04519790b42959c85e069eac2e0b63fccce0ae1 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 22 Mar 2017 17:32:37 +0100 Subject: [PATCH 088/100] Moved shorten_path to util. --- src/pyim/align/pipelines/shear_splink.py | 24 ++++++------------------ src/pyim/util/path.py | 13 +++++++++++++ 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/src/pyim/align/pipelines/shear_splink.py b/src/pyim/align/pipelines/shear_splink.py index 84c6152..7272252 100644 --- a/src/pyim/align/pipelines/shear_splink.py +++ b/src/pyim/align/pipelines/shear_splink.py @@ -1,6 +1,5 @@ import itertools import logging -import os from pathlib import Path from cutadapt import seqio @@ -10,6 +9,7 @@ from pyim.external import bowtie2 from pyim.external.util import flatten_arguments from pyim.model import Insertion +from pyim.util.path import shorten_path from ..common.genomic import extract_genomic from ..common.insertions import extract_insertions @@ -142,10 +142,10 @@ def _extract_and_align(self, reads_path, output_dir, logger): # Extract genomic sequences. logger.info('Extracting genomic sequences') logger.info(' %-18s: %s', 'Transposon', - _shorten_path(self._transposon_path)) - logger.info(' %-18s: %s', 'Linker', _shorten_path(self._linker_path)) + shorten_path(self._transposon_path)) + logger.info(' %-18s: %s', 'Linker', shorten_path(self._linker_path)) logger.info(' %-18s: %s', 'Contaminants', - _shorten_path(self._contaminant_path)) + shorten_path(self._contaminant_path)) logger.info(' %-18s: %s', 'Minimum length', self._min_length) genomic_path = extract_genomic( @@ -160,8 +160,7 @@ def _extract_and_align(self, reads_path, output_dir, logger): # Align reads to genome. logger.info('Aligning to reference') - logger.info(' %-18s: %s', 'Reference', - _shorten_path(self._index_path)) + logger.info(' %-18s: %s', 'Reference', shorten_path(self._index_path)) logger.info(' %-18s: %s', 'Bowtie options', flatten_arguments(self._bowtie_options)) @@ -249,7 +248,7 @@ def run(self, reads_path, output_dir, reads2_path=None): # Map reads to specific barcodes/samples. logger.info('Extracting barcode/sample mapping') logger.info(' %-18s: %s', 'Barcodes', - _shorten_path(self._barcode_path)) + shorten_path(self._barcode_path)) read_map = self._get_barcode_mapping(reads_path) # Extract insertions from bam file. @@ -288,17 +287,6 @@ def _get_barcode_mapping(self, reads_path): name='shearsplink-multiplexed', pipeline=MultiplexedShearSplinkPipeline) -def _shorten_path(file_name, limit=40): - """Shorten path for str to limit for logging.""" - - name = os.path.split(str(file_name))[1] - - if len(name) > limit: - return "%s~%s" % (name[:3], name[-(limit - 3):]) - else: - return name - - def _extract_barcode_mapping(reads, barcodes, barcode_mapping=None): # Create barcode/sample dict. diff --git a/src/pyim/util/path.py b/src/pyim/util/path.py index 43dc071..be1eab3 100644 --- a/src/pyim/util/path.py +++ b/src/pyim/util/path.py @@ -1,3 +1,5 @@ +import os + from pathlib import Path @@ -16,3 +18,14 @@ def build_path(file_path, suffix='', dir_=None, ext=None): new_path = Path(dir_) / new_path.name return new_path + + +def shorten_path(file_name, limit=40): + """Shorten path for str to limit for logging.""" + + name = os.path.split(str(file_name))[1] + + if len(name) > limit: + return "%s~%s" % (name[:3], name[-(limit - 3):]) + else: + return name From 3a41fd6ff9da075e2bdd87ad8ef21b80029ade01 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 22 Mar 2017 17:32:49 +0100 Subject: [PATCH 089/100] Start of nextera implementation. --- src/pyim/align/pipelines/nextera.py | 167 ++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 src/pyim/align/pipelines/nextera.py diff --git a/src/pyim/align/pipelines/nextera.py b/src/pyim/align/pipelines/nextera.py new file mode 100644 index 0000000..3abd60a --- /dev/null +++ b/src/pyim/align/pipelines/nextera.py @@ -0,0 +1,167 @@ +import logging +from pathlib import Path + +import pysam + +from pyim.external.bowtie2 import bowtie2 +from pyim.external.cutadapt import cutadapt +from pyim.external.util import flatten_arguments +from pyim.model import Insertion +from pyim.util.path import shorten_path + +from ..common.insertions import extract_insertions +from .base import Pipeline + + +class NexteraPipeline(Pipeline): + """Nextera-based transposon pipeline.""" + + def __init__(self, + transposon_path, + read1_adapter_path, + read2_adapter_path, + min_length=20, + logger=None): + super().__init__(logger=logger) + + self._read1_adapter_path = read1_adapter_path + self._read2_adapter_path = read2_adapter_path + self._transposon_path = transposon_path + self._min_length = min_length + + def _extract_args(cls, args): + raise NotImplementedError() + + def run(self, read_path, output_dir, read2_path=None): + logger = logging.getLogger() + + trimmed_path, trimmed2_path = self._trim_adapters( + read_path, read2_path, output_dir, logger=logger) + alignment_path = self._align( + trimmed_path, trimmed2_path, output_dir, logger=logger) + + # Extract insertions from bam file. + bam_file = pysam.AlignmentFile(str(alignment_path)) + + try: + insertions = extract_insertions( + iter(bam_file), + func=_process_mates, + paired=True, + merge_dist=self._merge_distance, + min_mapq=self._min_mapq, + min_support=self._min_support, + logger=logger) + finally: + bam_file.close() + + # Write insertions to output file. + insertion_path = output_dir / 'insertions.txt' + + ins_frame = Insertion.to_frame(insertions) + ins_frame.to_csv(str(insertion_path), sep='\t', index=False) + + # genomic_paths = self._extract_genomic(trimmed_paths, work_dir) + # alignment_path = self._align(genomic_paths) + # yield from self._extract_insertions(alignment_path) + + # def _trim_nextera_adapters(self, read_paths, work_dir): + # output_paths = path.build_paths( + # read_paths, dir_=work_dir / '_trim_nextera') + + # cutadapt_opts = {'--minimum-length': self._min_length, + # '-g': 'file:' + str(self._read1_adapter_path), + # '-G': 'file:' + str(self._read2_adapter_path)} + + # cutadapt( + # read_paths[0], + # output_paths[0], + # cutadapt_opts, + # read2_path=read_paths[1], + # out2_path=output_paths[1]) + + # return output_paths + + # def _extract_genomic(self, read_paths, work_dir): + # output_paths = path.build_paths( + # read_paths, dir_=work_dir / '_trim_sequence') + + # cutadapt_opts = {'--minimum-length': self._min_length, + # '--discard-untrimmed': True, + # '--pair-filter=both': True, + # '-G': 'file:' + str(self._transposon_path)} + + # cutadapt( + # read_paths[0], + # output_paths[0], + # cutadapt_opts, + # read2_path=read_paths[1], + # out2_path=output_paths[1]) + + # return output_paths + + def _trim_adapters(self, read_path, read2_path, output_dir, logger): + raise NotImplementedError() + + def _align(self, read_path, read2_path, output_dir, logger): + + # Align reads to genome. + logger.info('Aligning to reference') + logger.info(' %-18s: %s', 'Reference', shorten_path(self._index_path)) + logger.info(' %-18s: %s', 'Bowtie options', + flatten_arguments(self._bowtie_options)) + + alignment_path = output_dir / 'alignment.bam' + + bowtie2( + read_paths=[read_path], + read2_paths=[read2_path], + index_path=self._index_path, + output_path=alignment_path, + options=self._bowtie_options, + verbose=True) + + return alignment_path + + +def _trim_adapters(read_paths, + output_paths, + adapter1_path=None, + adapter2_path=None, + discard=False, + min_length=None): + cutadapt_opts = {} + + if adapter1_path is not None: + cutadapt_opts['-g'] = 'file:' + str(adapter1_path) + + if adapter2_path is not None: + cutadapt_opts['-G'] = 'file:' + str(adapter2_path) + + if discard: + cutadapt_opts['--discard-trimmed'] = True + + if min_length is not None: + cutadapt_opts['--minimum-length'] = min_length + + cutadapt( + read_paths[0], + output_paths[0], + cutadapt_opts, + in2_path=read_paths[1], + out2_path=output_paths[1]) + + +def _process_mates(mate1, mate2): + ref = mate1.reference_name + + if mate1.is_reverse: + transposon_pos = mate2.reference_start + linker_pos = mate1.reference_end + strand = 1 + else: + transposon_pos = mate2.reference_end + linker_pos = mate1.reference_start + strand = -1 + + return (ref, transposon_pos, strand), linker_pos From 76a21d2104f4e21305bf9fa7fb4f481811b78c59 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Fri, 24 Mar 2017 10:40:53 +0100 Subject: [PATCH 090/100] Initial full implementation of nextera pipeline. --- setup.py | 2 +- src/pyim/align/common/insertions.py | 4 + src/pyim/align/pipelines/__init__.py | 1 + src/pyim/align/pipelines/nextera.py | 172 +++++++++++++---------- src/pyim/align/pipelines/shear_splink.py | 2 +- src/pyim/main/pyim_align.py | 4 +- src/pyim/main/pyim_annotate.py | 7 +- src/pyim/util/path.py | 9 ++ 8 files changed, 121 insertions(+), 80 deletions(-) diff --git a/setup.py b/setup.py index 952609c..209d91d 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ requirements = ['pyfaidx>=0.4.8.1', 'intervaltree>=2.1', 'tqdm>=4.7', 'toolz>=0.8', 'rpy2>=2.8.2', 'numpy', 'pandas>=0.18', - 'pysam>=0.9'] + 'pysam>=0.9', 'natsort'] test_requirements = ['pytest', 'pytest-cov', 'pytest-mock', 'pytest-helpers-namespace', 'python-coveralls'] diff --git a/src/pyim/align/common/insertions.py b/src/pyim/align/common/insertions.py index 714b8be..70dc62a 100644 --- a/src/pyim/align/common/insertions.py +++ b/src/pyim/align/common/insertions.py @@ -234,6 +234,7 @@ def convert_summary_to_insertions(aln_summary, min_support=1, merge_dist=None, id_fmt='INS_{}', + sort=True, **kwargs): """Converts an alignment map to a list of Insertions.""" @@ -250,6 +251,9 @@ def convert_summary_to_insertions(aln_summary, # Filter for support. insertions = (ins for ins in insertions if ins.support >= min_support) + if sort: + insertions = sorted(insertions, key=lambda ins: -ins.support) + # Add ids. insertions = (ins._replace(id=id_fmt.format(i + 1)) for i, ins in enumerate(insertions)) diff --git a/src/pyim/align/pipelines/__init__.py b/src/pyim/align/pipelines/__init__.py index 6b387e1..7f37f8c 100644 --- a/src/pyim/align/pipelines/__init__.py +++ b/src/pyim/align/pipelines/__init__.py @@ -1,2 +1,3 @@ from .base import Pipeline, get_pipelines, register_pipeline from .shear_splink import ShearSplinkPipeline, MultiplexedShearSplinkPipeline +from .nextera import NexteraPipeline diff --git a/src/pyim/align/pipelines/nextera.py b/src/pyim/align/pipelines/nextera.py index 3abd60a..76dd11e 100644 --- a/src/pyim/align/pipelines/nextera.py +++ b/src/pyim/align/pipelines/nextera.py @@ -2,15 +2,16 @@ from pathlib import Path import pysam +import toolz from pyim.external.bowtie2 import bowtie2 from pyim.external.cutadapt import cutadapt from pyim.external.util import flatten_arguments from pyim.model import Insertion -from pyim.util.path import shorten_path +from pyim.util.path import shorten_path, extract_suffix from ..common.insertions import extract_insertions -from .base import Pipeline +from .base import Pipeline, register_pipeline class NexteraPipeline(Pipeline): @@ -18,27 +19,73 @@ class NexteraPipeline(Pipeline): def __init__(self, transposon_path, - read1_adapter_path, - read2_adapter_path, - min_length=20, - logger=None): - super().__init__(logger=logger) - - self._read1_adapter_path = read1_adapter_path - self._read2_adapter_path = read2_adapter_path + bowtie_index_path, + bowtie_options=None, + min_length=15, + min_support=2, + min_mapq=23, + merge_distance=None, + threads=1): + super().__init__() + self._transposon_path = transposon_path + self._index_path = bowtie_index_path + self._bowtie_options = bowtie_options or {} + self._min_length = min_length + self._min_support = min_support + self._min_mapq = min_mapq + + self._merge_distance = merge_distance + self._threads = threads + + @classmethod + def configure_args(cls, parser): + super().configure_args(parser) + + parser.add_argument('--transposon', type=Path, required=True) + parser.add_argument('--bowtie_index', type=Path, required=True) + parser.add_argument('--min_length', type=int, default=15) + parser.add_argument('--min_support', type=int, default=2) + parser.add_argument('--min_mapq', type=int, default=23) + parser.add_argument('--merge_distance', type=int, default=None) + + parser.add_argument('--local', default=False, action='store_true') + parser.add_argument('--threads', default=1, type=int) + + @classmethod def _extract_args(cls, args): - raise NotImplementedError() + bowtie_options = {'--local': args.local, '--threads': args.threads} + + return dict( + transposon_path=args.transposon, + bowtie_index_path=args.bowtie_index, + min_length=args.min_length, + min_support=args.min_support, + min_mapq=args.min_mapq, + merge_distance=args.merge_distance, + bowtie_options=bowtie_options, + threads=args.threads) def run(self, read_path, output_dir, read2_path=None): + if read2_path is None: + raise ValueError('This pipeline requires paired-end data') + logger = logging.getLogger() - trimmed_path, trimmed2_path = self._trim_adapters( + output_dir.mkdir(exist_ok=True, parents=True) + + # Trim reads and align to reference. + + trimmed_tr_path, trimmed_tr2_path = self._trim_transposon( read_path, read2_path, output_dir, logger=logger) + + trimmed_nt_path, trimmed_nt2_path = self._trim_nextera( + trimmed_tr_path, trimmed_tr2_path, output_dir, logger=logger) + alignment_path = self._align( - trimmed_path, trimmed2_path, output_dir, logger=logger) + trimmed_nt_path, trimmed_nt2_path, output_dir, logger=logger) # Extract insertions from bam file. bam_file = pysam.AlignmentFile(str(alignment_path)) @@ -61,55 +108,55 @@ def run(self, read_path, output_dir, read2_path=None): ins_frame = Insertion.to_frame(insertions) ins_frame.to_csv(str(insertion_path), sep='\t', index=False) - # genomic_paths = self._extract_genomic(trimmed_paths, work_dir) - # alignment_path = self._align(genomic_paths) - # yield from self._extract_insertions(alignment_path) - - # def _trim_nextera_adapters(self, read_paths, work_dir): - # output_paths = path.build_paths( - # read_paths, dir_=work_dir / '_trim_nextera') + def _trim_nextera(self, read_path, read2_path, output_dir, logger): + cutadapt_opts = { + '-a': 'CTGTCTCTTATA', + '-A': 'CTGTCTCTTATA', + '--minimum-length': self._min_length, + } - # cutadapt_opts = {'--minimum-length': self._min_length, - # '-g': 'file:' + str(self._read1_adapter_path), - # '-G': 'file:' + str(self._read2_adapter_path)} + suffix = extract_suffix(read_path) + trimmed_path = output_dir / ('trimmed_nextera.R1' + suffix) + trimmed2_path = output_dir / ('trimmed_nextera.R2' + suffix) - # cutadapt( - # read_paths[0], - # output_paths[0], - # cutadapt_opts, - # read2_path=read_paths[1], - # out2_path=output_paths[1]) + cutadapt( + read_path=read_path, + read2_path=read2_path, + out_path=trimmed_path, + out2_path=trimmed2_path, + options=cutadapt_opts) - # return output_paths + return trimmed_path, trimmed2_path - # def _extract_genomic(self, read_paths, work_dir): - # output_paths = path.build_paths( - # read_paths, dir_=work_dir / '_trim_sequence') + def _trim_transposon(self, read_path, read2_path, output_dir, logger): + cutadapt_opts = {'-G': 'file:' + str(self._transposon_path), + '--discard-untrimmed': True, + '--pair-filter=both': True} - # cutadapt_opts = {'--minimum-length': self._min_length, - # '--discard-untrimmed': True, - # '--pair-filter=both': True, - # '-G': 'file:' + str(self._transposon_path)} + suffix = extract_suffix(read_path) + trimmed_path = output_dir / ('trimmed_transposon.R1' + suffix) + trimmed2_path = output_dir / ('trimmed_transposon.R2' + suffix) - # cutadapt( - # read_paths[0], - # output_paths[0], - # cutadapt_opts, - # read2_path=read_paths[1], - # out2_path=output_paths[1]) + cutadapt( + read_path=read_path, + read2_path=read2_path, + out_path=trimmed_path, + out2_path=trimmed2_path, + options=cutadapt_opts) - # return output_paths - - def _trim_adapters(self, read_path, read2_path, output_dir, logger): - raise NotImplementedError() + return trimmed_path, trimmed2_path def _align(self, read_path, read2_path, output_dir, logger): + options = toolz.merge(self._bowtie_options, {'--fr': True, + '--threads': + self._threads}) + # Align reads to genome. logger.info('Aligning to reference') logger.info(' %-18s: %s', 'Reference', shorten_path(self._index_path)) logger.info(' %-18s: %s', 'Bowtie options', - flatten_arguments(self._bowtie_options)) + flatten_arguments(options)) alignment_path = output_dir / 'alignment.bam' @@ -118,38 +165,13 @@ def _align(self, read_path, read2_path, output_dir, logger): read2_paths=[read2_path], index_path=self._index_path, output_path=alignment_path, - options=self._bowtie_options, + options=options, verbose=True) return alignment_path -def _trim_adapters(read_paths, - output_paths, - adapter1_path=None, - adapter2_path=None, - discard=False, - min_length=None): - cutadapt_opts = {} - - if adapter1_path is not None: - cutadapt_opts['-g'] = 'file:' + str(adapter1_path) - - if adapter2_path is not None: - cutadapt_opts['-G'] = 'file:' + str(adapter2_path) - - if discard: - cutadapt_opts['--discard-trimmed'] = True - - if min_length is not None: - cutadapt_opts['--minimum-length'] = min_length - - cutadapt( - read_paths[0], - output_paths[0], - cutadapt_opts, - in2_path=read_paths[1], - out2_path=output_paths[1]) +register_pipeline(name='nextera', pipeline=NexteraPipeline) def _process_mates(mate1, mate2): diff --git a/src/pyim/align/pipelines/shear_splink.py b/src/pyim/align/pipelines/shear_splink.py index 7272252..14d7472 100644 --- a/src/pyim/align/pipelines/shear_splink.py +++ b/src/pyim/align/pipelines/shear_splink.py @@ -30,7 +30,7 @@ def __init__(self, min_length=15, min_support=2, min_mapq=23, - merge_distance=0, + merge_distance=None, bowtie_options=None, min_overlaps=None, error_rates=None): diff --git a/src/pyim/main/pyim_align.py b/src/pyim/main/pyim_align.py index 5f51696..612ba57 100644 --- a/src/pyim/main/pyim_align.py +++ b/src/pyim/main/pyim_align.py @@ -17,9 +17,9 @@ def main(): # Run pipeline. pipeline = args.pipeline.from_args(args) - pipeline.run(reads_path=args.reads, + pipeline.run(read_path=args.reads, output_dir=args.output_dir, - reads2_path=args.reads2) + read2_path=args.reads2) def parse_args(): diff --git a/src/pyim/main/pyim_annotate.py b/src/pyim/main/pyim_annotate.py index 7e83128..f6d1674 100644 --- a/src/pyim/main/pyim_annotate.py +++ b/src/pyim/main/pyim_annotate.py @@ -1,5 +1,7 @@ import argparse +from natsort import order_by_index, index_natsorted + from pyim.annotate import get_annotators from pyim.model import Insertion @@ -14,7 +16,10 @@ def main(): annotated = list(annotator.annotate(insertions)) annotated_frame = Insertion.to_frame(annotated) - annotated_frame = annotated_frame.sort_values(by='id') + + annotated_frame = annotated_frame.reindex(index=order_by_index( + annotated_frame.index, index_natsorted(annotated_frame.id))) + annotated_frame.to_csv(str(args.output), sep='\t', index=False) diff --git a/src/pyim/util/path.py b/src/pyim/util/path.py index be1eab3..c113b6b 100644 --- a/src/pyim/util/path.py +++ b/src/pyim/util/path.py @@ -29,3 +29,12 @@ def shorten_path(file_name, limit=40): return "%s~%s" % (name[:3], name[-(limit - 3):]) else: return name + + +def extract_suffix(file_path): + """Extracts suffix from file path.""" + if file_path.suffixes[-1] == '.gz': + suffix = ''.join(file_path.suffixes[-2:]) + else: + suffix = file_path.suffixes[-1] + return suffix From cf95f8c38a33ada23e86522a3a78c8aecc8c333a Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Sun, 26 Mar 2017 13:37:09 +0200 Subject: [PATCH 091/100] Remove unused import. --- src/pyim/main/pyim_align.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/pyim/main/pyim_align.py b/src/pyim/main/pyim_align.py index 612ba57..c7a6f3a 100644 --- a/src/pyim/main/pyim_align.py +++ b/src/pyim/main/pyim_align.py @@ -2,7 +2,6 @@ import logging from pyim.align.pipelines import get_pipelines -from pyim.model import Insertion logging.basicConfig( format='[%(asctime)-15s] %(message)s', From 12fb29023e4a9374f7c5e3fa59f72630524dce99 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Sun, 26 Mar 2017 15:21:37 +0200 Subject: [PATCH 092/100] Refactored nextera + shearsplink pipelines to common structure. --- src/pyim/align/common/__init__.py | 0 src/pyim/align/common/genomic.py | 166 --------------- src/pyim/align/pipelines/base.py | 8 +- src/pyim/align/pipelines/nextera.py | 95 ++++++--- src/pyim/align/pipelines/shear_splink.py | 199 ++++++++++++------ .../align/{common/insertions.py => util.py} | 0 src/pyim/external/cutadapt.py | 5 +- src/pyim/main/pyim_align.py | 4 +- 8 files changed, 216 insertions(+), 261 deletions(-) delete mode 100644 src/pyim/align/common/__init__.py delete mode 100644 src/pyim/align/common/genomic.py rename src/pyim/align/{common/insertions.py => util.py} (100%) diff --git a/src/pyim/align/common/__init__.py b/src/pyim/align/common/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/pyim/align/common/genomic.py b/src/pyim/align/common/genomic.py deleted file mode 100644 index ede2f8a..0000000 --- a/src/pyim/align/common/genomic.py +++ /dev/null @@ -1,166 +0,0 @@ -import logging - -from pyim.util.path import build_path - -from pyim.external.cutadapt import cutadapt, cutadapt_summary - -DEFAULT_OVERLAP = 3 -DEFAULT_ERROR_RATE = 0.1 - - -def extract_genomic(reads_path, - output_dir, - transposon_path, - linker_path=None, - contaminant_path=None, - min_length=None, - min_overlaps=None, - error_rates=None): - """Extracts genomic sequences from single-read data. - - Process reads of the following structure: - - [Transposon-Genomic-Linker] - - """ - - logger = logging.getLogger() - - min_overlaps = min_overlaps or {} - error_rates = error_rates or {} - - # Ensure output dir exists. - output_dir.mkdir(exist_ok=True) - - suffix = _extract_suffix(reads_path) - - # Track interim files for cleaning. - interim_files = [] - - if contaminant_path is not None: - # Remove contaminants. - contaminant_out_path = output_dir / ('filt_contaminant' + suffix) - contaminant_opts = { - '-g': 'file:' + str(contaminant_path), - '--discard-trimmed': True, - '-O': min_overlaps.get('contaminant', DEFAULT_OVERLAP), - '-e': error_rates.get('contaminant', DEFAULT_ERROR_RATE) - } - - process = cutadapt(reads_path, contaminant_out_path, contaminant_opts) - logger.info('Trimmed contaminant sequences' + - cutadapt_summary(process.stdout)) # yapf: disable - - interim_files.append(contaminant_out_path) - else: - contaminant_out_path = reads_path - - if linker_path is not None: - # Remove linker. - linker_out_path = output_dir / ('filt_linker' + suffix) - linker_opts = { - '-a': 'file:' + str(linker_path), - '--discard-untrimmed': True, - '-O': min_overlaps.get('linker', DEFAULT_OVERLAP), - '-e': error_rates.get('linker', DEFAULT_ERROR_RATE) - } - - process = cutadapt(contaminant_out_path, linker_out_path, linker_opts) - logger.info('Trimmed linker sequence' + - cutadapt_summary(process.stdout)) # yapf: disable - - interim_files.append(linker_out_path) - else: - linker_out_path = contaminant_out_path - - # Trim transposon and check minimum length. - transposon_opts = { - '-g': 'file:' + str(transposon_path), - '--discard-untrimmed': True, - '-O': min_overlaps.get('transposon', DEFAULT_OVERLAP), - '-e': error_rates.get('transposon', DEFAULT_ERROR_RATE) - } - - if min_length is not None: - transposon_opts['--minimum-length'] = min_length - - genomic_path = output_dir / ('genomic' + suffix) - process = cutadapt(linker_out_path, genomic_path, transposon_opts) - logger.info('Trimmed transposon sequence and filtered for length' + - cutadapt_summary(process.stdout)) # yapf: disable - - # Clean-up interim files. - for file_path in interim_files: - file_path.unlink() - - return genomic_path - - -def _extract_suffix(file_path): - if file_path.suffixes[-1] == '.gz': - suffix = ''.join(file_path.suffixes[-2:]) - else: - suffix = file_path.suffixes[-1] - return suffix - - -def extract_genomic_paired(reads_paths, - output_paths, - transposon_path, - linker_path=None, - contaminant_path=None, - min_length=None): - """Extracts genomic sequences from paired-end data.""" - - # Extract file paths. - in1_path, in2_path = reads_paths - out1_path, out2_path = output_paths - - # Ensure output dirs exists. - out1_path.parent.mkdir(exist_ok=True) - out2_path.parent.mkdir(exist_ok=True) - - # Track interim files. - interim_files = [] - - if contaminant_path is not None: - # Remove contaminants. - cont1_path = build_path(out1_path, suffix='.contaminant') - cont2_path = build_path(out2_path, suffix='.contaminant') - - contaminant_opts = {'-g': 'file:' + str(contaminant_path), - '--discard-trimmed': True} - cutadapt(in1_path, cont1_path, contaminant_opts, - reads2_path=in2_path, out2_path=out2_path) # yapf: disable - - interim_files += [cont1_path, cont2_path] - else: - cont1_path, cont2_path = in1_path, in2_path - - if linker_path is not None: - # Remove linker. - link1_path = build_path(out1_path, suffix='.linker') - link2_path = build_path(out2_path, suffix='.linker') - - linker_opts = {'-A': 'file:' + str(linker_path), - '--discard-untrimmed': True} - cutadapt(cont1_path, link1_path, linker_opts, - reads2_path=cont2_path, out2_path=link2_path) # yapf: disable - - interim_files += [link1_path, link2_path] - else: - link1_path, link2_path = cont1_path, cont2_path - - # Trim transposon and check minimum length. - transposon_opts = {'-g': 'file:' + str(transposon_path), - '--discard-untrimmed': True} - - if min_length is not None: - transposon_opts['--minimum-length'] = min_length - - cutadapt(link1_path, out1_path, transposon_opts, - reads2_path=link2_path, out2_path=out2_path) # yapf: disable - - # Clean-up intermediary files. - for fp in interim_files: - fp.unlink() diff --git a/src/pyim/align/pipelines/base.py b/src/pyim/align/pipelines/base.py index 5e805ce..9f86237 100644 --- a/src/pyim/align/pipelines/base.py +++ b/src/pyim/align/pipelines/base.py @@ -21,8 +21,14 @@ def __init__(self): @abc.abstractclassmethod def configure_args(cls, parser): """Configures argument parser for the pipeline.""" + + @classmethod + def _setup_base_args(cls, parser, paired=False): parser.add_argument('--reads', type=Path, required=True) - parser.add_argument('--reads2', type=Path, required=False) + + if paired: + parser.add_argument('--reads2', type=Path, required=False) + parser.add_argument('--output_dir', type=Path, required=True) @classmethod diff --git a/src/pyim/align/pipelines/nextera.py b/src/pyim/align/pipelines/nextera.py index 76dd11e..8828a8e 100644 --- a/src/pyim/align/pipelines/nextera.py +++ b/src/pyim/align/pipelines/nextera.py @@ -1,3 +1,5 @@ +"""Module containing the nextera pipeline.""" + import logging from pathlib import Path @@ -5,13 +7,13 @@ import toolz from pyim.external.bowtie2 import bowtie2 -from pyim.external.cutadapt import cutadapt +from pyim.external.cutadapt import cutadapt, cutadapt_summary from pyim.external.util import flatten_arguments from pyim.model import Insertion from pyim.util.path import shorten_path, extract_suffix -from ..common.insertions import extract_insertions from .base import Pipeline, register_pipeline +from ..util import extract_insertions class NexteraPipeline(Pipeline): @@ -41,7 +43,7 @@ def __init__(self, @classmethod def configure_args(cls, parser): - super().configure_args(parser) + cls._setup_base_args(parser, paired=True) parser.add_argument('--transposon', type=Path, required=True) parser.add_argument('--bowtie_index', type=Path, required=True) @@ -76,16 +78,18 @@ def run(self, read_path, output_dir, read2_path=None): output_dir.mkdir(exist_ok=True, parents=True) - # Trim reads and align to reference. + # Extract genomic sequences. + if logger is not None: + logger.info('Extracting genomic sequences') + logger.info(' %-18s: %s', 'Transposon', + shorten_path(self._transposon_path)) + logger.info(' %-18s: %s', 'Minimum length', self._min_length) - trimmed_tr_path, trimmed_tr2_path = self._trim_transposon( + # Trim reads and align to reference. + genomic_path, genomic2_path = self._extract_genomic( read_path, read2_path, output_dir, logger=logger) - - trimmed_nt_path, trimmed_nt2_path = self._trim_nextera( - trimmed_tr_path, trimmed_tr2_path, output_dir, logger=logger) - alignment_path = self._align( - trimmed_nt_path, trimmed_nt2_path, output_dir, logger=logger) + genomic_path, genomic2_path, output_dir, logger=logger) # Extract insertions from bam file. bam_file = pysam.AlignmentFile(str(alignment_path)) @@ -108,49 +112,82 @@ def run(self, read_path, output_dir, read2_path=None): ins_frame = Insertion.to_frame(insertions) ins_frame.to_csv(str(insertion_path), sep='\t', index=False) - def _trim_nextera(self, read_path, read2_path, output_dir, logger): - cutadapt_opts = { - '-a': 'CTGTCTCTTATA', - '-A': 'CTGTCTCTTATA', - '--minimum-length': self._min_length, - } + def _extract_genomic(self, read_path, read2_path, output_dir, logger): + """Extracts genomic sequences from reads. + + Extracts genomic sequences by first trimming for mates for the + transposon sequence (dropping reads without a match) and then + trimming any Nextera transposase sequences from the remaining reads. + Filtering for minimum length is performed in the nextera trimming step. + """ + + trimmed_tr_path, trimmed_tr2_path = self._trim_transposon( + read_path, read2_path, output_dir, logger=logger) + + trimmed_nt_path, trimmed_nt2_path = self._trim_nextera( + trimmed_tr_path, trimmed_tr2_path, output_dir, logger=logger) + + trimmed_tr_path.unlink() + trimmed_tr2_path.unlink() + + return trimmed_nt_path, trimmed_nt2_path + + def _trim_transposon(self, read_path, read2_path, output_dir, logger): + """Selects and trims mates with transposon sequence in second read.""" + + cutadapt_opts = {'-G': 'file:' + str(self._transposon_path), + '--discard-untrimmed': True, + '--pair-filter=both': True} suffix = extract_suffix(read_path) - trimmed_path = output_dir / ('trimmed_nextera.R1' + suffix) - trimmed2_path = output_dir / ('trimmed_nextera.R2' + suffix) + trimmed_path = output_dir / ('genomic.R1' + suffix) + trimmed2_path = output_dir / ('genomic.R2' + suffix) - cutadapt( + process = cutadapt( read_path=read_path, read2_path=read2_path, out_path=trimmed_path, out2_path=trimmed2_path, options=cutadapt_opts) + if logger is not None: + summary = cutadapt_summary(process.stdout, padding=' ') + logger.info('Trimmed transposon sequence' + summary) + return trimmed_path, trimmed2_path - def _trim_transposon(self, read_path, read2_path, output_dir, logger): - cutadapt_opts = {'-G': 'file:' + str(self._transposon_path), - '--discard-untrimmed': True, - '--pair-filter=both': True} + def _trim_nextera(self, read_path, read2_path, output_dir, logger): + """Trims nextera sequences from mates and filters for min length.""" + + cutadapt_opts = { + '-a': 'CTGTCTCTTATA', + '-A': 'CTGTCTCTTATA', + '--minimum-length': self._min_length, + } suffix = extract_suffix(read_path) - trimmed_path = output_dir / ('trimmed_transposon.R1' + suffix) - trimmed2_path = output_dir / ('trimmed_transposon.R2' + suffix) + trimmed_path = output_dir / ('trimmed_nextera.R1' + suffix) + trimmed2_path = output_dir / ('trimmed_nextera.R2' + suffix) - cutadapt( + process = cutadapt( read_path=read_path, read2_path=read2_path, out_path=trimmed_path, out2_path=trimmed2_path, options=cutadapt_opts) + if logger is not None: + summary = cutadapt_summary(process.stdout, padding=' ') + logger.info('Trimmed nextera sequences and ' + 'filtered for length' + summary) + return trimmed_path, trimmed2_path def _align(self, read_path, read2_path, output_dir, logger): + """Aligns mates to reference using bowtie2.""" - options = toolz.merge(self._bowtie_options, {'--fr': True, - '--threads': - self._threads}) + extra_opts = {'--threads': self._threads} + options = toolz.merge(self._bowtie_options, extra_opts) # Align reads to genome. logger.info('Aligning to reference') diff --git a/src/pyim/align/pipelines/shear_splink.py b/src/pyim/align/pipelines/shear_splink.py index 14d7472..b0ab69d 100644 --- a/src/pyim/align/pipelines/shear_splink.py +++ b/src/pyim/align/pipelines/shear_splink.py @@ -1,4 +1,5 @@ -import itertools +"""Module containing the ShearSplink pipelines.""" + import logging from pathlib import Path @@ -6,14 +7,14 @@ import pandas as pd import pysam -from pyim.external import bowtie2 +from pyim.external.cutadapt import cutadapt, cutadapt_summary +from pyim.external.bowtie2 import bowtie2 from pyim.external.util import flatten_arguments from pyim.model import Insertion -from pyim.util.path import shorten_path +from pyim.util.path import shorten_path, extract_suffix -from ..common.genomic import extract_genomic -from ..common.insertions import extract_insertions from .base import Pipeline, register_pipeline +from ..util import extract_insertions DEFAULT_OVERLAP = 3 DEFAULT_ERROR_RATE = 0.1 @@ -54,7 +55,7 @@ def __init__(self, @classmethod def configure_args(cls, parser): - super().configure_args(parser) + cls._setup_base_args(parser, paired=False) parser.add_argument('--transposon', type=Path, required=True) parser.add_argument('--bowtie_index', type=Path, required=True) @@ -106,15 +107,18 @@ def _extract_args(cls, args): min_overlaps=min_overlaps, error_rates=error_rates) - def run(self, reads_path, output_dir, reads2_path=None): - if reads2_path is not None: + def run(self, read_path, output_dir, read2_path=None): + if read2_path is not None: raise ValueError('Pipeline does not support paired-end data') logger = logging.getLogger() + # Ensure output dir exists. + output_dir.mkdir(exist_ok=True, parents=True) + # Extract genomic sequences and align to reference. - alignment_path = self._extract_and_align(reads_path, output_dir, - logger) + genomic_path = self._extract_genomic(read_path, output_dir, logger) + alignment_path = self._align(genomic_path, output_dir, logger) # Extract insertions from bam file. bam_file = pysam.AlignmentFile(str(alignment_path)) @@ -136,40 +140,108 @@ def run(self, reads_path, output_dir, reads2_path=None): ins_frame = Insertion.to_frame(insertions) ins_frame.to_csv(str(insertion_path), sep='\t', index=False) - def _extract_and_align(self, reads_path, output_dir, logger): - output_dir.mkdir(exist_ok=True, parents=True) + def _extract_genomic(self, read_path, output_dir, logger): + # Log parameters + if logger is not None: + logger.info('Extracting genomic sequences') + logger.info(' %-18s: %s', 'Transposon', + shorten_path(self._transposon_path)) + logger.info(' %-18s: %s', 'Linker', + shorten_path(self._linker_path)) + logger.info(' %-18s: %s', 'Contaminants', + shorten_path(self._contaminant_path)) + logger.info(' %-18s: %s', 'Minimum length', self._min_length) + + # Get suffix to use for intermediate/genomic files. + suffix = extract_suffix(read_path) + + # Track interim files for cleaning. + interim_files = [] + + if self._contaminant_path is not None: + # Remove contaminants. + contaminant_out_path = output_dir / ( + 'trimmed_contaminant' + suffix) + + contaminant_opts = { + '-g': 'file:' + str(self._contaminant_path), + '--discard-trimmed': True, + '-O': self._min_overlaps.get('contaminant', DEFAULT_OVERLAP), + '-e': self._error_rates.get('contaminant', DEFAULT_ERROR_RATE) + } + + process = cutadapt(read_path, contaminant_out_path, + contaminant_opts) + + if logger is not None: + summary = cutadapt_summary(process.stdout, padding=' ') + logger.info('Trimmed contaminant sequences' + summary) + + interim_files.append(contaminant_out_path) + else: + contaminant_out_path = read_path + + if self._linker_path is not None: + # Remove linker. + linker_out_path = output_dir / ('trimmed_linker' + suffix) + linker_opts = { + '-a': 'file:' + str(self._linker_path), + '--discard-untrimmed': True, + '-O': self._min_overlaps.get('linker', DEFAULT_OVERLAP), + '-e': self._error_rates.get('linker', DEFAULT_ERROR_RATE) + } + + process = cutadapt(contaminant_out_path, linker_out_path, + linker_opts) + + if logger is not None: + summary = cutadapt_summary(process.stdout, padding=' ') + logger.info('Trimmed linker sequence' + summary) + + interim_files.append(linker_out_path) + else: + linker_out_path = contaminant_out_path + + # Trim transposon and check minimum length. + transposon_opts = { + '-g': 'file:' + str(self._transposon_path), + '--discard-untrimmed': True, + '-O': self._min_overlaps.get('transposon', DEFAULT_OVERLAP), + '-e': self._error_rates.get('transposon', DEFAULT_ERROR_RATE) + } + + if self._min_length is not None: + transposon_opts['--minimum-length'] = self._min_length + + genomic_path = output_dir / ('genomic' + suffix) + process = cutadapt(linker_out_path, genomic_path, transposon_opts) + + if logger is not None: + summary = cutadapt_summary(process.stdout, padding=' ') + logger.info('Trimmed transposon sequence and filtered ' + 'for length' + summary) + + # Clean-up interim files. + for file_path in interim_files: + file_path.unlink() - # Extract genomic sequences. - logger.info('Extracting genomic sequences') - logger.info(' %-18s: %s', 'Transposon', - shorten_path(self._transposon_path)) - logger.info(' %-18s: %s', 'Linker', shorten_path(self._linker_path)) - logger.info(' %-18s: %s', 'Contaminants', - shorten_path(self._contaminant_path)) - logger.info(' %-18s: %s', 'Minimum length', self._min_length) - - genomic_path = extract_genomic( - reads_path, - output_dir, - transposon_path=self._transposon_path, - linker_path=self._linker_path, - contaminant_path=self._contaminant_path, - min_length=self._min_length, - min_overlaps=self._min_overlaps, - error_rates=self._error_rates) - - # Align reads to genome. - logger.info('Aligning to reference') - logger.info(' %-18s: %s', 'Reference', shorten_path(self._index_path)) - logger.info(' %-18s: %s', 'Bowtie options', - flatten_arguments(self._bowtie_options)) + return genomic_path + + def _align(self, read_path, output_dir, logger): + # Log parameters + if logger is not None: + logger.info('Aligning to reference') + logger.info(' %-18s: %s', 'Reference', + shorten_path(self._index_path)) + logger.info(' %-18s: %s', 'Bowtie options', + flatten_arguments(self._bowtie_options)) alignment_path = output_dir / 'alignment.bam' - bowtie2.bowtie2( - [genomic_path], - self._index_path, - alignment_path, + bowtie2( + [read_path], + index_path=self._index_path, + output_path=alignment_path, options=self._bowtie_options, verbose=True) @@ -179,6 +251,21 @@ def _extract_and_align(self, reads_path, output_dir, logger): register_pipeline(name='shearsplink', pipeline=ShearSplinkPipeline) +def _process_alignment(aln): + ref = aln.reference_name + + if aln.is_reverse: + transposon_pos = aln.reference_end + linker_pos = aln.reference_start + strand = -1 + else: + transposon_pos = aln.reference_start + linker_pos = aln.reference_end + strand = 1 + + return (ref, transposon_pos, strand), linker_pos + + class MultiplexedShearSplinkPipeline(ShearSplinkPipeline): """ShearSplink pipeline with multiplexed reads.""" @@ -235,21 +322,24 @@ def _extract_args(cls, args): return arg_dict - def run(self, reads_path, output_dir, reads2_path=None): - if reads2_path is not None: + def run(self, read_path, output_dir, read2_path=None): + if read2_path is not None: raise ValueError('Pipeline does not support paired-end data') logger = logging.getLogger() + # Ensure output dir exists. + output_dir.mkdir(exist_ok=True, parents=True) + # Extract genomic sequences and align to reference. - alignment_path = self._extract_and_align(reads_path, output_dir, - logger) + genomic_path = self._extract_genomic(read_path, output_dir, logger) + alignment_path = self._align(genomic_path, output_dir, logger) # Map reads to specific barcodes/samples. logger.info('Extracting barcode/sample mapping') logger.info(' %-18s: %s', 'Barcodes', shorten_path(self._barcode_path)) - read_map = self._get_barcode_mapping(reads_path) + read_map = self._get_barcode_mapping(read_path) # Extract insertions from bam file. bam_file = pysam.AlignmentFile(str(alignment_path)) @@ -272,13 +362,13 @@ def run(self, reads_path, output_dir, reads2_path=None): ins_frame = Insertion.to_frame(insertions) ins_frame.to_csv(str(insertion_path), sep='\t', index=False) - def _get_barcode_mapping(self, reads_path): + def _get_barcode_mapping(self, read_path): # Read barcode sequences. with seqio.open(str(self._barcode_path)) as barcode_file: barcodes = list(barcode_file) # Extract read --> barcode mapping. - with seqio.open(str(reads_path)) as reads: + with seqio.open(str(read_path)) as reads: return _extract_barcode_mapping(reads, barcodes, self._barcode_mapping) @@ -312,18 +402,3 @@ def _extract_barcode_mapping(reads, barcodes, barcode_mapping=None): read.name.split()[0]) return mapping - - -def _process_alignment(aln): - ref = aln.reference_name - - if aln.is_reverse: - transposon_pos = aln.reference_end - linker_pos = aln.reference_start - strand = -1 - else: - transposon_pos = aln.reference_start - linker_pos = aln.reference_end - strand = 1 - - return (ref, transposon_pos, strand), linker_pos diff --git a/src/pyim/align/common/insertions.py b/src/pyim/align/util.py similarity index 100% rename from src/pyim/align/common/insertions.py rename to src/pyim/align/util.py diff --git a/src/pyim/external/cutadapt.py b/src/pyim/external/cutadapt.py index c199ee1..1a653e1 100644 --- a/src/pyim/external/cutadapt.py +++ b/src/pyim/external/cutadapt.py @@ -105,9 +105,10 @@ def _demultiplex(read_path, output_dir, barcode_path, error_rate): return output_paths -def cutadapt_summary(stdstream): +def cutadapt_summary(stdstream, padding=''): sections = _split_log_sections(stdstream.read().decode()) - return '\n'.join([' '] + sections['=== Summary ===']) + delim = '\n' + padding + return padding + delim.join([''] + sections['=== Summary ===']) def _split_log_sections(log_str): diff --git a/src/pyim/main/pyim_align.py b/src/pyim/main/pyim_align.py index c7a6f3a..98ab05a 100644 --- a/src/pyim/main/pyim_align.py +++ b/src/pyim/main/pyim_align.py @@ -15,10 +15,12 @@ def main(): args = parse_args() # Run pipeline. + reads2 = args.reads2 if hasattr(args, 'reads2') else None + pipeline = args.pipeline.from_args(args) pipeline.run(read_path=args.reads, output_dir=args.output_dir, - read2_path=args.reads2) + read2_path=reads2) def parse_args(): From 6191607b4d6b24977b72213c255d092805a07eec Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Tue, 28 Mar 2017 10:10:49 +0200 Subject: [PATCH 093/100] Reorganized data files. --- data/lam_pcr.sb.barcodes.fa | 8 -------- data/lam_pcr.sb.transposon.fa | 2 -- data/{sb.barcodes.fa => sb/barcodes.fa} | 0 data/{sb.contaminants.fa => sb/contaminants.fa} | 0 data/{sb.linker.fa => sb/linker.fa} | 0 data/{sb.transposon.fa => sb/transposon.fa} | 0 6 files changed, 10 deletions(-) delete mode 100644 data/lam_pcr.sb.barcodes.fa delete mode 100644 data/lam_pcr.sb.transposon.fa rename data/{sb.barcodes.fa => sb/barcodes.fa} (100%) rename data/{sb.contaminants.fa => sb/contaminants.fa} (100%) rename data/{sb.linker.fa => sb/linker.fa} (100%) rename data/{sb.transposon.fa => sb/transposon.fa} (100%) diff --git a/data/lam_pcr.sb.barcodes.fa b/data/lam_pcr.sb.barcodes.fa deleted file mode 100644 index 3600211..0000000 --- a/data/lam_pcr.sb.barcodes.fa +++ /dev/null @@ -1,8 +0,0 @@ ->BC01 -ACACATACGC ->BC02 -ACAGTATATA ->BC03 -ACAGTCGTGC ->BC04 -ACATACGCGT diff --git a/data/lam_pcr.sb.transposon.fa b/data/lam_pcr.sb.transposon.fa deleted file mode 100644 index a8676f1..0000000 --- a/data/lam_pcr.sb.transposon.fa +++ /dev/null @@ -1,2 +0,0 @@ ->SB -TAAACTTCCGACTTCAACTG \ No newline at end of file diff --git a/data/sb.barcodes.fa b/data/sb/barcodes.fa similarity index 100% rename from data/sb.barcodes.fa rename to data/sb/barcodes.fa diff --git a/data/sb.contaminants.fa b/data/sb/contaminants.fa similarity index 100% rename from data/sb.contaminants.fa rename to data/sb/contaminants.fa diff --git a/data/sb.linker.fa b/data/sb/linker.fa similarity index 100% rename from data/sb.linker.fa rename to data/sb/linker.fa diff --git a/data/sb.transposon.fa b/data/sb/transposon.fa similarity index 100% rename from data/sb.transposon.fa rename to data/sb/transposon.fa From 558d0afd4479f6f3e6b3e0911551bf85a20e016c Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Tue, 28 Mar 2017 10:11:10 +0200 Subject: [PATCH 094/100] Initial edit of documentation. --- Makefile | 10 +--- docs/conf.py | 27 ++++------- docs/home.rst | 3 ++ docs/index.rst | 5 +- docs/readme.rst | 1 - docs/usage.rst | 119 +++++++++++++++++++++++++++++++++++++++++++++++- 6 files changed, 134 insertions(+), 31 deletions(-) create mode 100644 docs/home.rst delete mode 100644 docs/readme.rst diff --git a/Makefile b/Makefile index 4756a26..a8ec776 100644 --- a/Makefile +++ b/Makefile @@ -28,7 +28,6 @@ help: clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts - clean-build: ## remove build artifacts rm -fr build/ rm -fr dist/ @@ -52,22 +51,15 @@ lint: ## check style with flake8 test: ## run tests quickly with the default Python py.test - - -test-all: ## run tests on every Python version with tox - tox coverage: ## check code coverage quickly with the default Python coverage run --source pyim py.test - + coverage report -m coverage html $(BROWSER) htmlcov/index.html docs: ## generate Sphinx HTML documentation, including API docs - rm -f docs/pyim.rst - rm -f docs/modules.rst - sphinx-apidoc -o docs/ pyim $(MAKE) -C docs clean $(MAKE) -C docs html $(BROWSER) docs/_build/html/index.html diff --git a/docs/conf.py b/docs/conf.py index 44fe434..9c629c2 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -16,6 +16,8 @@ import sys import os +import sphinx_rtd_theme + # If extensions (or modules to document with autodoc) are in another # directory, add these directories to sys.path here. If the directory is # relative to the documentation root, use os.path.abspath to make it @@ -106,12 +108,13 @@ # documents. #keep_warnings = False - # -- Options for HTML output ------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'default' +# html_theme = 'default' +html_theme = "sphinx_rtd_theme" +html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] # Theme options are theme-specific and customize the look and feel of a # theme further. For a list of options available for each theme, see the @@ -190,7 +193,6 @@ # Output file base name for HTML help builder. htmlhelp_basename = 'pyimdoc' - # -- Options for LaTeX output ------------------------------------------ latex_elements = { @@ -208,9 +210,8 @@ # (source start file, target name, title, author, documentclass # [howto/manual]). latex_documents = [ - ('index', 'pyim.tex', - u'PyIM Documentation', - u'Julian de Ruiter', 'manual'), + ('index', 'pyim.tex', u'PyIM Documentation', u'Julian de Ruiter', + 'manual'), ] # The name of an image file (relative to this directory) to place at @@ -233,33 +234,25 @@ # If false, no module index is generated. #latex_domain_indices = True - # -- Options for manual page output ------------------------------------ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ('index', 'pyim', - u'PyIM Documentation', - [u'Julian de Ruiter'], 1) + ('index', 'pyim', u'PyIM Documentation', [u'Julian de Ruiter'], 1) ] # If true, show URL addresses after external links. #man_show_urls = False - # -- Options for Texinfo output ---------------------------------------- # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'pyim', - u'PyIM Documentation', - u'Julian de Ruiter', - 'pyim', - 'One line description of project.', - 'Miscellaneous'), + ('index', 'pyim', u'PyIM Documentation', u'Julian de Ruiter', 'pyim', + 'One line description of project.', 'Miscellaneous'), ] # Documents to append as an appendix to all manuals. diff --git a/docs/home.rst b/docs/home.rst new file mode 100644 index 0000000..9e27f2a --- /dev/null +++ b/docs/home.rst @@ -0,0 +1,3 @@ +==== +PyIM +==== \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index a3fdc28..d560f76 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -11,11 +11,12 @@ Contents: .. toctree:: :maxdepth: 2 - readme + home installation usage contributing - authorshistory + authors + history Indices and tables ================== diff --git a/docs/readme.rst b/docs/readme.rst deleted file mode 100644 index 72a3355..0000000 --- a/docs/readme.rst +++ /dev/null @@ -1 +0,0 @@ -.. include:: ../README.rst diff --git a/docs/usage.rst b/docs/usage.rst index 4c10dfa..c63abd7 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -2,6 +2,121 @@ Usage ===== -To use PyIM in a project:: +Identifying insertions +---------------------- + +Overview +======== + +The **pyim-align** command is used to identify insertions using sequence reads +from targeted DNA-sequencing of insertion sites. The command provides access +to various pipelines which (in essence) perform the following functions: + + - Reads are filtered to remove reads that do not contain the correct + technical sequences (such as transposon sequences or required adapter + sequences). + - Reads are trimmed to remove any non-genomic sequences (including + transposon/adapter sequences and any other technical sequences). Reads + that are too short after trimming are removed from the analysis, to + avoid issues during alignment. + - The remaining (genomic) reads are aligned to the reference genome. + - The resulting alignment is analyzed to identify the location and + orientation of the corresponding insertion sites. + +The exact implementation of these steps differs between pipelines and depends +on the design of the sequencing experiment. + +Each pipeline takes... + +Pipelines +========= + +ShearSplink +~~~~~~~~~~~ + +The ``shearsplink`` pipeline is designed to analyze data from samples that +were sequenced using the ShearSplink_ protocol. ShearSplink sequence reads are +expected to have the following structure:: + + [Transposon][Genomic][Linker] + +Here, the ``transposon`` element represents part of the transposon sequence, +which is used (a) verify that the read does indeed involve the transposon and +(b) to determine the exact breakpoint between the transposon sequence and +the flanking genomic sequence (the ``genomic`` element in the sequence). The +``linker`` represents an adapter that is ligated to the (sheared) genomic as +part of the protocol. The position of the linker sequence allows us to assess +the depth/clonality of individual insertions by determining the number of +unique ligation points between the adapter and genomic DNA (see the +ShearSplink_ publication for more details). + +The pipeline can be run using the following basic command: + +.. code-block:: bash + + pyim-align --reads ./reads/Pool3.1.TCA.454Reads.fna \ + --output_dir ./out \ + --transposon ~/Software/python/pyim/data/sb.transposon.fa \ + --linker ~/Software/python/pyim/data/sb.linker.fa \ + --contaminants ~/Software/python/pyim/data/sb.contaminants.fa \ + --bowtie_index ~/References/mus_musculus/mm10/indices/bowtie2/Mus_musculus.GRCm38.dna.primary_assembly + +The ``--linker`` and ``--contaminants`` arguments are optional. If the linker +sequence is omitted, the pipeline assumes that reads were sequenced without +including the linker. If a contaminants file is provided, the sequence reads +are first filtered for the provided contaminant sequences before further +processing. This enables filtering for specific contaminants, such as reads +stemming from the transposon donor locus. + +.. _ShearSplink: https://www.ncbi.nlm.nih.gov/pubmed/21852388 + +Multiplexed ShearSplink +~~~~~~~~~~~~~~~~~~~~~~~ + +The ``shearsplink-multiplexed`` pipeline is an extended version of the +ShearSplink pipeline, which can handle multiplexed datasets. The main advantage +of the pipeline is that it directly tags insertions belonging to specific +samples, rather than us having to first demultiplex the sequence reads and then +having to analyze each sample individually. + +The pipeline takes the same arguments as the basic ShearSplink pipeline, but +adds two arguments ``--barcodes`` and ``--barcode_mapping``. These arguments +specify which barcode sequences have been used to index reads and allow us +to specify an (optional) mapping from barcodes to sample names. If no mapping +is provided, insertions are tagged with the name of the corresponding barcode. + +Multiplexed ShearSplink reads are expected to have the following structure:: + + [Barcode][Transposon][Genomic][Linker] + +The ``transposon``, ``genomic`` and ``linker`` elements are the same as for +the normal ShearSplink pipeline. The ``barcode`` sequence indicates from which +sample the read originated. Barcode sequences should correspond with a sequence +in the provided barcode file. + +Nextera +~~~~~~~ + +TODO + +Merging/splitting datasets +-------------------------- + +.. code-block:: bash + + pyim-merge --insertions ./out1/insertions.txt ./out2/insertions.txt \ + --output ./merged.txt + +Annotating insertions +--------------------- + +.. code-block:: bash + + pyim-annotate window --insertions ./out/insertions.txt + --output ./out/insertions.ann.txt + --gtf reference.gtf + --window_size 20000 + +Identifying CISs +---------------- - import pyim From b2cd98fc376a29364ee4f7a08f5c17de4dfc67a7 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Tue, 28 Mar 2017 11:23:26 +0200 Subject: [PATCH 095/100] Added documentation for base class. --- src/pyim/align/pipelines/base.py | 97 ++++++++++++++++++++++++++++++-- 1 file changed, 91 insertions(+), 6 deletions(-) diff --git a/src/pyim/align/pipelines/base.py b/src/pyim/align/pipelines/base.py index 9f86237..566ab80 100644 --- a/src/pyim/align/pipelines/base.py +++ b/src/pyim/align/pipelines/base.py @@ -1,3 +1,6 @@ +"""Module providing base functionality for insertion identification +pipelines.""" + import abc from pathlib import Path @@ -5,22 +8,63 @@ def register_pipeline(name, pipeline): + """Registers a pipeline class under the given name. + + Parameters + ---------- + name : str + Name to use for the pipeline. + pipeline : Pipeline + The pipeline class. + + """ _registry[name] = pipeline def get_pipelines(): + """Returns a dict of the available pipelines, indexed by pipeline name. + + Returns + ------- + Dict[str, Pipeline] + Available pipelines. + + """ return dict(_registry) class Pipeline(abc.ABC): - """Base pipeline class.""" + """Base pipeline class. + + Pipeline classes implement analyses that derive transposon insertion sites + from sequencing data obtained by targeted (DNA) sequencing of the insertion + sites. + + The main interface of the class is the ``run`` method, whose main + arguments are the paths to the sequence read files and the output + directory. After completion, the output directory contains an + ``insertions.txt`` output file, describing the location of the identified + insertion sites, and any optional extra intermediate/output files. + + Each pipeline should also provide implementations for the + ``configure_args`` and ``from_args`` methods, which are used to instantiate + pipelines from command line arguments as part of the ``pyim-align`` command. + + """ def __init__(self): pass @abc.abstractclassmethod def configure_args(cls, parser): - """Configures argument parser for the pipeline.""" + """Configures argument parser for the pipeline. + + Parameters + ---------- + parser : ArgumentParser + ArgumentParser to configure. + + """ @classmethod def _setup_base_args(cls, parser, paired=False): @@ -33,13 +77,54 @@ def _setup_base_args(cls, parser, paired=False): @classmethod def from_args(cls, args): - """Builds a pipeline instance from the given arguments.""" + """Builds a pipeline instance from the given arguments. + + Parameters + ---------- + args : Namespace + Parsed arguments from argparser. + + Returns + ------- + Pipeline + Instantiated pipeline instance. + + """ return cls(**cls._extract_args(args)) @abc.abstractclassmethod def _extract_args(cls, args): - """Extract arguments from args for from_args.""" + """Extracts arguments from args for from_args. + + Returns arguments as a dict of Dict[str, Any], specifying values for + the various parameters of the corresponding pipeline class. + + Parameters + ---------- + args : Namespace + Parsed arguments from argparser. + + Returns + ------- + Dict[str, Any] + Dictionary of pipeline parameters. + + """ @abc.abstractmethod - def run(self, reads_path, output_dir, reads2_path=None): - """Runs the pipeline with the given input.""" + def run(self, read_path, output_dir, read2_path=None): + """Runs the pipeline, producing a table of identified insertions. + + Parameters + ---------- + read_path : Path + Path to sequence reads. For paired-end data, this should refer + to the first read of the mate pair. + output_dir : Path + Path to the output directory. + read2_path : Path + Optional path to the second read of the mate pair (for paired-end) + sequencing data. Only used in pipelines that support paired-end + sequencing data. + + """ From 4f71d032f81a4bf8481ec5cacfee9ab34602df30 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 10 May 2017 20:49:34 +0200 Subject: [PATCH 096/100] Updated documentation. --- CONTRIBUTING.rst | 20 +- HISTORY.rst | 8 +- Makefile | 8 +- README.rst | 8 +- data/lam_pcr/lam_pcr.sb.barcodes.fa | 8 + data/lam_pcr/lam_pcr.sb.transposon.fa | 2 + docs/api.rst | 23 ++ docs/conf.py | 7 +- docs/home.rst | 3 - docs/index.rst | 33 +-- docs/installation.rst | 29 ++- docs/usage.rst | 105 +++------- environment.yml | 5 +- setup.py | 47 +++-- src/pyim/__init__.py | 2 +- src/pyim/align/pipelines/nextera.py | 51 ++++- src/pyim/align/pipelines/shear_splink.py | 255 +++++++++++++++++++++-- src/pyim/main/pyim_align.py | 9 + src/pyim/main/pyim_merge.py | 2 +- 19 files changed, 455 insertions(+), 170 deletions(-) create mode 100644 data/lam_pcr/lam_pcr.sb.barcodes.fa create mode 100644 data/lam_pcr/lam_pcr.sb.transposon.fa create mode 100644 docs/api.rst delete mode 100644 docs/home.rst diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index a1e9422..e907bdd 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -68,7 +68,7 @@ Ready to contribute? Here's how to set up `pyim` for local development. $ mkvirtualenv pyim $ cd pyim/ - $ python setup.py develop + $ pip install .[dev] 4. Create a branch for local development:: @@ -76,13 +76,9 @@ Ready to contribute? Here's how to set up `pyim` for local development. Now you can make your changes locally. -5. When you're done making changes, check that your changes pass flake8 and the tests, including testing other Python versions with tox:: +5. When you're done making changes, check that your changes pass the tests:: - $ flake8 pyim tests - $ python setup.py test or py.test - $ tox - - To get flake8 and tox, just pip install them into your virtualenv. + $ make test 6. Commit your changes and push your branch to GitHub:: @@ -101,14 +97,6 @@ Before you submit a pull request, check that it meets these guidelines: 2. If the pull request adds functionality, the docs should be updated. Put your new functionality into a function with a docstring, and add the feature to the list in README.rst. -3. The pull request should work for Python 2.6, 2.7, 3.3, 3.4 and 3.5, and for PyPy. Check +3. The pull request should work for Python 3.4 and 3.5. Check https://travis-ci.org/jrderuiter/pyim/pull_requests and make sure that the tests pass for all supported Python versions. - -Tips ----- - -To run a subset of tests:: - -$ py.test tests.test_pyim - diff --git a/HISTORY.rst b/HISTORY.rst index 08d07c2..4541bab 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -2,7 +2,13 @@ History ======= +0.2.0 (2017-05-10) +------------------ + +* Refactored pipeline structure. +* Added ShearSplink and Nextera pipelines. + 0.1.0 (2016-09-01) ------------------ -* First release on PyPI. +* Initial release. diff --git a/Makefile b/Makefile index a8ec776..38bd4ed 100644 --- a/Makefile +++ b/Makefile @@ -60,12 +60,8 @@ coverage: ## check code coverage quickly with the default Python $(BROWSER) htmlcov/index.html docs: ## generate Sphinx HTML documentation, including API docs - $(MAKE) -C docs clean - $(MAKE) -C docs html - $(BROWSER) docs/_build/html/index.html - -servedocs: docs ## compile the docs watching for changes - watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D . + rm -rf docs/_build + sphinx-autobuild docs docs/_build release: clean ## package and upload a release python setup.py sdist upload diff --git a/README.rst b/README.rst index 546845e..19d13c3 100644 --- a/README.rst +++ b/README.rst @@ -5,10 +5,6 @@ PyIM .. image:: https://img.shields.io/travis/jrderuiter/pyim.svg :target: https://travis-ci.org/jrderuiter/pyim -.. image:: https://readthedocs.org/projects/pyim/badge/?version=latest - :target: https://pyim.readthedocs.io/en/latest/?badge=latest - :alt: Documentation Status - PyIM (Python Insertional Mutagenesis) is a python package for analyzing insertional mutagenesis data from targeted sequencing of transposon insertion sites. The package provides several command line tools for identifying @@ -19,7 +15,9 @@ the basic building blocks for implementing new pipelines, CIS callers, etc. Documentation ------------- -PyIM's documentation will be made available online soon. +PyIM's documentation is available at +`jrderuiter.github.io/pyim `_. + Requirements ------------ diff --git a/data/lam_pcr/lam_pcr.sb.barcodes.fa b/data/lam_pcr/lam_pcr.sb.barcodes.fa new file mode 100644 index 0000000..3600211 --- /dev/null +++ b/data/lam_pcr/lam_pcr.sb.barcodes.fa @@ -0,0 +1,8 @@ +>BC01 +ACACATACGC +>BC02 +ACAGTATATA +>BC03 +ACAGTCGTGC +>BC04 +ACATACGCGT diff --git a/data/lam_pcr/lam_pcr.sb.transposon.fa b/data/lam_pcr/lam_pcr.sb.transposon.fa new file mode 100644 index 0000000..a8676f1 --- /dev/null +++ b/data/lam_pcr/lam_pcr.sb.transposon.fa @@ -0,0 +1,2 @@ +>SB +TAAACTTCCGACTTCAACTG \ No newline at end of file diff --git a/docs/api.rst b/docs/api.rst new file mode 100644 index 0000000..c12c981 --- /dev/null +++ b/docs/api.rst @@ -0,0 +1,23 @@ +=== +API +=== + +.. _api_pipelines: + +Pipelines +--------- + +ShearSplink +~~~~~~~~~~~ + +.. autoclass:: pyim.align.pipelines.ShearSplinkPipeline + :members: + +.. autoclass:: pyim.align.pipelines.MultiplexedShearSplinkPipeline + :members: + +Nextera +~~~~~~~ + +.. autoclass:: pyim.align.pipelines.NexteraPipeline + :members: diff --git a/docs/conf.py b/docs/conf.py index 9c629c2..8badcb0 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -42,7 +42,8 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode'] +extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode', + 'sphinx.ext.napoleon'] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] @@ -266,3 +267,7 @@ # If true, do not generate a @detailmenu in the "Top" node's menu. #texinfo_no_detailmenu = False + +# Napoleon settings +napoleon_google_docstring = False +napoleon_numpy_docstring = True diff --git a/docs/home.rst b/docs/home.rst deleted file mode 100644 index 9e27f2a..0000000 --- a/docs/home.rst +++ /dev/null @@ -1,3 +0,0 @@ -==== -PyIM -==== \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index d560f76..22a70d0 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,26 +1,29 @@ -.. pyim documentation master file, created by - sphinx-quickstart on Tue Jul 9 22:26:36 2013. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. +==== +PyIM +==== -Welcome to PyIM's documentation! -====================================== +.. image:: https://img.shields.io/travis/jrderuiter/pyim.svg + :target: https://travis-ci.org/jrderuiter/pyim -Contents: +PyIM (Python Insertional Mutagenesis) is a python package for analyzing +insertional mutagenesis data from targeted sequencing of transposon insertion +sites. The package provides several command line tools for identifying +insertions, calling common insertion sites (CISs) and annotating +insertions/CISs directly from the command line. It also aims to provides +the basic building blocks for implementing new pipelines, CIS callers, etc. + +**Disclaimer: the documentation is a work-in-progress and is under +active development. For details on the different commands/pipelines, see +the help of the respective commands.** .. toctree:: :maxdepth: 2 + :hidden: - home + self installation usage + api contributing authors history - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` diff --git a/docs/installation.rst b/docs/installation.rst index 88153b3..e1c0844 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -4,9 +4,25 @@ Installation ============ +Dependencies +------------ + +PyIM requires Python 3.4 and has been tested on macOS and Linux. + +The following external dependencies are also required for full functionality: + +- Bowtie2 +- Cutadapt +- CIMPL (R package, via rpy2) + +These external tools should be available in ``$PATH``. CIMPL, which is an R +package, should be loadable in the default R installation. + +Using pip +--------- Stable release --------------- +~~~~~~~~~~~~~~ To install PyIM, run this command in your terminal: @@ -14,17 +30,14 @@ To install PyIM, run this command in your terminal: $ pip install pyim -This is the preferred method to install PyIM, as it will always install the most recent stable release. - If you don't have `pip`_ installed, this `Python installation guide`_ can guide you through the process. .. _pip: https://pip.pypa.io .. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/ - From sources ------------- +~~~~~~~~~~~~ The sources for PyIM can be downloaded from the `Github repo`_. @@ -33,6 +46,7 @@ You can either clone the public repository: .. code-block:: console $ git clone git://github.com/jrderuiter/pyim + $ git checkout master Or download the `tarball`_: @@ -49,3 +63,8 @@ Once you have a copy of the source, you can install it with: .. _Github repo: https://github.com/jrderuiter/pyim .. _tarball: https://github.com/jrderuiter/pyim/tarball/master + +Using bioconda +-------------- + +Coming soon! diff --git a/docs/usage.rst b/docs/usage.rst index c63abd7..9ce8a57 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -5,9 +5,6 @@ Usage Identifying insertions ---------------------- -Overview -======== - The **pyim-align** command is used to identify insertions using sequence reads from targeted DNA-sequencing of insertion sites. The command provides access to various pipelines which (in essence) perform the following functions: @@ -24,88 +21,53 @@ to various pipelines which (in essence) perform the following functions: orientation of the corresponding insertion sites. The exact implementation of these steps differs between pipelines and depends -on the design of the sequencing experiment. - -Each pipeline takes... - -Pipelines -========= - -ShearSplink -~~~~~~~~~~~ - -The ``shearsplink`` pipeline is designed to analyze data from samples that -were sequenced using the ShearSplink_ protocol. ShearSplink sequence reads are -expected to have the following structure:: - - [Transposon][Genomic][Linker] +on the design of the sequencing experiment. For an overview of the available +pipelines, see :ref:`api_pipelines`. -Here, the ``transposon`` element represents part of the transposon sequence, -which is used (a) verify that the read does indeed involve the transposon and -(b) to determine the exact breakpoint between the transposon sequence and -the flanking genomic sequence (the ``genomic`` element in the sequence). The -``linker`` represents an adapter that is ligated to the (sheared) genomic as -part of the protocol. The position of the linker sequence allows us to assess -the depth/clonality of individual insertions by determining the number of -unique ligation points between the adapter and genomic DNA (see the -ShearSplink_ publication for more details). - -The pipeline can be run using the following basic command: +An example of calling ``pyim-align`` using the ``shearsplink`` pipeline is +as follows: .. code-block:: bash - pyim-align --reads ./reads/Pool3.1.TCA.454Reads.fna \ - --output_dir ./out \ - --transposon ~/Software/python/pyim/data/sb.transposon.fa \ - --linker ~/Software/python/pyim/data/sb.linker.fa \ - --contaminants ~/Software/python/pyim/data/sb.contaminants.fa \ - --bowtie_index ~/References/mus_musculus/mm10/indices/bowtie2/Mus_musculus.GRCm38.dna.primary_assembly - -The ``--linker`` and ``--contaminants`` arguments are optional. If the linker -sequence is omitted, the pipeline assumes that reads were sequenced without -including the linker. If a contaminants file is provided, the sequence reads -are first filtered for the provided contaminant sequences before further -processing. This enables filtering for specific contaminants, such as reads -stemming from the transposon donor locus. - -.. _ShearSplink: https://www.ncbi.nlm.nih.gov/pubmed/21852388 - -Multiplexed ShearSplink -~~~~~~~~~~~~~~~~~~~~~~~ - -The ``shearsplink-multiplexed`` pipeline is an extended version of the -ShearSplink pipeline, which can handle multiplexed datasets. The main advantage -of the pipeline is that it directly tags insertions belonging to specific -samples, rather than us having to first demultiplex the sequence reads and then -having to analyze each sample individually. + pyim-align shearsplink + --reads ./reads.fastq.gz + --bowtie_index /path/to/index + --output_dir ./out + --transposon /path/to/transposon.fa + --linker /path/to/linker.fa -The pipeline takes the same arguments as the basic ShearSplink pipeline, but -adds two arguments ``--barcodes`` and ``--barcode_mapping``. These arguments -specify which barcode sequences have been used to index reads and allow us -to specify an (optional) mapping from barcodes to sample names. If no mapping -is provided, insertions are tagged with the name of the corresponding barcode. +This produces an ``insertions.txt`` file in the ``./out`` directory, +describing the identified insertions. -Multiplexed ShearSplink reads are expected to have the following structure:: +Merging/splitting datasets +-------------------------- - [Barcode][Transposon][Genomic][Linker] +The **pyim-merge** command can be used to merge different sets of insertions. +This is mainly useful for combining insertions from multiple samples or from +different sequencing datasets. The basic command is as follows: -The ``transposon``, ``genomic`` and ``linker`` elements are the same as for -the normal ShearSplink pipeline. The ``barcode`` sequence indicates from which -sample the read originated. Barcode sequences should correspond with a sequence -in the provided barcode file. +.. code-block:: bash -Nextera -~~~~~~~ + pyim-merge --insertions ./sample1/insertions.txt \ + ./sample2/insertions.txt \ + --output ./merged.txt -TODO +This command adds an additional ``sample`` column to the merged insertions +if this column is not yet present in the source files. By default, the used +sample names are derived from the names of the folders containing the +source files (``sample1`` and ``sample2`` in this example). These names can be +overridden using the ``--sample_names`` parameter. -Merging/splitting datasets --------------------------- +Alternatively, the **pyim-split** command can be used to split a merged +insertion file (containing multiple samples) to obtain separate insertion +files for each sample. The basic command is as follows: .. code-block:: bash - pyim-merge --insertions ./out1/insertions.txt ./out2/insertions.txt \ - --output ./merged.txt + pyim-split --insertions ./merged.txt \ + --output_dir ./out + +A specific subset of samples can be extracted using the ``--samples`` argument. Annotating insertions --------------------- @@ -120,3 +82,4 @@ Annotating insertions Identifying CISs ---------------- +TODO diff --git a/environment.yml b/environment.yml index 9421dc9..fb3e546 100644 --- a/environment.yml +++ b/environment.yml @@ -4,7 +4,6 @@ channels: - r - defaults - conda-forge - - jrderuiter dependencies: # Basic dependencies - python=3.5.* @@ -29,5 +28,9 @@ dependencies: - pytest-mock =1.1 - python-coveralls =2.9.* + - sphinx >=1.5 + - sphinx_rtd_theme + - pip: - pytest-helpers-namespace + - sphinx-autobuild diff --git a/setup.py b/setup.py index 209d91d..45c118b 100644 --- a/setup.py +++ b/setup.py @@ -4,31 +4,36 @@ from setuptools import setup, find_packages with open('README.rst') as readme_file: - readme = readme_file.read() + README = readme_file.read() with open('HISTORY.rst') as history_file: - history = history_file.read() + HISTORY = history_file.read() -requirements = ['pyfaidx>=0.4.8.1', 'intervaltree>=2.1', 'tqdm>=4.7', - 'toolz>=0.8', 'rpy2>=2.8.2', 'numpy', 'pandas>=0.18', - 'pysam>=0.9', 'natsort'] +REQUIREMENTS = [ + 'pyfaidx>=0.4.8.1', 'intervaltree>=2.1', 'tqdm>=4.7', 'toolz>=0.8', + 'rpy2>=2.8.2', 'numpy', 'pandas>=0.18', 'pysam>=0.9', 'natsort', 'cutadapt' +] -test_requirements = ['pytest', 'pytest-cov', 'pytest-mock', - 'pytest-helpers-namespace', 'python-coveralls'] +EXTRAS_REQUIRE = { + 'dev': [ + 'pytest', 'pytest-cov', 'pytest-mock', 'pytest-helpers-namespace', + 'python-coveralls', 'sphinx', 'sphinx-autobuild', 'sphinx_rtd_theme' + ] +} setup( name='pyim', version='0.2.0', description=('Tool for identifying transposon insertions ' 'from targeted DNA-sequencing data.'), - long_description=readme + '\n\n' + history, + long_description=README + '\n\n' + HISTORY, author='Julian de Ruiter', author_email='julianderuiter@gmail.com', url='https://github.com/jrderuiter/pyim', packages=find_packages('src'), package_dir={'': 'src'}, include_package_data=True, - install_requires=requirements, + install_requires=REQUIREMENTS, license='MIT license', zip_safe=False, keywords='pyim', @@ -41,15 +46,15 @@ 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', ], - extras_require={ - 'test': test_requirements - }, - entry_points={'console_scripts': [ - 'pyim-align = pyim.main.pyim_align:main', - 'pyim-demultiplex = pyim.main.pyim_demultiplex:main', - 'pyim-merge = pyim.main.pyim_merge:main', - 'pyim-cis = pyim.main.pyim_cis:main', - 'pyim-annotate = pyim.main.pyim_annotate:main', - 'pyim-bed = pyim.main.pyim_bed:main', - 'pyim-split = pyim.main.pyim_split:main' - ]}) + extras_require=EXTRAS_REQUIRE, + entry_points={ + 'console_scripts': [ + 'pyim-align = pyim.main.pyim_align:main', + 'pyim-demultiplex = pyim.main.pyim_demultiplex:main', + 'pyim-merge = pyim.main.pyim_merge:main', + 'pyim-cis = pyim.main.pyim_cis:main', + 'pyim-annotate = pyim.main.pyim_annotate:main', + 'pyim-bed = pyim.main.pyim_bed:main', + 'pyim-split = pyim.main.pyim_split:main' + ] + }) diff --git a/src/pyim/__init__.py b/src/pyim/__init__.py index 0a4ff68..ece0625 100644 --- a/src/pyim/__init__.py +++ b/src/pyim/__init__.py @@ -2,4 +2,4 @@ __author__ = 'Julian de Ruiter' __email__ = 'julianderuiter@gmail.com' -__version__ = '0.1.0' +__version__ = '0.2.0' diff --git a/src/pyim/align/pipelines/nextera.py b/src/pyim/align/pipelines/nextera.py index 8828a8e..a7215c2 100644 --- a/src/pyim/align/pipelines/nextera.py +++ b/src/pyim/align/pipelines/nextera.py @@ -17,7 +17,56 @@ class NexteraPipeline(Pipeline): - """Nextera-based transposon pipeline.""" + """Nextera-based transposon pipeline. + + Analyzes paired-end sequence data that was prepared using a Nextera-based + protocol. Sequence reads are expected to have the following structure:: + + Mate 1: + [Genomic] + + Mate 2: + [Transposon][Genomic] + + Here, ``transposon`` refers to the flanking part of the transposon sequence + and ``genomic`` refers to the genomic DNA located between the transposon + sequence and the used adapt sequence. Note that the adapter itself is not + sequenced and therefore not part of the reads. However, the end of Mate 1 + is considered to terminate at the adapter and as such represents the + breakpoint between the genomic DNA and the adapter. + + The pipeline essentially performs the following steps: + + - Mates are trimmed to remove the transposon sequence, dropping any + reads not containing the transposon. + - The remaining mates are trimmed to remove any sequences from + the Nextera transposase. + - The trimmed mates are aligned to the reference genome. + - The resulting alignment is used to identify insertions. + + Parameters + ---------- + transposon_path : Path + Path to the (flanking) transposon sequence (fasta). + bowtie_index_path : Path + Path to the bowtie index. + bowtie_options : Dict[str, Any] + Dictionary of extra options for Bowtie. + min_length : int + Minimum length for genomic reads to be kept for alignment. + min_support : int + Minimum support for insertions to be kept in the final output. + min_mapq : int + Minimum mapping quality of alignments to be used for + identifying insertions. + merge_distance : int + Maximum distance within which insertions are merged. Used to merge + insertions that occur within close vicinity, which is typically due + to slight variations in alignments. + threads : int + The number of threads to use for the alignment. + + """ def __init__(self, transposon_path, diff --git a/src/pyim/align/pipelines/shear_splink.py b/src/pyim/align/pipelines/shear_splink.py index b0ab69d..24cd751 100644 --- a/src/pyim/align/pipelines/shear_splink.py +++ b/src/pyim/align/pipelines/shear_splink.py @@ -21,7 +21,73 @@ class ShearSplinkPipeline(Pipeline): - """ShearSplink pipeline.""" + """ShearSplink pipeline. + + Analyzes (single-end) sequencing data that was prepared using the + ShearSplink protocol. Sequence reads are expected to have the following + structure:: + + [Transposon][Genomic][Linker] + + Here, ``transposon`` refers to the flanking part of the transposon + sequence, ``linker`` to the flanking linker sequence and ``genomic`` + to the genomic DNA located in between (which varies per insertion). + The linker sequence is optional and may be omitted if the linker is not + included in sequencing. + + The pipeline essentially performs the following steps: + + - If contaminants are provided, sequence reads are filtered + (using Cutadapt) for the contaminant sequences. + - The remaining reads are trimmed to remove the transposon and + linker sequences, leaving only genomic sequences. Reads without + the transposon/linker sequences are dropped, as we cannot be certain + of their origin. (Note that the linker is optional and is only + trimmed if a linker is given). + - The genomic reads are aligned to the reference genome. + - The resulting alignment is used to identify insertions. + + Note that this pipeline does **NOT** support multiplexed datasets (which is + the default output of the ShearSplink protocol). For multiplexed datasets, + use the ``MultiplexedShearSplinkPipeline``. + + Parameters + ---------- + transposon_path : Path + Path to the (flanking) transposon sequence (fasta). + bowtie_index_path : Path + Path to the bowtie index. + linker_path : Path + Path to the linker sequence (fasta). + contaminant_path : Path + Path to file containing contaminant sequences (fasta). If provided, + sequences are filtered for these sequences before extracting genomic + sequences for alignment. + min_length : int + Minimum length for genomic reads to be kept for alignment. + min_support : int + Minimum support for insertions to be kept in the final output. + min_mapq : int + Minimum mapping quality of alignments to be used for + identifying insertions. + merge_distance : int + Maximum distance within which insertions are merged. Used to merge + insertions that occur within close vicinity, which is typically due + to slight variations in alignments. + bowtie_options : Dict[str, Any] + Dictionary of extra options for Bowtie. + min_overlaps : Dict[str, int] + Minimum overlap required to recognize the transposon, linker and + contaminant sequences (see Cutadapts documentation for more + information). Keys of the dictionary indicate to which sequence the + overlap corresponds and should be one of the following: ``linker``, + ``transposon`` or ``contaminant``. + error_rates : Dict[str, float] + Maximum error rate to use when recognizing transposon, linker and + contaminant sequences (see Cutadapts documentation for more + information). Keys should be the same as for ``min_overlaps``. + + """ def __init__(self, transposon_path, @@ -57,26 +123,109 @@ def __init__(self, def configure_args(cls, parser): cls._setup_base_args(parser, paired=False) - parser.add_argument('--transposon', type=Path, required=True) - parser.add_argument('--bowtie_index', type=Path, required=True) - - parser.add_argument('--contaminants', type=Path, default=None) - parser.add_argument('--linker', type=Path, default=None) - - parser.add_argument('--min_length', type=int, default=15) - parser.add_argument('--min_support', type=int, default=2) - parser.add_argument('--min_mapq', type=int, default=23) - parser.add_argument('--merge_distance', type=int, default=None) - - parser.add_argument('--local', default=False, action='store_true') - - parser.add_argument('--contaminant_error', default=0.1, type=float) - parser.add_argument('--transposon_error', default=0.1, type=float) - parser.add_argument('--linker_error', default=0.1, type=float) - - parser.add_argument('--contaminant_overlap', default=3, type=int) - parser.add_argument('--transposon_overlap', default=3, type=int) - parser.add_argument('--linker_overlap', default=3, type=int) + parser.description = 'ShearSplink pipeline' + + # Paths to various sequences. + seq_options = parser.add_argument_group('Sequences') + + seq_options.add_argument( + '--transposon', + type=Path, + required=True, + help='Fasta file containing the transposon sequence.') + + seq_options.add_argument( + '--contaminants', + type=Path, + default=None, + help='Fasta file containing contaminant sequences.') + + seq_options.add_argument( + '--linker', + type=Path, + default=None, + help='Fasta file containing the linker sequence.') + + # Trimming options (used for cutadapt). + trim_options = parser.add_argument_group('Trimming') + + trim_options.add_argument( + '--min_length', + type=int, + default=15, + help='Minimum length for (trimmed) genomic sequences.') + + trim_options.add_argument( + '--contaminant_error', + default=0.1, + type=float, + help='Maximum error rate for matching contaminants.') + + trim_options.add_argument( + '--contaminant_overlap', + default=3, + type=int, + help='Minimum overlap for matching contaminants.') + + trim_options.add_argument( + '--transposon_error', + default=0.1, + type=float, + help='Maximum error rate for matching the transposon.') + + trim_options.add_argument( + '--transposon_overlap', + default=3, + type=int, + help='Minimum overlap for matching the transposon.') + + trim_options.add_argument( + '--linker_error', + default=0.1, + type=float, + help='Maximum error rate for matching the linker.') + + trim_options.add_argument( + '--linker_overlap', + default=3, + type=int, + help='Minimum overlap for matching the linker.') + + align_options = parser.add_argument_group('Alignment') + + align_options.add_argument( + '--bowtie_index', + type=Path, + required=True, + help='Bowtie2 index to use for alignment.') + + align_options.add_argument( + '--local', + default=False, + action='store_true', + help='Use local alignment.') + + ins_options = parser.add_argument_group('Insertions') + + ins_options.add_argument( + '--min_mapq', + type=int, + default=23, + help=('Minimum mapping quality for reads ' + 'used to identify insertions.')) + + ins_options.add_argument( + '--merge_distance', + type=int, + default=None, + help=('Distance within which insertions (from same ' + 'sample) are merged.')) + + ins_options.add_argument( + '--min_support', + type=int, + default=2, + help='Minimum support for insertions.') @classmethod def _extract_args(cls, args): @@ -141,6 +290,8 @@ def run(self, read_path, output_dir, read2_path=None): ins_frame.to_csv(str(insertion_path), sep='\t', index=False) def _extract_genomic(self, read_path, output_dir, logger): + """Extracts the genomic part of sequence reads.""" + # Log parameters if logger is not None: logger.info('Extracting genomic sequences') @@ -228,6 +379,8 @@ def _extract_genomic(self, read_path, output_dir, logger): return genomic_path def _align(self, read_path, output_dir, logger): + """Aligns genomic reads to the reference genome using Bowtie.""" + # Log parameters if logger is not None: logger.info('Aligning to reference') @@ -252,6 +405,7 @@ def _align(self, read_path, output_dir, logger): def _process_alignment(aln): + """Analyzes an alignment to determine the tranposon/linker breakpoints.""" ref = aln.reference_name if aln.is_reverse: @@ -267,7 +421,64 @@ def _process_alignment(aln): class MultiplexedShearSplinkPipeline(ShearSplinkPipeline): - """ShearSplink pipeline with multiplexed reads.""" + """ShearSplink pipeline supporting multiplexed reads. + + Analyzes multiplexed (single-end) sequencing data that was prepared using + the ShearSplink protocol. Sequence reads are expected to have the following + structure:: + + [Barcode][Transposon][Genomic][Linker] + + Here, the ``transposon``, ``genomic`` and ``linker`` sequences are the + same as for the ``ShearSplinkPipeline``. The ``barcode`` sequence is an + index that indicates which sample the read originated for. + + Barcode sequences should be provided using the ``barcode_path`` argument. + The optional ``barcode_mapping`` argument can be used to map barcodes to + sample names. + + Parameters + ---------- + transposon_path : Path + Path to the (flanking) transposon sequence (fasta). + bowtie_index_path : Path + Path to the bowtie index. + barcode_path : + Path to barcode sequences (fasta). + barcode_mapping : Path + Path to a tsv file specifying a mapping from barcodes to sample names. + Should contain ``sample`` and ``barcode`` columns. + linker_path : Path + Path to the linker sequence (fasta). + contaminant_path : Path + Path to file containing contamintant sequences (fasta). If provided, + sequences are filtered for these sequences before extracting genomic + sequences for alignment. + min_length : int + Minimum length for genomic reads to be kept for alignment. + min_support : int + Minimum support for insertions to be kept in the final output. + min_mapq : int + Minimum mapping quality of alignments to be used for + identifying insertions. + merge_distance : int + Maximum distance within which insertions are merged. Used to merge + insertions that occur within close vicinity, which is typically due + to slight variations in alignments. + bowtie_options : Dict[str, Any] + Dictionary of extra options for Bowtie. + min_overlaps : Dict[str, int] + Minimum overlap required to recognize the transposon, linker and + contamintant sequences (see Cutadapts documentation for more + information). Keys of the dictionary indicate to which sequence the + overlap corresponds and should be one of the following: ``linker``, + ``transposon`` or ``contaminant``. + error_rates : Dict[str, float] + Maximum error rate to use when recognizing transposon, linker and + contamintant sequences (see Cutadapts documentation for more + information). Keys should be the same as for ``min_overlaps``. + + """ def __init__(self, transposon_path, diff --git a/src/pyim/main/pyim_align.py b/src/pyim/main/pyim_align.py index 98ab05a..87f6986 100644 --- a/src/pyim/main/pyim_align.py +++ b/src/pyim/main/pyim_align.py @@ -1,3 +1,12 @@ +"""Script for the pyim-align command. + +The align command is responsible for extracting genomic reads from the +sequencing data, aligning these reads to the reference genome and extracting +insertion sites from these alignments. The command provides access to several +distinct pipelines, which perform these tasks for different types +of sequencing data. +""" + import argparse import logging diff --git a/src/pyim/main/pyim_merge.py b/src/pyim/main/pyim_merge.py index aa5d426..7e676e6 100644 --- a/src/pyim/main/pyim_merge.py +++ b/src/pyim/main/pyim_merge.py @@ -25,7 +25,7 @@ def main(): # Read and merge frames. if args.sample_names is None: - sample_names = [fp.stem for fp in args.insertions] + sample_names = [fp.parent.stem for fp in args.insertions] else: sample_names = args.sample_names From 6f068b035a0d4d342ffbfade5d1e0a790b33694f Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 10 May 2017 20:50:55 +0200 Subject: [PATCH 097/100] Remove external/cimpl. --- external/cimpl | 1 - 1 file changed, 1 deletion(-) delete mode 160000 external/cimpl diff --git a/external/cimpl b/external/cimpl deleted file mode 160000 index c4a6f8f..0000000 --- a/external/cimpl +++ /dev/null @@ -1 +0,0 @@ -Subproject commit c4a6f8fa3eec85956ea72724abfb9405fe7d8d51 From a7a00c0196760c02246c936c6210d2766f26c2d5 Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 10 May 2017 20:53:14 +0200 Subject: [PATCH 098/100] Fix dev installation. --- .travis.yml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index 065135e..f9fbca8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -24,12 +24,11 @@ install: - source activate test # Install dependencies. - - conda install -c r -c bioconda r - - pip install .[test] + - conda install -c r -c bioconda r r-cimpl + - pip install .[dev] - # Install test dependencies. - -script: py.test --cov pyim --cov-report term-missing +script: + py.test tests --cov pyim --cov-report term-missing #after_success: # - coveralls From 1c88a17b4ac58c3245c9f3ba0737b4342270456d Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 10 May 2017 21:01:39 +0200 Subject: [PATCH 099/100] Add bumpversion config. --- .bumpversion.cfg | 12 ++++++++++++ setup.py | 4 ++-- 2 files changed, 14 insertions(+), 2 deletions(-) create mode 100644 .bumpversion.cfg diff --git a/.bumpversion.cfg b/.bumpversion.cfg new file mode 100644 index 0000000..cc32225 --- /dev/null +++ b/.bumpversion.cfg @@ -0,0 +1,12 @@ +[bumpversion] +current_version = 0.2.0 + +[bumpversion:file:setup.py] + +[bumpversion:file:conda/meta.yaml] +search = version: {current_version} +replace = version: {new_version} + +[bumpversion:file:src/pyim/__init__.py] +search = __version__ = '{current_version}' +replace = __version__ = '{new_version}' diff --git a/setup.py b/setup.py index 45c118b..425cb90 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,8 @@ EXTRAS_REQUIRE = { 'dev': [ 'pytest', 'pytest-cov', 'pytest-mock', 'pytest-helpers-namespace', - 'python-coveralls', 'sphinx', 'sphinx-autobuild', 'sphinx_rtd_theme' + 'python-coveralls', 'sphinx', 'sphinx-autobuild', 'sphinx_rtd_theme', + 'bumpversion' ] } @@ -42,7 +43,6 @@ 'Intended Audience :: Science/Research', 'License :: OSI Approved :: MIT License', 'Natural Language :: English', - 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', ], From f805209076476e5a99c1ce2793e79e831ac0db6b Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Wed, 10 May 2017 21:08:21 +0200 Subject: [PATCH 100/100] Updated installation docs. --- docs/installation.rst | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/docs/installation.rst b/docs/installation.rst index e1c0844..75e6b8c 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -21,21 +21,6 @@ package, should be loadable in the default R installation. Using pip --------- -Stable release -~~~~~~~~~~~~~~ - -To install PyIM, run this command in your terminal: - -.. code-block:: console - - $ pip install pyim - -If you don't have `pip`_ installed, this `Python installation guide`_ can guide -you through the process. - -.. _pip: https://pip.pypa.io -.. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/ - From sources ~~~~~~~~~~~~ @@ -46,13 +31,12 @@ You can either clone the public repository: .. code-block:: console $ git clone git://github.com/jrderuiter/pyim - $ git checkout master Or download the `tarball`_: .. code-block:: console - $ curl -OL https://github.com/jrderuiter/pyim/tarball/master + $ curl -OL https://github.com/jrderuiter/pyim/tarball/develop Once you have a copy of the source, you can install it with: