diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..894a44c --- /dev/null +++ b/.gitignore @@ -0,0 +1,104 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..69f733f --- /dev/null +++ b/LICENSE @@ -0,0 +1,20 @@ +The MIT License + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + diff --git a/README.md b/README.md index 3e77309..6e50f21 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ git clone https://github.com/WGS-TB/PythonPRINCE.git To check PRINCE is installed properly run ``` -python Prince.py -to test_output.txt -tf sample_targets.txt +prince -to test_output.txt -tf sample_targets.txt ``` The output file should contain two rows with 24 random real numbers. @@ -48,7 +48,7 @@ Once you have your target file you can run PRINCE. Specify a target output file (eg. output.txt) with -to. If the file doesn't exist PRINCE will create one. ``` -python Prince.py -tf samples.txt -to output.txt +prince -tf samples.txt -to output.txt ``` Each line in output.txt will correspond to the predicted VNTR copy numbers for the corresponding sample in your target file. @@ -67,14 +67,14 @@ Once you have your altered genomes you can create simulated reads using your pre Create a separate training file for each copy number with the paths to all your genomes with that many copies at each VNTR region. Specify your training output file. ``` -python Prince.py -bf training_samples_cn_1.txt -bo training_output.txt -cn 1 -python Prince.py -bf training_samples_cn_2.txt -bo training_output.txt -cn 2 -python Prince.py -bf training_samples_cn_3.txt -bo training_output.txt -cn 3 -python Prince.py -bf training_samples_cn_4.txt -bo training_output.txt -cn 4 +prince -bf training_samples_cn_1.txt -bo training_output.txt -cn 1 +prince -bf training_samples_cn_2.txt -bo training_output.txt -cn 2 +prince -bf training_samples_cn_3.txt -bo training_output.txt -cn 3 +prince -bf training_samples_cn_4.txt -bo training_output.txt -cn 4 ``` To use your new training data on your queries specifiy the training output file. ``` -python Prince.py -tf samples.txt -to output.txt -bo training_output.txt +prince -tf samples.txt -to output.txt -bo training_output.txt ``` ## Built With diff --git a/Prince.py b/bin/prince old mode 100644 new mode 100755 similarity index 77% rename from Prince.py rename to bin/prince index d560cd6..9383cdc --- a/Prince.py +++ b/bin/prince @@ -1,21 +1,23 @@ -from Bio import SeqIO -from Kmer_Generator import kmerGenerator -from boost import run_boosts -from query_sample import test_target +#!/usr/bin/env python + import argparse import warnings +from Bio import SeqIO +from prince.kmer_generator import kmerGenerator +from prince.boost import run_boosts +from prince.query_sample import test_target + DEFAULT_K = 9 DEFAULT_BOOST_OUTPUT = "training_data.txt" def main(): - parser = argparse.ArgumentParser(description='Prince Options.') parser.add_argument('-bo', '--boost_output', default=DEFAULT_BOOST_OUTPUT, - help="output file for training data / training data used to predict copy numbers for queries") + help="output file for training data / training data used to predict copy numbers for queries") parser.add_argument('-to', '--target_output', default="results/predictions.csv", - help="output file for query copy number predictions") + help="output file for query copy number predictions") parser.add_argument('-tmp','--templates', default="templates.fasta", help="VNTR templates. Default is for M.TB") parser.add_argument('-tf', '--target_file', default=None, @@ -31,7 +33,7 @@ def main(): #Safety check: if prince_options.k != DEFAULT_K and prince_options.boost_output == DEFAULT_BOOST_OUTPUT: - warnings.warn("Warning: Target kmer size does not equal training settings. May lead to inaccurate predictions.") + warnings.warn("Warning: Target kmer size does not equal training settings. May lead to inaccurate predictions.") #Template data initialized @@ -49,4 +51,4 @@ def main(): test_target(prince_options, templates, templateNames,templateKmers) if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/Plots.ipynb b/docs/Plots.ipynb similarity index 100% rename from Plots.ipynb rename to docs/Plots.ipynb diff --git a/Testing_of_related_methods.ipynb b/docs/Testing_of_related_methods.ipynb similarity index 100% rename from Testing_of_related_methods.ipynb rename to docs/Testing_of_related_methods.ipynb diff --git a/prince/__init__.py b/prince/__init__.py new file mode 100644 index 0000000..c57bfd5 --- /dev/null +++ b/prince/__init__.py @@ -0,0 +1 @@ +__version__ = '0.0.0' diff --git a/boost.py b/prince/boost.py similarity index 93% rename from boost.py rename to prince/boost.py index cf4d098..7d7eaa3 100644 --- a/boost.py +++ b/prince/boost.py @@ -1,4 +1,4 @@ -from match_score import compute_match_score +from prince.match_score import compute_match_score def run_boosts(opts,templates,templateNames,templateKmers): with open(opts.boosting_file) as file: diff --git a/COARSE_filtering.py b/prince/coarse_filtering.py similarity index 100% rename from COARSE_filtering.py rename to prince/coarse_filtering.py diff --git a/FINE_filtering.py b/prince/fine_filtering.py similarity index 100% rename from FINE_filtering.py rename to prince/fine_filtering.py diff --git a/Kmer_Generator.py b/prince/kmer_generator.py similarity index 100% rename from Kmer_Generator.py rename to prince/kmer_generator.py diff --git a/match_score.py b/prince/match_score.py similarity index 93% rename from match_score.py rename to prince/match_score.py index 57b3415..7317eac 100644 --- a/match_score.py +++ b/prince/match_score.py @@ -1,6 +1,6 @@ from Bio import SeqIO -from COARSE_filtering import coarse_filtering -from FINE_filtering import fine_filtering +from prince.coarse_filtering import coarse_filtering +from prince.fine_filtering import fine_filtering from itertools import chain def check_file_exists(itr8tr): @@ -39,4 +39,4 @@ def compute_match_score(genome, templates, templateKmers, kmerLength): #Normalize score by adjusting for coverage matchScore = [t/coverage for t in matchScore] - return matchScore \ No newline at end of file + return matchScore diff --git a/predict.py b/prince/predict.py similarity index 100% rename from predict.py rename to prince/predict.py diff --git a/query_sample.py b/prince/query_sample.py similarity index 90% rename from query_sample.py rename to prince/query_sample.py index 3bc15f1..b27b5f1 100644 --- a/query_sample.py +++ b/prince/query_sample.py @@ -1,6 +1,6 @@ from math import sqrt -from predict import get_data, get_equations, get_copy_number -from match_score import compute_match_score +from prince.predict import get_data, get_equations, get_copy_number +from prince.match_score import compute_match_score import time def test_target(opts, templates,templateNames, templateKmers): diff --git a/medium_test1.fq b/prince/tests/data/medium_test1.fq similarity index 100% rename from medium_test1.fq rename to prince/tests/data/medium_test1.fq diff --git a/medium_test2.fq b/prince/tests/data/medium_test2.fq similarity index 100% rename from medium_test2.fq rename to prince/tests/data/medium_test2.fq diff --git a/small_test1.fastq b/prince/tests/data/small_test1.fastq similarity index 100% rename from small_test1.fastq rename to prince/tests/data/small_test1.fastq diff --git a/small_test2.fastq b/prince/tests/data/small_test2.fastq similarity index 100% rename from small_test2.fastq rename to prince/tests/data/small_test2.fastq diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..6660df3 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +biopython +numpy +scipy diff --git a/sample_targets.txt b/sample_targets.txt index c1f407e..2d06dc6 100644 --- a/sample_targets.txt +++ b/sample_targets.txt @@ -1,2 +1,2 @@ -small_test -medium_test +prince/tests/data/small_test +prince/tests/data/medium_test diff --git a/Inject_repeats.py b/scripts/Inject_repeats.py similarity index 100% rename from Inject_repeats.py rename to scripts/Inject_repeats.py diff --git a/incomplete_repeats.py b/scripts/incomplete_repeats.py similarity index 100% rename from incomplete_repeats.py rename to scripts/incomplete_repeats.py diff --git a/plots.py b/scripts/plots.py similarity index 100% rename from plots.py rename to scripts/plots.py diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..c026133 --- /dev/null +++ b/setup.py @@ -0,0 +1,36 @@ +from distutils.core import setup + +from setuptools import find_packages + +from prince import __version__ + +classifiers = """ +Development Status :: 4 - Beta +Environment :: Console +License :: OSI Approved :: MIT License +Intended Audience :: Science/Research +Topic :: Scientific/Engineering +Topic :: Scientific/Engineering :: Bio-Informatics +Programming Language :: Python :: 2.7 +Operating System :: POSIX :: Linux +""".strip().split('\n') + +setup(name='prince', + version=__version__, + description='PRINCE estimates Variable Number Tandem Repeats (VNTR) copy number from raw next generation sequencing (NGS) data.', + author='Julius Booth, Margaryta Vityaz, Merhdad Mansouri, Leonid Chindelevitch', + author_email='', + url='https://github.com/WGS-TB/PythonPRINCE', + license='MIT', + classifiers=classifiers, + install_requires=[ + 'biopython', + 'scipy', + 'numpy' + ], + test_suite='nose.collector', + tests_require=['nose'], + packages=find_packages(), + include_package_data=True, + scripts=['bin/prince'] +)