diff --git a/.gitignore b/.gitignore index 6e3bdf9..290f917 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,6 @@ pyplink/version.py .coveragerc htmlcov +.ipynb_checkpoints + build diff --git a/.travis.yml b/.travis.yml index e335212..6da901f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,4 +3,5 @@ python: - "2.7" - "3.3" - "3.4" + - "3.5" script: "python setup.py test" diff --git a/README.mkd b/README.mkd index 80b6197..963532e 100644 --- a/README.mkd +++ b/README.mkd @@ -2,9 +2,9 @@ [![PyPI version](https://badge.fury.io/py/pyplink.svg)](http://badge.fury.io/py/pyplink) -# pyplink - Module to read binary files from Plink +# pyplink - Module to process Plink's binary files -`PyPlink` is a Python module to read binary Plink files. +`PyPlink` is a Python module to read and write Plink's binary files. ## Dependencies @@ -13,7 +13,8 @@ The tool requires a standard [Python](http://python.org/) installation (2.7 or 3.4) with the following modules: 1. [numpy](http://www.numpy.org/) version 1.8.2 or latest -1. [pandas](http://pandas.pydata.org/) version 0.14.1 or latest +2. [pandas](http://pandas.pydata.org/) version 0.14.1 or latest +3. [six](https://pythonhosted.org/six/) version 1.9.0 or latest The tool has been tested on *Linux* only, but should work on *MacOS* and *Windows* operating systems as well. @@ -34,8 +35,8 @@ conda install pyplink -c http://statgen.org/wp-content/uploads/Softwares/pyplink ``` It is possible to add the channel to conda's configuration, so that the -`-c http://statgen.org/...` can be omitted to update or install. Only perform -the following command: +`-c http://statgen.org/...` can be omitted to update or install the package. +To add the channel, perform the following command: ```bash conda config --add channels http://statgen.org/wp-content/uploads/Softwares/pyplink @@ -61,146 +62,6 @@ conda update pyplink -c http://statgen.org/wp-content/uploads/Softwares/pyplink ``` -## Example - -This example describe how to work with the `pyplink` module. - - -### Data description - -```python ->>> from pyplink import PyPlink ->>> pedfile = PyPlink("prefix") ->>> pedfile.get_nb_samples() -10656 ->>> pedfile.get_nb_markers() -141701 ->>> samples = pedfile.get_fam() ->>> samples.head() - fid iid father mother gender status -0 Sample_1 Sample_1 0 0 2 -9 -1 Sample_2 Sample_2 0 0 1 -9 -2 Sample_3 Sample_3 0 0 2 -9 -3 Sample_4 Sample_4 0 0 1 -9 -4 Sample_5 Sample_5 0 0 2 -9 ->>> all_markers = pedfile.get_bim() ->>> all_markers.head() - chrom pos a1 a2 -snp -exm2268640 1 762320 A G -exm47 1 865628 A G -exm53 1 865665 A G -exm55 1 865694 A G -exm56 1 865700 A G -``` - - -### Iterating over all markers - -Cycling through genotypes as `-1`, `0`, `1` and `2` values, where `-1` is -unknown, `0` is homozygous (major allele), `1` is heterozygous and `2` is -homozygous (minor allele). - -```python ->>> for marker_id, genotypes in pedfile: -... print(marker_id, genotypes) -... break -... -exm2268640 [0 0 0 ..., 0 0 0] ->>> for marker_id, genotypes in pedfile.iter_geno(): -... print(marker_id, genotypes) -... break -... -exm2268640 [0 0 0 ..., 0 0 0] -``` - -Cycling through genotypes as `A`, `C`, `G`, `T` values. - -```python ->>> for marker_id, genotypes in pedfile.iter_acgt_geno(): -... print(marker_id, genotypes) -... break -... -exm2268640 ['GG' 'GG' 'GG' ..., 'GG' 'GG' 'GG'] -``` - - -### Iterating over selected markers - -Cycling through genotypes as `-1`, `0`, `1` and `2` values. - -```python ->>> markers = ["exm47", "exm2253575", "exm269"] ->>> for marker_id, genotypes in pedfile.iter_geno_marker(markers): -... print(marker_id, genotypes) -... -exm47 [0 0 0 ..., 0 0 0] -exm2253575 [1 1 1 ..., 0 0 2] -exm269 [0 0 0 ..., 0 1 0] -``` - -Cycling through genotypes as `A`, `C`, `G`, `T` values. - -```python ->>> for marker_id, genotypes in pedfile.iter_acgt_geno_marker(markers): -... print(marker_id, genotypes) -... -exm47 ['GG' 'GG' 'GG' ..., 'GG' 'GG' 'GG'] -exm2253575 ['GA' 'GA' 'GA' ..., 'AA' 'AA' 'GG'] -exm269 ['GG' 'GG' 'GG' ..., 'GG' 'AG' 'GG'] -``` - - -### Getting a single marker - -To get the genotypes (as `-1`, `0`, `1` and `2` values) of a single marker: -```python ->>> pedfile.get_geno_marker("exm47") -[0 0 0 ..., 0 0 0] -``` - -To get the genotypes (as `A`, `C`, `G`, `T` values) of a single marker: -```python ->>> pedfile.get_acgt_geno_marker("exm47") -['GG' 'GG' 'GG' ..., 'GG' 'GG' 'GG'] -``` - - -### Misc example - -To get all markers on the Y chromosomes for the males. - -```python ->>> y_markers = all_markers[all_markers.chrom == 23].index.values ->>> males = samples.gender == 1 ->>> for marker_id, genotypes in pedfile.iter_geno_marker(y_markers): -... male_genotypes = genotypes[males.values] -... print("{:,d} total genotypes".format(len(genotypes))) -... print("{:,d} genotypes for {:,d} " -... "males".format(len(male_genotypes), males.sum())) -... break -... -10,656 total genotypes -6,297 genotypes for 6,297 males -``` - -To count the minor allele frequency. - -```python ->>> from collections import Counter ->>> markers = ["exm47", "exm2253575", "exm269"] ->>> for marker_id, genotypes in pedfile.iter_geno_marker(markers): -... geno_counter = Counter(genotypes) -... nb_alleles = sum(geno_counter[i] * 2 for i in range(3)) -... maf = (geno_counter[2] * 2 + geno_counter[1]) / nb_alleles -... print(marker_id, maf) -... -exm47 0.0070389488503050214 -exm2253575 0.3461719116956318 -exm269 0.05987799155326138 -``` - - ## Testing To test the module, just perform the following command: @@ -208,9 +69,16 @@ To test the module, just perform the following command: ```python >>> import pyplink >>> pyplink.test() -................. +....................... ---------------------------------------------------------------------- -Ran 17 tests in 0.060s +Ran 23 tests in 0.149s OK ``` + + +## Example + +The following +[notebook](http://nbviewer.ipython.org/github/lemieuxl/pyplink/blob/binary_write/demo/PyPlink%20Demo.ipynb) +contains a demonstration (for both Python 2 and 3) of the `PyPlink` module. diff --git a/conda_build.sh b/conda_build.sh new file mode 100755 index 0000000..bf5d9c3 --- /dev/null +++ b/conda_build.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +# Creating a directory for the skeleton +mkdir -p skeleton +pushd skeleton + +# Creating the skeleton +conda skeleton pypi pyplink + +# The different python versions and platforms +python_versions="2.7 3.3 3.4 3.5" +platforms="linux-32 linux-64 osx-64 win-32 win-64" + +# Building +for python_version in $python_versions +do + # Building + conda build --python $python_version pyplink &> log.txt + filename=$(egrep "^# [$] binstar upload \S+$" log.txt | cut -d " " -f 5) + + # Converting + for platform in $platforms + do + conda convert -p $platform $filename -o ../conda_dist + done +done + +popd +rm -rf skeleton + +# Indexing +pushd conda_dist +conda index * +popd diff --git a/demo/PyPlink Demo.ipynb b/demo/PyPlink Demo.ipynb new file mode 100644 index 0000000..e1b9694 --- /dev/null +++ b/demo/PyPlink Demo.ipynb @@ -0,0 +1,1231 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from __future__ import print_function, division" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# `PyPlink`\n", + "\n", + "`PyPlink` is a Python module to read and write binary Plink files. Here are small examples for `PyPlink`." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from pyplink import PyPlink" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Table of contents\n", + "\n", + "* [**Reading binary pedfile**](#Reading-binary-pedfile)\n", + " * [Getting the demo data](#Getting-the-demo-data)\n", + " * [Reading the binary file](#Reading-the-binary-file)\n", + " * [Getting dataset information](#Getting-dataset-information)\n", + " * [Iterating over all markers](#Iterating-over-all-markers)\n", + " * [*Additive format*](#iterating_over_all_additive)\n", + " * [*Nucleotide format*](#iterating_over_all_nuc)\n", + " * [Iterating over selected markers](#Iterating-over-selected-markers)\n", + " * [*Additive format*](#iterating_over_selected_additive)\n", + " * [*Nucleotide format*](#iterating_over_selected_nuc)\n", + " * [Extracting a single marker](#Extracting-a-single-marker)\n", + " * [*Additive format*](#extracting_additive)\n", + " * [*Nucleotide format*](#extracting_nuc)\n", + " * [Misc example](#Misc-example)\n", + " * [*Extracting a subset of markers and samples*](#Extracting-a-subset-of-markers-and-samples)\n", + " * [*Counting the allele frequency of markers*](#Counting-the-allele-frequency-of-markers)\n", + "\n", + "\n", + "* [**Writing binary pedfile**](#Writing-binary-pedfile)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Reading binary pedfile\n", + "\n", + "### Getting the demo data\n", + "\n", + "The [`Plink`](http://pngu.mgh.harvard.edu/~purcell/plink/) softwares provides a testing dataset on the [resources page](http://pngu.mgh.harvard.edu/~purcell/plink/res.shtml). It contains the 270 samples from the HapMap project (release 23) on build GRCh36/hg18." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import zipfile\n", + "try:\n", + " from urllib.request import urlretrieve\n", + "except ImportError:\n", + " from urllib import urlretrieve\n", + "\n", + "# Downloading the demo data from Plink webset\n", + "urlretrieve(\n", + " \"http://pngu.mgh.harvard.edu/~purcell/plink/dist/hapmap_r23a.zip\",\n", + " \"hapmap_r23a.zip\",\n", + ")\n", + "\n", + "# Extracting the archive content\n", + "with zipfile.ZipFile(\"hapmap_r23a.zip\", \"r\") as z:\n", + " z.extractall(\".\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Reading the binary file\n", + "\n", + "To read a binary file, `PyPlink` only requires the prefix of the files." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pedfile = PyPlink(\"hapmap_r23a\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Getting dataset information" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "270 samples and 4,098,136 markers\n" + ] + } + ], + "source": [ + "print(\"{:,d} samples and {:,d} markers\".format(\n", + " pedfile.get_nb_samples(),\n", + " pedfile.get_nb_markers(),\n", + "))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
fidiidfathermothergenderstatus
0NA18524NA18524001-9
1NA18526NA18526002-9
2NA18529NA18529002-9
3NA18532NA18532002-9
4NA18537NA18537002-9
\n", + "
" + ], + "text/plain": [ + " fid iid father mother gender status\n", + "0 NA18524 NA18524 0 0 1 -9\n", + "1 NA18526 NA18526 0 0 2 -9\n", + "2 NA18529 NA18529 0 0 2 -9\n", + "3 NA18532 NA18532 0 0 2 -9\n", + "4 NA18537 NA18537 0 0 2 -9" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_samples = pedfile.get_fam()\n", + "all_samples.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false, + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chromposcma1a2
snp
rs1039974914516200C
rs294942014525700T
rs40303031724340AG
rs403030017251500C
rs385595217768900A
\n", + "
" + ], + "text/plain": [ + " chrom pos cm a1 a2\n", + "snp \n", + "rs10399749 1 45162 0 0 C\n", + "rs2949420 1 45257 0 0 T\n", + "rs4030303 1 72434 0 A G\n", + "rs4030300 1 72515 0 0 C\n", + "rs3855952 1 77689 0 0 A" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_markers = pedfile.get_bim()\n", + "all_markers.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Iterating over all markers\n", + "\n", + "\n", + "#### Additive format\n", + "\n", + "Cycling through genotypes as `-1`, `0`, `1` and `2` values, where `-1` is unknown, `0` is homozygous (major allele), `1` is heterozygous, and `2` is homozygous (minor allele)." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rs10399749\n", + "[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 0\n", + " 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 0 -1\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 -1 0 0 0\n", + " 0 0 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 0 -1 0 0 0 0 0 0 0\n", + " -1 0 -1 0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 -1 0 -1 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n" + ] + } + ], + "source": [ + "for marker_id, genotypes in pedfile:\n", + " print(marker_id)\n", + " print(genotypes)\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rs10399749\n", + "[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 0\n", + " 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 0 -1\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 -1 0 0 0\n", + " 0 0 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 0 -1 0 0 0 0 0 0 0\n", + " -1 0 -1 0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 -1 0 -1 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n" + ] + } + ], + "source": [ + "for marker_id, genotypes in pedfile.iter_geno():\n", + " print(marker_id)\n", + " print(genotypes)\n", + " break" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "#### Nucleotide format\n", + "\n", + "Cycling through genotypes as `A`, `C`, `G` and `T` values (where `00` is unknown)." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false, + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rs10399749\n", + "['CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'\n", + " 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'\n", + " 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' 'CC'\n", + " 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'\n", + " 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'\n", + " 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'\n", + " 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC'\n", + " 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'\n", + " 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' 'CC'\n", + " 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'\n", + " 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00'\n", + " 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC'\n", + " 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'\n", + " 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'\n", + " 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'\n", + " '00' 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' '00' 'CC' 'CC'\n", + " 'CC' 'CC' 'CC' '00' 'CC' '00' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC'\n", + " 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC' 'CC']\n" + ] + } + ], + "source": [ + "for marker_id, genotypes in pedfile.iter_acgt_geno():\n", + " print(marker_id)\n", + " print(genotypes)\n", + " break" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Iterating over selected markers\n", + "\n", + "\n", + "#### Additive format\n", + "\n", + "Cycling through genotypes as `-1`, `0`, `1` and `2` values, where `-1` is unknown, `0` is homozygous (major allele), `1` is heterozygous, and `2` is homozygous (minor allele)." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false, + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rs7092431\n", + "[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", + "\n", + "rs9943770\n", + "[ 0 0 0 2 0 2 0 1 1 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0\n", + " 1 0 1 0 0 1 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1\n", + " 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0\n", + " 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0\n", + " 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 1 -1 0 0 0 1 2 1 1 0 1 1 2 0 0 0 1 0 0 0 0 1 1 0 2\n", + " 1 1 1 0 1 1 -1 1 0 0 1 0 0 2 0 1 2 0 1 1 1 2 0 1 0\n", + " 0 0 0 0 0 0 2 1 2 1 1 2 1 1 1 1 0 1 1 2 1 0 -1 0 1\n", + " 1 1 0 1 0 1 -1 2 1 0 0 0 1 1 1 2 0 0 1 1]\n", + "\n", + "rs1587483\n", + "[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 1 0 0 0 0 0 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n", + "\n" + ] + } + ], + "source": [ + "markers = [\"rs7092431\", \"rs9943770\", \"rs1587483\"]\n", + "for marker_id, genotypes in pedfile.iter_geno_marker(markers):\n", + " print(marker_id)\n", + " print(genotypes, end=\"\\n\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "#### Nucleotide format\n", + "\n", + "Cycling through genotypes as `A`, `C`, `G` and `T` values (where `00` is unknown)." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false, + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rs7092431\n", + "['GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' '00'\n", + " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG']\n", + "\n", + "rs9943770\n", + "['AA' 'AA' 'AA' 'GG' 'AA' 'GG' 'AA' 'GA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA'\n", + " 'AA' 'GA' 'AA' 'AA' 'GA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA'\n", + " 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA'\n", + " 'GA' 'AA' 'GA' 'AA' 'AA' 'GA' 'AA' 'GA' 'AA' 'AA' 'GA' 'GA' 'GA' 'AA' 'AA'\n", + " 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GA'\n", + " 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA'\n", + " 'AA' 'GA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA'\n", + " 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA'\n", + " 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA'\n", + " 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA' 'AA'\n", + " 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA'\n", + " 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GA' '00' 'AA' 'AA' 'AA'\n", + " 'GA' 'GG' 'GA' 'GA' 'AA' 'GA' 'GA' 'GG' 'AA' 'AA' 'AA' 'GA' 'AA' 'AA' 'AA'\n", + " 'AA' 'GA' 'GA' 'AA' 'GG' 'GA' 'GA' 'GA' 'AA' 'GA' 'GA' '00' 'GA' 'AA' 'AA'\n", + " 'GA' 'AA' 'AA' 'GG' 'AA' 'GA' 'GG' 'AA' 'GA' 'GA' 'GA' 'GG' 'AA' 'GA' 'AA'\n", + " 'AA' 'AA' 'AA' 'AA' 'AA' 'AA' 'GG' 'GA' 'GG' 'GA' 'GA' 'GG' 'GA' 'GA' 'GA'\n", + " 'GA' 'AA' 'GA' 'GA' 'GG' 'GA' 'AA' '00' 'AA' 'GA' 'GA' 'GA' 'AA' 'GA' 'AA'\n", + " 'GA' '00' 'GG' 'GA' 'AA' 'AA' 'AA' 'GA' 'GA' 'GA' 'GG' 'AA' 'AA' 'GA' 'GA']\n", + "\n", + "rs1587483\n", + "['GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' '00' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' 'GG' '00' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' 'CG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' '00' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG'\n", + " 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG' 'GG']\n", + "\n" + ] + } + ], + "source": [ + "markers = [\"rs7092431\", \"rs9943770\", \"rs1587483\"]\n", + "for marker_id, genotypes in pedfile.iter_acgt_geno_marker(markers):\n", + " print(marker_id)\n", + " print(genotypes, end=\"\\n\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Extracting a single marker\n", + "\n", + "\n", + "#### Additive format\n", + "\n", + "Cycling through genotypes as `-1`, `0`, `1` and `2` values, where `-1` is unknown, `0` is homozygous (major allele), `1` is heterozygous, and `2` is homozygous (minor allele)." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 0, 0, 0, 0, 0, -1, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, -1,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,\n", + " -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,\n", + " -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,\n", + " -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,\n", + " -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,\n", + " -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,\n", + " -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,\n", + " -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,\n", + " -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,\n", + " -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,\n", + " -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], dtype=int8)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pedfile.get_geno_marker(\"rs7619974\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "#### Nucleotide format\n", + "\n", + "Cycling through genotypes as `A`, `C`, `G` and `T` values (where `00` is unknown)." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['CC', 'CC', 'CC', 'CC', 'CC', '00', 'CC', 'CC', '00', '00', 'CC',\n", + " 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC',\n", + " 'CC', 'CC', '00', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC',\n", + " '00', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC',\n", + " 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC',\n", + " 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC',\n", + " 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC',\n", + " 'CC', '00', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC', 'CC',\n", + " 'CC', 'CC', '00', '00', '00', '00', '00', '00', '00', '00', '00',\n", + " '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',\n", + " '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',\n", + " '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',\n", + " '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',\n", + " '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',\n", + " '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',\n", + " '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',\n", + " '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',\n", + " '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',\n", + " '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',\n", + " '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',\n", + " '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',\n", + " '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',\n", + " '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',\n", + " '00', '00', '00', '00', '00', '00', '00', '00', '00', '00', '00',\n", + " '00', '00', '00', '00', '00', '00'], \n", + " dtype=' Note that `PyPlink` only writes the `BED` file. The user is required to create the `FAM` and `BIM` files." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# The genotypes for 3 markers and 10 samples\n", + "all_genotypes = [\n", + " [0, 0, 0, 1, 0, 0, -1, 2, 1, 0],\n", + " [0, 0, 1, 1, 0, 0, 0, 1, 2, 0],\n", + " [0, 0, 0, 0, 1, 1, 0, 0, 0, 1],\n", + "]\n", + "\n", + "# Writing the BED file using PyPlink\n", + "with PyPlink(\"test_output\", \"w\") as pedfile:\n", + " for genotypes in all_genotypes:\n", + " pedfile.write_marker(genotypes)\n", + "\n", + "# Writing a dummy FAM file\n", + "with open(\"test_output.fam\", \"w\") as fam_file:\n", + " for i in range(10):\n", + " print(\"family_{}\".format(i+1), \"sample_{}\".format(i+1), \"0\", \"0\", \"0\", \"-9\",\n", + " sep=\" \", file=fam_file)\n", + "\n", + "# Writing a dummy BIM file\n", + "with open(\"test_output.bim\", \"w\") as bim_file:\n", + " for i in range(3):\n", + " print(\"1\", \"marker_{}\".format(i+1), \"0\", i+1, \"A\", \"T\",\n", + " sep=\"\\t\", file=bim_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Checking the content of the newly created binary files\n", + "pedfile = PyPlink(\"test_output\")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
fidiidfathermothergenderstatus
0family_1sample_1000-9
1family_2sample_2000-9
2family_3sample_3000-9
3family_4sample_4000-9
4family_5sample_5000-9
5family_6sample_6000-9
6family_7sample_7000-9
7family_8sample_8000-9
8family_9sample_9000-9
9family_10sample_10000-9
\n", + "
" + ], + "text/plain": [ + " fid iid father mother gender status\n", + "0 family_1 sample_1 0 0 0 -9\n", + "1 family_2 sample_2 0 0 0 -9\n", + "2 family_3 sample_3 0 0 0 -9\n", + "3 family_4 sample_4 0 0 0 -9\n", + "4 family_5 sample_5 0 0 0 -9\n", + "5 family_6 sample_6 0 0 0 -9\n", + "6 family_7 sample_7 0 0 0 -9\n", + "7 family_8 sample_8 0 0 0 -9\n", + "8 family_9 sample_9 0 0 0 -9\n", + "9 family_10 sample_10 0 0 0 -9" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pedfile.get_fam()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chromposcma1a2
snp
marker_1110AT
marker_2120AT
marker_3130AT
\n", + "
" + ], + "text/plain": [ + " chrom pos cm a1 a2\n", + "snp \n", + "marker_1 1 1 0 A T\n", + "marker_2 1 2 0 A T\n", + "marker_3 1 3 0 A T" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pedfile.get_bim()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "marker_1 [ 0 0 0 1 0 0 -1 2 1 0]\n", + "marker_2 [0 0 1 1 0 0 0 1 2 0]\n", + "marker_3 [0 0 0 0 1 1 0 0 0 1]\n" + ] + } + ], + "source": [ + "for marker, genotypes in pedfile:\n", + " print(marker, genotypes)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The newly created binary files are compatible with Plink." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "@----------------------------------------------------------@\n", + "| PLINK! | v1.07 | 10/Aug/2009 |\n", + "|----------------------------------------------------------|\n", + "| (C) 2009 Shaun Purcell, GNU General Public License, v2 |\n", + "|----------------------------------------------------------|\n", + "| For documentation, citation & bug-report instructions: |\n", + "| http://pngu.mgh.harvard.edu/purcell/plink/ |\n", + "@----------------------------------------------------------@\n", + "\n", + "Skipping web check... [ --noweb ] \n", + "Writing this text to log file [ plink.log ]\n", + "Analysis started: Tue Nov 3 11:12:07 2015\n", + "\n", + "Options in effect:\n", + "\t--noweb\n", + "\t--bfile test_output\n", + "\t--freq\n", + "\n", + "Reading map (extended format) from [ test_output.bim ] \n", + "3 markers to be included from [ test_output.bim ]\n", + "Reading pedigree information from [ test_output.fam ] \n", + "10 individuals read from [ test_output.fam ] \n", + "0 individuals with nonmissing phenotypes\n", + "Assuming a disease phenotype (1=unaff, 2=aff, 0=miss)\n", + "Missing phenotype value is also -9\n", + "0 cases, 0 controls and 10 missing\n", + "0 males, 0 females, and 10 of unspecified sex\n", + "Warning, found 10 individuals with ambiguous sex codes\n", + "These individuals will be set to missing ( or use --allow-no-sex )\n", + "Writing list of these individuals to [ plink.nosex ]\n", + "Reading genotype bitfile from [ test_output.bed ] \n", + "Detected that binary PED file is v1.00 SNP-major mode\n", + "Before frequency and genotyping pruning, there are 3 SNPs\n", + "10 founders and 0 non-founders found\n", + "Writing allele frequencies (founders-only) to [ plink.frq ] \n", + "\n", + "Analysis finished: Tue Nov 3 11:12:10 2015\n", + "\n" + ] + } + ], + "source": [ + "from subprocess import Popen, PIPE\n", + "\n", + "# Computing frequencies\n", + "proc = Popen([\"plink\", \"--noweb\", \"--bfile\", \"test_output\", \"--freq\"],\n", + " stdout=PIPE, stderr=PIPE)\n", + "outs, errs = proc.communicate()\n", + "print(outs.decode(), end=\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "collapsed": false, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " CHR SNP A1 A2 MAF NCHROBS\n", + " 1 marker_1 A T 0.2222 18\n", + " 1 marker_2 A T 0.25 20\n", + " 1 marker_3 A T 0.15 20\n" + ] + } + ], + "source": [ + "with open(\"plink.frq\", \"r\") as i_file:\n", + " print(i_file.read(), end=\"\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.0" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/pyplink/pyplink.py b/pyplink/pyplink.py index 16acf5b..be6db36 100644 --- a/pyplink/pyplink.py +++ b/pyplink/pyplink.py @@ -27,6 +27,11 @@ import os +try: + from itertools import zip_longest as zip_longest +except ImportError: + from itertools import izip_longest as zip_longest + import numpy as np import pandas as pd @@ -43,9 +48,10 @@ # The recoding values _geno_recode = {1: -1, # Unknown genotype - 2: 1, # Heterozygous genotype - 0: 2, # Homozygous A1 - 3: 0} # Homozygous A2 + 2: 1, # Heterozygous genotype + 0: 2, # Homozygous A1 + 3: 0} # Homozygous A2 +_byte_recode = dict(value[::-1] for value in _geno_recode.items()) class PyPlink(object): @@ -60,40 +66,58 @@ class PyPlink(object): dtype=np.int8, ) - def __init__(self, i_prefix): + def __init__(self, i_prefix, mode="r"): """Initializes a new PyPlink object. - :param i_prefix: the prefix of the binary Plink files - :type i_prefix: str + Args: + i_prefix (str): the prefix of the binary Plink files + mode (str): the open mode for the binary Plink file + nb_samples (int): the number of samples - Reads binary Plink files (BED, BIM and FAM) and create the objects. + Reads or write binary Plink files (BED, BIM and FAM). """ - # These are the name of the input files + # The mode + self._mode = mode + + # These are the name of the files self.bed_filename = "{}.bed".format(i_prefix) self.bim_filename = "{}.bim".format(i_prefix) self.fam_filename = "{}.fam".format(i_prefix) - # Checking that all the files exists (otherwise, there's nothing to do) - for filename in (self.bed_filename, self.bim_filename, - self.fam_filename): - if not os.path.isfile(filename): - raise IOError("No such file: '{}'".format(filename)) + if self._mode == "r": + # Checking that all the files exists (otherwise, there's nothing to + # do) + for filename in (self.bed_filename, self.bim_filename, + self.fam_filename): + if not os.path.isfile(filename): + raise IOError("No such file: '{}'".format(filename)) + + # Setting BIM and FAM to None + self._bim = None + self._fam = None + + # Reading the input files + self._read_bim() + self._read_fam() + self._read_bed() - # Setting BIM and FAM to None - self._bim = None - self._fam = None + # Where we're at + self._n = 0 - # Reading the input files - self._read_bim() - self._read_fam() - self._read_bed() + elif self._mode == "w": + # The dummy number of samples and bytes + self._nb_samples = None - # Where we're at - self._n = 0 + # Opening the output BED file + self._bed_file = open(self.bed_filename, "wb") + self._write_bed_header() + + else: + raise ValueError("invalid mode: '{}'".format(self._mode)) def __repr__(self): - """The __repr__ function.""" + """The representation of the PyPlink object.""" return "PyPlink({:,d} samples; {:,d} markers)".format( self.get_nb_samples(), self.get_nb_markers(), @@ -107,6 +131,16 @@ def __next__(self): """The __next__ function.""" return self.next() + def __enter__(self): + """Entering the context manager.""" + return self + + def __exit__(self, *args): + """Exiting the context manager.""" + if self._mode == "w": + # Closing the BED file + self._bed_file.close() + def next(self): """The next function.""" if self._n < self._nb_markers: @@ -129,17 +163,16 @@ def seek(self, n): def _read_bim(self): """Reads the BIM file.""" - # The original BIM columns - original_bim_cols = ["chrom", "snp", "cm", "pos", "a1", "a2"] - # Reading the BIM file and setting the values - bim = pd.read_csv(self.bim_filename, sep="\t", - names=original_bim_cols) + bim = pd.read_csv(self.bim_filename, delim_whitespace=True, + names=["chrom", "snp", "cm", "pos", "a1", "a2"]) - # The 'snp' should always be strings + # The 'snp', 'a1' and 'a2' columns should always be strings bim["snp"] = bim["snp"].astype(str) + bim["a1"] = bim["a1"].astype(str) + bim["a2"] = bim["a2"].astype(str) - bim = bim.set_index("snp", drop=False) + bim = bim.set_index("snp") bim["i"] = range(len(bim)) bim[2] = bim.a1 * 2 # Original '0' bim[1] = bim.a1 + bim.a2 # Original '2' @@ -168,16 +201,16 @@ def get_nb_markers(self): def _read_fam(self): """Reads the FAM file.""" - # The original FAM columns - self.original_fam_cols = ["fid", "iid", "father", "mother", "gender", - "status"] # Reading the FAM file and setting the values - fam = pd.read_csv(self.fam_filename, sep=" ", - names=self.original_fam_cols) + fam = pd.read_csv(self.fam_filename, delim_whitespace=True, + names=["fid", "iid", "father", "mother", "gender", + "status"]) - # 'fid' and 'iid' should always be strings (more logical that way) + # The sample IDs should always be strings (more logical that way) fam["fid"] = fam["fid"].astype(str) fam["iid"] = fam["iid"].astype(str) + fam["father"] = fam["father"].astype(str) + fam["mother"] = fam["mother"].astype(str) fam["byte"] = [ int(np.ceil((1 + 1) / 4.0)) - 1 for i in range(len(fam)) @@ -229,6 +262,11 @@ def _read_bed(self): # Saving the data in the object self._bed = data + def _write_bed_header(self): + """Writes the BED first 3 bytes.""" + # Writing the first three bytes + self._bed_file.write(bytearray((108, 27, 1))) + def iter_geno(self): """Iterates over genotypes.""" for i in range(len(self._bed)): @@ -307,3 +345,29 @@ def get_acgt_geno_marker(self, marker): geno = self._geno_values[self._bed[i]].flatten(order="C") return self._allele_encoding[i][geno[:self._nb_samples]] + + def write_marker(self, genotypes): + """Write genotypes to binary file.""" + # Initializing the number of samples if required + if self._nb_samples is None: + self._nb_samples = len(genotypes) + + # Checking the expected number of samples + if self._nb_samples != len(genotypes): + raise ValueError("{:,d} samples expected, got {:,d}".format( + self._nb_samples, + len(genotypes), + )) + + # Writing to file + byte_array = [ + g[0] | (g[1] << 2) | (g[2] << 4) | (g[3] << 6) for g in + self._grouper((_byte_recode[geno] for geno in genotypes), 4) + ] + self._bed_file.write(bytearray(byte_array)) + + @staticmethod + def _grouper(iterable, n, fillvalue=0): + """Collect data into fixed-length chunks or blocks.""" + args = [iter(iterable)] * n + return zip_longest(fillvalue=fillvalue, *args) diff --git a/pyplink/tests/test_pyplink.py b/pyplink/tests/test_pyplink.py index d7a2842..978a7ca 100644 --- a/pyplink/tests/test_pyplink.py +++ b/pyplink/tests/test_pyplink.py @@ -23,6 +23,8 @@ # THE SOFTWARE. +from __future__ import print_function + import os import random import shutil @@ -519,7 +521,95 @@ def test_get_acgt_geno_marker(self): # Asking for an unknown marker should raise an ValueError with self.assertRaises(ValueError) as cm: self.pedfile.get_acgt_geno_marker("dummy_marker") - self.assertEqual( - "dummy_marker: marker not in BIM", - str(cm.exception), - ) + self.assertEqual("dummy_marker: marker not in BIM", str(cm.exception)) + + def test_get_context_read_mode(self): + """Tests the PyPlink object as context manager.""" + with PyPlink(self.prefix) as genotypes: + self.assertEqual(3, len(genotypes.get_fam().head(n=3))) + + def test_invalid_mode(self): + """Tests invalid mode when PyPlink as context manager.""" + with self.assertRaises(ValueError) as cm: + with PyPlink(self.prefix, "u") as genotypes: + pass + self.assertEqual("invalid mode: 'u'", str(cm.exception)) + + def test_write_binary(self): + """Tests writing a Plink binary file.""" + # The expected genotypes + expected_genotypes = [ + np.array([0, 0, 0, 1, 0, 1, 2], dtype=int), + np.array([0, 0, 0, 0, -1, 0, 1], dtype=int), + np.array([0, -1, -1, 2, 0, 0, 0], dtype=int), + ] + + # The prefix + test_prefix = os.path.join(self.tmp_dir, "test_write") + + # Writing the binary file + with PyPlink(test_prefix, "w") as plink: + for genotypes in expected_genotypes: + plink.write_marker(genotypes) + + # Checking the file exists + self.assertTrue(os.path.isfile(test_prefix + ".bed")) + + # Writing the FAM file + with open(test_prefix + ".fam", "w") as o_file: + for i in range(7): + print("f{}".format(i+1), "s{}".format(i+1), "0", "0", + random.choice((1, 2)), "-9", sep=" ", file=o_file) + + # Writing the BIM file + with open(test_prefix + ".bim", "w") as o_file: + for i in range(3): + print(i+1, "m{}".format(i+1), "0", i+1, "T", "A", sep="\t", + file=o_file) + + # Reading the written binary file + plink = PyPlink(test_prefix) + for i, (marker, genotypes) in enumerate(plink): + self.assertEqual("m{}".format(i+1), marker) + self.assertTrue((expected_genotypes[i] == genotypes).all()) + + def test_write_binary_error(self): + """Tests writing a binary file, with different number of sample.""" + # The expected genotypes + expected_genotypes = [ + np.array([0, 0, 0, 1, 0, 1, 2], dtype=int), + np.array([0, 0, 0, 0, -1, 0], dtype=int), + np.array([0, -1, -1, 2, 0, 0, 0], dtype=int), + ] + + # The prefix + test_prefix = os.path.join(self.tmp_dir, "test_write") + + # Writing the binary file + with self.assertRaises(ValueError) as cm: + with PyPlink(test_prefix, "w") as plink: + for genotypes in expected_genotypes: + plink.write_marker(genotypes) + self.assertEqual("7 samples expected, got 6", str(cm.exception)) + + def test_grouper_padding(self): + """Tests the _grouper function (when padding is required).""" + expected_chunks = [ + (0, 1, 2), + (3, 4, 5), + (6, 7, 8), + (9, 0, 0), + ] + observed_chunks = PyPlink._grouper(range(10), 3) + for expected, observed in zip(expected_chunks, observed_chunks): + self.assertEqual(expected, observed) + + def test_grouper_no_padding(self): + """Tests the _grouper function (when padding is not required).""" + expected_chunks = [ + (0, 1, 2, 3, 4), + (5, 6, 7, 8, 9), + ] + observed_chunks = PyPlink._grouper(range(10), 5) + for expected, observed in zip(expected_chunks, observed_chunks): + self.assertEqual(expected, observed) diff --git a/setup.py b/setup.py index 4455fd5..990aef8 100644 --- a/setup.py +++ b/setup.py @@ -7,9 +7,7 @@ # - python setup.py bdist_wheel --universal # How to build for conda (do both with 2.7 and 3.4) -# - python setup.py bdist_conda -# - conda convert -p all /PATH/TO/FILE -o conda_dist -# - cd conda_dist && conda index * +# - bash conda_build.sh import os @@ -17,8 +15,8 @@ MAJOR = 1 -MINOR = 0 -MICRO = 2 +MINOR = 1 +MICRO = 0 VERSION = "{}.{}.{}".format(MAJOR, MINOR, MICRO)