Skip to content

Commit

Permalink
Merge pull request #83 from CSBiology/developer
Browse files Browse the repository at this point in the history
Sync master with developer for 1.0.03
  • Loading branch information
kMutagene authored Feb 26, 2020
2 parents 4126c85 + 670f49d commit af41feb
Show file tree
Hide file tree
Showing 29 changed files with 2,567 additions and 460 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -186,3 +186,4 @@ docsrc/tools/FSharp.Formatting.svclog
docs
/temp
/pkg
.ionide
1 change: 1 addition & 0 deletions BioFSharp.sln
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "content", "content", "{8E6D
docsrc\content\AminoProperties.fsx = docsrc\content\AminoProperties.fsx
docsrc\content\BioCollections.fsx = docsrc\content\BioCollections.fsx
docsrc\content\BioContainers.fsx = docsrc\content\BioContainers.fsx
docsrc\content\BioContainersDesignGuide.fsx = docsrc\content\BioContainersDesignGuide.fsx
docsrc\content\BioDB.fsx = docsrc\content\BioDB.fsx
docsrc\content\BioID.fsx = docsrc\content\BioID.fsx
docsrc\content\BioItem.fsx = docsrc\content\BioItem.fsx
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ BioFSharp is an open source bioinformatics and computational biology toolbox wri


|Branch|Linux Mono (Xenial)|Linux .Net Core only (Bionic Beaver)|Windows|
|---|---|---|
|---|---|---|---|
| master | [![Build Status](https://travis-ci.com/CSBiology/BioFSharp.svg?branch=master)](https://travis-ci.com/CSBiology/BioFSharp) | [![Build status](https://ci.appveyor.com/api/projects/status/9a5r4aklmmbykobk/branch/master?svg=true)](https://ci.appveyor.com/project/kMutagene/biofsharp/branch/master) | [![Build status](https://ci.appveyor.com/api/projects/status/9a5r4aklmmbykobk/branch/master?svg=true)](https://ci.appveyor.com/project/kMutagene/biofsharp/branch/master) |
| developer | [![Build Status](https://travis-ci.com/CSBiology/BioFSharp.svg?branch=developer)](https://travis-ci.com/CSBiology/BioFSharp) | [![Build status](https://ci.appveyor.com/api/projects/status/9a5r4aklmmbykobk/branch/developer?svg=true)](https://ci.appveyor.com/project/kMutagene/biofsharp/branch/developer) |[![Build status](https://ci.appveyor.com/api/projects/status/9a5r4aklmmbykobk/branch/developer?svg=true)](https://ci.appveyor.com/project/kMutagene/biofsharp/branch/developer) |

Expand Down
10 changes: 10 additions & 0 deletions RELEASE_NOTES.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
#### 1.0.03 - Wednesday, February 26, 2020
* **BioFSharp.Stats:**
* Massively improved SAILENT characterization speed for [preprocessing of large datasets](https://github.com/CSBiology/BioFSharp/pull/82)
* **BioFSharp.BioContainers:**
* [Improved DSL for Blast biocontainer API](https://github.com/CSBiology/BioFSharp/pull/83/commits/8e463f8cbc87797261520519b876b836d0b55bde)
* **BioFSharp.IO:**
* [Refactored SOFT Parser](https://github.com/CSBiology/BioFSharp/pull/83/commits/da0ba0cfa8807fad2032be66054125bd12f732c2) and [improved its namespace encapsulation](https://github.com/CSBiology/BioFSharp/pull/83/commits/eb389a908e7cee66f01616e62ef65df20fc88c6c).
* add prettyPrinters for the SOFT GSE/GPL type](https://github.com/CSBiology/BioFSharp/pull/83/commits/615dcbfd30632d6fdf7a70edae9f5227ce250937)


#### 1.0.02 - Wednesday, February 19, 2020
* **BioFSharp.BioDB:**
* Fix FaTool OData model URL
Expand Down
2 changes: 1 addition & 1 deletion build.fsx
Original file line number Diff line number Diff line change
Expand Up @@ -594,7 +594,7 @@ Target.create "GitReleaseNuget" (fun _ ->
let tempNugetDir = "temp/nuget"
Shell.cleanDir tempNugetDir |> ignore
Git.Repository.cloneSingleBranch "" (gitHome + "/" + gitName + ".git") "nuget" tempNugetDir
let files = Directory.EnumerateFiles bin
let files = Directory.EnumerateFiles pkgDir
Shell.copy tempNugetDir files
Git.Staging.stageAll tempNugetDir
Git.Commit.exec tempNugetDir (sprintf "Update git nuget packages for version %s" release.NugetVersion)
Expand Down
300 changes: 300 additions & 0 deletions docsrc/content/BioContainersDesignGuide.fsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,300 @@
(*** hide ***)
#I @"../../bin/BioFSharp/net47/"
#I @"../../bin/BioFSharp.BioDB/net45/"
#I @"../../bin/BioFSharp.ImgP/net47"
#I @"../../bin/BioFSharp.IO/net47/"
#I @"../../bin/BioFSharp.Parallel/net47/"
#I @"../../bin/BioFSharp.Stats/net47/"
#I @"../../bin/BioFSharp.Vis/net47/"
#r @"../../lib/Formatting/FSharp.Plotly.dll"
#I @"../../bin/BioFSharp.BioContainers/net47/"
#r @"C:\Users\kevin\source\repos\CSBiology\BioFSharp\packages\Docker.DotNet\lib\netstandard2.0\Docker.DotNet.dll"
#r "BioFSharp.dll"
#r "BioFSharp.BioContainers.dll"

open BioFSharp.BioContainers
open BioFSharp.BioContainers.BioContainer
(**
Designing F# APIs for biocontainers
===================================
This page is a design suggestion for common coding practice for all biocontainer APIs used in this library.
Do not take it as ultima ratio, but as a guideline that is up for community discussion.
Biocontainer targets
--------------------
If you want to create an F# API for a bioinformatic tool, make sure to check if the tool is already containerized
in the [BioConstainers registry](https://biocontainers.pro/#/registry) or [repository](https://github.com/BioContainers/containers). If not, make sure to create your own image according to the [BioConstainers standards](https://biocontainers-edu.biocontainers.pro/en/latest/what_is_biocontainers.html).
Suggested BioContainer API design workflow
------------------------------------------
- **Pull or build the target container.**
- **Run the help command in the container**
That way you can check that the container is working and you get the full list of commands with additional information.
You can alternatively get the description of commands from the documentation of the containerized tool.
- **Create a DSL for the command line arguments as follows:**
- **Create functions to run commands in the container**
Here is an example walkthrough for the `makeblastdb` tool. The documentation used to create the DSL can be found [here](https://www.ncbi.nlm.nih.gov/books/NBK279684/)
The relevant arguments are:
![MakeBlastDBParams](img/MakeBlastDBParams.png)
###Use Discriminated Union types for all types in the domain.
The top level type should be named "[toolname]Params"
For arguments with various input ranges, choose a fitting type at will, e.g. `String` for a title, `int` for number of cores, etc. or use the type annotations from the tool's docs when provided
*)

///DSL for command line arguments for the NCBI makeblastdb tool
type MakeBlastDbParams =
///Input file/database name
| Input of string

(**
###For flag type arguments, use typeless union labels**
*)

type MakeBlastDbParams =
///Input file/database name
| Input of string
///Parse bar delimited sequence identifiers (e.g., gi|129295) in FASTA input
| ParseSeqIds

(**
###For arguments than can only have a defined set of values, use a dedicated discriminated union type**
*)

///Input file type for makeblastdb
type MakeBlastDBInputType =
///fasta: for FASTA file(s)
|Fasta
///blastdb: for BLAST database(s)
|Blastdb
///asn1_txt: for Seq-entries in text ASN.1 format
|Asn1Txt
///asn1_bin: for Seq-entries in binary ASN.1 format
|ASN1Bin

type MakeBlastDbParams =
///Input file/database name
| Input of string
///Input file type for makeblastdb
| InputType of MakeBlastDBInputType
///Parse bar delimited sequence identifiers (e.g., gi|129295) in FASTA input
| ParseSeqIds


(**
###For each discriminated union type you have, create converter functions:
Sub union types:
- make: a function that returns the argument converted from DSL to string. This function can be used for creating commands usable in any local shell.
- make: a function that returns the argument converted from DSL to string, using the `MountInfo.containerPathOf` function.
This function creates commands that are usable in a container with mounted directories and ensures access for the tool to your mounted input/output files. You only need this function for types that contain paths.
Top level union type:
- makeCmd: a function that returns the argument converted from DSL to string, preceded with the option indicator (e.g. --, -). This function can be used for creating commands usable in any local shell.
- makeCmdWith: a function that returns the argument converted from DSL to string, preceded with the option indicator (e.g. --, -), using the `MountInfo.containerPathOf` function.
This function creates commands that are usable in a container with mounted directories and ensures access for the tool to your mounted input/output files. You only need this function for types that contain paths.
When containing subunions, the converter for the top level union should call the converter for the subunion for argument string creation.
All return values of the top level union should be wrapped in a list.
*)

///Input file type for makeblastdb
type MakeBlastDBInputType =
///fasta: for FASTA file(s)
|Fasta
///blastdb: for BLAST database(s)
|Blastdb
///asn1_txt: for Seq-entries in text ASN.1 format
|Asn1Txt
///asn1_bin: for Seq-entries in binary ASN.1 format
|ASN1Bin
static member make = function
|Fasta -> "fasta"
|Blastdb -> "blastdb"
|Asn1Txt -> "asn1_txt"
|ASN1Bin -> "asn1_bin"

type MakeBlastDbParams =
///Input file/database name
| Input of string
///Input file type for makeblastdb
| InputType of MakeBlastDBInputType
///Parse bar delimited sequence identifiers (e.g., gi|129295) in FASTA input
| ParseSeqIds

static member makeCmd = function
| Input (path) -> ["-in" ; path]
| InputType it -> ["-input_type" ; MakeBlastDBInputType.make it]
| ParseSeqIds -> ["-parse_seqids"]

///returns the string form of command line argument DSL for makeblastdb with paths adjusted for container localization
static member makeCmdWith (m: MountInfo) = function
| Input (path) -> ["-in" ; MountInfo.containerPathOf m path]
| InputType it -> ["-input_type" ; MakeBlastDBInputType.make it]
| ParseSeqIds -> ["-parse_seqids"]


(**
###Create functions for in-container command execution
Always create a synchronous and asynchronous version:
*)

open BioFSharp.BioContainers
open BioFSharp.BioContainers.BioContainer

let runMakeBlastDBAsync (bcContext:BioContainer.BcContext) (opt:MakeBlastDbParams list) =

//create correct command line strings from the input parameters
let cmds = (opt |> List.map (MakeBlastDbParams.makeCmdWith bcContext.Mount))

//append the commands after the running command, most time this is the tool name.
let tp = "makeblastdb"::(cmds |> List.concat)

printfn "Starting process makeblastdb\r\nparameters:"
cmds |> List.iter (fun op -> printfn "\t%s" (String.concat " " op))

//await execution of the commands in the container. execAsync does not return the result, use execReturn for that.
async {
let! res = BioContainer.execAsync bcContext tp
return res
}

//synchronous version
let runMakeBlastDB (bcContext:BioContainer.BcContext) (opt:MakeBlastDbParams list) =

runMakeBlastDBAsync bcContext opt
|> Async.RunSynchronously


(**
Full example
------------
Here is the full DSL for makeblastdb:
*)


open BioFSharp.BioContainers
open BioFSharp.BioContainers.BioContainer

///Input file type for makeblastdb
type MakeBlastDBInputType =
///fasta: for FASTA file(s)
|Fasta
///blastdb: for BLAST database(s)
|Blastdb
///asn1_txt: for Seq-entries in text ASN.1 format
|Asn1Txt
///asn1_bin: for Seq-entries in binary ASN.1 format
|ASN1Bin

static member make = function
|Fasta -> "fasta"
|Blastdb -> "blastdb"
|Asn1Txt -> "asn1_txt"
|ASN1Bin -> "asn1_bin"

///Molecule type of input, values can be nucl or prot
type DbType =
| Protein
| Nucleotide

static member make = function
| Protein -> "prot"
| Nucleotide -> "nucl"

///DSL for command line arguments for the NCBI makeblastdb tool
type MakeBlastDbParams =
///Input file/database name
| Input of string
///Input file type for makeblastdb
| InputType of MakeBlastDBInputType
///Molecule type of input, values can be nucl or prot
| DbType of DbType
///Title for BLAST database. If not set, the input file name will be used
| Title of string
///Parse bar delimited sequence identifiers (e.g., gi|129295) in FASTA input
| ParseSeqIds
///Create index of sequence hash values
| HashIndex
///Comma-separated list of input files containing masking data as produced by NCBI masking applications (e.g. dustmasker, segmasker, windowmasker
| MaskData of string list
///Name of BLAST database to be created. Input file name is used if none provided. This field is required if input consists of multiple files
| Output of string
///Maximum file size to use for BLAST database. 4GB is the maximum supported by the database structure
| MaxFileSize of string
///Taxonomy ID to assign to all sequences.
| TaxId of int
///File with two columns mapping sequence ID to the taxonomy ID. The first column is the sequence ID represented as one of:
///
///1
///fasta with accessions (e.g., emb|X17276.1|)
///
///2
///fasta with GI (e.g., gi|4)
///
///3
///GI as a bare number (e.g., 4)
///
///4
///A local ID. The local ID must be prefixed with "lcl" (e.g., lcl|4).
///The second column should be the NCBI taxonomy ID (e.g., 9606 for human).
| TaxIdMapFile of string
///Program log file (default is stderr).
| Logfile of string

///returns the string form of command line argument DSL for makeblastdb
static member makeCmd = function
| Input (path) -> ["-in" ; path]
| InputType it -> ["-input_type" ; MakeBlastDBInputType.make it]
| DbType (dbt) -> ["-dbtype" ; DbType.make dbt]
| Title t -> ["-title" ; t]
| ParseSeqIds -> ["-parse_seqids"]
| HashIndex -> ["-hash_index"]
| MaskData (paths) -> ["-mask_data" ; paths |> String.concat ","]
| Output (path) -> ["-out" ; path]
| MaxFileSize fs -> ["-max_file_size"; fs]
| TaxId tid -> ["-taxid" ; sprintf "%i" tid]
| TaxIdMapFile (path) -> ["-taxid_map" ; path]
| Logfile(path) -> ["-logfile" ; path]

///returns the string form of command line argument DSL for makeblastdb with paths adjusted for container localization
static member makeCmdWith (m: MountInfo) = function
| Input (path) -> ["-in" ; MountInfo.containerPathOf m path]
| InputType it -> ["-input_type" ; MakeBlastDBInputType.make it]
| DbType (dbt) -> ["-dbtype" ; DbType.make dbt]
| Title t -> ["-title" ; t]
| ParseSeqIds -> ["-parse_seqids"]
| HashIndex -> ["-hash_index"]
| MaskData (paths) -> ["-mask_data" ; paths |> List.map (MountInfo.containerPathOf m) |> String.concat ","]
| Output (path) -> ["-out" ; MountInfo.containerPathOf m path]
| MaxFileSize fs -> ["-max_file_size"; fs]
| TaxId tid -> ["-taxid" ; sprintf "%i" tid]
| TaxIdMapFile (path) -> ["-taxid_map" ; MountInfo.containerPathOf m path]
| Logfile(path) -> ["-logfile" ; MountInfo.containerPathOf m path]

let runMakeBlastDBAsync (bcContext:BioContainer.BcContext) (opt:MakeBlastDbParams list) =

let cmds = (opt |> List.map (MakeBlastDbParams.makeCmdWith bcContext.Mount))
let tp = "makeblastdb"::(cmds |> List.concat)

printfn "Starting process makeblastdb\r\nparameters:"
cmds |> List.iter (fun op -> printfn "\t%s" (String.concat " " op))

async {
let! res = BioContainer.execAsync bcContext tp
return res
}

let runMakeBlastDB (bcContext:BioContainer.BcContext) (opt:MakeBlastDbParams list) =

runMakeBlastDBAsync bcContext opt
|> Async.RunSynchronously
Loading

0 comments on commit af41feb

Please sign in to comment.