diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..72f1eba --- /dev/null +++ b/Dockerfile @@ -0,0 +1,85 @@ +FROM jupyter/base-notebook:latest + +# Install .NET CLI dependencies + +ARG NB_USER=fsdocs-user +ARG NB_UID=1000 +ENV USER ${NB_USER} +ENV NB_UID ${NB_UID} +ENV HOME /home/${NB_USER} + +WORKDIR ${HOME} + +USER root +RUN apt-get update +RUN apt-get install -y curl + +ENV \ + # Enable detection of running in a container + DOTNET_RUNNING_IN_CONTAINER=true \ + # Enable correct mode for dotnet watch (only mode supported in a container) + DOTNET_USE_POLLING_FILE_WATCHER=true \ + # Skip extraction of XML docs - generally not useful within an image/container - helps performance + NUGET_XMLDOC_MODE=skip \ + # Opt out of telemetry until after we install jupyter when building the image, this prevents caching of machine id + DOTNET_INTERACTIVE_CLI_TELEMETRY_OPTOUT=true + +# Install .NET CLI dependencies +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + libc6 \ + libgcc1 \ + libgssapi-krb5-2 \ + libicu66 \ + libssl1.1 \ + libstdc++6 \ + zlib1g \ + && rm -rf /var/lib/apt/lists/* + +# Install .NET Core SDK + +# When updating the SDK version, the sha512 value a few lines down must also be updated. +ENV DOTNET_SDK_VERSION 5.0.101 + +RUN dotnet_sdk_version=5.0.101 \ + && curl -SL --output dotnet.tar.gz https://dotnetcli.azureedge.net/dotnet/Sdk/$dotnet_sdk_version/dotnet-sdk-$dotnet_sdk_version-linux-x64.tar.gz \ + && dotnet_sha512='398d88099d765b8f5b920a3a2607c2d2d8a946786c1a3e51e73af1e663f0ee770b2b624a630b1bec1ceed43628ea8bc97963ba6c870d42bec064bde1cd1c9edb' \ + && echo "$dotnet_sha512 dotnet.tar.gz" | sha512sum -c - \ + && mkdir -p /usr/share/dotnet \ + && tar -ozxf dotnet.tar.gz -C /usr/share/dotnet \ + && rm dotnet.tar.gz \ + && ln -s /usr/share/dotnet/dotnet /usr/bin/dotnet \ + # Trigger first run experience by running arbitrary cmd + && dotnet help + +# Copy notebooks + +COPY ./ ${HOME}/notebooks/ + +# Copy package sources + +COPY ./NuGet.config ${HOME}/nuget.config + +RUN chown -R ${NB_UID} ${HOME} +USER ${USER} + +#Install nteract +RUN pip install nteract_on_jupyter + +# Install lastest build from master branch of Microsoft.DotNet.Interactive +RUN dotnet tool install -g Microsoft.dotnet-interactive --add-source "https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet-tools/nuget/v3/index.json" + +#latest stable from nuget.org +#RUN dotnet tool install -g Microsoft.dotnet-interactive --add-source "https://api.nuget.org/v3/index.json" + +ENV PATH="${PATH}:${HOME}/.dotnet/tools" +RUN echo "$PATH" + +# Install kernel specs +RUN dotnet interactive jupyter install + +# Enable telemetry once we install jupyter for the image +ENV DOTNET_INTERACTIVE_CLI_TELEMETRY_OPTOUT=false + +# Set root to notebooks +WORKDIR ${HOME}/notebooks/ \ No newline at end of file diff --git a/NuGet.config b/NuGet.config new file mode 100644 index 0000000..cf1ace5 --- /dev/null +++ b/NuGet.config @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/content/fsdocs-custom.css b/content/fsdocs-custom.css new file mode 100644 index 0000000..c32465d --- /dev/null +++ b/content/fsdocs-custom.css @@ -0,0 +1,5 @@ + +/*-------------------------------------------------------------------------- + Customize your CSS here +/*--------------------------------------------------------------------------*/ + diff --git a/content/fsdocs-default.css b/content/fsdocs-default.css new file mode 100644 index 0000000..68b1313 --- /dev/null +++ b/content/fsdocs-default.css @@ -0,0 +1,605 @@ +@import url('https://fonts.googleapis.com/css2?family=Hind+Vadodara&family=Roboto+Mono&display=swap'); +/*-------------------------------------------------------------------------- + Formatting for page & standard document content +/*--------------------------------------------------------------------------*/ + +body { + font-family: 'Hind Vadodara', sans-serif; + /* padding-top: 0px; + padding-bottom: 40px; +*/ +} + +blockquote { + margin: 0 1em 0 0.25em; + margin-top: 0px; + margin-right: 1em; + margin-bottom: 0px; + margin-left: 0.25em; + padding: 0 .75em 0 1em; + border-left: 1px solid #777; + border-right: 0px solid #777; +} + +/* Format the heading - nicer spacing etc. */ +.masthead { + overflow: hidden; +} + + .masthead .muted a { + text-decoration: none; + color: #999999; + } + + .masthead ul, .masthead li { + margin-bottom: 0px; + } + + .masthead .nav li { + margin-top: 15px; + font-size: 110%; + } + + .masthead h3 { + margin-top: 15px; + margin-bottom: 5px; + font-size: 170%; + } + +/*-------------------------------------------------------------------------- + Formatting fsdocs-content +/*--------------------------------------------------------------------------*/ + +/* Change font sizes for headings etc. */ +#fsdocs-content h1 { + margin: 30px 0px 15px 0px; + /* font-weight: 400; */ + font-size: 2rem; + letter-spacing: 1.78px; + line-height: 2.5rem; + font-weight: 400; +} + +#fsdocs-content h2 { + font-size: 1.6rem; + margin: 20px 0px 10px 0px; + font-weight: 400; +} + +#fsdocs-content h3 { + font-size: 1.2rem; + margin: 15px 0px 10px 0px; + font-weight: 400; +} + +#fsdocs-content hr { + margin: 0px 0px 20px 0px; +} + +#fsdocs-content li { + font-size: 1.0rem; + line-height: 1.375rem; + letter-spacing: 0.01px; + font-weight: 500; + margin: 0px 0px 15px 0px; +} + +#fsdocs-content p { + font-size: 1.0rem; + line-height: 1.375rem; + letter-spacing: 0.01px; + font-weight: 500; + color: #262626; +} + +#fsdocs-content a { + color: #4974D1; +} +/* remove the default bootstrap bold on dt elements */ +#fsdocs-content dt { + font-weight: normal; +} + + + +/*-------------------------------------------------------------------------- + Formatting tables in fsdocs-content, using docs.microsoft.com tables +/*--------------------------------------------------------------------------*/ + +#fsdocs-content .table { + table-layout: auto; + width: 100%; + font-size: 0.875rem; +} + + #fsdocs-content .table caption { + font-size: 0.8rem; + font-weight: 600; + letter-spacing: 2px; + text-transform: uppercase; + padding: 1.125rem; + border-width: 0 0 1px; + border-style: solid; + border-color: #e3e3e3; + text-align: right; + } + + #fsdocs-content .table td, + #fsdocs-content .table th { + display: table-cell; + word-wrap: break-word; + padding: 0.75rem 1rem 0.75rem 0rem; + line-height: 1.5; + vertical-align: top; + border-top: 1px solid #e3e3e3; + border-right: 0; + border-left: 0; + border-bottom: 0; + border-style: solid; + } + + /* suppress the top line on inner lists such as tables of exceptions */ + #fsdocs-content .table .fsdocs-exception-list td, + #fsdocs-content .table .fsdocs-exception-list th { + border-top: 0 + } + + #fsdocs-content .table td p:first-child, + #fsdocs-content .table th p:first-child { + margin-top: 0; + } + + #fsdocs-content .table td.nowrap, + #fsdocs-content .table th.nowrap { + white-space: nowrap; + } + + #fsdocs-content .table td.is-narrow, + #fsdocs-content .table th.is-narrow { + width: 15%; + } + + #fsdocs-content .table th:not([scope='row']) { + border-top: 0; + border-bottom: 1px; + } + + #fsdocs-content .table > caption + thead > tr:first-child > td, + #fsdocs-content .table > colgroup + thead > tr:first-child > td, + #fsdocs-content .table > thead:first-child > tr:first-child > td { + border-top: 0; + } + + #fsdocs-content .table table-striped > tbody > tr:nth-of-type(odd) { + background-color: var(--box-shadow-light); + } + + #fsdocs-content .table.min { + width: unset; + } + + #fsdocs-content .table.is-left-aligned td:first-child, + #fsdocs-content .table.is-left-aligned th:first-child { + padding-left: 0; + } + + #fsdocs-content .table.is-left-aligned td:first-child a, + #fsdocs-content .table.is-left-aligned th:first-child a { + outline-offset: -0.125rem; + } + +@media screen and (max-width: 767px), screen and (min-resolution: 120dpi) and (max-width: 767.9px) { + #fsdocs-content .table.is-stacked-mobile td:nth-child(1) { + display: block; + width: 100%; + padding: 1rem 0; + } + + #fsdocs-content .table.is-stacked-mobile td:not(:nth-child(1)) { + display: block; + border-width: 0; + padding: 0 0 1rem; + } +} + +#fsdocs-content .table.has-inner-borders th, +#fsdocs-content .table.has-inner-borders td { + border-right: 1px solid #e3e3e3; +} + + #fsdocs-content .table.has-inner-borders th:last-child, + #fsdocs-content .table.has-inner-borders td:last-child { + border-right: none; + } + +.fsdocs-entity-list .fsdocs-entity-name { + width: 25%; + font-weight: bold; +} + +.fsdocs-member-list .fsdocs-member-usage { + width: 35%; +} + +/*-------------------------------------------------------------------------- + Formatting xmldoc sections in fsdocs-content +/*--------------------------------------------------------------------------*/ + +.fsdocs-xmldoc, .fsdocs-entity-xmldoc, .fsdocs-member-xmldoc { + font-size: 1.0rem; + line-height: 1.375rem; + letter-spacing: 0.01px; + font-weight: 500; + color: #262626; +} + +.fsdocs-xmldoc h1 { + font-size: 1.2rem; + margin: 10px 0px 0px 0px; +} + +.fsdocs-xmldoc h2 { + font-size: 1.2rem; + margin: 10px 0px 0px 0px; +} + +.fsdocs-xmldoc h3 { + font-size: 1.1rem; + margin: 10px 0px 0px 0px; +} + +/* #fsdocs-nav .searchbox { + margin-top: 30px; + margin-bottom: 30px; +} */ + +#fsdocs-nav img.logo{ + width:90%; + /* height:140px; */ + /* margin:10px 0px 0px 20px; */ + margin-top:40px; + border-style:none; +} + +#fsdocs-nav input{ + /* margin-left: 20px; */ + margin-right: 20px; + margin-top: 20px; + margin-bottom: 20px; + width: 93%; + -webkit-border-radius: 0; + border-radius: 0; +} + +#fsdocs-nav { + /* margin-left: -5px; */ + /* width: 90%; */ + font-size:0.95rem; +} + +#fsdocs-nav li.nav-header{ + /* margin-left: -5px; */ + /* width: 90%; */ + padding-left: 0; + color: #262626; + text-transform: none; + font-size:16px; + margin-top: 9px; + font-weight: bold; +} + +#fsdocs-nav a{ + padding-left: 0; + color: #6c6c6d; + /* margin-left: 5px; */ + /* width: 90%; */ +} + +/*-------------------------------------------------------------------------- + Formatting pre and code sections in fsdocs-content (code highlighting is + further below) +/*--------------------------------------------------------------------------*/ + +#fsdocs-content code { + /* font-size: 0.83rem; */ + font: 0.85rem 'Roboto Mono', monospace; + background-color: #f7f7f900; + border: 0px; + padding: 0px; + /* word-wrap: break-word; */ + /* white-space: pre; */ +} + +/* omitted */ +#fsdocs-content span.omitted { + background: #3c4e52; + border-radius: 5px; + color: #808080; + padding: 0px 0px 1px 0px; +} + +#fsdocs-content pre .fssnip code { + font: 0.86rem 'Roboto Mono', monospace; +} + +#fsdocs-content table.pre, +#fsdocs-content pre.fssnip, +#fsdocs-content pre { + line-height: 13pt; + border: 0px solid #d8d8d8; + border-top: 0px solid #e3e3e3; + border-collapse: separate; + white-space: pre; + font: 0.86rem 'Roboto Mono', monospace; + width: 100%; + margin: 10px 0px 20px 0px; + background-color: #f3f4f7; + padding: 10px; + border-radius: 5px; + color: #8e0e2b; + max-width: none; + box-sizing: border-box; +} + +#fsdocs-content pre.fssnip code { + font: 0.86rem 'Roboto Mono', monospace; + font-weight: 600; +} + +#fsdocs-content table.pre { + background-color: #fff7ed; +} + +#fsdocs-content table.pre pre { + padding: 0px; + margin: 0px; + border-radius: 0px; + width: 100%; + background-color: #fff7ed; + color: #837b79; +} + +#fsdocs-content table.pre td { + padding: 0px; + white-space: normal; + margin: 0px; + width: 100%; +} + +#fsdocs-content table.pre td.lines { + width: 30px; +} + + +#fsdocs-content pre { + word-wrap: inherit; +} + +.fsdocs-example-header { + font-size: 1.0rem; + line-height: 1.375rem; + letter-spacing: 0.01px; + font-weight: 700; + color: #262626; +} + +/*-------------------------------------------------------------------------- + Formatting github source links +/*--------------------------------------------------------------------------*/ + +.fsdocs-source-link { + float: right; + text-decoration: none; +} + + .fsdocs-source-link img { + border-style: none; + margin-left: 10px; + width: auto; + height: 1.4em; + } + + .fsdocs-source-link .hover { + display: none; + } + + .fsdocs-source-link:hover .hover { + display: block; + } + + .fsdocs-source-link .normal { + display: block; + } + + .fsdocs-source-link:hover .normal { + display: none; + } + +/*-------------------------------------------------------------------------- + Formatting logo +/*--------------------------------------------------------------------------*/ + +#fsdocs-logo { + width:140px; + height:140px; + margin:10px 0px 0px 0px; + border-style:none; +} + +/*-------------------------------------------------------------------------- + +/*--------------------------------------------------------------------------*/ + +#fsdocs-content table.pre pre { + padding: 0px; + margin: 0px; + border: none; +} + +/*-------------------------------------------------------------------------- + Remove formatting from links +/*--------------------------------------------------------------------------*/ + +#fsdocs-content h1 a, +#fsdocs-content h1 a:hover, +#fsdocs-content h1 a:focus, +#fsdocs-content h2 a, +#fsdocs-content h2 a:hover, +#fsdocs-content h2 a:focus, +#fsdocs-content h3 a, +#fsdocs-content h3 a:hover, +#fsdocs-content h3 a:focus, +#fsdocs-content h4 a, +#fsdocs-content h4 a:hover, #fsdocs-content +#fsdocs-content h4 a:focus, +#fsdocs-content h5 a, +#fsdocs-content h5 a:hover, +#fsdocs-content h5 a:focus, +#fsdocs-content h6 a, +#fsdocs-content h6 a:hover, +#fsdocs-content h6 a:focus { + color: #262626; + text-decoration: none; + text-decoration-style: none; + /* outline: none */ +} + +/*-------------------------------------------------------------------------- + Formatting for F# code snippets +/*--------------------------------------------------------------------------*/ + +.fsdocs-param-name, +.fsdocs-return-name, +.fsdocs-param { + font-weight: 900; + font-size: 0.85rem; + font-family: 'Roboto Mono', monospace; +} +/* strings --- and stlyes for other string related formats */ +#fsdocs-content span.s { + color: #dd1144; +} +/* printf formatters */ +#fsdocs-content span.pf { + color: #E0C57F; +} +/* escaped chars */ +#fsdocs-content span.e { + color: #EA8675; +} + +/* identifiers --- and styles for more specific identifier types */ +#fsdocs-content span.id { + color: #262626; +} +/* module */ +#fsdocs-content span.m { + color: #009999; +} +/* reference type */ +#fsdocs-content span.rt { + color: #4974D1; +} +/* value type */ +#fsdocs-content span.vt { + color: #43AEC6; +} +/* interface */ +#fsdocs-content span.if { + color: #43AEC6; +} +/* type argument */ +#fsdocs-content span.ta { + color: #43AEC6; +} +/* disposable */ +#fsdocs-content span.d { + color: #43AEC6; +} +/* property */ +#fsdocs-content span.prop { + color: #43AEC6; +} +/* punctuation */ +#fsdocs-content span.p { + color: #43AEC6; +} +#fsdocs-content span.pn { + color: #262626; +} +/* function */ +#fsdocs-content span.f { + color: #e1e1e1; +} +#fsdocs-content span.fn { + color: #990000; +} +/* active pattern */ +#fsdocs-content span.pat { + color: #4ec9b0; +} +/* union case */ +#fsdocs-content span.u { + color: #4ec9b0; +} +/* enumeration */ +#fsdocs-content span.e { + color: #4ec9b0; +} +/* keywords */ +#fsdocs-content span.k { + color: #b68015; + /* font-weight: bold; */ +} +/* comment */ +#fsdocs-content span.c { + color: #808080; + font-weight: 400; + font-style: italic; +} +/* operators */ +#fsdocs-content span.o { + color: #af75c1; +} +/* numbers */ +#fsdocs-content span.n { + color: #009999; +} +/* line number */ +#fsdocs-content span.l { + color: #80b0b0; +} +/* mutable var or ref cell */ +#fsdocs-content span.v { + color: #d1d1d1; + font-weight: bold; +} +/* inactive code */ +#fsdocs-content span.inactive { + color: #808080; +} +/* preprocessor */ +#fsdocs-content span.prep { + color: #af75c1; +} +/* fsi output */ +#fsdocs-content span.fsi { + color: #808080; +} + +/* tool tip */ +div.fsdocs-tip { + background: #475b5f; + border-radius: 4px; + font: 0.85rem 'Roboto Mono', monospace; + padding: 6px 8px 6px 8px; + display: none; + color: #d1d1d1; + pointer-events: none; +} + + div.fsdocs-tip code { + color: #d1d1d1; + font: 0.85rem 'Roboto Mono', monospace; + } + diff --git a/content/fsdocs-search.js b/content/fsdocs-search.js new file mode 100644 index 0000000..3d543cf --- /dev/null +++ b/content/fsdocs-search.js @@ -0,0 +1,84 @@ +var lunrIndex, pagesIndex; + +function endsWith(str, suffix) { + return str.indexOf(suffix, str.length - suffix.length) !== -1; +} + +// Initialize lunrjs using our generated index file +function initLunr() { + if (!endsWith(fsdocs_search_baseurl,"/")){ + fsdocs_search_baseurl = fsdocs_search_baseurl+'/' + }; + + // First retrieve the index file + $.getJSON(fsdocs_search_baseurl +"index.json") + .done(function(index) { + pagesIndex = index; + // Set up lunrjs by declaring the fields we use + // Also provide their boost level for the ranking + lunrIndex = lunr(function() { + this.ref("uri"); + this.field('title', { + boost: 15 + }); + this.field('tags', { + boost: 10 + }); + this.field("content", { + boost: 5 + }); + + this.pipeline.remove(lunr.stemmer); + this.searchPipeline.remove(lunr.stemmer); + + // Feed lunr with each file and let lunr actually index them + pagesIndex.forEach(function(page) { + this.add(page); + }, this); + }) + }) + .fail(function(jqxhr, textStatus, error) { + var err = textStatus + ", " + error; + console.error("Error getting Hugo index file:", err); + }); +} + +/** + * Trigger a search in lunr and transform the result + * + * @param {String} query + * @return {Array} results + */ +function search(queryTerm) { + // Find the item in our index corresponding to the lunr one to have more info + return lunrIndex.search(queryTerm+"^100"+" "+queryTerm+"*^10"+" "+"*"+queryTerm+"^10"+" "+queryTerm+"~2^1").map(function(result) { + return pagesIndex.filter(function(page) { + return page.uri === result.ref; + })[0]; + }); +} + +// Let's get started +initLunr(); + +$( document ).ready(function() { + var searchList = new autoComplete({ + /* selector for the search box element */ + minChars: 1, + selector: $("#search-by").get(0), + /* source is the callback to perform the search */ + source: function(term, response) { + response(search(term)); + }, + /* renderItem displays individual search results */ + renderItem: function(item, search) { + search = search.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&'); + var re = new RegExp("(" + search.split(' ').join('|') + ")", "gi"); + return '
' + item.title.replace(re, "$1") + '
'; + }, + /* onSelect callback fires when a search suggestion is chosen */ + onSelect: function(e, term, item) { + location.href = item.getAttribute('data-uri'); + } + }); +}); diff --git a/content/fsdocs-tips.js b/content/fsdocs-tips.js new file mode 100644 index 0000000..bcd04cb --- /dev/null +++ b/content/fsdocs-tips.js @@ -0,0 +1,54 @@ +var currentTip = null; +var currentTipElement = null; + +function hideTip(evt, name, unique) { + var el = document.getElementById(name); + el.style.display = "none"; + currentTip = null; +} + +function findPos(obj) { + // no idea why, but it behaves differently in webbrowser component + if (window.location.search == "?inapp") + return [obj.offsetLeft + 10, obj.offsetTop + 30]; + + var curleft = 0; + var curtop = obj.offsetHeight; + while (obj) { + curleft += obj.offsetLeft; + curtop += obj.offsetTop; + obj = obj.offsetParent; + }; + return [curleft, curtop]; +} + +function hideUsingEsc(e) { + if (!e) { e = event; } + hideTip(e, currentTipElement, currentTip); +} + +function showTip(evt, name, unique, owner) { + document.onkeydown = hideUsingEsc; + if (currentTip == unique) return; + currentTip = unique; + currentTipElement = name; + + var pos = findPos(owner ? owner : (evt.srcElement ? evt.srcElement : evt.target)); + var posx = pos[0]; + var posy = pos[1]; + + var el = document.getElementById(name); + var parent = (document.documentElement == null) ? document.body : document.documentElement; + el.style.position = "absolute"; + el.style.left = posx + "px"; + el.style.top = posy + "px"; + el.style.display = "block"; +} +function Clipboard_CopyTo(value) { + var tempInput = document.createElement("input"); + tempInput.value = value; + document.body.appendChild(tempInput); + tempInput.select(); + document.execCommand("copy"); + document.body.removeChild(tempInput); +} \ No newline at end of file diff --git a/content/img/copy-md-hover.png b/content/img/copy-md-hover.png new file mode 100644 index 0000000..b14e941 Binary files /dev/null and b/content/img/copy-md-hover.png differ diff --git a/content/img/copy-md.png b/content/img/copy-md.png new file mode 100644 index 0000000..72de738 Binary files /dev/null and b/content/img/copy-md.png differ diff --git a/content/img/copy-xml-hover.png b/content/img/copy-xml-hover.png new file mode 100644 index 0000000..60fea16 Binary files /dev/null and b/content/img/copy-xml-hover.png differ diff --git a/content/img/copy-xml.png b/content/img/copy-xml.png new file mode 100644 index 0000000..e5606b9 Binary files /dev/null and b/content/img/copy-xml.png differ diff --git a/content/img/github-hover.png b/content/img/github-hover.png new file mode 100644 index 0000000..65971d4 Binary files /dev/null and b/content/img/github-hover.png differ diff --git a/content/img/github.png b/content/img/github.png new file mode 100644 index 0000000..ff34f35 Binary files /dev/null and b/content/img/github.png differ diff --git a/content/navbar-fixed-left.css b/content/navbar-fixed-left.css new file mode 100644 index 0000000..2de6255 --- /dev/null +++ b/content/navbar-fixed-left.css @@ -0,0 +1,77 @@ +body { + padding-top: 90px; +} + +@media (min-width: 768px) { + body { + padding-top: 0; + } +} + +@media (min-width: 768px) { + body { + margin-left: 252px; + } +} +.navbar { + overflow-y: auto; + overflow-x: hidden; + box-shadow: none; +} +.navbar.fixed-left { + position: fixed; + top: 0; + left: 0; + right: 0; + z-index: 1030; +} +.navbar-nav .nav-link { + padding-top: 0.3rem; + padding-bottom: 0.3rem; +} + +@media (min-width: 768px) { + .navbar.fixed-left { + bottom: 0; + width: 252px; + flex-flow: column nowrap; + align-items: flex-start; + } + + .navbar.fixed-left .navbar-collapse { + flex-grow: 0; + flex-direction: column; + width: 100%; + } + + .navbar.fixed-left .navbar-collapse .navbar-nav { + flex-direction: column; + width: 100%; + } + + .navbar.fixed-left .navbar-collapse .navbar-nav .nav-item { + width: 100%; + } + + .navbar.fixed-left .navbar-collapse .navbar-nav .nav-item .dropdown-menu { + top: 0; + } +} + +@media (min-width: 768px) { + .navbar.fixed-left { + right: auto; + } + + .navbar.fixed-left .navbar-nav .nav-item .dropdown-toggle:after { + border-top: 0.3em solid transparent; + border-left: 0.3em solid; + border-bottom: 0.3em solid transparent; + border-right: none; + vertical-align: baseline; + } + + .navbar.fixed-left .navbar-nav .nav-item .dropdown-menu { + left: 100%; + } +} diff --git a/content/navbar-fixed-right.css b/content/navbar-fixed-right.css new file mode 100644 index 0000000..ad6cef8 --- /dev/null +++ b/content/navbar-fixed-right.css @@ -0,0 +1,78 @@ +body { + padding-top: 90px; +} + +@media (min-width: 768px) { + body { + padding-top: 0; + } +} + +@media (min-width: 768px) { + body { + margin-right: 252px; + } +} + +.navbar { + overflow-y: auto; + overflow-x: hidden; + box-shadow: none; +} +.navbar.fixed-right { + position: fixed; + top: 0; + left: 0; + right: 0; + z-index: 1030; +} +.navbar-nav .nav-link { + padding-top: 0.3rem; + padding-bottom: 0.3rem; +} +@media (min-width: 768px) { + .navbar.fixed-right { + bottom: 0; + width: 252px; + flex-flow: column nowrap; + align-items: flex-start; + } + + .navbar.fixed-right .navbar-collapse { + flex-grow: 0; + flex-direction: column; + width: 100%; + } + + .navbar.fixed-right .navbar-collapse .navbar-nav { + flex-direction: column; + width: 100%; + } + + .navbar.fixed-right .navbar-collapse .navbar-nav .nav-item { + width: 100%; + } + + .navbar.fixed-right .navbar-collapse .navbar-nav .nav-item .dropdown-menu { + top: 0; + } +} + +@media (min-width: 768px) { + .navbar.fixed-right { + left: auto; + } + + .navbar.fixed-right .navbar-nav .nav-item .dropdown-toggle:after { + border-top: 0.3em solid transparent; + border-left: none; + border-bottom: 0.3em solid transparent; + border-right: 0.3em solid; + vertical-align: baseline; + } + + .navbar.fixed-right .navbar-nav .nav-item .dropdown-menu { + left: auto; + right: 100%; + } +} diff --git a/images/GenBankProvider.gif b/images/GenBankProvider.gif new file mode 100644 index 0000000..b05f6f6 Binary files /dev/null and b/images/GenBankProvider.gif differ diff --git a/images/GenBank_Info.gif b/images/GenBank_Info.gif new file mode 100644 index 0000000..c331f10 Binary files /dev/null and b/images/GenBank_Info.gif differ diff --git a/images/RefSeq_Info.gif b/images/RefSeq_Info.gif new file mode 100644 index 0000000..391865e Binary files /dev/null and b/images/RefSeq_Info.gif differ diff --git a/images/badge-notebook.svg b/images/badge-notebook.svg new file mode 100644 index 0000000..a001b54 --- /dev/null +++ b/images/badge-notebook.svg @@ -0,0 +1 @@ +Download notebookDownload notebook \ No newline at end of file diff --git a/images/badge-script.svg b/images/badge-script.svg new file mode 100644 index 0000000..90c93eb --- /dev/null +++ b/images/badge-script.svg @@ -0,0 +1 @@ +Download scriptDownload script \ No newline at end of file diff --git a/index.fsx b/index.fsx new file mode 100644 index 0000000..a71fc2b --- /dev/null +++ b/index.fsx @@ -0,0 +1,40 @@ +(** +# BioProviders: Simplifying Access to Bioinformatic Datasets + +The BioProviders package provides tools and functionality to simplify accessing and manipulating bioinformatic data. +The [.NET Bio](https://github.com/dotnetbio/bio) and [BioFSharp](https://github.com/CSBiology/BioFSharp) libraries +are used to parse and format the data provided by this package. + +BioProviders is available through [NuGet](https://nuget.org/packages/BioProviders). +[![NuGet Status](//img.shields.io/nuget/v/BioProviders.svg?style=flat)](https://www.nuget.org/packages/BioProviders/) + +## Type Providers + +*) +
+
+
+
+ +
+
+
+
+(** +BioProviders implements Type Providers for accessing bioinformatic datasets. These Type Providers allow remote access +to data sources (e.g., GenBank) and type-safe representations of their data (e.g., GenBank Flat File). + +* [GenBank Type Provider](library/GenBankProvider.html) - access to GenBank data using the `GenBankProvider<..>` type. + +* [RefSeq Type Provider](library/RefSeqProvider.html) - access to RefSeq data using the `RefSeqProvider<..>` type. + +## Contributing and Copyright + +The project is hosted on [GitHub](https://github.com/AlexKenna/BioProviders) where you can +[report issues](https://github.com/AlexKenna/BioProviders/issues), fork the project and submit pull requests. + +The library is available under the OSI-approved MIT license. For more information see the +[License file](https://github.com/AlexKenna/BioProviders/blob/main/LICENSE.md) in the GitHub repository. + +*) + diff --git a/index.html b/index.html new file mode 100644 index 0000000..4fd79e9 --- /dev/null +++ b/index.html @@ -0,0 +1,131 @@ + + + + + + BioProviders: Simplifying Access to Bioinformatic Datasets + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+

BioProviders

+
+
+
+

BioProviders: Simplifying Access to Bioinformatic Datasets

+

The BioProviders package provides tools and functionality to simplify accessing and manipulating bioinformatic data. +The .NET Bio and BioFSharp libraries +are used to parse and format the data provided by this package.

+

BioProviders is available through NuGet. +NuGet Status

+

Type Providers

+
+
+
+
+ +
+
+
+
+

BioProviders implements Type Providers for accessing bioinformatic datasets. These Type Providers allow remote access +to data sources (e.g., GenBank) and type-safe representations of their data (e.g., GenBank Flat File).

+ +

Contributing and Copyright

+

The project is hosted on GitHub where you can +report issues, fork the project and submit pull requests.

+

The library is available under the OSI-approved MIT license. For more information see the +License file in the GitHub repository.

+ + +
+ + + + + + + + +
+ + + \ No newline at end of file diff --git a/index.ipynb b/index.ipynb new file mode 100644 index 0000000..0c050f6 --- /dev/null +++ b/index.ipynb @@ -0,0 +1,70 @@ + + { + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + + "source": ["# BioProviders: Simplifying Access to Bioinformatic Datasets\n", +"\n", +"The BioProviders package provides tools and functionality to simplify accessing and manipulating bioinformatic data.\n", +"The [.NET Bio](https://github.com/dotnetbio/bio) and [BioFSharp](https://github.com/CSBiology/BioFSharp) libraries\n", +"are used to parse and format the data provided by this package.\n", +"\n", +"BioProviders is available through [NuGet](https://nuget.org/packages/BioProviders).\n", +"[![NuGet Status](//img.shields.io/nuget/v/BioProviders.svg?style=flat)](https://www.nuget.org/packages/BioProviders/)\n", +"\n", +"## Type Providers\n", +"\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": null, "outputs": [], + "source": ["\u003cdiv class=\"container-fluid\" style=\"margin:15px 0px 15px 0px;\"\u003e\n", +" \u003cdiv class=\"row-fluid\"\u003e\n", +" \u003cdiv class=\"span1\"\u003e\u003c/div\u003e\n", +" \u003cdiv class=\"span10\" id=\"anim-holder\"\u003e\n", +" \u003ca id=\"lnk\" href=\"images/GenBankProvider.gif\"\u003e\u003cimg id=\"anim\" src=\"images/GenBankProvider.gif\" /\u003e\u003c/a\u003e\n", +" \u003c/div\u003e\n", +" \u003cdiv class=\"span1\"\u003e\u003c/div\u003e\n", +" \u003c/div\u003e\n", +"\u003c/div\u003e\n"] + } +, + { + "cell_type": "markdown", + "metadata": {}, + + "source": ["BioProviders implements Type Providers for accessing bioinformatic datasets. These Type Providers allow remote access\n", +"to data sources (e.g., GenBank) and type-safe representations of their data (e.g., GenBank Flat File).\n", +"\n", +"* [GenBank Type Provider](library/GenBankProvider.html) - access to GenBank data using the `GenBankProvider\u003c..\u003e` type.\n", +"\n", +"* [RefSeq Type Provider](library/RefSeqProvider.html) - access to RefSeq data using the `RefSeqProvider\u003c..\u003e` type.\n", +"\n", +"## Contributing and Copyright\n", +"\n", +"The project is hosted on [GitHub](https://github.com/AlexKenna/BioProviders) where you can\n", +"[report issues](https://github.com/AlexKenna/BioProviders/issues), fork the project and submit pull requests.\n", +"\n", +"The library is available under the OSI-approved MIT license. For more information see the\n", +"[License file](https://github.com/AlexKenna/BioProviders/blob/main/LICENSE.md) in the GitHub repository.\n", +"\n"] + }], + "metadata": { + "kernelspec": {"display_name": ".NET (F#)", "language": "F#", "name": ".net-fsharp"}, + "langauge_info": { + "file_extension": ".fs", + "mimetype": "text/x-fsharp", + "name": "C#", + "pygments_lexer": "fsharp", + "version": "4.5" + } + }, + "nbformat": 4, + "nbformat_minor": 1 + } + + diff --git a/index.json b/index.json new file mode 100644 index 0000000..f51dc6f --- /dev/null +++ b/index.json @@ -0,0 +1 @@ +[{"uri":"https://fsprojects.github.io/BioProviders/index.html","title":"BioProviders: Simplifying Access to Bioinformatic Datasets\r\n","content":"BioProviders: Simplifying Access to Bioinformatic Datasets\r\n================================\r\n\r\nThe BioProviders package provides tools and functionality to simplify accessing and manipulating bioinformatic data.\r\nThe [.NET Bio](https://github.com/dotnetbio/bio) and [BioFSharp](https://github.com/CSBiology/BioFSharp) libraries\r\nare used to parse and format the data provided by this package.\r\n\r\nBioProviders is available through [NuGet](https://nuget.org/packages/BioProviders). \r\n[![NuGet Status](//img.shields.io/nuget/v/BioProviders.svg?style=flat)](https://www.nuget.org/packages/BioProviders/)\r\n\r\n\r\n## Type Providers\r\n\r\n\u003Cdiv class=\u0022container-fluid\u0022 style=\u0022margin:15px 0px 15px 0px;\u0022\u003E\r\n \u003Cdiv class=\u0022row-fluid\u0022\u003E\r\n \u003Cdiv class=\u0022span1\u0022\u003E\u003C/div\u003E\r\n \u003Cdiv class=\u0022span10\u0022 id=\u0022anim-holder\u0022\u003E\r\n \u003Ca id=\u0022lnk\u0022 href=\u0022images/GenBankProvider.gif\u0022\u003E\u003Cimg id=\u0022anim\u0022 src=\u0022images/GenBankProvider.gif\u0022 /\u003E\u003C/a\u003E\r\n \u003C/div\u003E\r\n \u003Cdiv class=\u0022span1\u0022\u003E\u003C/div\u003E\r\n \u003C/div\u003E\r\n\u003C/div\u003E\r\n\r\nBioProviders implements Type Providers for accessing bioinformatic datasets. These Type Providers allow remote access\r\nto data sources (e.g., GenBank) and type-safe representations of their data (e.g., GenBank Flat File).\r\n\r\n* [GenBank Type Provider](library/GenBankProvider.html) - access to GenBank data using the \u0060GenBankProvider\u003C..\u003E\u0060 type.\r\n* [RefSeq Type Provider](library/RefSeqProvider.html) - access to RefSeq data using the \u0060RefSeqProvider\u003C..\u003E\u0060 type.\r\n\r\n\r\n## Contributing and Copyright\r\n\r\nThe project is hosted on [GitHub](https://github.com/AlexKenna/BioProviders) where you can \r\n[report issues](https://github.com/AlexKenna/BioProviders/issues), fork the project and submit pull requests.\r\n\r\nThe library is available under the OSI-approved MIT license. For more information see the \r\n[License file](https://github.com/AlexKenna/BioProviders/blob/main/LICENSE.md) in the GitHub repository."},{"uri":"https://fsprojects.github.io/BioProviders/library/GenBankProvider.html","title":"GenBank Type Provider\r\n","content":"(**\n---\ncategory: Type Providers\ncategoryindex: 1\nindex: 1\n---\n*)\n\n(**\n\n[![Script](../images/badge-script.svg)]({{root}}/{{fsdocs-source-basename}}.fsx)\u0026emsp;\n[![Notebook](../images/badge-notebook.svg)]({{root}}/{{fsdocs-source-basename}}.ipynb)\n\n# GenBank Type Provider\n\nThis article describes how to use the GenBank Type Provider to remotely access genomic data stored in the \n[GenBank](https://www.ncbi.nlm.nih.gov/genbank/) database. This Type Provider collects and parses the genomic data\nfor a specified organism and generates a static type containing its metadata and sequence. \n\nThe GenBank Type Provider uses [.NET Bio](https://github.com/dotnetbio/bio) to parse the GenBank data files\nand [BioFSharp](https://github.com/CSBiology/BioFSharp) to provide utilities for manipulating genomic sequences.\n\n\u003Cbr /\u003E\n## Loading BioProviders Package\n\nTo load the GenBank Type Provider, a script can use the NuGet syntax to reference the BioProviders package, shown below.\n\nYou can optionally include the BioFSharp package. While it\u0027s not required to use the basic BioProviders functions, it can be used to explore the metadata of the provided types, as shown in a later example.\n*)\n\n#r \u0022nuget: BioProviders\u0022\n#r \u0022nuget: BioFSharp\u0022\n\n(** If creating an F# library or application, BioProviders can be added as a package reference. You can use your IDE for this, or use the \u0060\u0060\u0060dotnet add package BioProviders\u0060\u0060\u0060 command in your project folder from the command line.\n\nBioProviders can then be used in your script or code by using an open command. Opening its dependencies should not be required. (BioFSharp is loaded for future examples.)\n*)\n\nopen BioProviders\nopen BioFSharp\n\n(**\n\u003Cbr /\u003E\n## GenBankProvider Example\n\nThe GenBank Type Provider will be demonstrated for [this GenBank assembly](https://www.ncbi.nlm.nih.gov/nuccore/CP012411) \nof the *Candidatus Carsonella ruddii* species. To create a typed representation of the assembly, two pieces of information\nmust be given to the Type Provider:\n\n* Species name\n* GenBank assembly accession\n\nFor this example, the species name is \u0022Candidatus Carsonella ruddii\u0022 and the GenBank assembly accession is \u0022GCA_001274515.1\u0022.\nTo find this information:\n\n* Visit https://www.ncbi.nlm.nih.gov/datasets/\n* Search for the name of the species\n* Select to view all genones of the species\n\nYou can then select the assembly\u0027s GenBank (as well as RefSeq) accession from the list that appears.\n\n\u003Cdiv class=\u0022container-fluid\u0022 style=\u0022margin:15px 0px 15px 0px;\u0022\u003E\n \u003Cdiv class=\u0022row-fluid\u0022\u003E\n \u003Cdiv class=\u0022span1\u0022\u003E\u003C/div\u003E\n \u003Cdiv class=\u0022span10\u0022 id=\u0022anim-holder\u0022\u003E\n \u003Ca id=\u0022lnk\u0022 href=\u0022../images/GenBank_Info.gif\u0022\u003E\u003Cimg id=\u0022anim\u0022 src=\u0022../images/GenBank_Info.gif\u0022 /\u003E\u003C/a\u003E\n \u003C/div\u003E\n \u003Cdiv class=\u0022span1\u0022\u003E\u003C/div\u003E\n \u003C/div\u003E\n\u003C/div\u003E\n\nPassing this information to the Type Provider generates the Assembly Type. The genomic data can then be extracted from the \nAssembly Type by invoking the Genome method. This is demonstrated below.\n*)\n\n// Define species name and GenBank assembly accession.\nlet [\u003CLiteral\u003E] Species = \u0022Candidatus Carsonella ruddii\u0022\nlet [\u003CLiteral\u003E] Accession = \u0022GCA_001274515.1\u0022\n\n// Create GenBank assembly type.\ntype Ruddii = GenBankProvider\u003CSpecies, Accession\u003E\n\n// Extract statically-typed genome data.\nlet genome = Ruddii.Genome()\n\n(**\n\u003Cbr /\u003E\n\u003Cbr /\u003E\n### Metadata\n\nEach genome is accompanied by metadata describing the organism and sequence recorded in the assembly. This metadata can\nbe extracted using the Metadata field of the Genome Type created previously. The Metadata type is largely based on that\nprovided by [.NET Bio](http://dotnetbio.github.io/Help/html/319bf2e6-4fcf-9f93-586f-fc7ffcf04a83.htm), with modifications\nmade to be more idiomatic with F#.\n\nBelow is an example of how the raw metadata type can be retrieved and displayed:\n\n*)\n\n// Extract the metadata.\nlet metadata = genome.Metadata\n\n// Display the metadata type.\nprintf \u0022%A\u0022 metadata\n\n(*** include-output ***)\n\n(** \nThe metadata type consists of many fields, though not all fields of the metadata exist for all assemblies. Therefore, they are provided as option types, on which a match expression can be used. Below are examples of accessing fields from the example assembly.\n \u003Cbr /\u003E\n \u003Cbr /\u003E\n \u2705 Example - Accessing a field that is provided. \n*)\n\n// Print definition if exists.\nmatch metadata.Definition with\n| Some definition -\u003E printf \u0022%s\u0022 definition\n| None -\u003E printf \u0022No definition provided.\u0022\n\n(*** include-output ***)\n\n(** \n \u003Cbr /\u003E \n \u274C Example - Accessing a field that is not provided. \n*)\n\n// Print database source if exists.\nmatch metadata.DbSource with\n| Some dbsource -\u003E printf \u0022%s\u0022 dbsource\n| None -\u003E printf \u0022No database source provided.\u0022\n\n(*** include-output ***)\n\n(**\n\u003Cbr /\u003E\n\u003Cbr /\u003E\n### Sequence\n\nThe genomic sequence for the organism can be extracted using the Sequence field of the Genome Type created previously.\nThis field provides a BioFSharp [BioSeq](https://csbiology.github.io/BioFSharp/reference/biofsharp-bioseq.html) containing\na series of [Nucleotides](https://csbiology.github.io/BioFSharp//reference/biofsharp-nucleotides-nucleotide.html). More\ncan be read about BioFSharp containers [here](https://csbiology.github.io/BioFSharp//BioCollections.html). \n\nAn example of accessing and manipulating the GenBankProvider genomic sequence using BioFSharp is provided below:\n*)\n\n// Extract the BioFSharp BioSeq.\nlet sequence = genome.Sequence\n\n// Display the sequence type.\nprintf \u0022%A\u0022 sequence\n\n(*** include-output ***)\n\n// Take the complement, then transcribe and translate the coding strand.\nsequence\n|\u003E BioSeq.complement\n|\u003E BioSeq.transcribeCodingStrand\n|\u003E BioSeq.translate 0\n\n(*** include-it ***)\n\n\n(**\n\u003Cbr /\u003E\n## Wildcard Operators\n\nWildcard operators are supported in both the Species and Accession provided to the GenBankProvider. By using asterisks \u0022\\*\u0022\nat the end of a Species or Accession name, species or accessions starting with the provided pattern will be matched. \n\nFor example, we can get all *Staphylococcus* species starting with the letter \u0027c\u0027 and assembly accesions starting with\n\u0027GCA_01\u0027:\n*)\n\n// Define species name and GenBank assembly accession using wildcards.\nlet [\u003CLiteral\u003E] SpeciesPattern = \u0022Staphylococcus c*\u0022\nlet [\u003CLiteral\u003E] AccessionPattern = \u0022GCA_01*\u0022\n\n// Create GenBank type containing all species matching the species pattern.\ntype SpeciesCollection = GenBankProvider\u003CSpeciesPattern, AccessionPattern\u003E\n\n// Select the species types.\ntype Capitis = SpeciesCollection.\u0060\u0060Staphylococcus capitis\u0060\u0060\ntype Cohnii = SpeciesCollection.\u0060\u0060Staphylococcus cohnii\u0060\u0060\n\n// Select assemblies.\ntype Assembly1 = Capitis.\u0060\u0060GCA_012926605.1\u0060\u0060\ntype Assembly2 = Capitis.\u0060\u0060GCA_015645205.1\u0060\u0060\ntype Assembly3 = Cohnii.\u0060\u0060GCA_013349225.1\u0060\u0060\ntype Assembly4 = Cohnii.\u0060\u0060GCA_014884245.1\u0060\u0060\n\n// Extract statically-typed genome data.\nlet data = Assembly1.Genome()\n\n// Show the assembly\u0027s definition.\nmatch data.Metadata.Definition with\n| Some definition -\u003E printf \u0022%s\u0022 definition\n| None -\u003E printf \u0022No definition provided.\u0022\n\n(*** include-output ***)\n\n(**\nThe Accession parameter can also be omitted from the GenBankProvider. In this case, all assemblies for the given species will\nbe matched. For example:\n*)\n\n// Define species name.\nlet [\u003CLiteral\u003E] SpeciesName = \u0022Staphylococcus lugdunensis\u0022\n\n// Create GenBank type containing all assemblies for the species.\ntype Assemblies = GenBankProvider\u003CSpeciesName\u003E\n\n// Select assemblies.\ntype Assembly = Assemblies.\u0060\u0060GCA_001546615.1\u0060\u0060\n\n// Show the assembly\u0027s primary accession.\nmatch (Assembly.Genome()).Metadata.Accession with\n| Some accession -\u003E match accession.Primary with\n | Some primary -\u003E printf \u0022%s\u0022 primary\n | None -\u003E printf \u0022No primary accession provided.\u0022\n| None -\u003E printf \u0022No accession provided.\u0022\n\n(*** include-output ***)"},{"uri":"https://fsprojects.github.io/BioProviders/library/RefSeqProvider.html","title":"RefSeq Type Provider\r\n","content":"(**\n---\ncategory: Type Providers\ncategoryindex: 1\nindex: 2\n---\n*)\n\n(**\n\n[![Script](../images/badge-script.svg)]({{root}}/{{fsdocs-source-basename}}.fsx)\u0026emsp;\n[![Notebook](../images/badge-notebook.svg)]({{root}}/{{fsdocs-source-basename}}.ipynb)\n\n# RefSeq Type Provider\n\nThis article describes how to use the RefSeq Type Provider to remotely access genomic data stored in the \n[RefSeq](https://www.ncbi.nlm.nih.gov/genbank/) database. This Type Provider collects and parses the genomic data\nfor a specified organism and generates a static type containing its metadata and sequence. \n\nThe RefSeq Type Provider uses [.NET Bio](https://github.com/dotnetbio/bio) to parse the RefSeq data files\nand [BioFSharp](https://github.com/CSBiology/BioFSharp) to provide utilities for manipulating genomic sequences.\n\n\u003Cbr /\u003E\n## Loading BioProviders Package\n\nTo load the RefSeq Type Provider, a script can use the NuGet syntax to reference the BioProviders package, shown below.\n\nYou can optionally include the BioFSharp package. While it\u0027s not required to use the basic BioProviders functions, it can be used to explore the metadata of the provided types, as shown in a later example.\n*)\n\n#r \u0022nuget: BioProviders\u0022\n#r \u0022nuget: BioFSharp\u0022\n\n(** If creating an F# library or application, BioProviders can be added as a package reference. You can use your IDE for this, or use the \u0060\u0060\u0060dotnet add package BioProviders\u0060\u0060\u0060 command in your project folder from the command line.\n\nBioProviders can then be used in your script or code by using an open command. Opening its dependencies should not be required. (BioFSharp is loaded for future examples.)\n*)\n\nopen BioProviders\nopen BioFSharp\n\n(**\n\u003Cbr /\u003E\n## RefSeqProvider Example\n\nThe RefSeq Type Provider will be demonstrated for [this RefSeq assembly](https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_001224225.1/) \nof the *Staphylococcus borealis* species. To create a typed representation of the assembly, two pieces of information\nmust be given to the Type Provider:\n\n* Species name\n* RefSeq assembly accession\n\nFor this example, the species name is \u0022Staphylococcus borealis\u0022 and the RefSeq assembly accession is \u0022GCF_001224225.1\u0022.\nTo find this information:\n\n* Visit https://www.ncbi.nlm.nih.gov/datasets/\n* Search for the name of the species\n* Select to view all genones of the species\n\nYou can then select the assembly\u0027s RefSeq (as well as GenBank) accession from the list that appears.\n\n\u003Cdiv class=\u0022container-fluid\u0022 style=\u0022margin:15px 0px 15px 0px;\u0022\u003E\n \u003Cdiv class=\u0022row-fluid\u0022\u003E\n \u003Cdiv class=\u0022span1\u0022\u003E\u003C/div\u003E\n \u003Cdiv class=\u0022span10\u0022 id=\u0022anim-holder\u0022\u003E\n \u003Ca id=\u0022lnk\u0022 href=\u0022../images/RefSeq_Info.gif\u0022\u003E\u003Cimg id=\u0022anim\u0022 src=\u0022../images/RefSeq_Info.gif\u0022 /\u003E\u003C/a\u003E\n \u003C/div\u003E\n \u003Cdiv class=\u0022span1\u0022\u003E\u003C/div\u003E\n \u003C/div\u003E\n\u003C/div\u003E\n\nPassing this information to the Type Provider generates the Assembly Type. The genomic data can then be extracted from the \nAssembly Type by invoking the Genome method. This is demonstrated below.\n*)\n\n// Define species name and RefSeq assembly accession.\nlet [\u003CLiteral\u003E] Species = \u0022Staphylococcus borealis\u0022\nlet [\u003CLiteral\u003E] Accession = \u0022GCF_001224225.1\u0022\n\n// Create RefSeq assembly type.\ntype Borealis = RefSeqProvider\u003CSpecies, Accession\u003E\n\n// Extract statically-typed genome data.\nlet genome = Borealis.Genome()\n\n(**\n\u003Cbr /\u003E\n\u003Cbr /\u003E\n### Metadata\n\nEach genome is accompanied by metadata describing the organism and sequence recorded in the assembly. This metadata can\nbe extracted using the Metadata field of the Genome Type created previously. The Metadata type is largely based on that\nprovided by [.NET Bio](http://dotnetbio.github.io/Help/html/319bf2e6-4fcf-9f93-586f-fc7ffcf04a83.htm), with modifications\nmade to be more idiomatic with F#.\n\nBelow is an example of how the raw metadata type can be retrieved and displayed:\n\n*)\n\n// Extract the metadata.\nlet metadata = genome.Metadata\n\n// Display the metadata type.\nprintf \u0022%A\u0022 metadata\n\n(*** include-output ***)\n\n(** \nThe metadata type consists of many fields, though not all fields of the metadata exist for all assemblies. Therefore, they are provided as option types, on which a match expression can be used. Below are examples of accessing fields from the example assembly.\n \u003Cbr /\u003E\n \u003Cbr /\u003E\n \u2705 Example - Accessing a field that is provided. \n*)\n\n// Print definition if exists.\nmatch metadata.Definition with\n| Some definition -\u003E printf \u0022%s\u0022 definition\n| None -\u003E printf \u0022No definition provided.\u0022\n\n(*** include-output ***)\n\n(** \n \u003Cbr /\u003E \n \u274C Example - Accessing a field that is not provided. \n*)\n\n// Print database source if exists.\nmatch metadata.DbSource with\n| Some dbsource -\u003E printf \u0022%s\u0022 dbsource\n| None -\u003E printf \u0022No database source provided.\u0022\n\n(*** include-output ***)\n\n(**\n\u003Cbr /\u003E\n\u003Cbr /\u003E\n### Sequence\n\nThe genomic sequence for the organism can be extracted using the Sequence field of the Genome Type created previously.\nThis field provides a BioFSharp [BioSeq](https://csbiology.github.io/BioFSharp/reference/biofsharp-bioseq.html) containing\na series of [Nucleotides](https://csbiology.github.io/BioFSharp//reference/biofsharp-nucleotides-nucleotide.html). More\ncan be read about BioFSharp containers [here](https://csbiology.github.io/BioFSharp//BioCollections.html). \n\nAn example of accessing and manipulating the RefSeqProvider genomic sequence using BioFSharp is provided below:\n*)\n\n// Extract the BioFSharp BioSeq.\nlet sequence = genome.Sequence\n\n// Display the sequence type.\nprintf \u0022%A\u0022 sequence\n\n(*** include-output ***)\n\n// Take the complement, then transcribe and translate the coding strand.\nsequence\n|\u003E BioSeq.complement\n|\u003E BioSeq.transcribeCodingStrand\n|\u003E BioSeq.translate 0\n\n(*** include-it ***)\n\n\n(**\n\u003Cbr /\u003E\n## Wildcard Operators\n\nWildcard operators are supported in both the Species and Accession provided to the RefSeqProvider. By using asterisks \u0022\\*\u0022\nat the end of a Species or Accession name, species or accessions starting with the provided pattern will be matched. \n\nFor example, we can get all *Staphylococcus* species starting with the letter \u0027c\u0027 and assembly accesions starting with\n\u0027GCF_01\u0027:\n*)\n\n// Define species name and RefSeq assembly accession using wildcards.\nlet [\u003CLiteral\u003E] SpeciesPattern = \u0022Staphylococcus c*\u0022\nlet [\u003CLiteral\u003E] AccessionPattern = \u0022GCF_01*\u0022\n\n// Create RefSeq type containing all species matching the species pattern.\ntype SpeciesCollection = RefSeqProvider\u003CSpeciesPattern, AccessionPattern\u003E\n\n// Select the species types.\ntype Capitis = SpeciesCollection.\u0060\u0060Staphylococcus capitis\u0060\u0060\ntype Cohnii = SpeciesCollection.\u0060\u0060Staphylococcus cohnii\u0060\u0060\n\n// Select assemblies.\ntype Assembly1 = Capitis.\u0060\u0060GCF_012926605.1\u0060\u0060\ntype Assembly2 = Capitis.\u0060\u0060GCF_012926635.1\u0060\u0060\ntype Assembly3 = Cohnii.\u0060\u0060GCF_013602215.1\u0060\u0060\ntype Assembly4 = Cohnii.\u0060\u0060GCF_013602265.1\u0060\u0060\n\n// Extract statically-typed genome data.\nlet data = Assembly1.Genome()\n\n// Show the assembly\u0027s definition.\nmatch data.Metadata.Definition with\n| Some definition -\u003E printf \u0022%s\u0022 definition\n| None -\u003E printf \u0022No definition provided.\u0022\n\n(*** include-output ***)\n\n(**\nThe Accession parameter can also be omitted from the RefSeqProvider. In this case, all assemblies for the given species will\nbe matched. For example:\n*)\n\n// Define species name.\nlet [\u003CLiteral\u003E] SpeciesName = \u0022Staphylococcus lugdunensis\u0022\n\n// Create RefSeq type containing all assemblies for the species.\ntype Assemblies = RefSeqProvider\u003CSpeciesName\u003E\n\n// Select assemblies.\ntype Assembly = Assemblies.\u0060\u0060GCF_001546615.1\u0060\u0060\n\n// Show the assembly\u0027s primary accession.\nmatch (Assembly.Genome()).Metadata.Accession with\n| Some accession -\u003E match accession.Primary with\n | Some primary -\u003E printf \u0022%s\u0022 primary\n | None -\u003E printf \u0022No primary accession provided.\u0022\n| None -\u003E printf \u0022No accession provided.\u0022\n\n(*** include-output ***)"}] \ No newline at end of file diff --git a/library/GenBankProvider.fsx b/library/GenBankProvider.fsx new file mode 100644 index 0000000..3bea683 --- /dev/null +++ b/library/GenBankProvider.fsx @@ -0,0 +1,293 @@ +(** +[![Script](../images/badge-script.svg)](https://fsprojects.github.io/BioProviders//library/GenBankProvider.fsx)  +[![Notebook](../images/badge-notebook.svg)](https://fsprojects.github.io/BioProviders//library/GenBankProvider.ipynb) + +# GenBank Type Provider + +This article describes how to use the GenBank Type Provider to remotely access genomic data stored in the +[GenBank](https://www.ncbi.nlm.nih.gov/genbank/) database. This Type Provider collects and parses the genomic data +for a specified organism and generates a static type containing its metadata and sequence. + +The GenBank Type Provider uses [.NET Bio](https://github.com/dotnetbio/bio) to parse the GenBank data files +and [BioFSharp](https://github.com/CSBiology/BioFSharp) to provide utilities for manipulating genomic sequences. + +*) +
+(** +## Loading BioProviders Package + +To load the GenBank Type Provider, a script can use the NuGet syntax to reference the BioProviders package, shown below. + +You can optionally include the BioFSharp package. While it's not required to use the basic BioProviders functions, it can be used to explore the metadata of the provided types, as shown in a later example. + +*) +#r "nuget: BioProviders" +#r "nuget: BioFSharp" +(** +If creating an F# library or application, BioProviders can be added as a package reference. You can use your IDE for this, or use the `dotnet add package BioProviders` command in your project folder from the command line. + +BioProviders can then be used in your script or code by using an open command. Opening its dependencies should not be required. (BioFSharp is loaded for future examples.) + +*) +open BioProviders +open BioFSharp +
+(** +## GenBankProvider Example + +The GenBank Type Provider will be demonstrated for [this GenBank assembly](https://www.ncbi.nlm.nih.gov/nuccore/CP012411) +of the **Candidatus Carsonella ruddii** species. To create a typed representation of the assembly, two pieces of information +must be given to the Type Provider: + +* Species name + +* GenBank assembly accession + +For this example, the species name is "Candidatus Carsonella ruddii" and the GenBank assembly accession is "GCA_001274515.1". +To find this information: + +* Visit [https://www.ncbi.nlm.nih.gov/datasets/](https://www.ncbi.nlm.nih.gov/datasets/) + +* Search for the name of the species + +* Select to view all genones of the species + +You can then select the assembly's GenBank (as well as RefSeq) accession from the list that appears. + +*) +
+
+
+
+ +
+
+
+
+(** +Passing this information to the Type Provider generates the Assembly Type. The genomic data can then be extracted from the +Assembly Type by invoking the Genome method. This is demonstrated below. + +*) +// Define species name and GenBank assembly accession. +let [] Species = "Candidatus Carsonella ruddii" +let [] Accession = "GCA_001274515.1" + +// Create GenBank assembly type. +type Ruddii = GenBankProvider + +// Extract statically-typed genome data. +let genome = Ruddii.Genome() +
+
+(** +### Metadata + +Each genome is accompanied by metadata describing the organism and sequence recorded in the assembly. This metadata can +be extracted using the Metadata field of the Genome Type created previously. The Metadata type is largely based on that +provided by [.NET Bio](http://dotnetbio.github.io/Help/html/319bf2e6-4fcf-9f93-586f-fc7ffcf04a83.htm), with modifications +made to be more idiomatic with F#. + +Below is an example of how the raw metadata type can be retrieved and displayed: + +*) +// Extract the metadata. +let metadata = genome.Metadata + +// Display the metadata type. +printf "%A" metadata(* output: +{ Locus = Some { Date = Some 8/26/2015 12:00:00 AM + DivisionCode = Some BCT + MoleculeType = Some DNA + Name = Some "CP012411" + SequenceLength = 174018 + SequenceType = Some "bp" + Strand = None + StrandTopology = Some Circular } + Definition = Some "Candidatus Carsonella ruddii strain YCCR, complete genome." + Accession = Some { Primary = Some "CP012411" + Secondary = None } + Version = Some { Accession = Some "CP012411" + CompoundAccession = Some "CP012411.1" + GiNumber = None + Version = Some "1" } + DbLinks = Some [{ Numbers = Some [" PRJNA292590"] + Type = Some BioProject }; { Numbers = Some [" SAMN03999419"] + Type = None }] + DbSource = None + Keywords = Some "." + Primary = None + Source = + Some + { CommonName = Some "Candidatus Carsonella ruddii" + Organism = + Some + { ClassLevels = + Some + "Bacteria; Pseudomonadota; Gammaproteobacteria; Oceanospirillales; Halomonadaceae; Zymobacter group; Candidatus Carsonella." + Genus = Some "Candidatus Carsonella" + Species = Some "Candidatus Carsonella ruddii" } } + References = + Some + [{ Authors = Some "Wu,F., Deng,X., Liang,G., Cen,Y. and Chen,J." + Consortiums = None + Journal = Some "Unpublished" + Location = Some "bases 1 to 174018" + Medline = None + Number = 1 + PubMed = None + Remarks = None + Title = + Some + "Whole Genome Sequence of 'Candidatus Carsonella ruddii' from Diaphorina citri in Guangdong, China" }; + { Authors = Some "Wu,F." + Consortiums = None + Journal = + Some + "Submitted (20-AUG-2015) San Joaquin Valley Agricultural Sciences Center, Usda-Ars, 9611 South Riverbend Avenue, Parlier, CA 93648, USA" + Location = Some "bases 1 to 174018" + Medline = None + Number = 2 + PubMed = None + Remarks = None + Title = Some "Direct Submission" }] + Comments = + Some + ["Annotation was added by the NCBI Prokaryotic Genome Annotation +Pipeline (released 2013). Information about the Pipeline can be +found here: http://www.ncbi.nlm.nih.gov/genome/annotation_prok/ + +##Genome-Assembly-Data-START## +Assembly Method :: CLC Genomics Workbench v. 7.5 +Genome Coverage :: 85.24x +Sequencing Technology :: Illumina MiSeq +##Genome-Assembly-Data-END## + +##Genome-Annotation-Data-START## +Annotation Provider :: NCBI +Annotation Date :: 08/20/2015 13:42:07 +Annotation Pipeline :: NCBI Prokaryotic Genome Annotation +Pipeline +Annotation Method :: Best-placed reference protein set; +GeneMarkS+ +Annotation Software revision :: 2.10 +Features Annotated :: Gene; CDS; rRNA; tRNA; ncRNA; +repeat_region +Genes :: 224 +CDS :: 168 +Pseudo Genes :: 31 +rRNAs :: 1, 1, 1 (5S, 16S, 23S) +complete rRNAs :: 1, 1, 1 (5S, 16S, 23S) +partial rRNAs :: +tRNAs :: 22 +ncRNA :: 0 +Frameshifted Genes :: 0 +##Genome-Annotation-Data-END##"] + Contig = None + Segment = None + Origin = None }*) +(** +The metadata type consists of many fields, though not all fields of the metadata exist for all assemblies. Therefore, they are provided as option types, on which a match expression can be used. Below are examples of accessing fields from the example assembly. +
+
+✅ Example - Accessing a field that is provided. + +*) +// Print definition if exists. +match metadata.Definition with +| Some definition -> printf "%s" definition +| None -> printf "No definition provided."(* output: +Candidatus Carsonella ruddii strain YCCR, complete genome.*) +(** +
+❌ Example - Accessing a field that is not provided. + +*) +// Print database source if exists. +match metadata.DbSource with +| Some dbsource -> printf "%s" dbsource +| None -> printf "No database source provided."(* output: +No database source provided.*) +
+
+(** +### Sequence + +The genomic sequence for the organism can be extracted using the Sequence field of the Genome Type created previously. +This field provides a BioFSharp [BioSeq](https://csbiology.github.io/BioFSharp/reference/biofsharp-bioseq.html) containing +a series of [Nucleotides](https://csbiology.github.io/BioFSharp//reference/biofsharp-nucleotides-nucleotide.html). More +can be read about BioFSharp containers [here](https://csbiology.github.io/BioFSharp//BioCollections.html). + +An example of accessing and manipulating the GenBankProvider genomic sequence using BioFSharp is provided below: + +*) +// Extract the BioFSharp BioSeq. +let sequence = genome.Sequence + +// Display the sequence type. +printf "%A" sequence(* output: +seq [A; T; G; A; ...]*) +// Take the complement, then transcribe and translate the coding strand. +sequence +|> BioSeq.complement +|> BioSeq.transcribeCodingStrand +|> BioSeq.translate 0(* output: +seq [Tyr; Phe; Leu; Ter; ...]*) +
+(** +## Wildcard Operators + +Wildcard operators are supported in both the Species and Accession provided to the GenBankProvider. By using asterisks "*" +at the end of a Species or Accession name, species or accessions starting with the provided pattern will be matched. + +For example, we can get all **Staphylococcus** species starting with the letter 'c' and assembly accesions starting with +'GCA_01': + +*) +// Define species name and GenBank assembly accession using wildcards. +let [] SpeciesPattern = "Staphylococcus c*" +let [] AccessionPattern = "GCA_01*" + +// Create GenBank type containing all species matching the species pattern. +type SpeciesCollection = GenBankProvider + +// Select the species types. +type Capitis = SpeciesCollection.``Staphylococcus capitis`` +type Cohnii = SpeciesCollection.``Staphylococcus cohnii`` + +// Select assemblies. +type Assembly1 = Capitis.``GCA_012926605.1`` +type Assembly2 = Capitis.``GCA_015645205.1`` +type Assembly3 = Cohnii.``GCA_013349225.1`` +type Assembly4 = Cohnii.``GCA_014884245.1`` + +// Extract statically-typed genome data. +let data = Assembly1.Genome() + +// Show the assembly's definition. +match data.Metadata.Definition with +| Some definition -> printf "%s" definition +| None -> printf "No definition provided."(* output: +Staphylococcus capitis strain 18-857 NODE_1, whole genome shotgun sequence.*) +(** +The Accession parameter can also be omitted from the GenBankProvider. In this case, all assemblies for the given species will +be matched. For example: + +*) +// Define species name. +let [] SpeciesName = "Staphylococcus lugdunensis" + +// Create GenBank type containing all assemblies for the species. +type Assemblies = GenBankProvider + +// Select assemblies. +type Assembly = Assemblies.``GCA_001546615.1`` + +// Show the assembly's primary accession. +match (Assembly.Genome()).Metadata.Accession with +| Some accession -> match accession.Primary with + | Some primary -> printf "%s" primary + | None -> printf "No primary accession provided." +| None -> printf "No accession provided."(* output: +KQ957361*) + diff --git a/library/GenBankProvider.html b/library/GenBankProvider.html new file mode 100644 index 0000000..1ea86d4 --- /dev/null +++ b/library/GenBankProvider.html @@ -0,0 +1,424 @@ + + + + + + GenBank Type Provider + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ +

Script  +Notebook

+

GenBank Type Provider

+

This article describes how to use the GenBank Type Provider to remotely access genomic data stored in the +GenBank database. This Type Provider collects and parses the genomic data +for a specified organism and generates a static type containing its metadata and sequence.

+

The GenBank Type Provider uses .NET Bio to parse the GenBank data files +and BioFSharp to provide utilities for manipulating genomic sequences.

+
+

Loading BioProviders Package

+

To load the GenBank Type Provider, a script can use the NuGet syntax to reference the BioProviders package, shown below.

+

You can optionally include the BioFSharp package. While it's not required to use the basic BioProviders functions, it can be used to explore the metadata of the provided types, as shown in a later example.

+
#r "nuget: BioProviders"
+#r "nuget: BioFSharp"
+
+

If creating an F# library or application, BioProviders can be added as a package reference. You can use your IDE for this, or use the dotnet add package BioProviders command in your project folder from the command line.

+

BioProviders can then be used in your script or code by using an open command. Opening its dependencies should not be required. (BioFSharp is loaded for future examples.)

+
open BioProviders
+open BioFSharp
+
+
+

GenBankProvider Example

+

The GenBank Type Provider will be demonstrated for this GenBank assembly +of the Candidatus Carsonella ruddii species. To create a typed representation of the assembly, two pieces of information +must be given to the Type Provider:

+
    +
  • Species name
  • +
  • GenBank assembly accession
  • +
+

For this example, the species name is "Candidatus Carsonella ruddii" and the GenBank assembly accession is "GCA_001274515.1". +To find this information:

+ +

You can then select the assembly's GenBank (as well as RefSeq) accession from the list that appears.

+
+
+
+
+ +
+
+
+
+

Passing this information to the Type Provider generates the Assembly Type. The genomic data can then be extracted from the +Assembly Type by invoking the Genome method. This is demonstrated below.

+
// Define species name and GenBank assembly accession.
+let [<Literal>] Species = "Candidatus Carsonella ruddii"
+let [<Literal>] Accession = "GCA_001274515.1"
+
+// Create GenBank assembly type.
+type Ruddii = GenBankProvider<Species, Accession>
+
+// Extract statically-typed genome data.
+let genome = Ruddii.Genome()
+
+
+
+

Metadata

+

Each genome is accompanied by metadata describing the organism and sequence recorded in the assembly. This metadata can +be extracted using the Metadata field of the Genome Type created previously. The Metadata type is largely based on that +provided by .NET Bio, with modifications +made to be more idiomatic with F#.

+

Below is an example of how the raw metadata type can be retrieved and displayed:

+
// Extract the metadata.
+let metadata = genome.Metadata
+
+// Display the metadata type.
+printf "%A" metadata
+
+
{ Locus = Some { Date = Some 8/26/2015 12:00:00 AM
+                 DivisionCode = Some BCT
+                 MoleculeType = Some DNA
+                 Name = Some "CP012411"
+                 SequenceLength = 174018
+                 SequenceType = Some "bp"
+                 Strand = None
+                 StrandTopology = Some Circular }
+  Definition = Some "Candidatus Carsonella ruddii strain YCCR, complete genome."
+  Accession = Some { Primary = Some "CP012411"
+                     Secondary = None }
+  Version = Some { Accession = Some "CP012411"
+                   CompoundAccession = Some "CP012411.1"
+                   GiNumber = None
+                   Version = Some "1" }
+  DbLinks = Some [{ Numbers = Some [" PRJNA292590"]
+                    Type = Some BioProject }; { Numbers = Some [" SAMN03999419"]
+                                                Type = None }]
+  DbSource = None
+  Keywords = Some "."
+  Primary = None
+  Source =
+   Some
+     { CommonName = Some "Candidatus Carsonella ruddii"
+       Organism =
+        Some
+          { ClassLevels =
+             Some
+               "Bacteria; Pseudomonadota; Gammaproteobacteria; Oceanospirillales; Halomonadaceae; Zymobacter group; Candidatus Carsonella."
+            Genus = Some "Candidatus Carsonella"
+            Species = Some "Candidatus Carsonella ruddii" } }
+  References =
+   Some
+     [{ Authors = Some "Wu,F., Deng,X., Liang,G., Cen,Y. and Chen,J."
+        Consortiums = None
+        Journal = Some "Unpublished"
+        Location = Some "bases 1 to 174018"
+        Medline = None
+        Number = 1
+        PubMed = None
+        Remarks = None
+        Title =
+         Some
+           "Whole Genome Sequence of 'Candidatus Carsonella ruddii' from Diaphorina citri in Guangdong, China" };
+      { Authors = Some "Wu,F."
+        Consortiums = None
+        Journal =
+         Some
+           "Submitted (20-AUG-2015) San Joaquin Valley Agricultural Sciences Center, Usda-Ars, 9611 South Riverbend Avenue, Parlier, CA 93648, USA"
+        Location = Some "bases 1 to 174018"
+        Medline = None
+        Number = 2
+        PubMed = None
+        Remarks = None
+        Title = Some "Direct Submission" }]
+  Comments =
+   Some
+     ["Annotation was added by the NCBI Prokaryotic Genome Annotation
+Pipeline (released 2013). Information about the Pipeline can be
+found here: http://www.ncbi.nlm.nih.gov/genome/annotation_prok/
+
+##Genome-Assembly-Data-START##
+Assembly Method       :: CLC Genomics Workbench v. 7.5
+Genome Coverage       :: 85.24x
+Sequencing Technology :: Illumina MiSeq
+##Genome-Assembly-Data-END##
+
+##Genome-Annotation-Data-START##
+Annotation Provider          :: NCBI
+Annotation Date              :: 08/20/2015 13:42:07
+Annotation Pipeline          :: NCBI Prokaryotic Genome Annotation
+Pipeline
+Annotation Method            :: Best-placed reference protein set;
+GeneMarkS+
+Annotation Software revision :: 2.10
+Features Annotated           :: Gene; CDS; rRNA; tRNA; ncRNA;
+repeat_region
+Genes                        :: 224
+CDS                          :: 168
+Pseudo Genes                 :: 31
+rRNAs                        :: 1, 1, 1 (5S, 16S, 23S)
+complete rRNAs               :: 1, 1, 1 (5S, 16S, 23S)
+partial rRNAs                ::
+tRNAs                        :: 22
+ncRNA                        :: 0
+Frameshifted Genes           :: 0
+##Genome-Annotation-Data-END##"]
+  Contig = None
+  Segment = None
+  Origin = None }
+

The metadata type consists of many fields, though not all fields of the metadata exist for all assemblies. Therefore, they are provided as option types, on which a match expression can be used. Below are examples of accessing fields from the example assembly. +
+
+✅ Example - Accessing a field that is provided.

+
// Print definition if exists.
+match metadata.Definition with
+| Some definition -> printf "%s" definition
+| None -> printf "No definition provided."
+
+
Candidatus Carsonella ruddii strain YCCR, complete genome.
+


+❌ Example - Accessing a field that is not provided.

+
// Print database source if exists.
+match metadata.DbSource with
+| Some dbsource -> printf "%s" dbsource
+| None -> printf "No database source provided."
+
+
No database source provided.
+
+
+

Sequence

+

The genomic sequence for the organism can be extracted using the Sequence field of the Genome Type created previously. +This field provides a BioFSharp BioSeq containing +a series of Nucleotides. More +can be read about BioFSharp containers here.

+

An example of accessing and manipulating the GenBankProvider genomic sequence using BioFSharp is provided below:

+
// Extract the BioFSharp BioSeq.
+let sequence = genome.Sequence
+
+// Display the sequence type.
+printf "%A" sequence
+
+
seq [A; T; G; A; ...]
+
// Take the complement, then transcribe and translate the coding strand.
+sequence
+|> BioSeq.complement
+|> BioSeq.transcribeCodingStrand
+|> BioSeq.translate 0
+
+
seq [Tyr; Phe; Leu; Ter; ...]
+
+

Wildcard Operators

+

Wildcard operators are supported in both the Species and Accession provided to the GenBankProvider. By using asterisks "*" +at the end of a Species or Accession name, species or accessions starting with the provided pattern will be matched.

+

For example, we can get all Staphylococcus species starting with the letter 'c' and assembly accesions starting with +'GCA_01':

+
// Define species name and GenBank assembly accession using wildcards.
+let [<Literal>] SpeciesPattern = "Staphylococcus c*"
+let [<Literal>] AccessionPattern = "GCA_01*"
+
+// Create GenBank type containing all species matching the species pattern.
+type SpeciesCollection = GenBankProvider<SpeciesPattern, AccessionPattern>
+
+// Select the species types.
+type Capitis = SpeciesCollection.``Staphylococcus capitis``
+type Cohnii = SpeciesCollection.``Staphylococcus cohnii``
+
+// Select assemblies.
+type Assembly1 = Capitis.``GCA_012926605.1``
+type Assembly2 = Capitis.``GCA_015645205.1``
+type Assembly3 = Cohnii.``GCA_013349225.1``
+type Assembly4 = Cohnii.``GCA_014884245.1``
+
+// Extract statically-typed genome data.
+let data = Assembly1.Genome()
+
+// Show the assembly's definition.
+match data.Metadata.Definition with
+| Some definition -> printf "%s" definition
+| None -> printf "No definition provided."
+
+
Staphylococcus capitis strain 18-857 NODE_1, whole genome shotgun sequence.
+

The Accession parameter can also be omitted from the GenBankProvider. In this case, all assemblies for the given species will +be matched. For example:

+
// Define species name.
+let [<Literal>] SpeciesName = "Staphylococcus lugdunensis"
+
+// Create GenBank type containing all assemblies for the species.
+type Assemblies = GenBankProvider<SpeciesName>
+
+// Select assemblies.
+type Assembly = Assemblies.``GCA_001546615.1``
+
+// Show the assembly's primary accession.
+match (Assembly.Genome()).Metadata.Accession with
+| Some accession -> match accession.Primary with
+                    | Some primary -> printf "%s" primary
+                    | None -> printf "No primary accession provided."
+| None -> printf "No accession provided."
+
+
KQ957361
+ +
namespace BioProviders
+
namespace BioFSharp
+
Multiple items
type LiteralAttribute = + inherit Attribute + new: unit -> LiteralAttribute
<summary>Adding this attribute to a value causes it to be compiled as a CLI constant literal.</summary>
<category>Attributes</category>


--------------------
new: unit -> LiteralAttribute
+
[<Literal>] +val Species: string = "Candidatus Carsonella ruddii"
+
[<Literal>] +val Accession: string = "GCA_001274515.1"
+
type Ruddii = GenBankProvider<...>
+
type GenBankProvider
<summary>Typed representation of the NCBI FTP server, for GenBank data.</summary> + <param name="Species">The name of the species whose genome is being accessed (e.g. "Staphylococcus borealis"). Defaults to <c>""</c>.</param> + <param name="Accession">The accession of the genome assembly being accessed (e.g. "GCA_003042555.1"). Defaults to <c>""</c>.</param>
+
val genome: GenBankProvider<...>.Genome
+
type Genome = + inherit GenBankFlatFile + new: unit -> Genome + member Metadata: Metadata + member Sequence: IEnumerable<Nucleotide>
<summary>Typed representation of an Assembly's Genomic GenBank Flat File.</summary>
+
val metadata: Metadata.Metadata
+
Multiple items
GenBankFlatFile.GenBankFlatFile.Metadata: Metadata.Metadata

--------------------
property GenBankProvider<...>.Genome.Metadata: Metadata.Metadata with get
<summary>Typed representation of the Metadata of a Genomic GenBank Flat File.</summary>
+
val printf: format: Printf.TextWriterFormat<'T> -> 'T
<summary>Print to <c>stdout</c> using the given format.</summary>
<param name="format">The formatter.</param>
<returns>The formatted result.</returns>
<example>See <c>Printf.printf</c> (link: <see cref="M:Microsoft.FSharp.Core.PrintfModule.PrintFormat``1" />) for examples.</example>
+
Metadata.Metadata.Definition: string option
+
union case Option.Some: Value: 'T -> Option<'T>
<summary>The representation of "Value of type 'T"</summary>
<param name="Value">The input value.</param>
<returns>An option representing the value.</returns>
+
val definition: string
+
union case Option.None: Option<'T>
<summary>The representation of "No value"</summary>
+
Metadata.Metadata.DbSource: string option
+
val dbsource: string
+
val sequence: System.Collections.Generic.IEnumerable<Nucleotides.Nucleotide>
+
Multiple items
GenBankFlatFile.GenBankFlatFile.Sequence: BioSeq.BioSeq<Nucleotides.Nucleotide>

--------------------
property GenBankProvider<...>.Genome.Sequence: System.Collections.Generic.IEnumerable<Nucleotides.Nucleotide> with get
<summary>Typed representation of the Sequence of a Genomic GenBank Flat File.</summary>
+
module BioSeq + +from BioFSharp
+
val complement: nucs: seq<Nucleotides.Nucleotide> -> BioSeq.BioSeq<Nucleotides.Nucleotide>
+
val transcribeCodingStrand: nucs: seq<Nucleotides.Nucleotide> -> BioSeq.BioSeq<Nucleotides.Nucleotide>
+
val translate: nucleotideOffset: int -> rnaSeq: seq<Nucleotides.Nucleotide> -> BioSeq.BioSeq<AminoAcids.AminoAcid>
+
[<Literal>] +val SpeciesPattern: string = "Staphylococcus c*"
+
[<Literal>] +val AccessionPattern: string = "GCA_01*"
+
type SpeciesCollection = GenBankProvider<...>
+
type Capitis = GenBankProvider<...>.Staphylococcus capitis
+
type Cohnii = GenBankProvider<...>.Staphylococcus cohnii
+
type Assembly1 = GenBankProvider<...>.Staphylococcus capitis.GCA_012926605.1
+
type Assembly2 = GenBankProvider<...>.Staphylococcus capitis.GCA_015645205.1
+
type Assembly3 = GenBankProvider<...>.Staphylococcus cohnii.GCA_013349225.1
+
type Assembly4 = GenBankProvider<...>.Staphylococcus cohnii.GCA_014884245.1
+
val data: GenBankProvider<...>.Staphylococcus capitis.GCA_012926605.1.Genome
+
Multiple items
GenBankFlatFile.GenBankFlatFile.Metadata: Metadata.Metadata

--------------------
property GenBankProvider<...>.Staphylococcus capitis.GCA_012926605.1.Genome.Metadata: Metadata.Metadata with get
<summary>Typed representation of the Metadata of a Genomic GenBank Flat File.</summary>
+
[<Literal>] +val SpeciesName: string = "Staphylococcus lugdunensis"
+
type Assemblies = GenBankProvider<...>
+
type Assembly = GenBankProvider<...>.GCA_001546615.1
+
module Metadata + +from BioProviders
+
type Accession = + { + Primary: string option + Secondary: string list option + }
<summary> + Identifier assigned to each GenBank sequence record. + </summary>
+
val accession: Metadata.Accession
+
Metadata.Accession.Primary: string option
+
val primary: string
+ +
+ + + + + + + + +
+ + + \ No newline at end of file diff --git a/library/GenBankProvider.ipynb b/library/GenBankProvider.ipynb new file mode 100644 index 0000000..ad8d622 --- /dev/null +++ b/library/GenBankProvider.ipynb @@ -0,0 +1,527 @@ + + { + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + + "source": ["[![Script](../images/badge-script.svg)](https://fsprojects.github.io/BioProviders//library/GenBankProvider.fsx)\u0026emsp;\n", +"[![Notebook](../images/badge-notebook.svg)](https://fsprojects.github.io/BioProviders//library/GenBankProvider.ipynb)\n", +"\n", +"# GenBank Type Provider\n", +"\n", +"This article describes how to use the GenBank Type Provider to remotely access genomic data stored in the\n", +"[GenBank](https://www.ncbi.nlm.nih.gov/genbank/) database. This Type Provider collects and parses the genomic data\n", +"for a specified organism and generates a static type containing its metadata and sequence.\n", +"\n", +"The GenBank Type Provider uses [.NET Bio](https://github.com/dotnetbio/bio) to parse the GenBank data files\n", +"and [BioFSharp](https://github.com/CSBiology/BioFSharp) to provide utilities for manipulating genomic sequences.\n", +"\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": null, "outputs": [], + "source": ["\u003cbr /\u003e\n"] + } +, + { + "cell_type": "markdown", + "metadata": {}, + + "source": ["## Loading BioProviders Package\n", +"\n", +"To load the GenBank Type Provider, a script can use the NuGet syntax to reference the BioProviders package, shown below.\n", +"\n", +"You can optionally include the BioFSharp package. While it\u0027s not required to use the basic BioProviders functions, it can be used to explore the metadata of the provided types, as shown in a later example.\n", +"\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": 1, "outputs": [], + "source": ["#r \"nuget: BioProviders\"\n", +"#r \"nuget: BioFSharp\"\n"] + } +, + { + "cell_type": "markdown", + "metadata": {}, + + "source": ["If creating an F# library or application, BioProviders can be added as a package reference. You can use your IDE for this, or use the `dotnet add package BioProviders` command in your project folder from the command line.\n", +"\n", +"BioProviders can then be used in your script or code by using an open command. Opening its dependencies should not be required. (BioFSharp is loaded for future examples.)\n", +"\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": 2, "outputs": [], + "source": ["open BioProviders\n", +"open BioFSharp\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": null, "outputs": [], + "source": ["\u003cbr /\u003e\n"] + } +, + { + "cell_type": "markdown", + "metadata": {}, + + "source": ["## GenBankProvider Example\n", +"\n", +"The GenBank Type Provider will be demonstrated for [this GenBank assembly](https://www.ncbi.nlm.nih.gov/nuccore/CP012411)\n", +"of the **Candidatus Carsonella ruddii** species. To create a typed representation of the assembly, two pieces of information\n", +"must be given to the Type Provider:\n", +"\n", +"* Species name\n", +"\n", +"* GenBank assembly accession\n", +"\n", +"For this example, the species name is \"Candidatus Carsonella ruddii\" and the GenBank assembly accession is \"GCA_001274515.1\".\n", +"To find this information:\n", +"\n", +"* Visit [https://www.ncbi.nlm.nih.gov/datasets/](https://www.ncbi.nlm.nih.gov/datasets/)\n", +"\n", +"* Search for the name of the species\n", +"\n", +"* Select to view all genones of the species\n", +"\n", +"You can then select the assembly\u0027s GenBank (as well as RefSeq) accession from the list that appears.\n", +"\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": null, "outputs": [], + "source": ["\u003cdiv class=\"container-fluid\" style=\"margin:15px 0px 15px 0px;\"\u003e\n", +" \u003cdiv class=\"row-fluid\"\u003e\n", +" \u003cdiv class=\"span1\"\u003e\u003c/div\u003e\n", +" \u003cdiv class=\"span10\" id=\"anim-holder\"\u003e\n", +" \u003ca id=\"lnk\" href=\"../images/GenBank_Info.gif\"\u003e\u003cimg id=\"anim\" src=\"../images/GenBank_Info.gif\" /\u003e\u003c/a\u003e\n", +" \u003c/div\u003e\n", +" \u003cdiv class=\"span1\"\u003e\u003c/div\u003e\n", +" \u003c/div\u003e\n", +"\u003c/div\u003e\n"] + } +, + { + "cell_type": "markdown", + "metadata": {}, + + "source": ["Passing this information to the Type Provider generates the Assembly Type. The genomic data can then be extracted from the\n", +"Assembly Type by invoking the Genome method. This is demonstrated below.\n", +"\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": 3, "outputs": [], + "source": ["// Define species name and GenBank assembly accession.\n", +"let [\u003cLiteral\u003e] Species = \"Candidatus Carsonella ruddii\"\n", +"let [\u003cLiteral\u003e] Accession = \"GCA_001274515.1\"\n", +"\n", +"// Create GenBank assembly type.\n", +"type Ruddii = GenBankProvider\u003cSpecies, Accession\u003e\n", +"\n", +"// Extract statically-typed genome data.\n", +"let genome = Ruddii.Genome()\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": null, "outputs": [], + "source": ["\u003cbr /\u003e\n", +"\u003cbr /\u003e\n"] + } +, + { + "cell_type": "markdown", + "metadata": {}, + + "source": ["### Metadata\n", +"\n", +"Each genome is accompanied by metadata describing the organism and sequence recorded in the assembly. This metadata can\n", +"be extracted using the Metadata field of the Genome Type created previously. The Metadata type is largely based on that\n", +"provided by [.NET Bio](http://dotnetbio.github.io/Help/html/319bf2e6-4fcf-9f93-586f-fc7ffcf04a83.htm), with modifications\n", +"made to be more idiomatic with F#.\n", +"\n", +"Below is an example of how the raw metadata type can be retrieved and displayed:\n", +"\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": 4, "outputs": [ + { + "data": { + "text/plain": ["{ Locus = Some { Date = Some 8/26/2015 12:00:00 AM", +" DivisionCode = Some BCT", +" MoleculeType = Some DNA", +" Name = Some \"CP012411\"", +" SequenceLength = 174018", +" SequenceType = Some \"bp\"", +" Strand = None", +" StrandTopology = Some Circular }", +" Definition = Some \"Candidatus Carsonella ruddii strain YCCR, complete genome.\"", +" Accession = Some { Primary = Some \"CP012411\"", +" Secondary = None }", +" Version = Some { Accession = Some \"CP012411\"", +" CompoundAccession = Some \"CP012411.1\"", +" GiNumber = None", +" Version = Some \"1\" }", +" DbLinks = Some [{ Numbers = Some [\" PRJNA292590\"]", +" Type = Some BioProject }; { Numbers = Some [\" SAMN03999419\"]", +" Type = None }]", +" DbSource = None", +" Keywords = Some \".\"", +" Primary = None", +" Source =", +" Some", +" { CommonName = Some \"Candidatus Carsonella ruddii\"", +" Organism =", +" Some", +" { ClassLevels =", +" Some", +" \"Bacteria; Pseudomonadota; Gammaproteobacteria; Oceanospirillales; Halomonadaceae; Zymobacter group; Candidatus Carsonella.\"", +" Genus = Some \"Candidatus Carsonella\"", +" Species = Some \"Candidatus Carsonella ruddii\" } }", +" References =", +" Some", +" [{ Authors = Some \"Wu,F., Deng,X., Liang,G., Cen,Y. and Chen,J.\"", +" Consortiums = None", +" Journal = Some \"Unpublished\"", +" Location = Some \"bases 1 to 174018\"", +" Medline = None", +" Number = 1", +" PubMed = None", +" Remarks = None", +" Title =", +" Some", +" \"Whole Genome Sequence of \u0027Candidatus Carsonella ruddii\u0027 from Diaphorina citri in Guangdong, China\" };", +" { Authors = Some \"Wu,F.\"", +" Consortiums = None", +" Journal =", +" Some", +" \"Submitted (20-AUG-2015) San Joaquin Valley Agricultural Sciences Center, Usda-Ars, 9611 South Riverbend Avenue, Parlier, CA 93648, USA\"", +" Location = Some \"bases 1 to 174018\"", +" Medline = None", +" Number = 2", +" PubMed = None", +" Remarks = None", +" Title = Some \"Direct Submission\" }]", +" Comments =", +" Some", +" [\"Annotation was added by the NCBI Prokaryotic Genome Annotation", +"", +"Pipeline (released 2013). Information about the Pipeline can be", +"", +"found here: http://www.ncbi.nlm.nih.gov/genome/annotation_prok/", +"", +"", +"", +"##Genome-Assembly-Data-START##", +"", +"Assembly Method :: CLC Genomics Workbench v. 7.5", +"", +"Genome Coverage :: 85.24x", +"", +"Sequencing Technology :: Illumina MiSeq", +"", +"##Genome-Assembly-Data-END##", +"", +"", +"", +"##Genome-Annotation-Data-START##", +"", +"Annotation Provider :: NCBI", +"", +"Annotation Date :: 08/20/2015 13:42:07", +"", +"Annotation Pipeline :: NCBI Prokaryotic Genome Annotation", +"", +"Pipeline", +"", +"Annotation Method :: Best-placed reference protein set;", +"", +"GeneMarkS+", +"", +"Annotation Software revision :: 2.10", +"", +"Features Annotated :: Gene; CDS; rRNA; tRNA; ncRNA;", +"", +"repeat_region", +"", +"Genes :: 224", +"", +"CDS :: 168", +"", +"Pseudo Genes :: 31", +"", +"rRNAs :: 1, 1, 1 (5S, 16S, 23S)", +"", +"complete rRNAs :: 1, 1, 1 (5S, 16S, 23S)", +"", +"partial rRNAs ::", +"", +"tRNAs :: 22", +"", +"ncRNA :: 0", +"", +"Frameshifted Genes :: 0", +"", +"##Genome-Annotation-Data-END##\"]", +" Contig = None", +" Segment = None", +" Origin = None }"] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + }], + "source": ["// Extract the metadata.\n", +"let metadata = genome.Metadata\n", +"\n", +"// Display the metadata type.\n", +"printf \"%A\" metadata\n"] + } +, + { + "cell_type": "markdown", + "metadata": {}, + + "source": ["The metadata type consists of many fields, though not all fields of the metadata exist for all assemblies. Therefore, they are provided as option types, on which a match expression can be used. Below are examples of accessing fields from the example assembly.\n", +"\u003cbr /\u003e\n", +"\u003cbr /\u003e\n", +"✅ Example - Accessing a field that is provided.\n", +"\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": 5, "outputs": [ + { + "data": { + "text/plain": ["Candidatus Carsonella ruddii strain YCCR, complete genome."] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + }], + "source": ["// Print definition if exists.\n", +"match metadata.Definition with\n", +"| Some definition -\u003e printf \"%s\" definition\n", +"| None -\u003e printf \"No definition provided.\"\n"] + } +, + { + "cell_type": "markdown", + "metadata": {}, + + "source": ["\u003cbr /\u003e\n", +"❌ Example - Accessing a field that is not provided.\n", +"\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": 6, "outputs": [ + { + "data": { + "text/plain": ["No database source provided."] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + }], + "source": ["// Print database source if exists.\n", +"match metadata.DbSource with\n", +"| Some dbsource -\u003e printf \"%s\" dbsource\n", +"| None -\u003e printf \"No database source provided.\"\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": null, "outputs": [], + "source": ["\u003cbr /\u003e\n", +"\u003cbr /\u003e\n"] + } +, + { + "cell_type": "markdown", + "metadata": {}, + + "source": ["### Sequence\n", +"\n", +"The genomic sequence for the organism can be extracted using the Sequence field of the Genome Type created previously.\n", +"This field provides a BioFSharp [BioSeq](https://csbiology.github.io/BioFSharp/reference/biofsharp-bioseq.html) containing\n", +"a series of [Nucleotides](https://csbiology.github.io/BioFSharp//reference/biofsharp-nucleotides-nucleotide.html). More\n", +"can be read about BioFSharp containers [here](https://csbiology.github.io/BioFSharp//BioCollections.html).\n", +"\n", +"An example of accessing and manipulating the GenBankProvider genomic sequence using BioFSharp is provided below:\n", +"\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": 7, "outputs": [ + { + "data": { + "text/plain": ["seq [A; T; G; A; ...]"] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + }], + "source": ["// Extract the BioFSharp BioSeq.\n", +"let sequence = genome.Sequence\n", +"\n", +"// Display the sequence type.\n", +"printf \"%A\" sequence\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": 8, "outputs": [ + { + "data": { + "text/plain": ["seq [Tyr; Phe; Leu; Ter; ...]"] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + }], + "source": ["// Take the complement, then transcribe and translate the coding strand.\n", +"sequence\n", +"|\u003e BioSeq.complement\n", +"|\u003e BioSeq.transcribeCodingStrand\n", +"|\u003e BioSeq.translate 0\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": null, "outputs": [], + "source": ["\u003cbr /\u003e\n"] + } +, + { + "cell_type": "markdown", + "metadata": {}, + + "source": ["## Wildcard Operators\n", +"\n", +"Wildcard operators are supported in both the Species and Accession provided to the GenBankProvider. By using asterisks \"*\"\n", +"at the end of a Species or Accession name, species or accessions starting with the provided pattern will be matched.\n", +"\n", +"For example, we can get all **Staphylococcus** species starting with the letter \u0027c\u0027 and assembly accesions starting with\n", +"\u0027GCA_01\u0027:\n", +"\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": 9, "outputs": [ + { + "data": { + "text/plain": ["Staphylococcus capitis strain 18-857 NODE_1, whole genome shotgun sequence."] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + }], + "source": ["// Define species name and GenBank assembly accession using wildcards.\n", +"let [\u003cLiteral\u003e] SpeciesPattern = \"Staphylococcus c*\"\n", +"let [\u003cLiteral\u003e] AccessionPattern = \"GCA_01*\"\n", +"\n", +"// Create GenBank type containing all species matching the species pattern.\n", +"type SpeciesCollection = GenBankProvider\u003cSpeciesPattern, AccessionPattern\u003e\n", +"\n", +"// Select the species types.\n", +"type Capitis = SpeciesCollection.``Staphylococcus capitis``\n", +"type Cohnii = SpeciesCollection.``Staphylococcus cohnii``\n", +"\n", +"// Select assemblies.\n", +"type Assembly1 = Capitis.``GCA_012926605.1``\n", +"type Assembly2 = Capitis.``GCA_015645205.1``\n", +"type Assembly3 = Cohnii.``GCA_013349225.1``\n", +"type Assembly4 = Cohnii.``GCA_014884245.1``\n", +"\n", +"// Extract statically-typed genome data.\n", +"let data = Assembly1.Genome()\n", +"\n", +"// Show the assembly\u0027s definition.\n", +"match data.Metadata.Definition with\n", +"| Some definition -\u003e printf \"%s\" definition\n", +"| None -\u003e printf \"No definition provided.\"\n"] + } +, + { + "cell_type": "markdown", + "metadata": {}, + + "source": ["The Accession parameter can also be omitted from the GenBankProvider. In this case, all assemblies for the given species will\n", +"be matched. For example:\n", +"\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": 10, "outputs": [ + { + "data": { + "text/plain": ["KQ957361"] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + }], + "source": ["// Define species name.\n", +"let [\u003cLiteral\u003e] SpeciesName = \"Staphylococcus lugdunensis\"\n", +"\n", +"// Create GenBank type containing all assemblies for the species.\n", +"type Assemblies = GenBankProvider\u003cSpeciesName\u003e\n", +"\n", +"// Select assemblies.\n", +"type Assembly = Assemblies.``GCA_001546615.1``\n", +"\n", +"// Show the assembly\u0027s primary accession.\n", +"match (Assembly.Genome()).Metadata.Accession with\n", +"| Some accession -\u003e match accession.Primary with\n", +" | Some primary -\u003e printf \"%s\" primary\n", +" | None -\u003e printf \"No primary accession provided.\"\n", +"| None -\u003e printf \"No accession provided.\"\n"] + }], + "metadata": { + "kernelspec": {"display_name": ".NET (F#)", "language": "F#", "name": ".net-fsharp"}, + "langauge_info": { + "file_extension": ".fs", + "mimetype": "text/x-fsharp", + "name": "C#", + "pygments_lexer": "fsharp", + "version": "4.5" + } + }, + "nbformat": 4, + "nbformat_minor": 1 + } + + diff --git a/library/RefSeqProvider.fsx b/library/RefSeqProvider.fsx new file mode 100644 index 0000000..67c4346 --- /dev/null +++ b/library/RefSeqProvider.fsx @@ -0,0 +1,289 @@ +(** +[![Script](../images/badge-script.svg)](https://fsprojects.github.io/BioProviders//library/RefSeqProvider.fsx)  +[![Notebook](../images/badge-notebook.svg)](https://fsprojects.github.io/BioProviders//library/RefSeqProvider.ipynb) + +# RefSeq Type Provider + +This article describes how to use the RefSeq Type Provider to remotely access genomic data stored in the +[RefSeq](https://www.ncbi.nlm.nih.gov/genbank/) database. This Type Provider collects and parses the genomic data +for a specified organism and generates a static type containing its metadata and sequence. + +The RefSeq Type Provider uses [.NET Bio](https://github.com/dotnetbio/bio) to parse the RefSeq data files +and [BioFSharp](https://github.com/CSBiology/BioFSharp) to provide utilities for manipulating genomic sequences. + +*) +
+(** +## Loading BioProviders Package + +To load the RefSeq Type Provider, a script can use the NuGet syntax to reference the BioProviders package, shown below. + +You can optionally include the BioFSharp package. While it's not required to use the basic BioProviders functions, it can be used to explore the metadata of the provided types, as shown in a later example. + +*) +#r "nuget: BioProviders" +#r "nuget: BioFSharp" +(** +If creating an F# library or application, BioProviders can be added as a package reference. You can use your IDE for this, or use the `dotnet add package BioProviders` command in your project folder from the command line. + +BioProviders can then be used in your script or code by using an open command. Opening its dependencies should not be required. (BioFSharp is loaded for future examples.) + +*) +open BioProviders +open BioFSharp +
+(** +## RefSeqProvider Example + +The RefSeq Type Provider will be demonstrated for [this RefSeq assembly](https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_001224225.1/) +of the **Staphylococcus borealis** species. To create a typed representation of the assembly, two pieces of information +must be given to the Type Provider: + +* Species name + +* RefSeq assembly accession + +For this example, the species name is "Staphylococcus borealis" and the RefSeq assembly accession is "GCF_001224225.1". +To find this information: + +* Visit [https://www.ncbi.nlm.nih.gov/datasets/](https://www.ncbi.nlm.nih.gov/datasets/) + +* Search for the name of the species + +* Select to view all genones of the species + +You can then select the assembly's RefSeq (as well as GenBank) accession from the list that appears. + +*) +
+
+
+
+ +
+
+
+
+(** +Passing this information to the Type Provider generates the Assembly Type. The genomic data can then be extracted from the +Assembly Type by invoking the Genome method. This is demonstrated below. + +*) +// Define species name and RefSeq assembly accession. +let [] Species = "Staphylococcus borealis" +let [] Accession = "GCF_001224225.1" + +// Create RefSeq assembly type. +type Borealis = RefSeqProvider + +// Extract statically-typed genome data. +let genome = Borealis.Genome() +
+
+(** +### Metadata + +Each genome is accompanied by metadata describing the organism and sequence recorded in the assembly. This metadata can +be extracted using the Metadata field of the Genome Type created previously. The Metadata type is largely based on that +provided by [.NET Bio](http://dotnetbio.github.io/Help/html/319bf2e6-4fcf-9f93-586f-fc7ffcf04a83.htm), with modifications +made to be more idiomatic with F#. + +Below is an example of how the raw metadata type can be retrieved and displayed: + +*) +// Extract the metadata. +let metadata = genome.Metadata + +// Display the metadata type. +printf "%A" metadata(* output: +{ Locus = Some { Date = Some 4/27/2023 12:00:00 AM + DivisionCode = Some CON + MoleculeType = Some DNA + Name = Some "NZ_CUEE01000001" + SequenceLength = 563044 + SequenceType = Some "bp" + Strand = None + StrandTopology = Some Linear } + Definition = + Some "Staphylococcus borealis strain 51-48, whole genome shotgun sequence." + Accession = Some { Primary = Some "NZ_CUEE01000001" + Secondary = Some ["NZ_CUEE01000000"] } + Version = Some { Accession = Some "NZ_CUEE01000001" + CompoundAccession = Some "NZ_CUEE01000001.1" + GiNumber = None + Version = Some "1" } + DbLinks = + Some + [{ Numbers = Some [" PRJNA224116"] + Type = Some BioProject }; { Numbers = Some [" SAMEA1035138"] + Type = None }; + { Numbers = Some [" GCF_001224225.1"] + Type = None }] + DbSource = None + Keywords = Some "WGS; RefSeq." + Primary = None + Source = + Some + { CommonName = Some "Staphylococcus borealis" + Organism = + Some + { ClassLevels = + Some + "Bacteria; Bacillota; Bacilli; Bacillales; Staphylococcaceae; Staphylococcus." + Genus = Some "Staphylococcus" + Species = Some "borealis" } } + References = + Some + [{ Authors = Some "Informatics,Pathogen." + Consortiums = None + Journal = + Some + "Submitted (10-MAR-2015) SC, Wellcome Trust Sanger Institute, CB10 1SA, United Kingdom" + Location = None + Medline = None + Number = 1 + PubMed = None + Remarks = None + Title = Some "Direct Submission" }] + Comments = + Some + ["REFSEQ INFORMATION: The reference sequence is identical to +CUEE01000001.1. +The annotation was added by the NCBI Prokaryotic Genome Annotation +Pipeline (PGAP). Information about PGAP can be found here: +https://www.ncbi.nlm.nih.gov/genome/annotation_prok/ + +##Genome-Annotation-Data-START## +Annotation Provider :: NCBI RefSeq +Annotation Date :: 04/27/2023 01:28:26 +Annotation Pipeline :: NCBI Prokaryotic Genome +Annotation Pipeline (PGAP) +Annotation Method :: Best-placed reference protein +set; GeneMarkS-2+ +Annotation Software revision :: 6.5 +Features Annotated :: Gene; CDS; rRNA; tRNA; ncRNA +Genes (total) :: 2,650 +CDSs (total) :: 2,584 +Genes (coding) :: 2,507 +CDSs (with protein) :: 2,507 +Genes (RNA) :: 66 +rRNAs :: 2, 1, 1 (5S, 16S, 23S) +complete rRNAs :: 2, 1, 1 (5S, 16S, 23S) +tRNAs :: 58 +ncRNAs :: 4 +Pseudo Genes (total) :: 77 +CDSs (without protein) :: 77 +Pseudo Genes (ambiguous residues) :: 0 of 77 +Pseudo Genes (frameshifted) :: 29 of 77 +Pseudo Genes (incomplete) :: 49 of 77 +Pseudo Genes (internal stop) :: 29 of 77 +Pseudo Genes (multiple problems) :: 23 of 77 +##Genome-Annotation-Data-END##"] + Contig = Some "join(CUEE01000001.1:1..563044)" + Segment = None + Origin = None }*) +(** +The metadata type consists of many fields, though not all fields of the metadata exist for all assemblies. Therefore, they are provided as option types, on which a match expression can be used. Below are examples of accessing fields from the example assembly. +
+
+✅ Example - Accessing a field that is provided. + +*) +// Print definition if exists. +match metadata.Definition with +| Some definition -> printf "%s" definition +| None -> printf "No definition provided."(* output: +Staphylococcus borealis strain 51-48, whole genome shotgun sequence.*) +(** +
+❌ Example - Accessing a field that is not provided. + +*) +// Print database source if exists. +match metadata.DbSource with +| Some dbsource -> printf "%s" dbsource +| None -> printf "No database source provided."(* output: +No database source provided.*) +
+
+(** +### Sequence + +The genomic sequence for the organism can be extracted using the Sequence field of the Genome Type created previously. +This field provides a BioFSharp [BioSeq](https://csbiology.github.io/BioFSharp/reference/biofsharp-bioseq.html) containing +a series of [Nucleotides](https://csbiology.github.io/BioFSharp//reference/biofsharp-nucleotides-nucleotide.html). More +can be read about BioFSharp containers [here](https://csbiology.github.io/BioFSharp//BioCollections.html). + +An example of accessing and manipulating the RefSeqProvider genomic sequence using BioFSharp is provided below: + +*) +// Extract the BioFSharp BioSeq. +let sequence = genome.Sequence + +// Display the sequence type. +printf "%A" sequence(* output: +seq [C; A; G; G; ...]*) +// Take the complement, then transcribe and translate the coding strand. +sequence +|> BioSeq.complement +|> BioSeq.transcribeCodingStrand +|> BioSeq.translate 0(* output: +seq [Val; Leu; Val; Ter; ...]*) +
+(** +## Wildcard Operators + +Wildcard operators are supported in both the Species and Accession provided to the RefSeqProvider. By using asterisks "*" +at the end of a Species or Accession name, species or accessions starting with the provided pattern will be matched. + +For example, we can get all **Staphylococcus** species starting with the letter 'c' and assembly accesions starting with +'GCF_01': + +*) +// Define species name and RefSeq assembly accession using wildcards. +let [] SpeciesPattern = "Staphylococcus c*" +let [] AccessionPattern = "GCF_01*" + +// Create RefSeq type containing all species matching the species pattern. +type SpeciesCollection = RefSeqProvider + +// Select the species types. +type Capitis = SpeciesCollection.``Staphylococcus capitis`` +type Cohnii = SpeciesCollection.``Staphylococcus cohnii`` + +// Select assemblies. +type Assembly1 = Capitis.``GCF_012926605.1`` +type Assembly2 = Capitis.``GCF_012926635.1`` +type Assembly3 = Cohnii.``GCF_013602215.1`` +type Assembly4 = Cohnii.``GCF_013602265.1`` + +// Extract statically-typed genome data. +let data = Assembly1.Genome() + +// Show the assembly's definition. +match data.Metadata.Definition with +| Some definition -> printf "%s" definition +| None -> printf "No definition provided."(* output: +Staphylococcus capitis strain 18-857 NODE_1, whole genome shotgun sequence.*) +(** +The Accession parameter can also be omitted from the RefSeqProvider. In this case, all assemblies for the given species will +be matched. For example: + +*) +// Define species name. +let [] SpeciesName = "Staphylococcus lugdunensis" + +// Create RefSeq type containing all assemblies for the species. +type Assemblies = RefSeqProvider + +// Select assemblies. +type Assembly = Assemblies.``GCF_001546615.1`` + +// Show the assembly's primary accession. +match (Assembly.Genome()).Metadata.Accession with +| Some accession -> match accession.Primary with + | Some primary -> printf "%s" primary + | None -> printf "No primary accession provided." +| None -> printf "No accession provided."(* output: +NZ_KQ957361*) + diff --git a/library/RefSeqProvider.html b/library/RefSeqProvider.html new file mode 100644 index 0000000..33b660d --- /dev/null +++ b/library/RefSeqProvider.html @@ -0,0 +1,420 @@ + + + + + + RefSeq Type Provider + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ +

Script  +Notebook

+

RefSeq Type Provider

+

This article describes how to use the RefSeq Type Provider to remotely access genomic data stored in the +RefSeq database. This Type Provider collects and parses the genomic data +for a specified organism and generates a static type containing its metadata and sequence.

+

The RefSeq Type Provider uses .NET Bio to parse the RefSeq data files +and BioFSharp to provide utilities for manipulating genomic sequences.

+
+

Loading BioProviders Package

+

To load the RefSeq Type Provider, a script can use the NuGet syntax to reference the BioProviders package, shown below.

+

You can optionally include the BioFSharp package. While it's not required to use the basic BioProviders functions, it can be used to explore the metadata of the provided types, as shown in a later example.

+
#r "nuget: BioProviders"
+#r "nuget: BioFSharp"
+
+

If creating an F# library or application, BioProviders can be added as a package reference. You can use your IDE for this, or use the dotnet add package BioProviders command in your project folder from the command line.

+

BioProviders can then be used in your script or code by using an open command. Opening its dependencies should not be required. (BioFSharp is loaded for future examples.)

+
open BioProviders
+open BioFSharp
+
+
+

RefSeqProvider Example

+

The RefSeq Type Provider will be demonstrated for this RefSeq assembly +of the Staphylococcus borealis species. To create a typed representation of the assembly, two pieces of information +must be given to the Type Provider:

+
    +
  • Species name
  • +
  • RefSeq assembly accession
  • +
+

For this example, the species name is "Staphylococcus borealis" and the RefSeq assembly accession is "GCF_001224225.1". +To find this information:

+ +

You can then select the assembly's RefSeq (as well as GenBank) accession from the list that appears.

+
+
+
+
+ +
+
+
+
+

Passing this information to the Type Provider generates the Assembly Type. The genomic data can then be extracted from the +Assembly Type by invoking the Genome method. This is demonstrated below.

+
// Define species name and RefSeq assembly accession.
+let [<Literal>] Species = "Staphylococcus borealis"
+let [<Literal>] Accession = "GCF_001224225.1"
+
+// Create RefSeq assembly type.
+type Borealis = RefSeqProvider<Species, Accession>
+
+// Extract statically-typed genome data.
+let genome = Borealis.Genome()
+
+
+
+

Metadata

+

Each genome is accompanied by metadata describing the organism and sequence recorded in the assembly. This metadata can +be extracted using the Metadata field of the Genome Type created previously. The Metadata type is largely based on that +provided by .NET Bio, with modifications +made to be more idiomatic with F#.

+

Below is an example of how the raw metadata type can be retrieved and displayed:

+
// Extract the metadata.
+let metadata = genome.Metadata
+
+// Display the metadata type.
+printf "%A" metadata
+
+
{ Locus = Some { Date = Some 4/27/2023 12:00:00 AM
+                 DivisionCode = Some CON
+                 MoleculeType = Some DNA
+                 Name = Some "NZ_CUEE01000001"
+                 SequenceLength = 563044
+                 SequenceType = Some "bp"
+                 Strand = None
+                 StrandTopology = Some Linear }
+  Definition =
+   Some "Staphylococcus borealis strain 51-48, whole genome shotgun sequence."
+  Accession = Some { Primary = Some "NZ_CUEE01000001"
+                     Secondary = Some ["NZ_CUEE01000000"] }
+  Version = Some { Accession = Some "NZ_CUEE01000001"
+                   CompoundAccession = Some "NZ_CUEE01000001.1"
+                   GiNumber = None
+                   Version = Some "1" }
+  DbLinks =
+   Some
+     [{ Numbers = Some [" PRJNA224116"]
+        Type = Some BioProject }; { Numbers = Some [" SAMEA1035138"]
+                                    Type = None };
+      { Numbers = Some [" GCF_001224225.1"]
+        Type = None }]
+  DbSource = None
+  Keywords = Some "WGS; RefSeq."
+  Primary = None
+  Source =
+   Some
+     { CommonName = Some "Staphylococcus borealis"
+       Organism =
+        Some
+          { ClassLevels =
+             Some
+               "Bacteria; Bacillota; Bacilli; Bacillales; Staphylococcaceae; Staphylococcus."
+            Genus = Some "Staphylococcus"
+            Species = Some "borealis" } }
+  References =
+   Some
+     [{ Authors = Some "Informatics,Pathogen."
+        Consortiums = None
+        Journal =
+         Some
+           "Submitted (10-MAR-2015) SC, Wellcome Trust Sanger Institute, CB10 1SA, United Kingdom"
+        Location = None
+        Medline = None
+        Number = 1
+        PubMed = None
+        Remarks = None
+        Title = Some "Direct Submission" }]
+  Comments =
+   Some
+     ["REFSEQ INFORMATION: The reference sequence is identical to
+CUEE01000001.1.
+The annotation was added by the NCBI Prokaryotic Genome Annotation
+Pipeline (PGAP). Information about PGAP can be found here:
+https://www.ncbi.nlm.nih.gov/genome/annotation_prok/
+
+##Genome-Annotation-Data-START##
+Annotation Provider               :: NCBI RefSeq
+Annotation Date                   :: 04/27/2023 01:28:26
+Annotation Pipeline               :: NCBI Prokaryotic Genome
+Annotation Pipeline (PGAP)
+Annotation Method                 :: Best-placed reference protein
+set; GeneMarkS-2+
+Annotation Software revision      :: 6.5
+Features Annotated                :: Gene; CDS; rRNA; tRNA; ncRNA
+Genes (total)                     :: 2,650
+CDSs (total)                      :: 2,584
+Genes (coding)                    :: 2,507
+CDSs (with protein)               :: 2,507
+Genes (RNA)                       :: 66
+rRNAs                             :: 2, 1, 1 (5S, 16S, 23S)
+complete rRNAs                    :: 2, 1, 1 (5S, 16S, 23S)
+tRNAs                             :: 58
+ncRNAs                            :: 4
+Pseudo Genes (total)              :: 77
+CDSs (without protein)            :: 77
+Pseudo Genes (ambiguous residues) :: 0 of 77
+Pseudo Genes (frameshifted)       :: 29 of 77
+Pseudo Genes (incomplete)         :: 49 of 77
+Pseudo Genes (internal stop)      :: 29 of 77
+Pseudo Genes (multiple problems)  :: 23 of 77
+##Genome-Annotation-Data-END##"]
+  Contig = Some "join(CUEE01000001.1:1..563044)"
+  Segment = None
+  Origin = None }
+

The metadata type consists of many fields, though not all fields of the metadata exist for all assemblies. Therefore, they are provided as option types, on which a match expression can be used. Below are examples of accessing fields from the example assembly. +
+
+✅ Example - Accessing a field that is provided.

+
// Print definition if exists.
+match metadata.Definition with
+| Some definition -> printf "%s" definition
+| None -> printf "No definition provided."
+
+
Staphylococcus borealis strain 51-48, whole genome shotgun sequence.
+


+❌ Example - Accessing a field that is not provided.

+
// Print database source if exists.
+match metadata.DbSource with
+| Some dbsource -> printf "%s" dbsource
+| None -> printf "No database source provided."
+
+
No database source provided.
+
+
+

Sequence

+

The genomic sequence for the organism can be extracted using the Sequence field of the Genome Type created previously. +This field provides a BioFSharp BioSeq containing +a series of Nucleotides. More +can be read about BioFSharp containers here.

+

An example of accessing and manipulating the RefSeqProvider genomic sequence using BioFSharp is provided below:

+
// Extract the BioFSharp BioSeq.
+let sequence = genome.Sequence
+
+// Display the sequence type.
+printf "%A" sequence
+
+
seq [C; A; G; G; ...]
+
// Take the complement, then transcribe and translate the coding strand.
+sequence
+|> BioSeq.complement
+|> BioSeq.transcribeCodingStrand
+|> BioSeq.translate 0
+
+
seq [Val; Leu; Val; Ter; ...]
+
+

Wildcard Operators

+

Wildcard operators are supported in both the Species and Accession provided to the RefSeqProvider. By using asterisks "*" +at the end of a Species or Accession name, species or accessions starting with the provided pattern will be matched.

+

For example, we can get all Staphylococcus species starting with the letter 'c' and assembly accesions starting with +'GCF_01':

+
// Define species name and RefSeq assembly accession using wildcards.
+let [<Literal>] SpeciesPattern = "Staphylococcus c*"
+let [<Literal>] AccessionPattern = "GCF_01*"
+
+// Create RefSeq type containing all species matching the species pattern.
+type SpeciesCollection = RefSeqProvider<SpeciesPattern, AccessionPattern>
+
+// Select the species types.
+type Capitis = SpeciesCollection.``Staphylococcus capitis``
+type Cohnii = SpeciesCollection.``Staphylococcus cohnii``
+
+// Select assemblies.
+type Assembly1 = Capitis.``GCF_012926605.1``
+type Assembly2 = Capitis.``GCF_012926635.1``
+type Assembly3 = Cohnii.``GCF_013602215.1``
+type Assembly4 = Cohnii.``GCF_013602265.1``
+
+// Extract statically-typed genome data.
+let data = Assembly1.Genome()
+
+// Show the assembly's definition.
+match data.Metadata.Definition with
+| Some definition -> printf "%s" definition
+| None -> printf "No definition provided."
+
+
Staphylococcus capitis strain 18-857 NODE_1, whole genome shotgun sequence.
+

The Accession parameter can also be omitted from the RefSeqProvider. In this case, all assemblies for the given species will +be matched. For example:

+
// Define species name.
+let [<Literal>] SpeciesName = "Staphylococcus lugdunensis"
+
+// Create RefSeq type containing all assemblies for the species.
+type Assemblies = RefSeqProvider<SpeciesName>
+
+// Select assemblies.
+type Assembly = Assemblies.``GCF_001546615.1``
+
+// Show the assembly's primary accession.
+match (Assembly.Genome()).Metadata.Accession with
+| Some accession -> match accession.Primary with
+                    | Some primary -> printf "%s" primary
+                    | None -> printf "No primary accession provided."
+| None -> printf "No accession provided."
+
+
NZ_KQ957361
+ +
namespace BioProviders
+
namespace BioFSharp
+
Multiple items
type LiteralAttribute = + inherit Attribute + new: unit -> LiteralAttribute
<summary>Adding this attribute to a value causes it to be compiled as a CLI constant literal.</summary>
<category>Attributes</category>


--------------------
new: unit -> LiteralAttribute
+
[<Literal>] +val Species: string = "Staphylococcus borealis"
+
[<Literal>] +val Accession: string = "GCF_001224225.1"
+
type Borealis = RefSeqProvider<...>
+
type RefSeqProvider
<summary>Typed representation of the NCBI FTP server, for RefSeq data.</summary> + <param name="Species">The name of the species whose genome is being accessed (e.g. "Staphylococcus borealis"). Defaults to <c>""</c>.</param> + <param name="Accession">The accession of the genome assembly being accessed (e.g. "GCF_001224225.1"). Defaults to <c>""</c>.</param>
+
val genome: RefSeqProvider<...>.Genome
+
type Genome = + inherit GenBankFlatFile + new: unit -> Genome + member Metadata: Metadata + member Sequence: IEnumerable<Nucleotide>
<summary>Typed representation of an Assembly's Genomic GenBank Flat File.</summary>
+
val metadata: Metadata.Metadata
+
Multiple items
GenBankFlatFile.GenBankFlatFile.Metadata: Metadata.Metadata

--------------------
property RefSeqProvider<...>.Genome.Metadata: Metadata.Metadata with get
<summary>Typed representation of the Metadata of a Genomic GenBank Flat File.</summary>
+
val printf: format: Printf.TextWriterFormat<'T> -> 'T
<summary>Print to <c>stdout</c> using the given format.</summary>
<param name="format">The formatter.</param>
<returns>The formatted result.</returns>
<example>See <c>Printf.printf</c> (link: <see cref="M:Microsoft.FSharp.Core.PrintfModule.PrintFormat``1" />) for examples.</example>
+
Metadata.Metadata.Definition: string option
+
union case Option.Some: Value: 'T -> Option<'T>
<summary>The representation of "Value of type 'T"</summary>
<param name="Value">The input value.</param>
<returns>An option representing the value.</returns>
+
val definition: string
+
union case Option.None: Option<'T>
<summary>The representation of "No value"</summary>
+
Metadata.Metadata.DbSource: string option
+
val dbsource: string
+
val sequence: System.Collections.Generic.IEnumerable<Nucleotides.Nucleotide>
+
Multiple items
GenBankFlatFile.GenBankFlatFile.Sequence: BioSeq.BioSeq<Nucleotides.Nucleotide>

--------------------
property RefSeqProvider<...>.Genome.Sequence: System.Collections.Generic.IEnumerable<Nucleotides.Nucleotide> with get
<summary>Typed representation of the Sequence of a Genomic GenBank Flat File.</summary>
+
module BioSeq + +from BioFSharp
+
val complement: nucs: seq<Nucleotides.Nucleotide> -> BioSeq.BioSeq<Nucleotides.Nucleotide>
+
val transcribeCodingStrand: nucs: seq<Nucleotides.Nucleotide> -> BioSeq.BioSeq<Nucleotides.Nucleotide>
+
val translate: nucleotideOffset: int -> rnaSeq: seq<Nucleotides.Nucleotide> -> BioSeq.BioSeq<AminoAcids.AminoAcid>
+
[<Literal>] +val SpeciesPattern: string = "Staphylococcus c*"
+
[<Literal>] +val AccessionPattern: string = "GCF_01*"
+
type SpeciesCollection = RefSeqProvider<...>
+
type Capitis = RefSeqProvider<...>.Staphylococcus capitis
+
type Cohnii = RefSeqProvider<...>.Staphylococcus cohnii
+
type Assembly1 = RefSeqProvider<...>.Staphylococcus capitis.GCF_012926605.1
+
type Assembly2 = RefSeqProvider<...>.Staphylococcus capitis.GCF_012926635.1
+
type Assembly3 = RefSeqProvider<...>.Staphylococcus cohnii.GCF_013602215.1
+
type Assembly4 = RefSeqProvider<...>.Staphylococcus cohnii.GCF_013602265.1
+
val data: RefSeqProvider<...>.Staphylococcus capitis.GCF_012926605.1.Genome
+
Multiple items
GenBankFlatFile.GenBankFlatFile.Metadata: Metadata.Metadata

--------------------
property RefSeqProvider<...>.Staphylococcus capitis.GCF_012926605.1.Genome.Metadata: Metadata.Metadata with get
<summary>Typed representation of the Metadata of a Genomic GenBank Flat File.</summary>
+
[<Literal>] +val SpeciesName: string = "Staphylococcus lugdunensis"
+
type Assemblies = RefSeqProvider<...>
+
type Assembly = RefSeqProvider<...>.GCF_001546615.1
+
module Metadata + +from BioProviders
+
type Accession = + { + Primary: string option + Secondary: string list option + }
<summary> + Identifier assigned to each GenBank sequence record. + </summary>
+
val accession: Metadata.Accession
+
Metadata.Accession.Primary: string option
+
val primary: string
+ +
+ + + + + + + + +
+ + + \ No newline at end of file diff --git a/library/RefSeqProvider.ipynb b/library/RefSeqProvider.ipynb new file mode 100644 index 0000000..7416f57 --- /dev/null +++ b/library/RefSeqProvider.ipynb @@ -0,0 +1,525 @@ + + { + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + + "source": ["[![Script](../images/badge-script.svg)](https://fsprojects.github.io/BioProviders//library/RefSeqProvider.fsx)\u0026emsp;\n", +"[![Notebook](../images/badge-notebook.svg)](https://fsprojects.github.io/BioProviders//library/RefSeqProvider.ipynb)\n", +"\n", +"# RefSeq Type Provider\n", +"\n", +"This article describes how to use the RefSeq Type Provider to remotely access genomic data stored in the\n", +"[RefSeq](https://www.ncbi.nlm.nih.gov/genbank/) database. This Type Provider collects and parses the genomic data\n", +"for a specified organism and generates a static type containing its metadata and sequence.\n", +"\n", +"The RefSeq Type Provider uses [.NET Bio](https://github.com/dotnetbio/bio) to parse the RefSeq data files\n", +"and [BioFSharp](https://github.com/CSBiology/BioFSharp) to provide utilities for manipulating genomic sequences.\n", +"\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": null, "outputs": [], + "source": ["\u003cbr /\u003e\n"] + } +, + { + "cell_type": "markdown", + "metadata": {}, + + "source": ["## Loading BioProviders Package\n", +"\n", +"To load the RefSeq Type Provider, a script can use the NuGet syntax to reference the BioProviders package, shown below.\n", +"\n", +"You can optionally include the BioFSharp package. While it\u0027s not required to use the basic BioProviders functions, it can be used to explore the metadata of the provided types, as shown in a later example.\n", +"\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": 1, "outputs": [], + "source": ["#r \"nuget: BioProviders\"\n", +"#r \"nuget: BioFSharp\"\n"] + } +, + { + "cell_type": "markdown", + "metadata": {}, + + "source": ["If creating an F# library or application, BioProviders can be added as a package reference. You can use your IDE for this, or use the `dotnet add package BioProviders` command in your project folder from the command line.\n", +"\n", +"BioProviders can then be used in your script or code by using an open command. Opening its dependencies should not be required. (BioFSharp is loaded for future examples.)\n", +"\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": 2, "outputs": [], + "source": ["open BioProviders\n", +"open BioFSharp\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": null, "outputs": [], + "source": ["\u003cbr /\u003e\n"] + } +, + { + "cell_type": "markdown", + "metadata": {}, + + "source": ["## RefSeqProvider Example\n", +"\n", +"The RefSeq Type Provider will be demonstrated for [this RefSeq assembly](https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_001224225.1/)\n", +"of the **Staphylococcus borealis** species. To create a typed representation of the assembly, two pieces of information\n", +"must be given to the Type Provider:\n", +"\n", +"* Species name\n", +"\n", +"* RefSeq assembly accession\n", +"\n", +"For this example, the species name is \"Staphylococcus borealis\" and the RefSeq assembly accession is \"GCF_001224225.1\".\n", +"To find this information:\n", +"\n", +"* Visit [https://www.ncbi.nlm.nih.gov/datasets/](https://www.ncbi.nlm.nih.gov/datasets/)\n", +"\n", +"* Search for the name of the species\n", +"\n", +"* Select to view all genones of the species\n", +"\n", +"You can then select the assembly\u0027s RefSeq (as well as GenBank) accession from the list that appears.\n", +"\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": null, "outputs": [], + "source": ["\u003cdiv class=\"container-fluid\" style=\"margin:15px 0px 15px 0px;\"\u003e\n", +" \u003cdiv class=\"row-fluid\"\u003e\n", +" \u003cdiv class=\"span1\"\u003e\u003c/div\u003e\n", +" \u003cdiv class=\"span10\" id=\"anim-holder\"\u003e\n", +" \u003ca id=\"lnk\" href=\"../images/RefSeq_Info.gif\"\u003e\u003cimg id=\"anim\" src=\"../images/RefSeq_Info.gif\" /\u003e\u003c/a\u003e\n", +" \u003c/div\u003e\n", +" \u003cdiv class=\"span1\"\u003e\u003c/div\u003e\n", +" \u003c/div\u003e\n", +"\u003c/div\u003e\n"] + } +, + { + "cell_type": "markdown", + "metadata": {}, + + "source": ["Passing this information to the Type Provider generates the Assembly Type. The genomic data can then be extracted from the\n", +"Assembly Type by invoking the Genome method. This is demonstrated below.\n", +"\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": 3, "outputs": [], + "source": ["// Define species name and RefSeq assembly accession.\n", +"let [\u003cLiteral\u003e] Species = \"Staphylococcus borealis\"\n", +"let [\u003cLiteral\u003e] Accession = \"GCF_001224225.1\"\n", +"\n", +"// Create RefSeq assembly type.\n", +"type Borealis = RefSeqProvider\u003cSpecies, Accession\u003e\n", +"\n", +"// Extract statically-typed genome data.\n", +"let genome = Borealis.Genome()\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": null, "outputs": [], + "source": ["\u003cbr /\u003e\n", +"\u003cbr /\u003e\n"] + } +, + { + "cell_type": "markdown", + "metadata": {}, + + "source": ["### Metadata\n", +"\n", +"Each genome is accompanied by metadata describing the organism and sequence recorded in the assembly. This metadata can\n", +"be extracted using the Metadata field of the Genome Type created previously. The Metadata type is largely based on that\n", +"provided by [.NET Bio](http://dotnetbio.github.io/Help/html/319bf2e6-4fcf-9f93-586f-fc7ffcf04a83.htm), with modifications\n", +"made to be more idiomatic with F#.\n", +"\n", +"Below is an example of how the raw metadata type can be retrieved and displayed:\n", +"\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": 4, "outputs": [ + { + "data": { + "text/plain": ["{ Locus = Some { Date = Some 4/27/2023 12:00:00 AM", +" DivisionCode = Some CON", +" MoleculeType = Some DNA", +" Name = Some \"NZ_CUEE01000001\"", +" SequenceLength = 563044", +" SequenceType = Some \"bp\"", +" Strand = None", +" StrandTopology = Some Linear }", +" Definition =", +" Some \"Staphylococcus borealis strain 51-48, whole genome shotgun sequence.\"", +" Accession = Some { Primary = Some \"NZ_CUEE01000001\"", +" Secondary = Some [\"NZ_CUEE01000000\"] }", +" Version = Some { Accession = Some \"NZ_CUEE01000001\"", +" CompoundAccession = Some \"NZ_CUEE01000001.1\"", +" GiNumber = None", +" Version = Some \"1\" }", +" DbLinks =", +" Some", +" [{ Numbers = Some [\" PRJNA224116\"]", +" Type = Some BioProject }; { Numbers = Some [\" SAMEA1035138\"]", +" Type = None };", +" { Numbers = Some [\" GCF_001224225.1\"]", +" Type = None }]", +" DbSource = None", +" Keywords = Some \"WGS; RefSeq.\"", +" Primary = None", +" Source =", +" Some", +" { CommonName = Some \"Staphylococcus borealis\"", +" Organism =", +" Some", +" { ClassLevels =", +" Some", +" \"Bacteria; Bacillota; Bacilli; Bacillales; Staphylococcaceae; Staphylococcus.\"", +" Genus = Some \"Staphylococcus\"", +" Species = Some \"borealis\" } }", +" References =", +" Some", +" [{ Authors = Some \"Informatics,Pathogen.\"", +" Consortiums = None", +" Journal =", +" Some", +" \"Submitted (10-MAR-2015) SC, Wellcome Trust Sanger Institute, CB10 1SA, United Kingdom\"", +" Location = None", +" Medline = None", +" Number = 1", +" PubMed = None", +" Remarks = None", +" Title = Some \"Direct Submission\" }]", +" Comments =", +" Some", +" [\"REFSEQ INFORMATION: The reference sequence is identical to", +"", +"CUEE01000001.1.", +"", +"The annotation was added by the NCBI Prokaryotic Genome Annotation", +"", +"Pipeline (PGAP). Information about PGAP can be found here:", +"", +"https://www.ncbi.nlm.nih.gov/genome/annotation_prok/", +"", +"", +"", +"##Genome-Annotation-Data-START##", +"", +"Annotation Provider :: NCBI RefSeq", +"", +"Annotation Date :: 04/27/2023 01:28:26", +"", +"Annotation Pipeline :: NCBI Prokaryotic Genome", +"", +"Annotation Pipeline (PGAP)", +"", +"Annotation Method :: Best-placed reference protein", +"", +"set; GeneMarkS-2+", +"", +"Annotation Software revision :: 6.5", +"", +"Features Annotated :: Gene; CDS; rRNA; tRNA; ncRNA", +"", +"Genes (total) :: 2,650", +"", +"CDSs (total) :: 2,584", +"", +"Genes (coding) :: 2,507", +"", +"CDSs (with protein) :: 2,507", +"", +"Genes (RNA) :: 66", +"", +"rRNAs :: 2, 1, 1 (5S, 16S, 23S)", +"", +"complete rRNAs :: 2, 1, 1 (5S, 16S, 23S)", +"", +"tRNAs :: 58", +"", +"ncRNAs :: 4", +"", +"Pseudo Genes (total) :: 77", +"", +"CDSs (without protein) :: 77", +"", +"Pseudo Genes (ambiguous residues) :: 0 of 77", +"", +"Pseudo Genes (frameshifted) :: 29 of 77", +"", +"Pseudo Genes (incomplete) :: 49 of 77", +"", +"Pseudo Genes (internal stop) :: 29 of 77", +"", +"Pseudo Genes (multiple problems) :: 23 of 77", +"", +"##Genome-Annotation-Data-END##\"]", +" Contig = Some \"join(CUEE01000001.1:1..563044)\"", +" Segment = None", +" Origin = None }"] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + }], + "source": ["// Extract the metadata.\n", +"let metadata = genome.Metadata\n", +"\n", +"// Display the metadata type.\n", +"printf \"%A\" metadata\n"] + } +, + { + "cell_type": "markdown", + "metadata": {}, + + "source": ["The metadata type consists of many fields, though not all fields of the metadata exist for all assemblies. Therefore, they are provided as option types, on which a match expression can be used. Below are examples of accessing fields from the example assembly.\n", +"\u003cbr /\u003e\n", +"\u003cbr /\u003e\n", +"✅ Example - Accessing a field that is provided.\n", +"\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": 5, "outputs": [ + { + "data": { + "text/plain": ["Staphylococcus borealis strain 51-48, whole genome shotgun sequence."] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + }], + "source": ["// Print definition if exists.\n", +"match metadata.Definition with\n", +"| Some definition -\u003e printf \"%s\" definition\n", +"| None -\u003e printf \"No definition provided.\"\n"] + } +, + { + "cell_type": "markdown", + "metadata": {}, + + "source": ["\u003cbr /\u003e\n", +"❌ Example - Accessing a field that is not provided.\n", +"\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": 6, "outputs": [ + { + "data": { + "text/plain": ["No database source provided."] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + }], + "source": ["// Print database source if exists.\n", +"match metadata.DbSource with\n", +"| Some dbsource -\u003e printf \"%s\" dbsource\n", +"| None -\u003e printf \"No database source provided.\"\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": null, "outputs": [], + "source": ["\u003cbr /\u003e\n", +"\u003cbr /\u003e\n"] + } +, + { + "cell_type": "markdown", + "metadata": {}, + + "source": ["### Sequence\n", +"\n", +"The genomic sequence for the organism can be extracted using the Sequence field of the Genome Type created previously.\n", +"This field provides a BioFSharp [BioSeq](https://csbiology.github.io/BioFSharp/reference/biofsharp-bioseq.html) containing\n", +"a series of [Nucleotides](https://csbiology.github.io/BioFSharp//reference/biofsharp-nucleotides-nucleotide.html). More\n", +"can be read about BioFSharp containers [here](https://csbiology.github.io/BioFSharp//BioCollections.html).\n", +"\n", +"An example of accessing and manipulating the RefSeqProvider genomic sequence using BioFSharp is provided below:\n", +"\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": 7, "outputs": [ + { + "data": { + "text/plain": ["seq [C; A; G; G; ...]"] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + }], + "source": ["// Extract the BioFSharp BioSeq.\n", +"let sequence = genome.Sequence\n", +"\n", +"// Display the sequence type.\n", +"printf \"%A\" sequence\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": 8, "outputs": [ + { + "data": { + "text/plain": ["seq [Val; Leu; Val; Ter; ...]"] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + }], + "source": ["// Take the complement, then transcribe and translate the coding strand.\n", +"sequence\n", +"|\u003e BioSeq.complement\n", +"|\u003e BioSeq.transcribeCodingStrand\n", +"|\u003e BioSeq.translate 0\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": null, "outputs": [], + "source": ["\u003cbr /\u003e\n"] + } +, + { + "cell_type": "markdown", + "metadata": {}, + + "source": ["## Wildcard Operators\n", +"\n", +"Wildcard operators are supported in both the Species and Accession provided to the RefSeqProvider. By using asterisks \"*\"\n", +"at the end of a Species or Accession name, species or accessions starting with the provided pattern will be matched.\n", +"\n", +"For example, we can get all **Staphylococcus** species starting with the letter \u0027c\u0027 and assembly accesions starting with\n", +"\u0027GCF_01\u0027:\n", +"\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": 9, "outputs": [ + { + "data": { + "text/plain": ["Staphylococcus capitis strain 18-857 NODE_1, whole genome shotgun sequence."] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + }], + "source": ["// Define species name and RefSeq assembly accession using wildcards.\n", +"let [\u003cLiteral\u003e] SpeciesPattern = \"Staphylococcus c*\"\n", +"let [\u003cLiteral\u003e] AccessionPattern = \"GCF_01*\"\n", +"\n", +"// Create RefSeq type containing all species matching the species pattern.\n", +"type SpeciesCollection = RefSeqProvider\u003cSpeciesPattern, AccessionPattern\u003e\n", +"\n", +"// Select the species types.\n", +"type Capitis = SpeciesCollection.``Staphylococcus capitis``\n", +"type Cohnii = SpeciesCollection.``Staphylococcus cohnii``\n", +"\n", +"// Select assemblies.\n", +"type Assembly1 = Capitis.``GCF_012926605.1``\n", +"type Assembly2 = Capitis.``GCF_012926635.1``\n", +"type Assembly3 = Cohnii.``GCF_013602215.1``\n", +"type Assembly4 = Cohnii.``GCF_013602265.1``\n", +"\n", +"// Extract statically-typed genome data.\n", +"let data = Assembly1.Genome()\n", +"\n", +"// Show the assembly\u0027s definition.\n", +"match data.Metadata.Definition with\n", +"| Some definition -\u003e printf \"%s\" definition\n", +"| None -\u003e printf \"No definition provided.\"\n"] + } +, + { + "cell_type": "markdown", + "metadata": {}, + + "source": ["The Accession parameter can also be omitted from the RefSeqProvider. In this case, all assemblies for the given species will\n", +"be matched. For example:\n", +"\n"] + } +, + { + "cell_type": "code", + "metadata": {}, + "execution_count": 10, "outputs": [ + { + "data": { + "text/plain": ["NZ_KQ957361"] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + }], + "source": ["// Define species name.\n", +"let [\u003cLiteral\u003e] SpeciesName = \"Staphylococcus lugdunensis\"\n", +"\n", +"// Create RefSeq type containing all assemblies for the species.\n", +"type Assemblies = RefSeqProvider\u003cSpeciesName\u003e\n", +"\n", +"// Select assemblies.\n", +"type Assembly = Assemblies.``GCF_001546615.1``\n", +"\n", +"// Show the assembly\u0027s primary accession.\n", +"match (Assembly.Genome()).Metadata.Accession with\n", +"| Some accession -\u003e match accession.Primary with\n", +" | Some primary -\u003e printf \"%s\" primary\n", +" | None -\u003e printf \"No primary accession provided.\"\n", +"| None -\u003e printf \"No accession provided.\"\n"] + }], + "metadata": { + "kernelspec": {"display_name": ".NET (F#)", "language": "F#", "name": ".net-fsharp"}, + "langauge_info": { + "file_extension": ".fs", + "mimetype": "text/x-fsharp", + "name": "C#", + "pygments_lexer": "fsharp", + "version": "4.5" + } + }, + "nbformat": 4, + "nbformat_minor": 1 + } + + diff --git a/reference/index.html b/reference/index.html new file mode 100644 index 0000000..c9b20dd --- /dev/null +++ b/reference/index.html @@ -0,0 +1,123 @@ + + + + + + BioProviders (API Reference) + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+
+

+ API Reference +

+

+ Available Namespaces: +

+ + + + + + + + +
+ Namespace + + Description +
+
+ +
+ + + + + + + + +
+ + + \ No newline at end of file