diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 00000000..9fcfca43 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,3 @@ +{"image":"mcr.microsoft.com/devcontainers/universal:2", +"postCreateCommand": "pip3 install --user -r requirements.txt && python -m mkdocs serve" +} \ No newline at end of file diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 00000000..77a82861 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,40 @@ + +# Description + + +❓**What**: + +🧠**Why?**: + +👨‍💻**How?**: + +# Checklist: +Have checked for the following: +- [ ] The website still builds correctly, and you can view it using `mkdocs serve`. +- [ ] There are no new "warnings" from mkdocs +- [ ] Does your page follow the [page template](https://nhsdigital.github.io/rap-community-of-practice/example_RAP_CoP_page/) (or [here in Markdown](https://github.com/NHSDigital/rap-community-of-practice/blob/main/docs/example_RAP_CoP_page.md))? (**need to make a new one specific to NHSE Data Science**) +- [ ] Spelling errors +- [ ] Consistent capitalization +- [ ] Consistent numbers +- [ ] Material features incorrectly implemented: search for code blocks and markers (e.g. !!!). +- [ ] Code snippets don't work +- [ ] Images not working +- [ ] Links not working + +## Where it was tested + +- Github Codespaces - 2-core, 4GB RAM, 32GB hard drive +- devcontainer.json describes further settings \ No newline at end of file diff --git a/.github/workflows/pages-build-deployment.yml b/.github/workflows/pages-build-deployment.yml new file mode 100644 index 00000000..6bd8ad73 --- /dev/null +++ b/.github/workflows/pages-build-deployment.yml @@ -0,0 +1,36 @@ +name: Website Deployment + +# Controls when the workflow will run +on: + # Triggers the workflow on push events but only for the "main" branch and in the docs directory + push: + branches: + - 'main' + #paths: + # - 'docs/**' + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +# A workflow run is made up of one or more jobs that can run sequentially or in parallel +jobs: + # This workflow contains a single job called "deploy" + deploy: + # The type of runner that the job will run on + runs-on: ubuntu-latest + # Steps represent a sequence of tasks that will be executed as part of the job + steps: + # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it + - name: "Checkout Code" + uses: actions/checkout@v3 + + - name: "Setup Python" + uses: actions/setup-python@v2 + with: + python-version: 3.x + + - name: "Install Python Packages" + run: pip install -r requirements.txt + + - name: "Run mkdocs to build website" + run: mkdocs gh-deploy --force --clean --verbose diff --git a/.gitignore b/.gitignore index 68bc17f9..5bce3257 100644 --- a/.gitignore +++ b/.gitignore @@ -157,4 +157,10 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ +.idea/ + +.DS_Store + +# VSCode + +.vscode/ diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 00000000..aaa69ca5 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,91 @@ +# Code of Conduct for NHS Digital + +Contributors to repositories hosted in NHS Digital are expected to +follow the Contributor Covenant Code of +Conduct, and those working within Government are also expected to follow the Civil Service Code. + +## Civil Service Code + +- The [Civil Service Code](https://www.gov.uk/government/publications/civil-service-code/the-civil-service-code). + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, religion, or sexual identity +and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behaviour that contributes to a positive environment for our +community include: + +- Demonstrating empathy and kindness toward other people +- Being respectful of differing opinions, viewpoints, and experiences +- Giving and gracefully accepting constructive feedback +- Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +- Focusing on what is best not just for us as individuals, but for the + overall community + +Examples of unacceptable behaviour include: + +- The use of sexualized language or imagery, and sexual attention or + advances of any kind +- Trolling, insulting or derogatory comments, and personal or political attacks +- Public or private harassment +- Publishing others' private information, such as a physical or email + address, without their explicit permission +- Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behaviour and will take appropriate and fair corrective action in +response to any behaviour that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behaviour may be +reported to the community leaders responsible for enforcement at +datascience@nhs.net. +All complaints will be reviewed and investigated and will result in a response that +is deemed necessary and appropriate to the circumstances. The project team is +obligated to maintain confidentiality with regard to the reporter of an incident. +Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +[version 2.0](https://www.contributor-covenant.org/version/2/0/code_of_conduct.html) and the `govcookiecutter` [Code of Conduct](https://github.com/best-practice-and-impact/govcookiecutter/blob/main/CODE_OF_CONDUCT.md). + +Community Impact Guidelines were inspired by [Mozilla's code of conduct +enforcement ladder](https://github.com/mozilla/diversity). + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see the [FAQ](https://www.contributor-covenant.org/faq). Translations are available [here](https://www.contributor-covenant.org/translations). diff --git a/CONTRIBUTE.md b/CONTRIBUTE.md new file mode 100644 index 00000000..1ed57485 --- /dev/null +++ b/CONTRIBUTE.md @@ -0,0 +1,77 @@ +# Contribute + +Hi there! We're thrilled that you'd like to contribute to this landing page repository. Your help is essential for keeping it great. + +## Creating an issue + +If you think of something worth including, improving, or want to contribute, please [raise an issue on GitHub](https://github.com/NHSDigital/rap-community-of-practice/issues). + +## Submitting a pull request + +If you want to contribute to our resources: + +1. [Fork][fork] or clone the repository +2. Configure and install the dependencies if you want to run the page in your machine, otherwise none. +3. Create a new branch: `git checkout -b my-branch-name` +4. Make your change +5. Check how your change looks on our website by hosting the website locally (follow [the steps below](#contribute-to-rap-community-of-practice-website) on how to do this) +6. Push to your fork and [submit a pull request][pr] + +Your pull request will then be reviewed. You may receive some feedback and suggested changes before it can be approved and your pull request merged. + +To increase the likelihood of your pull request being accepted: + +- If you are making visual changes, include a screenshot of what the affected element looks like, both before and after. +- Follow the [style guide][style]. +- Keep your change as focussed as possible. If there are multiple changes you would like to make that are not dependent upon each other, consider submitting them as separate pull requests. +- Write [good commit messages](http://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html). + +## Contribute to RAP Community of Practice Website + +### Installing MkDocs + +Run the commands (or follow the MkDocs documentation to locally pip install MkDocs): + +```bash + # environment.yml + + conda env create -f environment.yml + conda activate rap-cop-pages + + --- + + # requirements.txt + + ## using pip + pip install -r requirements.txt + + ## using Conda + conda create --name --file requirements.txt +``` + +### Hosting + +To host the website locally to view the live changes, run the command: + +```bash + mkdocs serve +``` + +### Editing the contents + +To add a new file to the repository and website, you can add the file as you would normally and then update 'nav' in mkdocs.yml to include the file within the nested list. Don't forget to check that the links, images, headings, and contents are all working correctly on both the website and in the GitHub repo. + +All of the files accessed via the website are nested within the 'docs' folder. + +The website currently uses the [Material for MkDocs](https://squidfunk.github.io/mkdocs-material/getting-started/) theme. This sets the layout, colour, font, search bar, header, footer, navigation bar and contents. You can follow the documentation to make any changes (e.g. change the [colour scheme](https://squidfunk.github.io/mkdocs-material/setup/changing-the-colors/)) as it is simple to use and also easy to overwrite. There is a separate stylesheet, [extra.css](./docs/stylesheets/extra.css), which is used to overwrite the colours, fonts and some of the sizing for some elements. +Here is a good [cheat sheet](https://yakworks.github.io/docmark/cheat-sheet/) for what features can be used in MkDocs and also interesting features in [Material for MkDocs](https://squidfunk.github.io/mkdocs-material/reference/). + +## Resources + +- [Contributing to Projects](https://docs.github.com/en/get-started/quickstart/contributing-to-projects) +- [Using Pull Requests](https://help.github.com/articles/using-pull-requests/) +- [GitHub Help](https://help.github.com) + +[fork]: https://github.com/pages-themes/slate/fork +[pr]: https://github.com/pages-themes/slate/compare +[style]: http://ben.balter.com/jekyll-style-guide/ diff --git a/LICENCE b/LICENCE new file mode 100644 index 00000000..913e9c42 --- /dev/null +++ b/LICENCE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 Crown Copyright NHS Digital + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index f0e43c82..6bb9d047 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,27 @@ -# datascience -Website for the data science team within NHS England. +# [NHS England Data Science](https://NHSDigital.github.io/data-science/) +![CI](https://github.com/NHSDigital/data_science_site/actions/workflows/pages-build-deployment.yml/badge.svg "CI badge indicating passing or failing status") +[![Release Version](https://img.shields.io/github/v/release/nhsdigital/data_science_site "Release version")](https://github.com/NHSDigital/data_science_site/releases) +[![MkDocs Material](https://img.shields.io/badge/style-MkDocs%20Material-darkblue "Markdown Style: MkDocs")](https://squidfunk.github.io/mkdocs-material/reference/) +[![licence: MIT](https://img.shields.io/badge/Licence-MIT-yellow.svg)](https://opensource.org/licenses/MIT "MIT License") +[![licence: OGL3](https://img.shields.io/badge/Licence-OGL3-darkgrey "licence: Open Government Licence 3")](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/) + + +> **This material is maintained by the [NHS England Data Science team](mailto:datascience@nhs.net)**. +> +> See our other work here: [NHS Digital Analytical Services](https://github.com/NHSDigital/data-analytics-services). + +
+ +**Welcome to the landing page for the NHS England Data Science Team.** + +Visit our [website](https://NHSDigital.github.io/data-science/) for more information about our work! + + +## Licence + +Unless stated otherwise, the codebase is released under the [MIT Licence][2]. This covers both the codebase and any sample code in the documentation. + +HTML and Markdown documentation is © Crown copyright and available under the terms of the [Open Government 3.0](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/) licence. + +[1]: ./CONTRIBUTE.md +[2]: ./LICENCE diff --git a/docs/about.md b/docs/about.md new file mode 100644 index 00000000..43f91664 --- /dev/null +++ b/docs/about.md @@ -0,0 +1,8 @@ +--- +hide: + - navigation +--- + +# Data Science in NHS England + +> This page could describe a bit about our team \ No newline at end of file diff --git a/docs/articles/.pages b/docs/articles/.pages new file mode 100644 index 00000000..a6449d74 --- /dev/null +++ b/docs/articles/.pages @@ -0,0 +1 @@ +# this page can be used to arrange the other pages and apply formatting to the nav etc. diff --git a/docs/articles/index.md b/docs/articles/index.md new file mode 100644 index 00000000..73c85550 --- /dev/null +++ b/docs/articles/index.md @@ -0,0 +1,3 @@ +# Blog + +> A team blog if we wish to make one \ No newline at end of file diff --git a/docs/codebases.md b/docs/codebases.md new file mode 100644 index 00000000..4fd0069e --- /dev/null +++ b/docs/codebases.md @@ -0,0 +1,6 @@ +--- +hide: + - navigation +--- + +# Codebases \ No newline at end of file diff --git a/docs/images/NHS Digital logo_WEB_LEFT.svg b/docs/images/NHS Digital logo_WEB_LEFT.svg new file mode 100644 index 00000000..1f095f70 --- /dev/null +++ b/docs/images/NHS Digital logo_WEB_LEFT.svg @@ -0,0 +1 @@ +NHS Digital logo_WEB_LEFT \ No newline at end of file diff --git a/docs/images/NHS-Digital-logo_LEFT-WHITE-235x183.png b/docs/images/NHS-Digital-logo_LEFT-WHITE-235x183.png new file mode 100644 index 00000000..24a9c9b7 Binary files /dev/null and b/docs/images/NHS-Digital-logo_LEFT-WHITE-235x183.png differ diff --git a/docs/images/NHS-Digital-logo_WEB_LEFT-235x183.png b/docs/images/NHS-Digital-logo_WEB_LEFT-235x183.png new file mode 100644 index 00000000..1536b030 Binary files /dev/null and b/docs/images/NHS-Digital-logo_WEB_LEFT-235x183.png differ diff --git a/docs/images/Staff_hot_desking_in_the_HUB_01.jpeg b/docs/images/Staff_hot_desking_in_the_HUB_01.jpeg new file mode 100644 index 00000000..3d210a51 Binary files /dev/null and b/docs/images/Staff_hot_desking_in_the_HUB_01.jpeg differ diff --git a/docs/images/cartoon hospital.jpeg b/docs/images/cartoon hospital.jpeg new file mode 100644 index 00000000..c6be47cd Binary files /dev/null and b/docs/images/cartoon hospital.jpeg differ diff --git a/docs/images/favicon/favicon.ico b/docs/images/favicon/favicon.ico new file mode 100644 index 00000000..ee9b7116 Binary files /dev/null and b/docs/images/favicon/favicon.ico differ diff --git a/docs/images/logo/nhs-blue-on-white.jpg b/docs/images/logo/nhs-blue-on-white.jpg new file mode 100644 index 00000000..b95c15c7 Binary files /dev/null and b/docs/images/logo/nhs-blue-on-white.jpg differ diff --git a/docs/images/logo/nhs-logo.png b/docs/images/logo/nhs-logo.png new file mode 100644 index 00000000..15a17282 Binary files /dev/null and b/docs/images/logo/nhs-logo.png differ diff --git a/docs/images/logo/nhs-white-on-blue.jpg b/docs/images/logo/nhs-white-on-blue.jpg new file mode 100644 index 00000000..fa0fc8f4 Binary files /dev/null and b/docs/images/logo/nhs-white-on-blue.jpg differ diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 00000000..f4c0d99f --- /dev/null +++ b/docs/index.md @@ -0,0 +1,8 @@ +--- +hide: + - navigation + - toc + - footer + +template: home.html +--- diff --git a/docs/javascripts/mathjax.js b/docs/javascripts/mathjax.js new file mode 100644 index 00000000..c189c6a7 --- /dev/null +++ b/docs/javascripts/mathjax.js @@ -0,0 +1 @@ +# needed to silence a warning \ No newline at end of file diff --git a/docs/meta_page.md b/docs/meta_page.md new file mode 100644 index 00000000..0f19aeb0 --- /dev/null +++ b/docs/meta_page.md @@ -0,0 +1,13 @@ +--- +hide: + - navigation +--- + +# Meta Page + + +## Contribution + +This uses: +- mkdocs-material +- https://github.com/lukasgeiter/mkdocs-awesome-pages-plugin \ No newline at end of file diff --git a/docs/our_work/.pages b/docs/our_work/.pages new file mode 100644 index 00000000..a6449d74 --- /dev/null +++ b/docs/our_work/.pages @@ -0,0 +1 @@ +# this page can be used to arrange the other pages and apply formatting to the nav etc. diff --git a/docs/our_work/Publications.md b/docs/our_work/Publications.md new file mode 100644 index 00000000..6b099616 --- /dev/null +++ b/docs/our_work/Publications.md @@ -0,0 +1,48 @@ +--- +layout: base +title: Connected Publications +permalink: publications.html +--- + +# {{page.title}} + +List of pre-releases and publications connected to our work + +[4] [https://doi.org/10.1101/2023.08.31.23294903](https://doi.org/10.1101/2023.08.31.23294903) - (Pre-Print) + +**Represneting Multimorbid Disease Progressions using directed hypergraphs** + +**Jamie Burke**, Ashley Akbari, Rowena Bailey, **Kevin Fasusi**, Ronan A.Lyons, **Jonathan Pearson**, James Rafferty, and **Daniel Schofield** + +*To introduce directed hypergraphs as a novel tool for assessing the temporal relationships between coincident diseases,addressing the need for a more accurate +representation of multimorbidity and leveraging the growing availability of electronic healthcare databases and improved computational resources.* + +--- + +[3] [https://doi.org/10.1016/j.epidem.2022.100662](https://doi.org/10.1016/j.epidem.2022.100662) + +**Large-scale calibration and simulation of COVID-19 epidemiologic scenarios to support healthcare planning** + +**Nick Groves-Kirkby**, Ewan Wakeman, Seema Patel, Robert Hinch, Tineke Poot, **Jonathan Pearson**, Lily Tang, Edward Kendall, Ming Tang, Kim Moore, Scott Stevenson, Bryn Mathias, Ilya Feige, Simon Nakach, Laura Stevenson, Paul O'Dwyer, William Probert, Jasmina Panovska-Griffiths, Christophe Fraser + +*... We adapted an agent-based model of COVID-19 to inform planning and decision-making within a healthcare setting, and created a software framework that automates processes for calibrating the model parameters to health data and allows the model to be run at national population scale on National Health Service (NHS) infrastructure. ... These simulations were used to support operational planning in the NHS in England, and we present the example of the use of these simulations in projecting future clinical demand during the rollout of the national COVID-19 vaccination programme. ...* + +--- + +[2] [https://doi.org/10.1101/2023.01.25.23284428](https://doi.org/10.1101/2023.01.25.23284428) + +**Primary care coding activity related to the use of online consultation systems or remote consulting: an analysis of 53 million peoples’ health records using OpenSAFELY** + +**Martina Fonseca**, Brian MacKenna, Amir Mehrkar, The OpenSAFELY Collaborative, Caroline E Walters, George Hickman, **Jonathan Pearson**, Louis Fisher, Peter Inglesby, Seb Bacon, Simon Davy, William Hulme, Ben Goldacre, Ofra Koffman, Minal Bakhai + +*We aimed to explore general practice coding activity associated with the use of online consultation systems in terms of trends, COVID-19 effect, variation and quality.* + +--- + +[1] [https://doi.org/10.21203/rs.3.rs-2226531/v1](https://doi.org/10.21203/rs.3.rs-2226531/v1) + +**Assessing the value of integrating national longitudinal shopping data into respiratory disease forecasting models** + +**Elizabeth Dolan**, James Goulding, Harry Marshall, Gavin Smith, Gavin Ling, Laila Tata + +*... We investigated the value of integrating sales of non-prescription medications commonly bought for managing respiratory symptoms, to improve forecasting of weekly registered deaths from respiratory disease at local levels across England, by using over 2 billion transactions logged by a UK high street retailer from March 2016 to March 2020. We report the results from the novel AI explainability variable importance tool Model Class Reliance implemented on the PADRUS model. ...* diff --git a/docs/our_work/c245_synpath.md b/docs/our_work/c245_synpath.md new file mode 100644 index 00000000..3a77cf60 --- /dev/null +++ b/docs/our_work/c245_synpath.md @@ -0,0 +1,38 @@ +--- +layout: base +title: Building the Foundations for a Generic Patient Simulator +permalink: c245_synpath.html +--- + + +# {{page.title}} + +> | "Developing an agent-based simulation for generating synthetic patient pathways and scenario modelling for healthcare specific implementations." + +

+ Overview of data model +

+

+ Figure 1: Overview of the Synpath data model +

+ +A data model (“Patient Agent”) was developed for fake patients to be defined in the simulation. The patient is then assigned a health record (conditions, medications, ..) with optional additional attributes. + +Interacting this data model over time with an environment layer (representation of the physical and abstract health system components that the patient can interact with (e.g., GP practice, multidisciplinary team meeting)) creates a patient record with appointment times, updates to health status, and changes in medications prescribed. + +## Results + +Foundations were built for the data model and environment. + +During the development it became clear that a key nature to be included for healthcare agent simulations in the NHS is the distinction between active and passive agents in regards to the choice of the next environment interaction point. The spatial location of services was less important than in a typical agent-based simulation as the timescales made these considerations redundant. + +Efficient object communication and concurrency were also highlighted needing significant further development. + +| Output | Link | +| ---- | ---- | +| Open Source Code & Documentation | [Github](https://github.com/nhsx/SynPath) | +| Case Study | Awaiting Sign-Off | +| Technical report | [Here](https://github.com/nhsx/SynPath/blob/master/reports/REDACTED_C245%20ABM%20Patient%20Pathways_Final%20Report_V3_28042021.cleaned.pdf) | + +|:-|:-|:-| +|||| \ No newline at end of file diff --git a/docs/our_work/c250_nhscorpus.md b/docs/our_work/c250_nhscorpus.md new file mode 100644 index 00000000..c05e2130 --- /dev/null +++ b/docs/our_work/c250_nhscorpus.md @@ -0,0 +1,32 @@ +--- +layout: base +title: Considerations for Building a Language Corpus with a Focus on the NHS +permalink: c250_nhscorpus.html +--- + +# {{page.title}} +> | "What are the available tools that could be used to build an NHS-focussed collection of texts which could help developers build better NLP tools for the healthcare system." + +

+ Ingest, Enrich, Share +

+

+ Figure 1: Open source tools used in each functional setting +

+ +We aimed to explore how to build an Open, Representative, Extensible and Useful set of tools to curate, enrich and share sources of healthcare text data in an appropriate manner. + +## Results + +Whilst a tool stack was developed which achieved many of our objectives, the key learning points were around the knowledge gaps which need to be addressed at both a data and tooling level before bringing these data together becomes achievable. + + +| Output | Link | +| ---- | ---- | +| Open Source Code & Documentation | [Github](https://github.com/nhsx/language-corpus-tools) | +| Case Study | n/a | +| Blog | [Here](https://nhsx.github.io/AnalyticsUnit/languagecorpusdiscovery.html) | + +|:-|:-|:-| +||| + diff --git a/docs/our_work/c338_poud.md b/docs/our_work/c338_poud.md new file mode 100644 index 00000000..b88032c4 --- /dev/null +++ b/docs/our_work/c338_poud.md @@ -0,0 +1,65 @@ +--- +layout: base +title: How to Assess the Privacy of Unstructured Data +permalink: c338_poud.html +--- + +# {{page.title}} +> | "What are the privacy considerations that need to be addressed when dealing with unstructured healthcare text data " + +

+ +

+

+ Figure 1: Figure 4 from Al-Fedaghi, Sabah. (2012). Experimentation with Personal Identifiable Information. Showing an example PII sphere from different perspectives (compund, singleton and multitude personal identifiable information) +

+ +Unstructured data (e.g. text, image, audio) makes up a significant quantity of NHS data but is comparatively underused as an evidence source for analysis. This is often due to the privacy concerns restricting the sharing and use of these data. + +To our knowledge, there are currently no tools on the market that allows the NHS to robustly ascertain the level of privacy of unstructured data. To have confidence when commissioning tooling for anonymisation purposes the NHS needs an understanding of what private content, health related text data can contain. The tooling put in place to protect the privacy of these data needs to be able to assess the content, evaluate the risk associated with the content, and demonstrate that the tooling functionality has dealt with any privacy concerns appropriately. + +During this time the three main activities were a literature review, bringing a range of expert and voices together into a workshop, and writing the associated report summarising our understanding of the problem. + +## Results + +The main output specified was for a list of key qualities that could feed a tool specification in the future. The qualities the report highlighted were: + +**Structuring and data handling** + +* Ability to flag and identify with the range of possible data issues prior to deidentification (misspellings, medical terms, acronyms) +* Connection with a clinical vocabulary in order to match and assist word identification to assist structuring of the data. +* Ability to flag the data variables required for anonymisation to assist in the risk analysis and disclosure control process +* Ability to deal with unstructured, semi-structured and structured data +* Ability to deal with different formats of free-text data e.g. medical notes, patient feedback, survey responses, research papers + +**Tool Use & Validation** + +* Ease of manual manipulation in order to react to the level of anonymisation required and the key variables to be maintained for data utility +* Automated auditing of the flagged terms, any data manipulation and tool manipulation that has taken place +* Ability to demonstrate quality and anonymisation level before and after each stage of the de-identification process for the QC process. +* Ability to apply manual QC at each step along the process QC (human and automated) or the requirement of human authorisation to move to the next step +* Clarity around the tool limitations +* Need to align with the Information Governance (IG) process + +**Context** + +* Ability to tune into a domain extracting and utilising the appropriate medical dictionary. b. Clarity around individual versus population +* Ability to define level of anonymisation + +**Flexibility** + +* Ability to adapt the anonymisation functionality to the risk level assessed +* Flexibility within the tool programming to adapt to the utility required and hence the purpose of the output data aligning the appropriate level of de-identification +* Incorporated regular updating and reaction to current “threats” + + +| Output | Link | +| ---- | ---- | +| Open Source Code & Documentation | n/a | +| Case Study | Awaiting Sign-Off | +| Technical report | [Here](https://github.com/nhsx/PrivacyFingerprint/blob/main/reports/PrivacyOfUnstructuredDataReport_Nov2022.pdf) | + +|:-|:-|:-| +||| + + diff --git a/docs/our_work/c339_sas.md b/docs/our_work/c339_sas.md new file mode 100644 index 00000000..cf24f472 --- /dev/null +++ b/docs/our_work/c339_sas.md @@ -0,0 +1,30 @@ +--- +layout: base +title: Creating a Generic Adversarial Attack for any Synthetic Dataset +permalink: c339_sas.html +--- + +# {{page.title}} +> | "Can the privacy of a generated dataset be assessed through downstream adversarial attacks to highlight the risk of reidentificiation " + +

+ +

+

+ Figure 1: Attack diagrams for the currently incorporated scenarios. Scenario 1: Access to the synthetic dataset and a description of the generative model’s architecture and training procedure. Scenario 2: Access to a black box model that can provide unlimited synthetic data, with data realistic of the training distribution gathered by the attacker, which may be an example synthetic dataset released by the researchers. +

+ +An extensible code was developed to apply a suite of adversarial attacks to synthetically generated single table tabular data in order to assess the likely success of attacks and act as a privacy indicator for the dataset. Using this information then informs the generation and information governance process to ensure the safety of our data. + +## Results + +The code-base was successfully developed with code injection points for extensibility. Unfortunately, as the code could be used as an active attack on a dataset, we have decided not not to make the codebase public but instead aiming to both extend the number of attacks and incoproate the code in our synthetic generation process. + +| Output | Link | +| ---- | ---- | +| Open Source Code & Documentation | restricted | +| Case Study | Awaiting Sign-Off | +| Technical report | [Blod](https://nhsx.github.io/AnalyticsUnit/SynthAdvSuite.html) | + +|:-|:-|:-| +||| diff --git a/docs/our_work/c399_privfinger.md b/docs/our_work/c399_privfinger.md new file mode 100644 index 00000000..3b514e58 --- /dev/null +++ b/docs/our_work/c399_privfinger.md @@ -0,0 +1,41 @@ +--- +layout: base +title: Building a Tool to Assess the Privacy Risk of Text Data +permalink: c399_privfinger.html +--- + +# {{page.title}} +> | "Can we generate usable privacy scores for text data to support understanding of privacy concerns and the anonymisation process " + +

+ +

+

+ Figure 1: The methodology works in the following way: generated structured data for an individual record, transform this into unstructured medical notes, encode identifiers through named entitiy recognition, evaluate privacy, perform deidentification, repeat process. +

+ +This work was undertaken as an external commission aiming to build a pipeline of components which firstly generated unstructured medical notes using a structured output from [Synthea:tm:](https://github.com/synthetichealth/synthea) and then running these through [GPT-3.5](https://platform.openai.com/docs/models/gpt-3-5) models to transform these into human readable notes. + +These notes were then processed using named entitiy recognition to extract pre-defined identifiers and store these in a structured form. The alogrithm [pycorrect match](https://github.com/computationalprivacy/pycorrectmatch) was then implemented to give a privacy risk score of reidentification from the identifiers. + +[Shap ](https://github.com/slundberg/shap) analysis was then conducted to understand which components of an individual record and of the dataset as a whole had the highest risk of privacy leakage. + +This pipeline could then be run before and after a deidentification process has taken place to understand the impact of the process on the risk score and to generate confidence that the dataset has been appropriately processed for use. + +## Results + +During the 10 week project the end-to-end code was developed, tested and delivered. However, key components are dependent on commercial offerings and only the first (of three) levels of identifiers was tested in the setup. + +Future work needs to replace some components with open source versions and a large number of experiments needs to be investigated to understand the limitations and where further development would be useful. + +This is an ongoing piece of work. + + +| Output | Link | +| ---- | ---- | +| Open Source Code & Documentation | Coming Soon | +| Case Study | Coming | +| Technical report | Coming Soon | + +|:-|:-|:-|:-| +||||| diff --git a/docs/our_work/ds218_rap_community_of_practice.md b/docs/our_work/ds218_rap_community_of_practice.md new file mode 100644 index 00000000..a9b85765 --- /dev/null +++ b/docs/our_work/ds218_rap_community_of_practice.md @@ -0,0 +1,30 @@ +--- +layout: base +title: NHS Reproducible Analytical Pipelines (RAP) Champion Squad +permalink: ds218_rap_community_of_practice.html +--- + +# {{page.title}} +> | "Our roving squad of RAP champions have helped a number of teams not only transform their pipelines, but also tought them how to train others and produced guidance which is used by many other organisations." + +

+ An image displaying the front page of the NHS RAP Community of Practice website. +

+

+ Figure 1: The front page of the RAP Community of Practice website. +

+ +Our Squad of RAP Champions has supported the rollout of RAP across the Analytics area within NHE England. This has involved taking existing guidance on RAP found elsewhere, and interpreting it in the local context of NHSE, making guidance specific to our systems and the problems faced by our analysts. We also put together a program for how to learn RAP, transform pipelines (through the "[thin slice approach](https://nhsdigital.github.io/rap-community-of-practice/our_RAP_service/thin-slice-strategy/)") and [become a RAP champion yourself](https://nhsdigital.github.io/rap-community-of-practice/our_RAP_service/building_team_capability/). + +## Results + +In the past two years we've helped start up two other squads of RAP champions within NHS England, and greatly increased the number of publications with published code (it was zero, and now is everything here: NHSDigital/data-analytics-services#rap-publication-repositories). The teams who have implemented RAP have found their code is easier to understand, reuse, and often a lot faster, with one pipeline going from taking two analysts 2 weeks, to running in just 40 minutes. + +We also got through to the finals of HSJ Digital Awards - Replicating Digital Best Practice Award. + +Our guidance is widely used by other organisations within the sector, and we've had great success rolling out RAP within NHSE. + +| Output | Link | +| ---- | ---- | +| NHS RAP Community of Practice Website | [Website](https://nhsdigital.github.io/rap-community-of-practice/) | +| NHS Digital Data Services - Analytics Service Repo | [Github Repo](https://github.com/NHSDigital/data-analytics-services) | diff --git a/docs/our_work/index.md b/docs/our_work/index.md new file mode 100644 index 00000000..fd6bb2db --- /dev/null +++ b/docs/our_work/index.md @@ -0,0 +1,3 @@ +# Our Work + +> Pages describing the kinds of work we do, incl. curret future and past projects \ No newline at end of file diff --git a/docs/our_work/p11_synpathdiabetes.md b/docs/our_work/p11_synpathdiabetes.md new file mode 100644 index 00000000..6c048298 --- /dev/null +++ b/docs/our_work/p11_synpathdiabetes.md @@ -0,0 +1,32 @@ +--- +layout: base +title: Applying our SynPath Simulator to a Diabetes Pathway +permalink: p11_synpathdiabetes.html +--- + +# {{page.title}} +> | "Exploration work into incorporating learning into a pathway simulator for diabetes. This work has fed our current SynPathGo project to create synthetic patient pathways and a foundation for agent based modelling in the NHS." + +

+ +

+

+ Figure 1: Table of learning algorithms considered for the simulation inteligence layer +

+ +Using the SynPath framework we created a diabetes simulation for 800 patients. These patients could interact within a fictional local area with hospitals providing outpatient and inpatient services, GP practices and community healthcare services. + +## Results + +The project showed how to develop a set of environments, interactions and patients from academic literature, policy, and clinical resources. The model currently runs a simulation that prints outputs of patient records into the console. + +Future collaboration around validation and how to apply learning algorithms are being pursued. + +| Output | Link | +| ---- | ---- | +| Open Source Code & Documentation | [Github](https://github.com/nhsx/SynPath_Diabetes) | +| Case Study | Awaiting Sign-Off | +| Technical report | [Here](https://github.com/nhsx/SynPath_Diabetes/blob/main/t2dm/reports/Technical%20Report%20(SynPath%20Diabetes)%20v1.pdf) | + +|:-|:-|:-| +|||| diff --git a/docs/our_work/p12_synthvae.md b/docs/our_work/p12_synthvae.md new file mode 100644 index 00000000..a4cbb40c --- /dev/null +++ b/docs/our_work/p12_synthvae.md @@ -0,0 +1,35 @@ +--- +layout: base +title: Investigating Differential Privacy in a Variational AutoEncoder for Synthetic Data Generation +permalink: p12_synthvae.html +--- + +# {{page.title}} +> | "The initial creation of a variational autoencoder with differential privacy for generating single table tabular gaussian data. This work demonstrated the feasibility of this approach for healthcare and fed into further interactions of the code base." + +

+ +

+

+ Figure 1: Correlation plots highlighting the difference between the variable relationships in the real and synthetic data across four models. +

+ +This project investigates the potential suitability of Variational Autoencoders (VAEs) as a synthetic data generation tool in the context of the NHS. To effectively address this direction, this work focussed on four key aspects: quality, privacy, ease of use, and interpretability. + +We evaluate the performance of the VAE approach alongside five alternative methods available in July/August 2021, namely Gaussian Copula, CTGAN, CopulaGAN, SDV’s TVAE and Independent (a model which assumes independence across variables). Evaluating this set of models provides context to the performance of the VAE with respect to both basic (e.g. Independent) and complex (e.g. CTGAN) approaches. + +We then tested how the metrics and visualisations changed when differential privacy was incorporated into the variational autoencoder as a function of differential levels of privacy (increased privacy budget). +## Results + +We found that a variational autoencoder could indeed generate medium to high fidelity synthetic data for a single tabular table with numerical and categorical gaussian variables. + +As the privacy budget increases, we see the quality decrease as expected. However, the level of privacy increase associated with increasing the privacy budget appears not to have a direct correlation. This warrants further work as this might be down to implementation, the metrics being used for evaluation or may point to a feature of the VAE not incorporating the differential privacy correctly. + +| Output | Link | +| ---- | ---- | +| Open Source Code & Documentation | [Github](https://github.com/nhsx/SynthVAE) | +| Case Study | Awaiting Sign-Off | +| Technical report | [Here](https://github.com/nhsx/SynthVAE/blob/main/reports/report.pdf) | + +|:-|:-|:-| +|||| diff --git a/docs/our_work/p14_mcr.md b/docs/our_work/p14_mcr.md new file mode 100644 index 00000000..8ccb4db4 --- /dev/null +++ b/docs/our_work/p14_mcr.md @@ -0,0 +1,43 @@ +--- +layout: base +title: Using Model Class Reliance to Understnad the Impact of Commerical Data on Predictions +permalink: p14_mcr.html +--- + +# {{page.title}} +> | "How to asses the value that commercial sales data of over-the-counter prescriptions has on respiratory death predictions" + +

+ +

+

+ Figure 1: Schematic of the difference between other variable importance tools and the Model Class Reliance approach to explaining the value of a sinlge input variable in a prediction +

+ +The primary aim of the project was to apply the novel variable importance technique, [model class reliance](https://papers.nips.cc/paper/2020/hash/fd512441a1a791770a6fa573d688bff5-Abstract.html), to machine learning models which could predict registered respiratory deaths in the UK. The objective was to assess the value of commercial health data in healthcare predictions compared to other available datasets. +## Results + +In order to apply MCR, a set of optimal models had to be created which can successfully make the required predictions. The project managed to achieve this outcome with the machine learning model PADRUS (Predicting the amount of deaths by respiratory disease using sales). PADRUS is a random forest regressor which makes accurate weekly predictions of respiratory deaths in 314 local authorities across England 17 days in advance. The models’ features are created from the following dataset types: +* week number, +* commercial sales, +* weather, +* indices of multiple deprivation, +* age and population, +* demographics, +* housing, and +* land use. + +MCR was applied to PADRUS showing the highest and lowest impact variables had on predictions across all instances of the model. Grouped MCR was also employed in order for variables to be evaluated in concert as a collection of features created from a dataset type. + +The MCR results implied model instances of PADRUS were using variables in different ways to achieve the same predictive results, and suggested where variables could be interchangeable or critical to predictions. + +The addition of commercial data show a significant increase in predictive power. Further results are closed whilst a publication is being reviewed. + +| Output | Link | +| ---- | ---- | +| Open Source Code & Documentation | [Github](https://github.com/nhsx/commercial-data-healthcare-predictions) | +| Case Study | Awaiting Sign-Off | +| Technical report | [Here](https://github.com/nhsx/commercial-data-healthcare-predictions/blob/main/report/NHSX%20Report_ValueofCommercialProductSalesDatainHealthcarePrediction_V2.pdf) | + +|:-|:-|:-| +|||| diff --git a/docs/our_work/p21_synthvae.md b/docs/our_work/p21_synthvae.md new file mode 100644 index 00000000..12103fee --- /dev/null +++ b/docs/our_work/p21_synthvae.md @@ -0,0 +1,35 @@ +--- +layout: base +title: Developing our SynthVAE code +permalink: p21_synthvae2.html +--- + +# {{page.title}} +> | "Improving our variational autoencoder to consider fairness and to run on non-gaussian distributions" + +

+ +

+

+ Figure 1: Directed Acylic Graph (DAG) of a network of variables highlighting their relationships. The highlighted blue I node is the variable of interest whilst the other highlighted blue node is the sensitive vraiable that maybe inducing bias. +

+ +Continuation of the previous development of our variational autoencoder (VAE) to correct for an error discovered since the last project finished. This error appears when trying to generate data for continuous variables which follow non-Gaussian distributions. Previously, standard scaling had been used to normalise these variables which was causing the non-gaussian variables to be synthesised poorly. This was replaced with a Guassian mixture model from the RDT python library to scale and transform these variables into ones with a Gaussian distribution. + +The second phase of this worked focussed on understanding the different ways of measuring and implementing fairness within the synthetic data. +## Results + +The gaussian mixture model was able to cope with non-gaussian variables thus extending the range of datasets which we can generate from greatly. Additional hyper-paramter tuning and general coding improvements have increased the reusability and performance of the code. + +Regarding fairness, there are many metrics to choose from and to make the situation more complex, not all metrics are compatible with one another, i.e. you might be able to satisfy an equal odds metric for a group but not an equal prediction for the same group. This means that the level of fairness required is project specific and has to be re-evaluated depending on the research needs. + +Further work will expore the adaption of direct acylic graphs to control for fairness and the impact this has on quality and privacy. + +| Output | Link | +| ---- | ---- | +| Open Source Code & Documentation | [Github](https://github.com/nhsx/SynthVAE) | +| Case Study | Awaiting Sign-Off | +| Technical report | [Here](https://github.com/nhsx/SynthVAE/blob/main/reports/NHSXSynthVAE%20(2).pdf) | + +|:-|:-|:-| +|||| diff --git a/docs/our_work/p22_txtrayalign.md b/docs/our_work/p22_txtrayalign.md new file mode 100644 index 00000000..58fc5ccd --- /dev/null +++ b/docs/our_work/p22_txtrayalign.md @@ -0,0 +1,41 @@ +--- +layout: base +title: TxtRayAlign +permalink: p22_txtrayalign.html +--- + +# {{page.title}} +> | "Generating descriptive text from X-Ray images using contrastive learning on multi-modal data" + +

+ +

+

+ Figure 1: A contrastive retrieval mechanism. A query image is encoded and compared with the embeddings of a corpus of reference reports. The report with the greatest cosine similarity in the shared embedding space is returned as the output. +

+ +TxtRayAlign exploits contrastive training to learn similarities between text and images, allowing a retrieval-based mechanism to find reports that are “similar” to an image. + +## Results + +We observe that even the best performing model (ResNet50-DeCLUTR) only retrieves anything of relevance for 62% of queries. The retrieved sentences tend to contain findings that are not relevant for the query, as indicated by the relatively poor precision. Further, the query image contains findings that are only poorly covered by the retrieved sentences, as indicated by the low recall. + +

+ +

+

+ Figure 2: Two example reports generated by ResNet50-DeCLUTR (trained on 5%). Highlighted text corresponds to matches of the CheXpert sentence label between the ground truth and generated report. Ground truth report partially redacted for privacy. +

+ +The results of our investigation indicate that this approach can help generate reasonably grammatical and clinically meaningful sentences, yet falls short in achieving this with sufficient accuracy. While improvements to the model could be made, our findings are corroborated by others in literature. Besides improving +performance, future work could develop other applications of TxtRayAlign for other downstream tasks, such as image-to-image or text-to-image retrieval. + + +| Output | Link | +| ---- | ---- | +| Open Source Code & Documentation | [Github](https://github.com/nhsx/txt-ray-align) | +| Case Study | Awaiting Sign-Off | +| Technical report | [Here](https://github.com/nhsx/txt-ray-align/blob/main/report/TxtRayAlign_Report_DZ.pdf) | + +|:-|:-|:-| +||| diff --git a/docs/our_work/p23_stm.md b/docs/our_work/p23_stm.md new file mode 100644 index 00000000..1feed8d6 --- /dev/null +++ b/docs/our_work/p23_stm.md @@ -0,0 +1,30 @@ +--- +layout: base +title: Text Analysis using Structural Topic Modelling +permalink: p23_stm.html +--- + +# {{page.title}} +> | "Using metadata to support better topic modelling of free text responses" + +

+ +

+

+ Figure 1: Example visualisations using toLDAvis (Top) and stminsights (Bottom) +

+ +The development of an R code for investigating the topics found in free text survey data using a technique that monitors both the content of the responses but also the metadata (e.g. when the response was made, which organisation the response relates to) in order to support the construction of these topics. + +## Results + +The code base has been developed as an open reusable code and being used internally for topic modelling of survey responses. + +| Output | Link | +| ---- | ---- | +| Open Source Code & Documentation | [Github](https://github.com/nhsx/stm-survey-text) | +| Case Study | Awaiting Sign-Off | +| Technical report | [Here](https://github.com/nhsx/stm-survey-text/blob/main/reports/report_stm.pdf) | + +|:-|:-|:-| +||| diff --git a/docs/our_work/p24_lime.md b/docs/our_work/p24_lime.md new file mode 100644 index 00000000..70aff43f --- /dev/null +++ b/docs/our_work/p24_lime.md @@ -0,0 +1,30 @@ +--- +layout: base +title: Investigating Superpixels in LIME for Explaining Predictions of Facial Images +permalink: p24_lime.html +--- + +# {{page.title}} +> | "Investigating explainability algorithms for granular healthcare images" + +

+ +

+

+ Figure 1: Output from the LIME pipeline for a Rosacea face. 3(a) is an example image from the rff-300 dataset; 3(b) is an illustration of 73 superpixels found in the image; 3(c) is an illustration of random perturbations of the superpixels; 3(d) show the top features used in training to created the best performing predictions. +

+ +A work experience project investigating the application of a [Local Interpretable Model-agnostic Explanations (LIME)](https://arxiv.org/abs/1602.04938) technique to an image classification task around identifying Rosacea. A binary classification model was trained on the normal and Rosacea faces to generate the LIME explanation for Rosacea faces. Secondly, the fine-tuned model was integrated into the LIME pipeline to generate explanations based on the crucial features on which predictions were made in the classification model. Hence the experimentations helped in understanding the features the classification model took. + +## Results + +Image pre-processing (through contrast enhancement) improved the LIME by increasing the number of superpixels for the given input image so that some broad features usch as discolouration and rashes were identified. However, the coarseness of the superpixels generated through the pipeline was not sufficient to pick up the features which would disceren between different types of Rosacea. It may require domain/ imaging modality-specific pre-processing tasks to enhance the quality of explanation by improving the distinctiveness of the features that may help pick the right number of superpixels. + +| Output | Link | +| ---- | ---- | +| Open Source Code & Documentation | [Github](https://github.com/nhsx/LIME-XAI-Facial-Disease-Classification) | +| Case Study | n/a | +| Technical report | [Here](https://github.com/nhsx/LIME-XAI-Facial-Disease-Classification/blob/main/reports/report_AM.pdf) | + +|:-|:-|:-| +||| diff --git a/docs/our_work/p31_txtrayalign2.md b/docs/our_work/p31_txtrayalign2.md new file mode 100644 index 00000000..4d029db7 --- /dev/null +++ b/docs/our_work/p31_txtrayalign2.md @@ -0,0 +1,42 @@ +--- +layout: base +title: Adding a Clinical Focus to Evaulating Multi-Modal Data Representations +permalink: p31_txtrayalign2.html +--- + +# {{page.title}} +> | "How to validate synthetic text generated from images for healthcare applications." + +

+ +

+

+ Figure 1: Proposed clinical workflow applications of ML to radiology - using the CXR workflow as an exmaple. [CDSS = clinical decision support system, CXR = Chect x-ray, EHR = Electronnic health record, PACS = Picture archiving and communication system] +

+ +The use of Natural Laguage Generation (NLG) for the auto generation of radiology reports has the potential to provide multiple radiology workflow applications. Free text reports pose a challenging task from which to compare NLG outputs due to the "ambiguity, syntax, synonymy, medical abbreviations", use of negation, reference to "out of reach" information, linking of associated findings, and overall individual variation in reporting style seen between different +radiologists + +An series of evaulation techniques were tested cobining components from machine translation metrics (Bleu, Rougue, Meteor), Clinical metrics (CheXpert, Mirqi), and a suggested Clinical Scoring System. + +## Results + +Combinations of evaulation metrics were tested against three experiemnts which were trained on single sentence descriptions, multiple sentence descriptions and including the "impression and findings" section of the reports alongside the multiple sentence descriptions. + +The metrics used to evaluate the performance of models for clinical tasks require further refinement to ensure clinical accuracy is captured. The effect on model performance from adapting model training as well as performance on external dataset was also conducted. + +Three potential uses for NLG models in the clinical radiological workflow highlighted in this work include; +* use as a safety-net for radiologists to auto-fill positive findings if not included in the report by the radiologist, +* provide preliminary reports for acute CXRs to support junior doctors interpreting scans on the wards in the first instance whilst awaiting the radiologists +report communicating critical findings, +* automate follow up oncology scans, e.g. CT, reporting to provide a faster indication if a malignancy has progressed / quantifying response to therapy. + + +| Output | Link | +| ---- | ---- | +| Open Source Code & Documentation | [Github](https://github.com/nhsx/txt-ray-align) | +| Case Study | Awaiting Sign-Off | +| Technical report | [Here](https://github.com/nhsx/txt-ray-align/blob/main/report/TxtRayAlign_Report2_SH.pdf) | + +|:-|:-|:-| +||| diff --git a/docs/our_work/p32_phmdiabetes.md b/docs/our_work/p32_phmdiabetes.md new file mode 100644 index 00000000..453d398f --- /dev/null +++ b/docs/our_work/p32_phmdiabetes.md @@ -0,0 +1,34 @@ +--- +layout: base +title: Inequalities in Diabetes from PHM Data +permalink: p32_phmdiabetes.html +--- + +# {{page.title}} +> | "How to extract inequalities information from linked population health management data" + +

+ +

+

+ Figure 1: Workflow in ESNEFT tools to process the data into Lower Super Output Area granularity for mapping and analysis +

+ +A collaborations with East Suffolk and North Essex foundation trust (ESNEFT) to apply a suite of data science techniques to a large population health data including both primary and secondary care data. The aim of the project was to identify inequalities in diabetes care whilst making reusable code bases which can now be applied for different conditions and in different organisations. + +A small collection of code bases have been created to support the analysis of inequalities in diabetes services for a single locality based on linked population health data. + +## Results + +The project was able to both deliver new insights around drivers for inequalities as well as reproducible analytical pipelines in the code-bases. This means the code and approach can be reused for other disease or could be adapted for different localities. + + +| Output | Link | +| ---- | ---- | +| Open Source Code & Documentation | [DNA Risk Predict](https://github.com/nhsx/dna-risk-predict) & [Diabetes Inequalities](https://github.com/nhsx/p24-pvt-diabetes-inequal) & [Morbidity Network](https://github.com/nhsx/morbidity_network_analysis)| +| Case Study | Awaiting Sign-Off | +| Technical report | [Here](https://github.com/nhsx/ESNEFT_diabetes_StephenRicher/blob/main/stephen-richer-report.pdf) | +| Project Slides | [Here](https://github.com/nhsx/ESNEFT_diabetes_StephenRicher/blob/main/stephen-richer-slides.pdf) | + +|:-|:-|:-| +|||| diff --git a/docs/our_work/p33_patientsafetylms.md b/docs/our_work/p33_patientsafetylms.md new file mode 100644 index 00000000..f248c433 --- /dev/null +++ b/docs/our_work/p33_patientsafetylms.md @@ -0,0 +1,31 @@ +--- +layout: base +title: Investigating Applying and Evaulating a Language Model to Patient Safety Data +permalink: p33_patientsafetylms.html +--- + +# {{page.title}} +> | "What's the most suitable models and workflows for represneting an NHS text dataset?" + +

+ +

+

+ Figure 1: Taken from DeCLUTR: Deep Contrastive Learning for Unsupervised Textual Representations - arXiv:2006.03659 +

+ +In collaboration with the NHS England patient safety data team, we present an exploration of a selection of different language model pretraining and finetuning objectives with patient safety incident reports as the domain of interest, followed by a discussion of a number of methods for probing and evaluating these new models, and their respective embedding spaces. + +## Results + +Results showed that the models trained on the patient safety incident reports using either the Masked Language Model (MLM) objective, or the MLM plus contrastive loss objective, appeared to have a superior performance on the presented pseudo-tasks when compared to their general domain equivalent. Whilst the performance in the frozen setting did not match that of the full fine-tuned setting, we have not performed a thorough investigation, for instance we could look to utilising larger base models. Further there are other examples of promising approaches which can better utilise frozen language models at scale, such as prompt learning and parameter efficient fine-tuning. + + +| Output | Link | +| ---- | ---- | +| Open Source Code & Documentation | [Github](https://github.com/nhsx/ELM4PSIR) | +| Case Study | Awaiting Sign-Off | +| Technical report | [Here](https://github.com/nhsx/ELM4PSIR/blob/main/reports/ELM4PSIR_NT_v1.1.pdf) | + +|:-|:-|:-| +|| diff --git a/docs/our_work/p34_hypergraphs.md b/docs/our_work/p34_hypergraphs.md new file mode 100644 index 00000000..89d5e68f --- /dev/null +++ b/docs/our_work/p34_hypergraphs.md @@ -0,0 +1,31 @@ +--- +layout: base +title: Exploring Hypergraphs as a Technique for Understanding Impact of Co-Morbidities +permalink: p34_hypergraphs.html +--- + +# {{page.title}} +> | "In collaboration with Swansea University and the SAIL databank, this work focused on the generation of hypergraphs for investigating the individual and joint impact of comorbidities on a patient pathway. This work will feed into two future projects to continue the creation of directed hypergraphs and then apply graph neural networks to demonstrate the process of extracting useful insights from these data. " + +

+ +

+

+ Figure 1: +

+ + + +## Results + + + +| Output | Link | +| ---- | ---- | +| Open Source Code & Documentation | Coming Soon | +| Case Study | Awaiting Sign-Off | +| Technical report | Coming Soon | + +|||| +|:-|:-|:-| +|||| diff --git a/docs/our_work/p42_hypergraphs2.md b/docs/our_work/p42_hypergraphs2.md new file mode 100644 index 00000000..4bb6122f --- /dev/null +++ b/docs/our_work/p42_hypergraphs2.md @@ -0,0 +1,31 @@ +--- +layout: base +title: Including Mortality in our Implementation of Hypergraphs +permalink: p42_hypergraphs2.html +--- + +# {{page.title}} +> | "A continuation of the previous work on hypergraphs than can extract the impact of predecessor and successor diseases on disease progression pathways. This work in envisaged to include an implicit relationship to demographics and consider the impact of mortality." + +

+ +

+

+ Figure 1: +

+ + + +## Results + + + +| Output | Link | +| ---- | ---- | +| Open Source Code & Documentation | Coming Soon | +| Case Study | Awaiting Sign-Off | +| Technical report | Coming Soon | + +|||| +|:-|:-|:-| +|||| diff --git a/docs/our_work/p43_medcat.md b/docs/our_work/p43_medcat.md new file mode 100644 index 00000000..345ebdd8 --- /dev/null +++ b/docs/our_work/p43_medcat.md @@ -0,0 +1,31 @@ +--- +layout: base +title: Enriching Clinical Coding for Neurology Pathways using MedCAT +permalink: p43_medcat.html +--- + +# {{page.title}} +> | "In collaboration with Lancaster teaching hospital and the University of Lancaster we aim to apply MedCat (an automated named entity recognition with linkage algorithm) to neurology letters to identify related SNOMED CT coding." + +

+ +

+

+ Figure 1: +

+ + + +## Results + + + +| Output | Link | +| ---- | ---- | +| Open Source Code & Documentation | Coming Soon | +| Case Study | Awaiting Sign-Off | +| Technical report | Coming Soon | + +|||| +|:-|:-|:-| +|||| diff --git a/docs/outputs/.pages b/docs/outputs/.pages new file mode 100644 index 00000000..a6449d74 --- /dev/null +++ b/docs/outputs/.pages @@ -0,0 +1 @@ +# this page can be used to arrange the other pages and apply formatting to the nav etc. diff --git a/docs/outputs/index.md b/docs/outputs/index.md new file mode 100644 index 00000000..4aa9e613 --- /dev/null +++ b/docs/outputs/index.md @@ -0,0 +1,3 @@ +# Outputs + +> Pages describing any of our outputs, e.g. articles, research papers we've contributed to, systems, websites, etc... \ No newline at end of file diff --git a/docs/playbooks.md b/docs/playbooks.md new file mode 100644 index 00000000..b5a7cb76 --- /dev/null +++ b/docs/playbooks.md @@ -0,0 +1,8 @@ +--- +hide: + - navigation +--- + +# Playbooks + +* [RAP Playbook](https://nhsengland.github.io/Health-RAP-Playbook-Alpha/) \ No newline at end of file diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css new file mode 100644 index 00000000..c2be2146 --- /dev/null +++ b/docs/stylesheets/extra.css @@ -0,0 +1,97 @@ +:root>* { + --md-primary-fg-color: #005eb8; + --md-default-bg-color: #f8f8f8; + -moz-osx-font-smoothing: grayscale; + -webkit-font-smoothing: antialiased; + background-color: #f8f8f8; + line-height: 1.5; + margin: 0; + min-height: 100%; + color: black; + --md-footer-bg-color: #f8f8f8; + --md-footer-fg-color: black; + --md-footer-fg-color--light: black; +} + +.md-typeset h1 { + color: black; + font-size: 2em; + line-height: 1.3; + margin: 0 0 1.25em; +} + +.md-footer { + background-color: #f6f8f8 !important; +} + +.md-copyright { + color: black; +} + +.md-logo-nhs { + height: auto; +} + +.md-header__button.md-logo-nhs :is(img, svg) { + width: 85px !important; + height: auto !important; +} + +code { + background: #e5f1fa !important; +} + +.md-ellipsis { + font-weight: bold; +} + +.footer { + display: flex; + padding: 30px 30px 20px 30px; + color: #2f2f2f; + background-color: #f6f8f8; + border-top: 3px solid #005eb8; +} + +.footer>* { + flex: 1 100%; +} + +.footer a { + color: #005eb8; +} + +.footer_left p { + font-size: 15px; + font-weight: 400; + margin: 1vh; +} + +.legal { + color: #999; + font-size: 14px; + justify-content: right; + text-align: right; + align-items: right; +} + +@media screen and (min-width: 24.375em) { + .legal .legal__links { + margin-left: auto; + } +} + +table { + background-color: white !important; +} + +:target:before { + content: ""; + display: block; + height: 40px; + margin: -40px 0 0; +} + +html { + scroll-behavior: smooth; +} diff --git a/docs/tools/.pages b/docs/tools/.pages new file mode 100644 index 00000000..a6449d74 --- /dev/null +++ b/docs/tools/.pages @@ -0,0 +1 @@ +# this page can be used to arrange the other pages and apply formatting to the nav etc. diff --git a/docs/tools/index.md b/docs/tools/index.md new file mode 100644 index 00000000..e69de29b diff --git a/docs/useful_links.md b/docs/useful_links.md new file mode 100644 index 00000000..f05b27c2 --- /dev/null +++ b/docs/useful_links.md @@ -0,0 +1,8 @@ +--- +hide: + - navigation +--- + +# Useful links + +> any useful links to other pages, sites, etc. \ No newline at end of file diff --git a/docs/what_is_data_science/.pages b/docs/what_is_data_science/.pages new file mode 100644 index 00000000..a6449d74 --- /dev/null +++ b/docs/what_is_data_science/.pages @@ -0,0 +1 @@ +# this page can be used to arrange the other pages and apply formatting to the nav etc. diff --git a/docs/what_is_data_science/Benefits of Data Science in the NHS.md b/docs/what_is_data_science/Benefits of Data Science in the NHS.md new file mode 100644 index 00000000..e69de29b diff --git a/docs/what_is_data_science/How you can learn Data Science.md b/docs/what_is_data_science/How you can learn Data Science.md new file mode 100644 index 00000000..e69de29b diff --git a/docs/what_is_data_science/index.md b/docs/what_is_data_science/index.md new file mode 100644 index 00000000..613cd4c8 --- /dev/null +++ b/docs/what_is_data_science/index.md @@ -0,0 +1,3 @@ +# What is Data Science? + +> A page describing what data science is. \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 00000000..26405320 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,94 @@ +site_name: NHS Data Science +site_url: https://nhsengland.github.io/datascience +repo_url: https://github.com/NHSEngland/datascience +repo_name: NHS England Data Science +edit_uri: edit/main/docs/ +plugins: + - git-revision-date-localized: + fallback_to_build_date: true + - search + - awesome-pages + - mkdocs-jupyter: + include_source: True + theme: dark +nav: + - Home: index.md + - What is Data Science?: + - ... | what_is_data_science/*.md + - Articles: + - ... | articles/*.md + - Case Studies / Projects: + - ... | our_work/*.md + - Tools: + - ... | tools/*.md + - Playbooks: playbooks.md + - Codebases: codebases.md + - About the team: about.md + - Meta Page: meta_page.md + - Useful links: useful_links.md +theme: + name: material + custom_dir: overrides + palette: + scheme: default + primary: indigo + font: + text: Arial + logo: images/logo/nhs-blue-on-white.jpg + favicon: images/favicon/favicon.ico + features: + - search.share + - content.code.annotate + - content.tabs.link + - navigation.tabs + - navigation.tabs.sticky + - navigation.indexes + - navigation.path + - navigation.top + icon: + admonition: + : material/alert +extra_css: + - stylesheets/extra.css +markdown_extensions: + - pymdownx.tasklist: + # clickable_checkbox: true + custom_checkbox: true + - tables + - attr_list + - pymdownx.emoji: + emoji_index: !!python/name:materialx.emoji.twemoji + emoji_generator: !!python/name:materialx.emoji.to_svg + - pymdownx.highlight: + anchor_linenums: true + - pymdownx.inlinehilite + - pymdownx.snippets + - pymdownx.superfences: + custom_fences: + - name: mermaid + class: mermaid + format: !!python/name:pymdownx.superfences.fence_code_format + - pymdownx.tabbed: + alternate_style: true + - pymdownx.arithmatex: + generic: true + - admonition + - pymdownx.details + - pymdownx.critic + - pymdownx.caret + - pymdownx.keys + - pymdownx.mark + - pymdownx.tilde +extra: + social: + - icon: fontawesome/brands/github + link: https://github.com/NHSDigital/data_science_site + name: NHS Digital Analytical Services + generator: false +copyright: Copyright © 2023 Crown Copyright (NHS England) +extra_javascript: + - javascripts/mathjax.js + - https://polyfill.io/v3/polyfill.min.js?features=es6 + - https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js +watch: + - overrides diff --git a/overrides/home.html b/overrides/home.html new file mode 100644 index 00000000..5bff8279 --- /dev/null +++ b/overrides/home.html @@ -0,0 +1,384 @@ +{% extends "main.html" %} {% block tabs %} {{ super() }} + + + + +
+
+
+
+ +
+ +
+

NHS Data Science

+ +

+ The latest techniques, applied to real business problems for the benefit of the patient. +

+ + + What is Data Science? + + + + See how our work has benefited the NHS + + + + How can I learn data science? + +
+
+
+
+ + + + + + + + + + + +{% endblock %} {% block content %}{% endblock %} {% block footer %} {{ super() +}} {% endblock %} diff --git a/overrides/main.html b/overrides/main.html new file mode 100644 index 00000000..f223d92c --- /dev/null +++ b/overrides/main.html @@ -0,0 +1,11 @@ +{% extends "base.html" %} + +{% block content %} +{% if page.nb_url %} + + {% include ".icons/material/download.svg" %} + +{% endif %} + +{{ super() }} +{% endblock content %} \ No newline at end of file diff --git a/overrides/partials/footer.html b/overrides/partials/footer.html new file mode 100644 index 00000000..c9c1391d --- /dev/null +++ b/overrides/partials/footer.html @@ -0,0 +1,83 @@ + + + + + + + + diff --git a/overrides/partials/header.html b/overrides/partials/header.html new file mode 100644 index 00000000..c86999a8 --- /dev/null +++ b/overrides/partials/header.html @@ -0,0 +1,158 @@ + + + +{% set class = "md-header" %} +{% if "navigation.tabs.sticky" in features %} + {% set class = class ~ " md-header--lifted" %} +{% endif %} + + +
+ + + + {% if "navigation.tabs.sticky" in features %} + {% if "navigation.tabs" in features %} + {% include "partials/tabs.html" %} + {% endif %} + {% endif %} +
\ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..0fc3a6e2 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,33 @@ +Babel +click +colorama +ghp-import +gitdb +GitPython +importlib-metadata +Jinja2 +Markdown +MarkupSafe +mergedeep +mkdocs +mkdocs-git-revision-date-localized-plugin +mkdocs-material +mkdocs-material-extensions +mkdocs-jupyter +mkdocs-awesome-pages-plugin +packaging +pyparsing +Pygments +pymdown-extensions +pyparsing +python-dateutil +pytz +PyYAML +pyyaml_env_tag +setuptools +six +smmap +typing_extensions +watchdog +wheel +zipp \ No newline at end of file