diff --git a/.gitignore b/.gitignore index dfdafb3810..cc93144f7d 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,10 @@ mimic-iv-1.0 scrap *.gzip *.csv.gz -*summary*.txt \ No newline at end of file +*summary*.txt +venv +__pycache__ +raw_data +preproc_data +data +*.csv \ No newline at end of file diff --git a/README copy.md b/README copy.md new file mode 100644 index 0000000000..1101cdf712 --- /dev/null +++ b/README copy.md @@ -0,0 +1,95 @@ +# MIMIC-IV +**MIMIC-IV data pipeline** is an end-to-end pipeline that offers a configurable framework to prepare MIMIC-IV data for the downstream tasks. +The pipeline cleans the raw data by removing outliers and allowing users to impute missing entries. +It also provides options for the clinical grouping of medical features using standard coding systems for dimensionality reduction. +All of these options are customizable for the users, allowing them to generate a personalized patient cohort. +The customization steps can be recorded for the reproducibility of the overall framework. +The pipeline produces a smooth time-series dataset by binning the sequential data into equal-length time intervals and allowing for filtering of the time-series length according to the user's preferences. +Besides the data processing modules, our pipeline also includes two additional modules for modeling and evaluation. +For modeling, the pipeline includes several commonly used sequential models for performing prediction tasks. +The evaluation module offers a series of standard methods for evaluating the performance of the created models. +This module also includes options for reporting individual and group fairness measures. + +##### Citing MIMIC-IV Data Pipeline: +MIMIC-IV Data Pipeline is available on [ML4H](https://proceedings.mlr.press/v193/gupta22a/gupta22a.pdf). +If you use MIMIC-IV Data Pipeline, we would appreciate citations to the following paper. + +``` +@InProceedings{gupta2022extensive, + title = {{An Extensive Data Processing Pipeline for MIMIC-IV}}, + author = {Gupta, Mehak and Gallamoza, Brennan and Cutrona, Nicolas and Dhakal, Pranjal and Poulain, Raphael and Beheshti, Rahmatollah}, + booktitle = {Proceedings of the 2nd Machine Learning for Health symposium}, + pages = {311--325}, + year = {2022}, + volume = {193}, + series = {Proceedings of Machine Learning Research}, + month = {28 Nov}, + publisher = {PMLR}, + url = {https://proceedings.mlr.press/v193/gupta22a.html} +} +``` + +## Table of Contents: +- [Steps to download MIMIC-IV dataset for the pipeline](#Steps-to-download-MIMIC-IV-dataset-for-the-pipeline) +- [Repository Structure](#Repository-Structure) +- [How to use the pipeline?](#How-to-use-the-pipeline) + +### Steps to download MIMIC-IV dataset for the pipeline + +Go to https://physionet.org/content/mimiciv/1.0/ + +Follow instructions to get access to MIMIC-IV dataset. + +Download the files using your terminal: wget -r -N -c -np --user mehakg --ask-password https://physionet.org/files/mimiciv/1.0/ + +### Repository Structure + +- **mainPipeline.ipynb** + is the main file to interact with the pipeline. It provides step-step by options to extract and pre-process cohorts. +- **./data** + consists of all data files stored during pre-processing + - **./cohort** + consists of files saved during cohort extraction + - **./features** + consist of files containing features data for all selected features. + - **./summary** + consists of summary files for all features. + It also consists of file with list of variables in all features and can be used for feature selection. + - **./dict** + consists of dictionary structured files for all features obtained after time-series representation + - **./output** + consists output files saved after training and testing of model. These files are used during evaluation. +- **./mimic-iv-1.0** + consist of files downloaded from MIMIC-IV website. +- **./saved_models** + consists of models saved during training. +- **./preprocessing** + - **./day_intervals_preproc** + - **day_intervals_cohort.py** file is used to extract samples, labels and demographic data for cohorts. + - **disease_cohort.py** is used to filter samples based on diagnoses codes at time of admission + - **./hosp_module_preproc** + - **feature_selection_hosp.py** is used to extract, clean and summarize selected features for non-ICU data. + - **feature_selection_icu.py** is used to extract, clean and summarize selected features for ICU data. +- **./model** + - **train.py** + consists of code to create batches of data according to batch_size and create, train and test different models. + - **Mimic_model.py** + consist of different model architectures. + - **evaluation.py** + consists of class to perform evaluation of results obtained from models. + This class can be instantiated separated for use as standalone module. + - **fairness.py** + consists of code to perform fairness evaluation. + It can also be used as standalone module. + - **parameters.py** + consists of list of hyperparameters to be defined for model training. + - **callibrate_output** + consists of code to calibrate model output. + It can also be used as standalone module. + +### How to use the pipeline? +- After downloading the repo, open **mainPipeline.ipynb**. +- **mainPipeline.ipynb**, contains sequential code blocks to extract, preprocess, model and train MIMIC-IV EHR data. +- Follow each code bloack and read intructions given just before each code block to run code block. +- Follow the exact file paths and filenames given in instructions for each code block to run the pipeline. +- For evaluation module, clear instructions are provided on how to use it as a standalone module. diff --git a/README.md b/README.md index 1101cdf712..c195fffbd9 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,25 @@ Download the files using your terminal: wget -r -N -c -np --user mehakg --ask-pa ### How to use the pipeline? - After downloading the repo, open **mainPipeline.ipynb**. - **mainPipeline.ipynb**, contains sequential code blocks to extract, preprocess, model and train MIMIC-IV EHR data. -- Follow each code bloack and read intructions given just before each code block to run code block. +- Follow each code block and read intructions given just before each code block to run code block. - Follow the exact file paths and filenames given in instructions for each code block to run the pipeline. - For evaluation module, clear instructions are provided on how to use it as a standalone module. + +### Pipeline details + +#### Cohort extraction +Options: +- use icu data + + +#### Feature extraction + +#### Feature preprocessing + +##### Preprocessing + +##### Summary + +##### Selection + +##### Event Cleaning \ No newline at end of file diff --git a/_old_requirements.txt b/_old_requirements.txt new file mode 100644 index 0000000000..58020deb8b --- /dev/null +++ b/_old_requirements.txt @@ -0,0 +1,9 @@ +import_ipynb==0.1.3 +ipywidgets==7.5.1 +Jinja2==2.11.2 +matplotlib==3.2.2 +numpy==1.18.5 +pandas==1.0.5 +scikit_learn==1.0.2 +torch==1.6.0 +tqdm==4.47.0 diff --git a/mainPipeline.ipynb b/mainPipeline.ipynb index 4573ff0821..8ceb7f0a98 100644 --- a/mainPipeline.ipynb +++ b/mainPipeline.ipynb @@ -2,8 +2,8 @@ "cells": [ { "cell_type": "code", - "execution_count": 157, - "id": "available-albany", + "execution_count": 1, + "id": "4b80a7cf-e155-48b2-840e-4cc5101d6984", "metadata": {}, "outputs": [], "source": [ @@ -11,9 +11,16 @@ "import sys\n", "from pathlib import Path\n", "import os\n", - "import importlib\n", - "\n", - "\n", + "import importlib" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "available-albany", + "metadata": {}, + "outputs": [], + "source": [ "module_path='preprocessing/day_intervals_preproc'\n", "if module_path not in sys.path:\n", " sys.path.append(module_path)\n", @@ -70,7 +77,7 @@ }, { "cell_type": "code", - "execution_count": 158, + "execution_count": 3, "id": "nutritional-chicago", "metadata": {}, "outputs": [], @@ -173,7 +180,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "id": "structured-dimension", "metadata": { "tags": [ @@ -196,7 +203,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "71ef24fe5a444ebb8191a30f09302791", + "model_id": "73670930477c4c639717c622d624b049", "version_major": 2, "version_minor": 0 }, @@ -217,7 +224,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "5d7ae412aa5f4976a0c630efa469f694", + "model_id": "17de961b447c4005aac3dcbef993546a", "version_major": 2, "version_minor": 0 }, @@ -273,38 +280,10 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "id": "broke-spirituality", "metadata": {}, "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "f3e3ebb6ac2b44919e40f658a661a986", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "RadioButtons(options=('Length of Stay ge 3', 'Length of Stay ge 7', 'Custom'), value='Length of Stay ge 3')" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "0ef6884738014f2c840370f7b264656c", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(Label(value='Length of stay ge (in days)', layout=Layout(width='180px')), IntSlider(value=3, co…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", @@ -316,7 +295,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "5056acfb82184e948c1e3079a6ffd597", + "model_id": "c21fd5c8416f49c2822bce9081e011f2", "version_major": 2, "version_minor": 0 }, @@ -337,7 +316,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "0bd8a7d96b9a41a5bdfe3c5506e3b058", + "model_id": "20eb962a8b80459bae218ffda503ea74", "version_major": 2, "version_minor": 0 }, @@ -395,7 +374,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "id": "republican-freight", "metadata": {}, "outputs": [ @@ -403,16 +382,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "===========MIMIC-IV v1.0============\n", - "EXTRACTING FOR: | ICU | LENGTH OF STAY | 3 |\n", - "[ LOS LABELS FINISHED ]\n", + "===========MIMIC-IV v2.0============\n", + "EXTRACTING FOR: | ICU | MORTALITY | 0 |\n", + "[ MORTALITY LABELS FINISHED ]\n", "[ COHORT SUCCESSFULLY SAVED ]\n", "[ SUMMARY SUCCESSFULLY SAVED ]\n", - "Length of Stay FOR ICU DATA\n", - "# Admission Records: 76540\n", - "# Patients: 53150\n", - "# Positive cases: 24397\n", - "# Negative cases: 52143\n" + "Mortality FOR ICU DATA\n", + "# Admission Records: 140\n", + "# Patients: 100\n", + "# Positive cases: 10\n", + "# Negative cases: 130\n" ] } ], @@ -471,14 +450,41 @@ " version_path=\"mimiciv/1.0\"\n", " cohort_output = day_intervals_cohort.extract_data(radio_input1.value,label,time,icd_code, root_dir,disease_label)\n", "elif version.value=='Version 2':\n", - " version_path=\"mimiciv/2.0\"\n", + " version_path=\"mimiciv\"\n", " cohort_output = day_intervals_cohort_v2.extract_data(radio_input1.value,label,time,icd_code, root_dir,disease_label)" ] }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('ICU',\n", + " 'Mortality',\n", + " 0,\n", + " 'No Disease Filter',\n", + " 'd:\\\\Work\\\\Repos\\\\MIMIC-IV-Data-Pipeline',\n", + " '')" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "radio_input1.value,label,time,icd_code, root_dir,disease_label" + ] + }, { "cell_type": "markdown", "id": "interstate-stadium", - "metadata": {}, + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, "source": [ "## 2. FEATURE SELECTION\n", "Features available for ICU data -\n", @@ -501,7 +507,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "id": "raised-olympus", "metadata": {}, "outputs": [ @@ -516,7 +522,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "c7a96185f866472ba7d6f415fc1c7e7f", + "model_id": "d91444a4a11144a4abb5fc2e9281ee42", "version_major": 2, "version_minor": 0 }, @@ -530,7 +536,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "116cd5d5738745c1a7c4c30a3affdf10", + "model_id": "f919bf191891416d94039f9076a4a7f4", "version_major": 2, "version_minor": 0 }, @@ -544,7 +550,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "11fe8d7e1ad84f23a61e81b0c0fb53e5", + "model_id": "97ff8f2f6fd04245980b5362c537b399", "version_major": 2, "version_minor": 0 }, @@ -558,7 +564,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "bce5452627164365b27b6ea2e2eab349", + "model_id": "6dff2879bcc94b58b646bf29af7b7ded", "version_major": 2, "version_minor": 0 }, @@ -572,7 +578,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "02b89f4a21684c70a40564454d2cce67", + "model_id": "f459aff3d4794b85bd95a7669e81fdf0", "version_major": 2, "version_minor": 0 }, @@ -620,7 +626,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "id": "native-covering", "metadata": { "scrolled": true @@ -631,31 +637,17 @@ "output_type": "stream", "text": [ "[EXTRACTING DIAGNOSIS DATA]\n", - "# unique ICD-9 codes 6686\n", - "# unique ICD-10 codes 10120\n", - "# unique ICD-10 codes (After converting ICD-9 to ICD-10) 10414\n", - "# unique ICD-10 codes (After clinical gruping ICD-10 codes) 1522\n", - "# Admissions: 76504\n", - "Total rows 1362068\n", + "# unique ICD-9 codes 539\n", + "# unique ICD-10 codes 508\n", + "# unique ICD-10 codes (After converting ICD-9 to ICD-10) 689\n", + "# unique ICD-10 codes (After clinical gruping ICD-10 codes) 388\n", + "# Admissions: 140\n", + "Total rows 2647\n", "[SUCCESSFULLY SAVED DIAGNOSIS DATA]\n", "[EXTRACTING OUPTPUT EVENTS DATA]\n", - "# Unique Events: 71\n", - "# Admissions: 74364\n", - "Total rows 4457381\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\r", - "0it [00:00, ?it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + "# Unique Events: 39\n", + "# Admissions: 137\n", + "Total rows 9362\n", "[SUCCESSFULLY SAVED OUPTPUT EVENTS DATA]\n", "[EXTRACTING CHART EVENTS DATA]\n" ] @@ -664,26 +656,26 @@ "name": "stderr", "output_type": "stream", "text": [ - "33it [06:39, 12.12s/it]\n" + "1it [00:00, 1.11it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "# Unique Events: 454\n", - "# Admissions: 76529\n", - "Total rows 81030530\n", + "# Unique Events: 298\n", + "# Admissions: 140\n", + "Total rows 162571\n", "[SUCCESSFULLY SAVED CHART EVENTS DATA]\n", "[EXTRACTING PROCEDURES DATA]\n", - "# Unique Events: 157\n", - "# Admissions: 76041\n", - "Total rows 713377\n", + "# Unique Events: 82\n", + "# Admissions: 138\n", + "Total rows 1435\n", "[SUCCESSFULLY SAVED PROCEDURES DATA]\n", "[EXTRACTING MEDICATIONS DATA]\n", - "# of unique type of drug: 196\n", - "# Admissions: 72118\n", - "# Total rows 5078987\n", + "# of unique type of drug: 76\n", + "# Admissions: 136\n", + "# Total rows 11038\n", "[SUCCESSFULLY SAVED MEDICATIONS DATA]\n" ] } @@ -704,10 +696,21 @@ " feature_nonicu(cohort_output, version_path,diag_flag,lab_flag,proc_flag,med_flag)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cohort_output, version_path,diag_flag,out_flag,chart_flag,proc_flag,med_flag" + ] + }, { "cell_type": "markdown", "id": "aboriginal-upset", - "metadata": {}, + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, "source": [ "## 3. CLINICAL GROUPING\n", "Below you will have option to clinically group diagnosis and medications.\n", @@ -720,7 +723,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 14, "id": "partial-manhattan", "metadata": {}, "outputs": [ @@ -734,7 +737,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "1c9195ae736744d7a79960cbfe87b95d", + "model_id": "f08060562dcd42f69f99300fc590dedc", "version_major": 2, "version_minor": 0 }, @@ -778,7 +781,34 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('cohort_icu_mortality_0_',\n", + " True,\n", + " 'Convert ICD-9 to ICD-10 and group ICD-10 codes',\n", + " False,\n", + " False,\n", + " False,\n", + " 0,\n", + " 0)" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cohort_output, diag_flag, group_diag,False,False,False,0,0" + ] + }, + { + "cell_type": "code", + "execution_count": 16, "id": "descending-symphony", "metadata": {}, "outputs": [ @@ -787,7 +817,7 @@ "output_type": "stream", "text": [ "[PROCESSING DIAGNOSIS DATA]\n", - "Total number of rows 1289600\n", + "Total number of rows 2504\n", "[SUCCESSFULLY SAVED DIAGNOSIS DATA]\n" ] } @@ -830,7 +860,27 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(True, True, True, True, True)" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(diag_flag,proc_flag,med_flag,out_flag,chart_flag)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, "id": "thick-residence", "metadata": {}, "outputs": [ @@ -839,19 +889,6 @@ "output_type": "stream", "text": [ "[GENERATING FEATURE SUMMARY]\n", - " subject_id hadm_id starttime stoptime drug_name \\\n", - "0 17868682 22726960 2160-01-07 10:00:00 2160-01-07 16:00:00 aspirin \n", - "1 17067646 20845642 2159-02-23 10:00:00 2159-02-26 23:00:00 aspirin \n", - "2 17067646 25358552 2159-08-08 10:00:00 2159-08-13 18:00:00 aspirin \n", - "3 13359788 27483342 2143-11-22 19:00:00 2143-11-23 19:00:00 aspirin \n", - "4 15346117 20604717 2195-01-21 10:00:00 2195-01-24 21:00:00 aspirin \n", - "\n", - " start_hours_from_admit stop_hours_from_admit dose_val_rx \n", - "0 -1 days +22:00:00.000000000 0 days 04:00:00.000000000 81 \n", - "1 0 days 12:49:00.000000000 4 days 01:49:00.000000000 81 \n", - "2 -1 days +13:54:00.000000000 4 days 21:54:00.000000000 81 \n", - "3 0 days 02:59:00.000000000 1 days 02:59:00.000000000 81 \n", - "4 -1 days +13:23:00.000000000 3 days 00:23:00.000000000 81 \n", "[SUCCESSFULLY SAVED FEATURE SUMMARY]\n" ] } @@ -866,7 +903,9 @@ { "cell_type": "markdown", "id": "northern-architecture", - "metadata": {}, + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, "source": [ "## 5. Feature Selection\n", "\n", @@ -881,7 +920,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 20, "id": "immediate-seafood", "metadata": {}, "outputs": [ @@ -896,7 +935,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "3d5b89b25a7d41a38b994eecc9550f69", + "model_id": "831fcd09d5a24893b587cd2cc4947d56", "version_major": 2, "version_minor": 0 }, @@ -918,7 +957,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "c97cc3c6313246d68cefd652eca49cfa", + "model_id": "8a6c9e9e2cfc4b63962ab1dab3662e88", "version_major": 2, "version_minor": 0 }, @@ -940,7 +979,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "7c1d627a06e648ada8d44f90f7ae4f04", + "model_id": "85eed2e5a1834402a772379a7e388faa", "version_major": 2, "version_minor": 0 }, @@ -962,7 +1001,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "2b5a02b857cd4266a63bdbab772d050e", + "model_id": "dc4cf06db0a649f5b928a754d85b661c", "version_major": 2, "version_minor": 0 }, @@ -984,7 +1023,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "09f56b608e144b88bd975b743a7cd075", + "model_id": "8af1bced7fc745409b7db3728ccc3c5e", "version_major": 2, "version_minor": 0 }, @@ -1047,10 +1086,32 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 21, "id": "perceived-python", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[FEATURE SELECTION DIAGNOSIS DATA]\n", + "Total number of rows 2504\n", + "[SUCCESSFULLY SAVED DIAGNOSIS DATA]\n", + "[FEATURE SELECTION MEDICATIONS DATA]\n", + "Total number of rows 11038\n", + "[SUCCESSFULLY SAVED MEDICATIONS DATA]\n", + "[FEATURE SELECTION PROCEDURES DATA]\n", + "Total number of rows 1435\n", + "[SUCCESSFULLY SAVED PROCEDURES DATA]\n", + "[FEATURE SELECTION OUTPUT EVENTS DATA]\n", + "Total number of rows 9362\n", + "[SUCCESSFULLY SAVED OUTPUT EVENTS DATA]\n", + "[FEATURE SELECTION CHART EVENTS DATA]\n", + "Total number of rows 162571\n", + "[SUCCESSFULLY SAVED CHART EVENTS DATA]\n" + ] + } + ], "source": [ "select_diag=False\n", "select_med=False\n", @@ -1083,10 +1144,42 @@ " features_selection_hosp(cohort_output, diag_flag,proc_flag,med_flag,lab_flag,select_diag,select_med,select_proc,select_lab)" ] }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('cohort_icu_mortality_0_',\n", + " True,\n", + " True,\n", + " True,\n", + " True,\n", + " True,\n", + " True,\n", + " True,\n", + " True,\n", + " True,\n", + " True)" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(cohort_output, diag_flag,proc_flag,med_flag,out_flag, chart_flag,select_diag,select_med,select_proc,select_out,select_chart)" + ] + }, { "cell_type": "markdown", "id": "comfortable-director", - "metadata": {}, + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, "source": [ "## 6. CLEANING OF FEATURES\n", "Below you will have option to to clean lab and chart events by performing outlier removal and unit conversion.\n", @@ -1098,7 +1191,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "id": "moderate-forum", "metadata": {}, "outputs": [ @@ -1112,7 +1205,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "222126e0df6d44f88fe98d2909d23129", + "model_id": "7edefbfc026d4eae9c732f1d82a70ce3", "version_major": 2, "version_minor": 0 }, @@ -1126,7 +1219,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "5fc77b72dfae4b9d9e9fb66e0a4a6234", + "model_id": "778ebe54223d4e0b8e0e0c60ceb02555", "version_major": 2, "version_minor": 0 }, @@ -1140,7 +1233,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "cbd8497ab8684c258d21dd3c5c965aed", + "model_id": "dd1977db5d754e78859692220234400d", "version_major": 2, "version_minor": 0 }, @@ -1214,7 +1307,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 24, "id": "impossible-mailman", "metadata": {}, "outputs": [ @@ -1223,7 +1316,7 @@ "output_type": "stream", "text": [ "[PROCESSING CHART EVENTS DATA]\n", - "Total number of rows 4892842\n", + "Total number of rows 162571\n", "[SUCCESSFULLY SAVED CHART EVENTS DATA]\n" ] } @@ -1246,10 +1339,32 @@ " preprocess_features_hosp(cohort_output, False,False,False,lab_flag,False,False,False,clean_lab,impute_outlier,thresh,left_thresh)" ] }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('cohort_icu_mortality_0_', False, False, True, True, True, 98, 0)" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(cohort_output, False, False,chart_flag,clean_chart,impute_outlier_chart,thresh,left_thresh)" + ] + }, { "cell_type": "markdown", "id": "independent-academy", - "metadata": {}, + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, "source": [ "## 7. Time-Series Representation\n", "In this section, please choose how you want to process and represent time-series data.\n", @@ -1274,7 +1389,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 26, "id": "mechanical-three", "metadata": {}, "outputs": [ @@ -1289,12 +1404,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "0ff4a416c46e4b0e8c44438d7194b616", + "model_id": "cf03952437504aa6b000af62ca5542aa", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "RadioButtons(index=1, options=('First 12 hours', 'First 24 hours', 'Custom'), value='First 24 hours')" + "RadioButtons(options=('First 72 hours', 'First 48 hours', 'First 24 hours', 'Custom'), value='First 72 hours')" ] }, "metadata": {}, @@ -1303,7 +1418,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "24c44d3c094e4ee095172e3721728436", + "model_id": "49077fdbb6ce49f5a2f910dc856bbe03", "version_major": 2, "version_minor": 0 }, @@ -1324,7 +1439,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "40be412e210240c49a62c25414254b3e", + "model_id": "1be8ec6f9848424cae38b4fee674df65", "version_major": 2, "version_minor": 0 }, @@ -1338,7 +1453,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "f190f26d47fc4cc7ae489e0677fb63f4", + "model_id": "07db34eba6c94bad96c31b65cfb2a516", "version_major": 2, "version_minor": 0 }, @@ -1359,7 +1474,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "466fa54671a3463c96a0d2405dc39bc6", + "model_id": "e6d9d12a47494e72852aff8ca363fc76", "version_major": 2, "version_minor": 0 }, @@ -1370,6 +1485,41 @@ "metadata": {}, "output_type": "display_data" }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "If you have choosen mortality prediction task, then what prediction window length you want to keep?\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8f76d3ae14044a11a71d59a5839bed9c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "RadioButtons(options=('2 hours', '4 hours', '6 hours', '8 hours', 'Custom'), value='2 hours')" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "235fdea0b4cb40cbb85feed9aca459e3", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(Label(value='Prediction window (in hours)', layout=Layout(width='180px')), IntSlider(value=2, m…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "name": "stdout", "output_type": "stream", @@ -1454,7 +1604,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 27, "id": "indie-appendix", "metadata": {}, "outputs": [ @@ -1465,21 +1615,7 @@ "[ READ COHORT ]\n", "[ ======READING DIAGNOSIS ]\n", "[ ======READING PROCEDURES ]\n", - "[ ======READING OUT EVENTS ]\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\r", - "0it [00:00, ?it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + "[ ======READING OUT EVENTS ]\n", "[ ======READING CHART EVENTS ]\n" ] }, @@ -1487,7 +1623,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "17it [08:07, 28.66s/it]\n" + "1it [00:01, 1.10s/it]\n" ] }, { @@ -1496,7 +1632,7 @@ "text": [ "[ ======READING MEDICATIONS ]\n", "[ READ ALL FEATURES ]\n", - "include_time 24\n", + "include_time 72\n", "[ PROCESSED TIME SERIES TO EQUAL LENGTH ]\n" ] }, @@ -1504,7 +1640,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 24/24 [00:20<00:00, 1.15it/s]\n" + "100%|██████████| 72/72 [00:00<00:00, 79.20it/s]\n" ] }, { @@ -1513,14 +1649,14 @@ "text": [ "bucket 1\n", "[ PROCESSED TIME SERIES TO EQUAL TIME INTERVAL ]\n", - "24\n" + "72\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 60494/60494 [10:16:42<00:00, 1.63it/s] \n" + "100%|██████████| 54/54 [00:09<00:00, 5.58it/s]" ] }, { @@ -1529,6 +1665,13 @@ "text": [ "[ SUCCESSFULLY SAVED DATA DICTIONARIES ]\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] } ], "source": [ @@ -1560,10 +1703,44 @@ " gen=data_generation.Generator(cohort_output,data_mort,data_admn,data_los,diag_flag,lab_flag,proc_flag,med_flag,impute,include,bucket,predW)" ] }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('cohort_icu_mortality_0_',\n", + " True,\n", + " False,\n", + " False,\n", + " True,\n", + " True,\n", + " True,\n", + " True,\n", + " True,\n", + " False,\n", + " 72,\n", + " 1,\n", + " 2)" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(cohort_output,data_mort,data_admn,data_los,diag_flag,proc_flag,out_flag,chart_flag,med_flag,impute,include,bucket,predW)" + ] + }, { "cell_type": "markdown", "id": "lined-reset", - "metadata": {}, + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, "source": [ "## 8. Machine Learning Models\n", "\n", @@ -1577,7 +1754,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 25, "id": "consolidated-former", "metadata": {}, "outputs": [ @@ -1591,7 +1768,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "dc736b979c044a84a3348b8d17c936b3", + "model_id": "9fd49d0d536d4afea1b8635a59f5eac3", "version_major": 2, "version_minor": 0 }, @@ -1612,7 +1789,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "f8f67b6fe390466cb8565f91909d9825", + "model_id": "41a7bab9c71d4b9fb888e44a09aabde2", "version_major": 2, "version_minor": 0 }, @@ -1633,7 +1810,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "c9c36f7c64774ed6a0d0591056a39fc8", + "model_id": "3c0600abf96745beb66d2cd62f979afb", "version_major": 2, "version_minor": 0 }, @@ -1654,7 +1831,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "6006248cbfe84600974598218b4dd64a", + "model_id": "983178aa7dc84480a1c622890bc24241", "version_major": 2, "version_minor": 0 }, @@ -1683,7 +1860,7 @@ }, { "cell_type": "code", - "execution_count": 101, + "execution_count": 26, "id": "promising-miller", "metadata": { "scrolled": true @@ -1693,59 +1870,43 @@ "name": "stdout", "output_type": "stream", "text": [ - "Total Samples 500\n", - "Positive Samples 233\n", + "Total Samples 117\n", + "Positive Samples 20\n", "=============OVERSAMPLING===============\n", - "Total Samples 534\n", - "Positive Samples 267\n", + "Total Samples 194\n", + "Positive Samples 97\n", "=================== 0 FOLD=====================\n", - "train_hids 424\n", - "X_df (424, 21560)\n", - "y_df (424,)\n", - "(424, 21560)\n", - "(424,)\n", - "test_hids 106\n", - "X_df (106, 21560)\n", - "y_df (106,)\n", - "(106, 21560)\n", - "(106,)\n", - "===============MODEL TRAINING===============\n", - "BCE Loss: 1.38\n", - "AU-ROC: 0.79\n", - "AU-PRC: 0.85\n", - "AU-PRC Baaseline: 0.55\n", - "Accuracy: 0.75\n", - "Precision: 0.82\n", - "Recall: 0.71\n", - "Specificity: 0.81\n", - "NPV: 0.70\n", - "ECE: 0.17\n", - "MCE: 0.35\n" + "train_hids 152\n", + "X_df (152, 9824)\n", + "y_df (152,)\n", + "(152, 9824)\n", + "(152,)\n", + "test_hids 38\n", + "X_df (38, 9824)\n", + "y_df (38,)\n" ] }, { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" + "ename": "ValueError", + "evalue": "y contains previously unseen labels: 'BLACK/CAPE VERDEAN'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)", + "File \u001b[1;32md:\\Work\\Repos\\MIMIC-IV-Data-Pipeline\\venv\\Lib\\site-packages\\sklearn\\utils\\_encode.py:225\u001b[0m, in \u001b[0;36m_encode\u001b[1;34m(values, uniques, check_unknown)\u001b[0m\n\u001b[0;32m 224\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m--> 225\u001b[0m \u001b[39mreturn\u001b[39;00m _map_to_integer(values, uniques)\n\u001b[0;32m 226\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mKeyError\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n", + "File \u001b[1;32md:\\Work\\Repos\\MIMIC-IV-Data-Pipeline\\venv\\Lib\\site-packages\\sklearn\\utils\\_encode.py:165\u001b[0m, in \u001b[0;36m_map_to_integer\u001b[1;34m(values, uniques)\u001b[0m\n\u001b[0;32m 164\u001b[0m table \u001b[39m=\u001b[39m _nandict({val: i \u001b[39mfor\u001b[39;00m i, val \u001b[39min\u001b[39;00m \u001b[39menumerate\u001b[39m(uniques)})\n\u001b[1;32m--> 165\u001b[0m \u001b[39mreturn\u001b[39;00m np\u001b[39m.\u001b[39marray([table[v] \u001b[39mfor\u001b[39;49;00m v \u001b[39min\u001b[39;49;00m values])\n", + "File \u001b[1;32md:\\Work\\Repos\\MIMIC-IV-Data-Pipeline\\venv\\Lib\\site-packages\\sklearn\\utils\\_encode.py:165\u001b[0m, in \u001b[0;36m\u001b[1;34m(.0)\u001b[0m\n\u001b[0;32m 164\u001b[0m table \u001b[39m=\u001b[39m _nandict({val: i \u001b[39mfor\u001b[39;00m i, val \u001b[39min\u001b[39;00m \u001b[39menumerate\u001b[39m(uniques)})\n\u001b[1;32m--> 165\u001b[0m \u001b[39mreturn\u001b[39;00m np\u001b[39m.\u001b[39marray([table[v] \u001b[39mfor\u001b[39;00m v \u001b[39min\u001b[39;00m values])\n", + "File \u001b[1;32md:\\Work\\Repos\\MIMIC-IV-Data-Pipeline\\venv\\Lib\\site-packages\\sklearn\\utils\\_encode.py:159\u001b[0m, in \u001b[0;36m_nandict.__missing__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 158\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mnan_value\n\u001b[1;32m--> 159\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mKeyError\u001b[39;00m(key)\n", + "\u001b[1;31mKeyError\u001b[0m: 'BLACK/CAPE VERDEAN'", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32md:\\Work\\Repos\\MIMIC-IV-Data-Pipeline\\mainPipeline.ipynb Cell 33\u001b[0m line \u001b[0;36m7\n\u001b[0;32m 5\u001b[0m \u001b[39melif\u001b[39;00m radio_input7\u001b[39m.\u001b[39mvalue\u001b[39m==\u001b[39m\u001b[39m'\u001b[39m\u001b[39m10-fold CV\u001b[39m\u001b[39m'\u001b[39m:\n\u001b[0;32m 6\u001b[0m cv\u001b[39m=\u001b[39m\u001b[39mint\u001b[39m(\u001b[39m10\u001b[39m)\n\u001b[1;32m----> 7\u001b[0m ml\u001b[39m=\u001b[39mml_models\u001b[39m.\u001b[39;49mML_models(data_icu,cv,radio_input5\u001b[39m.\u001b[39;49mvalue,concat\u001b[39m=\u001b[39;49mradio_input6\u001b[39m.\u001b[39;49mvalue\u001b[39m==\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mConactenate\u001b[39;49m\u001b[39m'\u001b[39;49m,oversampling\u001b[39m=\u001b[39;49mradio_input8\u001b[39m.\u001b[39;49mvalue\u001b[39m==\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mTrue\u001b[39;49m\u001b[39m'\u001b[39;49m)\n", + "File \u001b[1;32md:\\Work\\Repos\\MIMIC-IV-Data-Pipeline\\model\\ml_models.py:42\u001b[0m, in \u001b[0;36mML_models.__init__\u001b[1;34m(self, data_icu, k_fold, model_type, concat, oversampling)\u001b[0m\n\u001b[0;32m 40\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39moversampling\u001b[39m=\u001b[39moversampling\n\u001b[0;32m 41\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mloss\u001b[39m=\u001b[39mevaluation\u001b[39m.\u001b[39mLoss(\u001b[39m'\u001b[39m\u001b[39mcpu\u001b[39m\u001b[39m'\u001b[39m,\u001b[39mTrue\u001b[39;00m,\u001b[39mTrue\u001b[39;00m,\u001b[39mTrue\u001b[39;00m,\u001b[39mTrue\u001b[39;00m,\u001b[39mTrue\u001b[39;00m,\u001b[39mTrue\u001b[39;00m,\u001b[39mTrue\u001b[39;00m,\u001b[39mTrue\u001b[39;00m,\u001b[39mTrue\u001b[39;00m,\u001b[39mTrue\u001b[39;00m,\u001b[39mTrue\u001b[39;00m)\n\u001b[1;32m---> 42\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mml_train()\n", + "File \u001b[1;32md:\\Work\\Repos\\MIMIC-IV-Data-Pipeline\\model\\ml_models.py:124\u001b[0m, in \u001b[0;36mML_models.ml_train\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 122\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtest_data\u001b[39m=\u001b[39mX_test\u001b[39m.\u001b[39mcopy(deep\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n\u001b[0;32m 123\u001b[0m X_test[\u001b[39m'\u001b[39m\u001b[39mgender\u001b[39m\u001b[39m'\u001b[39m]\u001b[39m=\u001b[39mgen_encoder\u001b[39m.\u001b[39mtransform(X_test[\u001b[39m'\u001b[39m\u001b[39mgender\u001b[39m\u001b[39m'\u001b[39m])\n\u001b[1;32m--> 124\u001b[0m X_test[\u001b[39m'\u001b[39m\u001b[39methnicity\u001b[39m\u001b[39m'\u001b[39m]\u001b[39m=\u001b[39meth_encoder\u001b[39m.\u001b[39;49mtransform(X_test[\u001b[39m'\u001b[39;49m\u001b[39methnicity\u001b[39;49m\u001b[39m'\u001b[39;49m])\n\u001b[0;32m 125\u001b[0m X_test[\u001b[39m'\u001b[39m\u001b[39minsurance\u001b[39m\u001b[39m'\u001b[39m]\u001b[39m=\u001b[39mins_encoder\u001b[39m.\u001b[39mtransform(X_test[\u001b[39m'\u001b[39m\u001b[39minsurance\u001b[39m\u001b[39m'\u001b[39m])\n\u001b[0;32m 126\u001b[0m \u001b[39m#X_test['Age']=age_encoder.transform(X_test['Age'])\u001b[39;00m\n", + "File \u001b[1;32md:\\Work\\Repos\\MIMIC-IV-Data-Pipeline\\venv\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:137\u001b[0m, in \u001b[0;36mLabelEncoder.transform\u001b[1;34m(self, y)\u001b[0m\n\u001b[0;32m 134\u001b[0m \u001b[39mif\u001b[39;00m _num_samples(y) \u001b[39m==\u001b[39m \u001b[39m0\u001b[39m:\n\u001b[0;32m 135\u001b[0m \u001b[39mreturn\u001b[39;00m np\u001b[39m.\u001b[39marray([])\n\u001b[1;32m--> 137\u001b[0m \u001b[39mreturn\u001b[39;00m _encode(y, uniques\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mclasses_)\n", + "File \u001b[1;32md:\\Work\\Repos\\MIMIC-IV-Data-Pipeline\\venv\\Lib\\site-packages\\sklearn\\utils\\_encode.py:227\u001b[0m, in \u001b[0;36m_encode\u001b[1;34m(values, uniques, check_unknown)\u001b[0m\n\u001b[0;32m 225\u001b[0m \u001b[39mreturn\u001b[39;00m _map_to_integer(values, uniques)\n\u001b[0;32m 226\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mKeyError\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m--> 227\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39my contains previously unseen labels: \u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mstr\u001b[39m(e)\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m)\n\u001b[0;32m 228\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m 229\u001b[0m \u001b[39mif\u001b[39;00m check_unknown:\n", + "\u001b[1;31mValueError\u001b[0m: y contains previously unseen labels: 'BLACK/CAPE VERDEAN'" + ] } ], "source": [ @@ -1761,7 +1922,9 @@ { "cell_type": "markdown", "id": "ordinary-chancellor", - "metadata": {}, + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, "source": [ "## 9. Deep Learning Models\n", "- Time-series LSTM and Time-series CNN which will only use time-series events like medications, charts, labs, output events to train model.\n", @@ -1777,14 +1940,14 @@ }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 27, "id": "operational-pride", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "d162f67f625a457c8a9801aa324f5be3", + "model_id": "4c5bf3139bfc4a888740b17daf69fc66", "version_major": 2, "version_minor": 0 }, @@ -1805,7 +1968,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "0a23df702c704ed5a63944d897653f56", + "model_id": "71a6547b8f1d49e4b542d6af61e5498b", "version_major": 2, "version_minor": 0 }, @@ -1826,7 +1989,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "f8f5b62f5a8c4641ab60a4cd150455c7", + "model_id": "f68b900bb6894d528a725a338cfb1807", "version_major": 2, "version_minor": 0 }, @@ -1851,7 +2014,7 @@ }, { "cell_type": "code", - "execution_count": 159, + "execution_count": 28, "id": "golden-stewart", "metadata": { "scrolled": true @@ -1862,33 +2025,36 @@ "output_type": "stream", "text": [ "===============MODEL TRAINING===============\n", - "Total Samples 500\n", - "Positive Samples 233\n", + "Total Samples 117\n", + "Positive Samples 20\n", + "=============OVERSAMPLING===============\n", + "Total Samples 194\n", + "Positive Samples 97\n", "[ MODEL CREATED ]\n", "LSTMBase(\n", " (med): ValEmbed(\n", - " (codeEmbed): BatchNorm1d(163, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", - " (fc): Linear(in_features=163, out_features=152, bias=True)\n", + " (codeEmbed): BatchNorm1d(54, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (fc): Linear(in_features=54, out_features=152, bias=True)\n", " )\n", " (proc): CodeEmbed(\n", - " (codeEmbed): Embedding(157, 52)\n", - " (fc): Linear(in_features=8164, out_features=152, bias=True)\n", + " (codeEmbed): Embedding(61, 52)\n", + " (fc): Linear(in_features=3172, out_features=152, bias=True)\n", " )\n", " (out): CodeEmbed(\n", - " (codeEmbed): Embedding(70, 52)\n", - " (fc): Linear(in_features=3640, out_features=152, bias=True)\n", + " (codeEmbed): Embedding(29, 52)\n", + " (fc): Linear(in_features=1508, out_features=152, bias=True)\n", " )\n", " (chart): ValEmbed(\n", - " (codeEmbed): BatchNorm1d(446, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", - " (fc): Linear(in_features=446, out_features=152, bias=True)\n", + " (codeEmbed): BatchNorm1d(250, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", + " (fc): Linear(in_features=250, out_features=152, bias=True)\n", " )\n", " (cond): StatEmbed(\n", - " (codeEmbed): Embedding(1492, 52)\n", - " (fc): Linear(in_features=77584, out_features=152, bias=True)\n", + " (codeEmbed): Embedding(364, 52)\n", + " (fc): Linear(in_features=18928, out_features=152, bias=True)\n", " )\n", - " (ethEmbed): Embedding(9, 152, padding_idx=0)\n", + " (ethEmbed): Embedding(13, 152, padding_idx=0)\n", " (genderEmbed): Embedding(3, 152, padding_idx=0)\n", - " (ageEmbed): Embedding(74, 152, padding_idx=0)\n", + " (ageEmbed): Embedding(51, 152, padding_idx=0)\n", " (insEmbed): Embedding(4, 152, padding_idx=0)\n", " (embedfc): Linear(in_features=1368, out_features=152, bias=True)\n", " (rnn): LSTM(152, 256, num_layers=2, batch_first=True)\n", @@ -1896,100 +2062,28 @@ " (fc2): Linear(in_features=128, out_features=1, bias=True)\n", ")\n", "=================== 0 FOLD=====================\n", - "======= EPOCH 0.0 ========\n", - "BCE Loss: 1.30\n", - "AU-ROC: 0.68\n", - "AU-PRC: 0.63\n", - "AU-PRC Baaseline: 0.47\n", - "Accuracy: 0.64\n", - "Precision: 0.60\n", - "Recall: 0.70\n", - "Specificity: 0.58\n", - "NPV: 0.69\n", - "ECE: 0.07\n", - "MCE: 0.17\n", - "======= VALIDATION ========\n", - "BCE Loss: 1.26\n", - "AU-ROC: 0.72\n", - "AU-PRC: 0.77\n", - "AU-PRC Baaseline: 0.53\n", - "Accuracy: 0.70\n", - "Precision: 0.74\n", - "Recall: 0.67\n", - "Specificity: 0.74\n", - "NPV: 0.67\n", - "ECE: 0.14\n", - "MCE: 0.44\n", - "Validation results improved\n", - "Updating Model\n", - "======= EPOCH 1.0 ========\n" + "======= EPOCH 0.0 ========\n" ] }, { - "ename": "KeyboardInterrupt", - "evalue": "", + "ename": "ValueError", + "evalue": "y_true takes value in {} and pos_label is not specified: either make y_true take value in {0, 1} or {-1, 1} or pass pos_label explicitly.", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mdata_icu\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 9\u001b[1;33m \u001b[0mmodel\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdl_train\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mDL_models\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_icu\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mdiag_flag\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mproc_flag\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mout_flag\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mchart_flag\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mmed_flag\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mradio_input6\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalue\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mcv\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0moversampling\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mradio_input8\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalue\u001b[0m\u001b[1;33m==\u001b[0m\u001b[1;34m'True'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mmodel_name\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'attn_icu_read'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mtrain\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 10\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 11\u001b[0m \u001b[0mmodel\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdl_train\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mDL_models\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_icu\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mdiag_flag\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mproc_flag\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mmed_flag\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mlab_flag\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mradio_input6\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalue\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mcv\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0moversampling\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mradio_input8\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalue\u001b[0m\u001b[1;33m==\u001b[0m\u001b[1;34m'True'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mmodel_name\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'attn_icu_read'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mtrain\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Desktop\\MIMIC-IV-Data-Pipeline\\model\\dl_train.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, data_icu, diag_flag, proc_flag, out_flag, chart_flag, med_flag, lab_flag, model_type, k_fold, oversampling, model_name, train)\u001b[0m\n\u001b[0;32m 81\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mtrain\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 82\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"===============MODEL TRAINING===============\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 83\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdl_train\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 84\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 85\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Desktop\\MIMIC-IV-Data-Pipeline\\model\\dl_train.py\u001b[0m in \u001b[0;36mdl_train\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 162\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"======= EPOCH {:.1f} ========\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mepoch\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 163\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mnbatch\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtrain_hids\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m/\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbatch_size\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 164\u001b[1;33m \u001b[0mmeds\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mchart\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mout\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mproc\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mlab\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mstat_train\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mdemo_train\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mY_train\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgetXY\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtrain_hids\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mnbatch\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbatch_size\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnbatch\u001b[0m\u001b[1;33m+\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbatch_size\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mlabels\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 165\u001b[0m \u001b[1;31m# print(chart.shape)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 166\u001b[0m \u001b[1;31m# print(meds.shape)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Desktop\\MIMIC-IV-Data-Pipeline\\model\\dl_train.py\u001b[0m in \u001b[0;36mgetXY\u001b[1;34m(self, ids, labels)\u001b[0m\n\u001b[0;32m 315\u001b[0m \u001b[1;31m# print(\"key\",key)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 316\u001b[0m \u001b[1;31m# print(\"keys[key]\",keys[key])\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 317\u001b[1;33m \u001b[0mdyn_temp\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdyn\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mkeys\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 318\u001b[0m \u001b[0mdyn_temp\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdyn_temp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mto_numpy\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 319\u001b[0m \u001b[0mdyn_temp\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mtorch\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtensor\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdyn_temp\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Anaconda3\\envs\\DSRA\\lib\\site-packages\\pandas\\core\\frame.py\u001b[0m in \u001b[0;36m__getitem__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 2773\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mis_unique\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mkey\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2774\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnlevels\u001b[0m \u001b[1;33m>\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2775\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_getitem_multilevel\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2776\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_get_item_cache\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2777\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Anaconda3\\envs\\DSRA\\lib\\site-packages\\pandas\\core\\frame.py\u001b[0m in \u001b[0;36m_getitem_multilevel\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 2847\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_getitem_multilevel\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2848\u001b[0m \u001b[1;31m# self.columns is a MultiIndex\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2849\u001b[1;33m \u001b[0mloc\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2850\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mloc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mslice\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mSeries\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mndarray\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mIndex\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2851\u001b[0m \u001b[0mnew_columns\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mloc\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Anaconda3\\envs\\DSRA\\lib\\site-packages\\pandas\\core\\indexes\\multi.py\u001b[0m in \u001b[0;36mget_loc\u001b[1;34m(self, key, method)\u001b[0m\n\u001b[0;32m 2651\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mtuple\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2652\u001b[0m \u001b[1;31m# not including list here breaks some indexing, xref #30892\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2653\u001b[1;33m \u001b[0mloc\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_get_level_indexer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlevel\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2654\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0m_maybe_to_slice\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mloc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2655\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Anaconda3\\envs\\DSRA\\lib\\site-packages\\pandas\\core\\indexes\\multi.py\u001b[0m in \u001b[0;36m_get_level_indexer\u001b[1;34m(self, key, level, indexer)\u001b[0m\n\u001b[0;32m 2922\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mlevel\u001b[0m \u001b[1;33m>\u001b[0m \u001b[1;36m0\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlexsort_depth\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2923\u001b[0m \u001b[1;31m# Desired level is not sorted\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2924\u001b[1;33m \u001b[0mlocs\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlevel_codes\u001b[0m \u001b[1;33m==\u001b[0m \u001b[0mcode\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mbool\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2925\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mlocs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0many\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2926\u001b[0m \u001b[1;31m# The label is present in self.levels[level] but unused:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mKeyboardInterrupt\u001b[0m: " + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32md:\\Work\\Repos\\MIMIC-IV-Data-Pipeline\\mainPipeline.ipynb Cell 36\u001b[0m line \u001b[0;36m9\n\u001b[0;32m 6\u001b[0m cv\u001b[39m=\u001b[39m\u001b[39mint\u001b[39m(\u001b[39m10\u001b[39m)\n\u001b[0;32m 8\u001b[0m \u001b[39mif\u001b[39;00m data_icu:\n\u001b[1;32m----> 9\u001b[0m model\u001b[39m=\u001b[39mdl_train\u001b[39m.\u001b[39;49mDL_models(data_icu,diag_flag,proc_flag,out_flag,chart_flag,med_flag,\u001b[39mFalse\u001b[39;49;00m,radio_input6\u001b[39m.\u001b[39;49mvalue,cv,oversampling\u001b[39m=\u001b[39;49mradio_input8\u001b[39m.\u001b[39;49mvalue\u001b[39m==\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mTrue\u001b[39;49m\u001b[39m'\u001b[39;49m,model_name\u001b[39m=\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mattn_icu_read\u001b[39;49m\u001b[39m'\u001b[39;49m,train\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m)\n\u001b[0;32m 10\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m 11\u001b[0m model\u001b[39m=\u001b[39mdl_train\u001b[39m.\u001b[39mDL_models(data_icu,diag_flag,proc_flag,\u001b[39mFalse\u001b[39;00m,\u001b[39mFalse\u001b[39;00m,med_flag,lab_flag,radio_input6\u001b[39m.\u001b[39mvalue,cv,oversampling\u001b[39m=\u001b[39mradio_input8\u001b[39m.\u001b[39mvalue\u001b[39m==\u001b[39m\u001b[39m'\u001b[39m\u001b[39mTrue\u001b[39m\u001b[39m'\u001b[39m,model_name\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mattn_icu_read\u001b[39m\u001b[39m'\u001b[39m,train\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n", + "File \u001b[1;32md:\\Work\\Repos\\MIMIC-IV-Data-Pipeline\\model\\dl_train.py:83\u001b[0m, in \u001b[0;36mDL_models.__init__\u001b[1;34m(self, data_icu, diag_flag, proc_flag, out_flag, chart_flag, med_flag, lab_flag, model_type, k_fold, oversampling, model_name, train)\u001b[0m\n\u001b[0;32m 81\u001b[0m \u001b[39mif\u001b[39;00m train:\n\u001b[0;32m 82\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39m===============MODEL TRAINING===============\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m---> 83\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mdl_train()\n\u001b[0;32m 85\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m 86\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mnet\u001b[39m=\u001b[39mtorch\u001b[39m.\u001b[39mload(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39msave_path)\n", + "File \u001b[1;32md:\\Work\\Repos\\MIMIC-IV-Data-Pipeline\\model\\dl_train.py:180\u001b[0m, in \u001b[0;36mDL_models.dl_train\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 176\u001b[0m train_logits\u001b[39m.\u001b[39mextend(logits\u001b[39m.\u001b[39mdata\u001b[39m.\u001b[39mcpu()\u001b[39m.\u001b[39mnumpy())\n\u001b[0;32m 178\u001b[0m \u001b[39m#print(train_prob)\u001b[39;00m\n\u001b[0;32m 179\u001b[0m \u001b[39m#print(train_truth)\u001b[39;00m\n\u001b[1;32m--> 180\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mloss(torch\u001b[39m.\u001b[39;49mtensor(train_prob),torch\u001b[39m.\u001b[39;49mtensor(train_truth),torch\u001b[39m.\u001b[39;49mtensor(train_logits),\u001b[39mFalse\u001b[39;49;00m,\u001b[39mFalse\u001b[39;49;00m)\n\u001b[0;32m 181\u001b[0m val_loss\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmodel_val(val_hids)\n\u001b[0;32m 182\u001b[0m \u001b[39m#print(\"Updating Model\")\u001b[39;00m\n\u001b[0;32m 183\u001b[0m \u001b[39m#T.save(self.net,self.save_path)\u001b[39;00m\n", + "File \u001b[1;32md:\\Work\\Repos\\MIMIC-IV-Data-Pipeline\\venv\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1518\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1516\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_compiled_call_impl(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs) \u001b[39m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m 1517\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m-> 1518\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_call_impl(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n", + "File \u001b[1;32md:\\Work\\Repos\\MIMIC-IV-Data-Pipeline\\venv\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1527\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1522\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m 1523\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m 1524\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m 1525\u001b[0m \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m 1526\u001b[0m \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1527\u001b[0m \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[0;32m 1529\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m 1530\u001b[0m result \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n", + "File \u001b[1;32md:\\Work\\Repos\\MIMIC-IV-Data-Pipeline\\model\\evaluation.py:98\u001b[0m, in \u001b[0;36mLoss.forward\u001b[1;34m(self, prob, labels, logits, train, standalone)\u001b[0m\n\u001b[0;32m 94\u001b[0m prob \u001b[39m=\u001b[39m prob\u001b[39m.\u001b[39mdata\u001b[39m.\u001b[39mcpu()\u001b[39m.\u001b[39mnumpy()\n\u001b[0;32m 95\u001b[0m \u001b[39mif\u001b[39;00m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mauroc):\n\u001b[0;32m 96\u001b[0m \u001b[39m# print(labels)\u001b[39;00m\n\u001b[0;32m 97\u001b[0m \u001b[39m# print(prob)\u001b[39;00m\n\u001b[1;32m---> 98\u001b[0m fpr, tpr, threshholds \u001b[39m=\u001b[39m metrics\u001b[39m.\u001b[39;49mroc_curve(labels, prob)\n\u001b[0;32m 99\u001b[0m auc \u001b[39m=\u001b[39m metrics\u001b[39m.\u001b[39mauc(fpr, tpr)\n\u001b[0;32m 100\u001b[0m \u001b[39mif\u001b[39;00m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39maurocPlot):\n", + "File \u001b[1;32md:\\Work\\Repos\\MIMIC-IV-Data-Pipeline\\venv\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py:214\u001b[0m, in \u001b[0;36mvalidate_params..decorator..wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 208\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m 209\u001b[0m \u001b[39mwith\u001b[39;00m config_context(\n\u001b[0;32m 210\u001b[0m skip_parameter_validation\u001b[39m=\u001b[39m(\n\u001b[0;32m 211\u001b[0m prefer_skip_nested_validation \u001b[39mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 212\u001b[0m )\n\u001b[0;32m 213\u001b[0m ):\n\u001b[1;32m--> 214\u001b[0m \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[0;32m 215\u001b[0m \u001b[39mexcept\u001b[39;00m InvalidParameterError \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m 216\u001b[0m \u001b[39m# When the function is just a wrapper around an estimator, we allow\u001b[39;00m\n\u001b[0;32m 217\u001b[0m \u001b[39m# the function to delegate validation to the estimator, but we replace\u001b[39;00m\n\u001b[0;32m 218\u001b[0m \u001b[39m# the name of the estimator by the name of the function in the error\u001b[39;00m\n\u001b[0;32m 219\u001b[0m \u001b[39m# message to avoid confusion.\u001b[39;00m\n\u001b[0;32m 220\u001b[0m msg \u001b[39m=\u001b[39m re\u001b[39m.\u001b[39msub(\n\u001b[0;32m 221\u001b[0m \u001b[39mr\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mparameter of \u001b[39m\u001b[39m\\\u001b[39m\u001b[39mw+ must be\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m 222\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mparameter of \u001b[39m\u001b[39m{\u001b[39;00mfunc\u001b[39m.\u001b[39m\u001b[39m__qualname__\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m must be\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m 223\u001b[0m \u001b[39mstr\u001b[39m(e),\n\u001b[0;32m 224\u001b[0m )\n", + "File \u001b[1;32md:\\Work\\Repos\\MIMIC-IV-Data-Pipeline\\venv\\Lib\\site-packages\\sklearn\\metrics\\_ranking.py:1095\u001b[0m, in \u001b[0;36mroc_curve\u001b[1;34m(y_true, y_score, pos_label, sample_weight, drop_intermediate)\u001b[0m\n\u001b[0;32m 993\u001b[0m \u001b[39m@validate_params\u001b[39m(\n\u001b[0;32m 994\u001b[0m {\n\u001b[0;32m 995\u001b[0m \u001b[39m\"\u001b[39m\u001b[39my_true\u001b[39m\u001b[39m\"\u001b[39m: [\u001b[39m\"\u001b[39m\u001b[39marray-like\u001b[39m\u001b[39m\"\u001b[39m],\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1004\u001b[0m y_true, y_score, \u001b[39m*\u001b[39m, pos_label\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, sample_weight\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, drop_intermediate\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m\n\u001b[0;32m 1005\u001b[0m ):\n\u001b[0;32m 1006\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Compute Receiver operating characteristic (ROC).\u001b[39;00m\n\u001b[0;32m 1007\u001b[0m \n\u001b[0;32m 1008\u001b[0m \u001b[39m Note: this implementation is restricted to the binary classification task.\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1093\u001b[0m \u001b[39m array([ inf, 0.8 , 0.4 , 0.35, 0.1 ])\u001b[39;00m\n\u001b[0;32m 1094\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[1;32m-> 1095\u001b[0m fps, tps, thresholds \u001b[39m=\u001b[39m _binary_clf_curve(\n\u001b[0;32m 1096\u001b[0m y_true, y_score, pos_label\u001b[39m=\u001b[39;49mpos_label, sample_weight\u001b[39m=\u001b[39;49msample_weight\n\u001b[0;32m 1097\u001b[0m )\n\u001b[0;32m 1099\u001b[0m \u001b[39m# Attempt to drop thresholds corresponding to points in between and\u001b[39;00m\n\u001b[0;32m 1100\u001b[0m \u001b[39m# collinear with other points. These are always suboptimal and do not\u001b[39;00m\n\u001b[0;32m 1101\u001b[0m \u001b[39m# appear on a plotted ROC curve (and thus do not affect the AUC).\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1106\u001b[0m \u001b[39m# but does not drop more complicated cases like fps = [1, 3, 7],\u001b[39;00m\n\u001b[0;32m 1107\u001b[0m \u001b[39m# tps = [1, 2, 4]; there is no harm in keeping too many thresholds.\u001b[39;00m\n\u001b[0;32m 1108\u001b[0m \u001b[39mif\u001b[39;00m drop_intermediate \u001b[39mand\u001b[39;00m \u001b[39mlen\u001b[39m(fps) \u001b[39m>\u001b[39m \u001b[39m2\u001b[39m:\n", + "File \u001b[1;32md:\\Work\\Repos\\MIMIC-IV-Data-Pipeline\\venv\\Lib\\site-packages\\sklearn\\metrics\\_ranking.py:821\u001b[0m, in \u001b[0;36m_binary_clf_curve\u001b[1;34m(y_true, y_score, pos_label, sample_weight)\u001b[0m\n\u001b[0;32m 818\u001b[0m y_score \u001b[39m=\u001b[39m y_score[nonzero_weight_mask]\n\u001b[0;32m 819\u001b[0m sample_weight \u001b[39m=\u001b[39m sample_weight[nonzero_weight_mask]\n\u001b[1;32m--> 821\u001b[0m pos_label \u001b[39m=\u001b[39m _check_pos_label_consistency(pos_label, y_true)\n\u001b[0;32m 823\u001b[0m \u001b[39m# make y_true a boolean vector\u001b[39;00m\n\u001b[0;32m 824\u001b[0m y_true \u001b[39m=\u001b[39m y_true \u001b[39m==\u001b[39m pos_label\n", + "File \u001b[1;32md:\\Work\\Repos\\MIMIC-IV-Data-Pipeline\\venv\\Lib\\site-packages\\sklearn\\utils\\validation.py:2245\u001b[0m, in \u001b[0;36m_check_pos_label_consistency\u001b[1;34m(pos_label, y_true)\u001b[0m\n\u001b[0;32m 2234\u001b[0m \u001b[39mif\u001b[39;00m pos_label \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m (\n\u001b[0;32m 2235\u001b[0m classes\u001b[39m.\u001b[39mdtype\u001b[39m.\u001b[39mkind \u001b[39min\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mOUS\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 2236\u001b[0m \u001b[39mor\u001b[39;00m \u001b[39mnot\u001b[39;00m (\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 2242\u001b[0m )\n\u001b[0;32m 2243\u001b[0m ):\n\u001b[0;32m 2244\u001b[0m classes_repr \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39m, \u001b[39m\u001b[39m\"\u001b[39m\u001b[39m.\u001b[39mjoin([\u001b[39mrepr\u001b[39m(c) \u001b[39mfor\u001b[39;00m c \u001b[39min\u001b[39;00m classes\u001b[39m.\u001b[39mtolist()])\n\u001b[1;32m-> 2245\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m 2246\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39my_true takes value in \u001b[39m\u001b[39m{{\u001b[39;00m\u001b[39m{\u001b[39;00mclasses_repr\u001b[39m}\u001b[39;00m\u001b[39m}}\u001b[39;00m\u001b[39m and pos_label is not \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 2247\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mspecified: either make y_true take value in \u001b[39m\u001b[39m{\u001b[39m\u001b[39m0, 1} or \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 2248\u001b[0m \u001b[39m\"\u001b[39m\u001b[39m{\u001b[39m\u001b[39m-1, 1} or pass pos_label explicitly.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 2249\u001b[0m )\n\u001b[0;32m 2250\u001b[0m \u001b[39melif\u001b[39;00m pos_label \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m 2251\u001b[0m pos_label \u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n", + "\u001b[1;31mValueError\u001b[0m: y_true takes value in {} and pos_label is not specified: either make y_true take value in {0, 1} or {-1, 1} or pass pos_label explicitly." ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" } ], "source": [ @@ -2009,7 +2103,9 @@ { "cell_type": "markdown", "id": "driven-factor", - "metadata": {}, + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, "source": [ "## 10. Running BEHRT\n", "Below we integrate the implementation of BEHRT in our pipeline.\n", @@ -2028,10 +2124,85 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "id": "aggressive-break", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "STARTING READING FILES.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 117/117 [00:03<00:00, 38.60it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "FINISHED READING FILES. \n", + "\n", + "STARTING TOKENIZATION.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 117/117 [00:05<00:00, 19.54it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "FINISHED TOKENIZATION. \n", + "\n", + "FINAL COHORT STATISTICS: \n", + "7 Positive samples.\n", + "73 Negative samples.\n", + "\n", + "43 Female samples.\n", + "37 Male samples.\n", + "\n", + "56 WHITE samples.\n", + "5 UNKNOWN samples.\n", + "11 BLACK/AFRICAN AMERICAN samples.\n", + "0 BLACK/CAPE VERDEAN samples.\n", + "3 OTHER samples.\n", + "1 PORTUGUESE samples.\n", + "0 HISPANIC/LATINO - PUERTO RICAN samples.\n", + "1 WHITE - BRAZILIAN samples.\n", + "0 HISPANIC OR LATINO samples.\n", + "2 UNABLE TO OBTAIN samples.\n", + "0 WHITE - OTHER EUROPEAN samples.\n", + "1 HISPANIC/LATINO - SALVADORAN samples.\n", + "\n", + "\n", + "48 Other samples.\n", + "27 Medicare samples.\n", + "5 Medicaid samples.\n" + ] + }, + { + "ename": "UnboundLocalError", + "evalue": "cannot access local variable 'train' where it is not associated with a value", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mUnboundLocalError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32md:\\Work\\Repos\\MIMIC-IV-Data-Pipeline\\mainPipeline.ipynb Cell 38\u001b[0m line \u001b[0;36m8\n\u001b[0;32m 5\u001b[0m token\u001b[39m=\u001b[39mtokenization\u001b[39m.\u001b[39mBEHRT_models(data_icu,diag_flag,proc_flag,\u001b[39mFalse\u001b[39;00m,\u001b[39mFalse\u001b[39;00m,med_flag,lab_flag)\n\u001b[0;32m 6\u001b[0m tokenized_src, tokenized_age, tokenized_gender, tokenized_ethni, tokenized_ins, tokenized_labels\u001b[39m=\u001b[39mtoken\u001b[39m.\u001b[39mtokenize()\n\u001b[1;32m----> 8\u001b[0m behrt_train\u001b[39m.\u001b[39;49mtrain_behrt(tokenized_src, tokenized_age, tokenized_gender, tokenized_ethni, tokenized_ins, tokenized_labels)\n", + "File \u001b[1;32md:\\Work\\Repos\\MIMIC-IV-Data-Pipeline\\model\\behrt_train.py:133\u001b[0m, in \u001b[0;36mtrain_behrt.__init__\u001b[1;34m(self, src, age, sex, ethni, ins, target_data)\u001b[0m\n\u001b[0;32m 130\u001b[0m ValDset \u001b[39m=\u001b[39m DataLoader(val_data, max_len\u001b[39m=\u001b[39mtrain_params[\u001b[39m'\u001b[39m\u001b[39mmax_len_seq\u001b[39m\u001b[39m'\u001b[39m], code\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mcode\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[0;32m 131\u001b[0m valload \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mutils\u001b[39m.\u001b[39mdata\u001b[39m.\u001b[39mDataLoader(dataset\u001b[39m=\u001b[39mValDset, batch_size\u001b[39m=\u001b[39mtrain_params[\u001b[39m'\u001b[39m\u001b[39mbatch_size\u001b[39m\u001b[39m'\u001b[39m], shuffle\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n\u001b[1;32m--> 133\u001b[0m train_loss, val_loss \u001b[39m=\u001b[39m train(trainload, valload, train_params[\u001b[39m'\u001b[39m\u001b[39mdevice\u001b[39m\u001b[39m'\u001b[39m])\n\u001b[0;32m 135\u001b[0m behrt\u001b[39m.\u001b[39mload_state_dict(torch\u001b[39m.\u001b[39mload(\u001b[39m\"\u001b[39m\u001b[39m./saved_models/checkpoint/behrt\u001b[39m\u001b[39m\"\u001b[39m, map_location\u001b[39m=\u001b[39mtrain_params[\u001b[39m'\u001b[39m\u001b[39mdevice\u001b[39m\u001b[39m'\u001b[39m]))\n\u001b[0;32m 136\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mLoading succesfull\u001b[39m\u001b[39m\"\u001b[39m)\n", + "\u001b[1;31mUnboundLocalError\u001b[0m: cannot access local variable 'train' where it is not associated with a value" + ] + } + ], "source": [ "if data_icu:\n", " token=tokenization.BEHRT_models(data_icu,diag_flag,proc_flag,out_flag,chart_flag,med_flag,False)\n", @@ -2066,10 +2237,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "id": "streaming-integration", "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'device' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32md:\\Work\\Repos\\MIMIC-IV-Data-Pipeline\\mainPipeline.ipynb Cell 40\u001b[0m line \u001b[0;36m4\n\u001b[0;32m 2\u001b[0m device\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mcuda:0\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m 3\u001b[0m \u001b[39m#device='cpu'\u001b[39;00m\n\u001b[1;32m----> 4\u001b[0m loss\u001b[39m=\u001b[39mevaluation\u001b[39m.\u001b[39mLoss(device,acc\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m,ppv\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m,sensi\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m,tnr\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m,npv\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m,auroc\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m,aurocPlot\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m,auprc\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m,auprcPlot\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m,callb\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m,callbPlot\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n\u001b[0;32m 5\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39m./data/output/outputDict\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m'\u001b[39m\u001b[39mrb\u001b[39m\u001b[39m'\u001b[39m) \u001b[39mas\u001b[39;00m fp:\n\u001b[0;32m 6\u001b[0m outputDict\u001b[39m=\u001b[39mpickle\u001b[39m.\u001b[39mload(fp)\n", + "\u001b[1;31mNameError\u001b[0m: name 'device' is not defined" + ] + } + ], "source": [ "if torch.cuda.is_available():\n", " device='cuda:0'\n", @@ -2107,10 +2290,151 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "id": "civilian-direction", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 sensitive_attributegrouptptnfpfntprtnrfprfnrprnraccuracy
0ethnicityBLACK/AFRICAN AMERICAN0100nan1.0000000.000000nan0.0000001.0000001.000000
1ethnicityWHITE01030.0000001.0000000.0000001.0000000.0000001.0000000.250000
2genderF02030.0000001.0000000.0000001.0000000.0000001.0000000.400000
3age_binned40-5000030.000000nannan1.0000000.0000001.0000000.000000
4age_binned70-800100nan1.0000000.000000nan0.0000001.0000001.000000
5age_binned80-900100nan1.0000000.000000nan0.0000001.0000001.000000
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "fairness.fairness_evaluation(inputFile='outputDict',outputFile='fairnessReport')" ] @@ -2134,10 +2458,81 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "id": "secure-flavor", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "BEFORE CALLIBRATION\n", + "BCE Loss: 1.40\n", + "AU-ROC: 0.50\n", + "AU-PRC: 0.80\n", + "AU-PRC Baaseline: 0.60\n", + "Accuracy: 0.40\n", + "Precision: 0.00\n", + "Recall: 0.00\n", + "Specificity: 1.00\n", + "NPV: 0.40\n", + "ECE: 0.15\n", + "MCE: 0.15\n", + "AFTER CALLIBRATION\n", + "BCE Loss: 1.39\n", + "AU-ROC: 0.50\n", + "AU-PRC: 0.80\n", + "AU-PRC Baaseline: 0.60\n", + "Accuracy: 0.40\n", + "Precision: 0.00\n", + "Recall: 0.00\n", + "Specificity: 1.00\n", + "NPV: 0.40\n", + "ECE: 0.10\n", + "MCE: 0.10\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "callibrate_output.callibrate(inputFile='outputDict',outputFile='callibratedResults')" ] @@ -2168,7 +2563,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.0" + "version": "3.11.7" } }, "nbformat": 4, diff --git a/mimic_pipeline.ipynb b/mimic_pipeline.ipynb new file mode 100644 index 0000000000..779ed2b896 --- /dev/null +++ b/mimic_pipeline.ipynb @@ -0,0 +1,1647 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import ipywidgets as widgets\n", + "from pathlib import Path\n", + "\n", + "from pipeline.cohort_extractor import CohortExtractor\n", + "from pipeline.prediction_task import TargetType, PredictionTask, DiseaseCode\n", + "from pipeline.features_extractor import FeatureExtractor" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Welcome to MIMIC-IV Project" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "Path(\"raw_data\").mkdir(parents=True, exist_ok=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This repository explains the steps to download and clean MIMIC-IV dataset for analysis.\n", + "The repository is compatible with MIMIC-IV v2.0\n", + "\n", + "Please go to:\n", + "- https://physionet.org/content/mimiciv/2.0/ \n", + "\n", + "Follow instructions to get access to MIMIC-IV dataset.\n", + "\n", + "\n", + "Save downloaded files in the fikder raw_data\n", + "\n", + "The structure should look like below\n", + "- raw_data/mimiciv_2_0/hosp\n", + "- raw_data/mimiciv_2_0/icu" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. DATA EXTRACTION" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Please select what prediction task you want to perform ?\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4380137ad15a41a7bb2c71dab9ebfae8", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "RadioButtons(options=('Mortality', 'Length of Stay', 'Readmission', 'Phenotype'), value='Mortality')" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "print(\"Please select what prediction task you want to perform ?\")\n", + "task_ratio = widgets.RadioButtons(options=['Mortality','Length of Stay','Readmission','Phenotype'],value='Mortality')\n", + "display(task_ratio)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Refining Cohort and Prediction Task Definition\n", + "\n", + "Based on your current selection following block will provide option to further refine prediction task and cohort associated with it:\n", + "\n", + "- First you will refine the prediction task choosing from following options -\n", + " - **Length of Stay** - You can select from two predefined options or enter custom number of days to predict length os stay greater than number of days.\n", + "\n", + " - **Readmission** - You can select from two predefined options or enter custom number of days to predict readmission after \"number of days\" after previous admission.\n", + "\n", + " - **Phenotype Prediction** - You can select from four major chronic diseases to predict its future outcome\n", + "\n", + " - Heart failure\n", + " - CAD (Coronary Artery Disease)\n", + " - CKD (Chronic Kidney Disease)\n", + " - COPD (Chronic obstructive pulmonary disease)\n", + "\n", + "- Second, you will choode whether to perfom above task using ICU or non-ICU admissions data\n", + "\n", + "- Third, you can refine the refine the cohort selection for any of the above choosen prediction tasks by including the admission samples admitted with particular chronic disease - \n", + " - Heart failure\n", + " - CAD (Coronary Artery Disease)\n", + " - CKD (Chronic Kidney Disease)\n", + " - COPD (Chronic obstructive pulmonary disease)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Please select below if you want to work with ICU or Non-ICU data:\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2753dae2c46f49abb5c8623756c319b0", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "RadioButtons(options=('ICU', 'Non-ICU'), value='ICU')" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Please select if you want to perform the chosen prediction task for a specific disease.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f53d7dd969044115bedd712be0ba9a5f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "RadioButtons(options=('No Disease Filter', 'Heart Failure', 'CKD', 'CAD', 'COPD'), value='No Disease Filter')" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def create_length_of_stay_widgets():\n", + " radio_options = ['Length of Stay ≥ 3', 'Length of Stay ≥ 7', 'Custom']\n", + " radio_input = widgets.RadioButtons(options=radio_options, value='Length of Stay ≥ 3')\n", + " slider = widgets.IntSlider(value=3, min=1, max=10, step=1, continuous_update=False)\n", + " display(radio_input, widgets.HBox([widgets.Label('Length of stay ≥ (days):', layout={'width': '180px'}), slider]))\n", + " return radio_input, slider\n", + "\n", + "def create_readmission_widgets():\n", + " radio_options = ['30 Day Readmission', '60 Day Readmission', '90 Day Readmission', '120 Day Readmission', 'Custom']\n", + " radio_input = widgets.RadioButtons(options=radio_options, value='30 Day Readmission')\n", + " slider = widgets.IntSlider(value=30, min=10, max=150, step=10)\n", + " display(radio_input, widgets.HBox([widgets.Label('Readmission after (days):', layout={'width': '180px'}), slider]))\n", + " return radio_input, slider\n", + "\n", + "def create_phenotype_widgets():\n", + " radio_options = ['Heart Failure in 30 days', 'CAD in 30 days', 'CKD in 30 days', 'COPD in 30 days']\n", + " radio_input = widgets.RadioButtons(options=radio_options, value='Heart Failure in 30 days')\n", + " display(radio_input)\n", + " return radio_input\n", + "\n", + "def create_mortality_widgets():\n", + " radio_input = widgets.RadioButtons(options=['Mortality'], value='Mortality')\n", + " return radio_input\n", + "\n", + "if task_ratio.value != 'Mortality':\n", + " print(\"Please select to precise the prediction task \")\n", + "if task_ratio.value == 'Length of Stay':\n", + " los_radio, los_slider = create_length_of_stay_widgets()\n", + "elif task_ratio.value == 'Readmission':\n", + " readmission_radio, readmission_slider = create_readmission_widgets()\n", + "elif task_ratio.value == 'Phenotype':\n", + " phenotype_radio = create_phenotype_widgets()\n", + "elif task_ratio.value == 'Mortality':\n", + " mortality_radio = create_mortality_widgets()\n", + "\n", + "print(\"Please select below if you want to work with ICU or Non-ICU data:\")\n", + "icu_type_input = widgets.RadioButtons(options=['ICU', 'Non-ICU'], value='ICU')\n", + "display(icu_type_input)\n", + "\n", + "print(\"Please select if you want to perform the chosen prediction task for a specific disease.\")\n", + "disease_filter_input = widgets.RadioButtons(options=['No Disease Filter', 'Heart Failure', 'CKD', 'CAD', 'COPD'], value='No Disease Filter')\n", + "display(disease_filter_input)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def get_time_from_input():\n", + " task_type = task_ratio.value\n", + " if task_type == 'Length of Stay' and los_radio.value == 'Custom':\n", + " return los_slider.value\n", + " elif task_type == 'Readmission' and readmission_radio.value == 'Custom':\n", + " return readmission_slider.value\n", + " elif task_type == 'Readmission':\n", + " return int(readmission_radio.value.split()[0])\n", + " elif task_type == 'Length of Stay':\n", + " return int(los_radio.value.split()[4])\n", + " elif task_type == 'Phenotype':\n", + " return 30\n", + " return 0\n", + "\n", + "def get_disease_label():\n", + " if task_ratio.value != 'Phenotype':\n", + " return None\n", + " task_type = phenotype_radio.value\n", + " disease_mapping = {\n", + " 'Heart Failure in 30 days': DiseaseCode.HEARTH_FAILURE,\n", + " 'CAD in 30 days': DiseaseCode.CAD,\n", + " 'CKD in 30 days': DiseaseCode.CKD,\n", + " 'COPD in 30 days': DiseaseCode.COPD\n", + " }\n", + " return disease_mapping.get(task_type, \"\")\n", + "\n", + "def convert_to_icd_code(disease):\n", + " if (disease==\"Heart Failure\"):\n", + " icd_code=DiseaseCode.HEARTH_FAILURE\n", + " elif (disease==\"CKD\"):\n", + " icd_code=DiseaseCode.CKD\n", + " elif (disease==\"COPD\"):\n", + " icd_code=DiseaseCode.COPD\n", + " elif (disease==\"CAD\"):\n", + " icd_code=DiseaseCode.CAD\n", + " else:\n", + " icd_code=None\n", + " return icd_code \n", + "\n", + "def convert_to_prediction_task(task_text):\n", + " if task_text == 'Length of Stay':\n", + " return TargetType.LOS\n", + " elif task_text == 'Mortality': \n", + " return TargetType.MORTALITY\n", + " else:\n", + " return TargetType.READMISSION" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(, None, None, 0, True)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# DEBUG\n", + "(convert_to_prediction_task(task_ratio.value), \n", + "get_disease_label() if task_ratio.value == 'Phenotype' else None, \n", + "convert_to_icd_code(disease_filter_input.value) ,\n", + "get_time_from_input(), \n", + "(icu_type_input.value==\"ICU\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:===========MIMIC-IV v2.0============\n", + "INFO:root:EXTRACTING FOR: ICU | MORTALITY | 0 |\n", + "INFO:root:[ MORTALITY LABELS FINISHED: 10 Mortality Cases ]\n", + "INFO:root:[SUCCESSFULLY SAVED COHORT DATA]\n" + ] + } + ], + "source": [ + "prediction_task = PredictionTask(\n", + " target_type = convert_to_prediction_task(task_ratio.value), \n", + " disease_readmission= get_disease_label() if task_ratio.value == 'Phenotype' else None, \n", + " disease_selection=convert_to_icd_code(disease_filter_input.value) ,\n", + " nb_days=get_time_from_input(), \n", + " use_icu=(icu_type_input.value==\"ICU\")\n", + ")\n", + "cohort_extractor = CohortExtractor(prediction_task=prediction_task)\n", + "cohort = cohort_extractor.extract()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. FEATURE EXTRACTION\n", + "Features available for ICU data -\n", + "- Diagnosis (https://mimic.mit.edu/docs/iv/modules/hosp/diagnoses_icd/)\n", + "- Procedures (https://mimic.mit.edu/docs/iv/modules/icu/procedureevents/)\n", + "- Medications (https://mimic.mit.edu/docs/iv/modules/icu/inputevents/)\n", + "- Output Events (https://mimic.mit.edu/docs/iv/modules/icu/outputevents/)\n", + "- Chart Events (https://mimic.mit.edu/docs/iv/modules/icu/chartevents/)\n", + "\n", + "Features available for ICU data -\n", + "- Diagnosis (https://mimic.mit.edu/docs/iv/modules/hosp/diagnoses_icd/)\n", + "- Procedures (https://mimic.mit.edu/docs/iv/modules/hosp/procedures_icd/)\n", + "- Medications (https://mimic.mit.edu/docs/iv/modules/hosp/prescriptions/)\n", + "- Lab Events (https://mimic.mit.edu/docs/iv/modules/hosp/labevents/)\n", + "\n", + "All features will be saved in **./preproc_data/features/**" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Feature Selection\n", + "Which Features you want to include for cohort?\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "dafe8aa61f444b1aba9538b6a4553f20", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Checkbox(value=False, description='Diagnosis')" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "9fb3415c769f4211a8280fdb19fbfb18", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Checkbox(value=False, description='Output Events')" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5393df8a9da34b7d8bfea88681bd3b74", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Checkbox(value=False, description='Chart Events(Labs and Vitals)')" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4b99ad904e8a4650b8d19a237801bcaf", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Checkbox(value=False, description='Procedures')" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c558b62a4d2f49ff9e6c9290c7f9c359", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Checkbox(value=False, description='Medications')" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "**Please run below cell to extract selected features**\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG:Comm:handle_msg[dafe8aa61f444b1aba9538b6a4553f20]({'header': {'date': datetime.datetime(2023, 12, 11, 15, 33, 25, 713000, tzinfo=tzutc()), 'msg_id': '0676aaaf-830a-4a7d-a54b-b74545f800bd', 'msg_type': 'comm_msg', 'session': 'e8845a06-cb7d-41e1-bf48-372172e275ed', 'username': '6b58c396-60d8-4f33-b913-49b4b6a7dd8b', 'version': '5.2'}, 'msg_id': '0676aaaf-830a-4a7d-a54b-b74545f800bd', 'msg_type': 'comm_msg', 'parent_header': {}, 'metadata': {}, 'content': {'comm_id': 'dafe8aa61f444b1aba9538b6a4553f20', 'data': {'method': 'update', 'state': {'value': True}, 'buffer_paths': []}}, 'buffers': []})\n", + "DEBUG:Comm:handle_msg[9fb3415c769f4211a8280fdb19fbfb18]({'header': {'date': datetime.datetime(2023, 12, 11, 15, 33, 26, 741000, tzinfo=tzutc()), 'msg_id': '731bbc8f-b3c6-4552-a23e-ecda79ee3a18', 'msg_type': 'comm_msg', 'session': 'e8845a06-cb7d-41e1-bf48-372172e275ed', 'username': '6b58c396-60d8-4f33-b913-49b4b6a7dd8b', 'version': '5.2'}, 'msg_id': '731bbc8f-b3c6-4552-a23e-ecda79ee3a18', 'msg_type': 'comm_msg', 'parent_header': {}, 'metadata': {}, 'content': {'comm_id': '9fb3415c769f4211a8280fdb19fbfb18', 'data': {'method': 'update', 'state': {'value': True}, 'buffer_paths': []}}, 'buffers': []})\n", + "DEBUG:Comm:handle_msg[5393df8a9da34b7d8bfea88681bd3b74]({'header': {'date': datetime.datetime(2023, 12, 11, 15, 33, 27, 168000, tzinfo=tzutc()), 'msg_id': '8d3a1d76-0688-4e55-b964-5bf7cbbb3a30', 'msg_type': 'comm_msg', 'session': 'e8845a06-cb7d-41e1-bf48-372172e275ed', 'username': '6b58c396-60d8-4f33-b913-49b4b6a7dd8b', 'version': '5.2'}, 'msg_id': '8d3a1d76-0688-4e55-b964-5bf7cbbb3a30', 'msg_type': 'comm_msg', 'parent_header': {}, 'metadata': {}, 'content': {'comm_id': '5393df8a9da34b7d8bfea88681bd3b74', 'data': {'method': 'update', 'state': {'value': True}, 'buffer_paths': []}}, 'buffers': []})\n", + "DEBUG:Comm:handle_msg[4b99ad904e8a4650b8d19a237801bcaf]({'header': {'date': datetime.datetime(2023, 12, 11, 15, 33, 27, 739000, tzinfo=tzutc()), 'msg_id': '0eea441a-3aaa-4672-98c2-9122d73b851e', 'msg_type': 'comm_msg', 'session': 'e8845a06-cb7d-41e1-bf48-372172e275ed', 'username': '6b58c396-60d8-4f33-b913-49b4b6a7dd8b', 'version': '5.2'}, 'msg_id': '0eea441a-3aaa-4672-98c2-9122d73b851e', 'msg_type': 'comm_msg', 'parent_header': {}, 'metadata': {}, 'content': {'comm_id': '4b99ad904e8a4650b8d19a237801bcaf', 'data': {'method': 'update', 'state': {'value': True}, 'buffer_paths': []}}, 'buffers': []})\n", + "DEBUG:Comm:handle_msg[c558b62a4d2f49ff9e6c9290c7f9c359]({'header': {'date': datetime.datetime(2023, 12, 11, 15, 33, 28, 631000, tzinfo=tzutc()), 'msg_id': 'bc2a646c-bb26-40dc-af86-87e2db4d4c79', 'msg_type': 'comm_msg', 'session': 'e8845a06-cb7d-41e1-bf48-372172e275ed', 'username': '6b58c396-60d8-4f33-b913-49b4b6a7dd8b', 'version': '5.2'}, 'msg_id': 'bc2a646c-bb26-40dc-af86-87e2db4d4c79', 'msg_type': 'comm_msg', 'parent_header': {}, 'metadata': {}, 'content': {'comm_id': 'c558b62a4d2f49ff9e6c9290c7f9c359', 'data': {'method': 'update', 'state': {'value': True}, 'buffer_paths': []}}, 'buffers': []})\n" + ] + } + ], + "source": [ + "print(\"Feature Selection\")\n", + "if cohort_extractor.prediction_task.use_icu:\n", + " print(\"Which Features you want to include for cohort?\")\n", + " dia_input = widgets.Checkbox(description='Diagnosis')\n", + " display(dia_input)\n", + " out_input = widgets.Checkbox(description='Output Events')\n", + " display(out_input)\n", + " chart_input = widgets.Checkbox(description='Chart Events(Labs and Vitals)')\n", + " display(chart_input)\n", + " proc_input = widgets.Checkbox(description='Procedures')\n", + " display(proc_input)\n", + " med_input = widgets.Checkbox(description='Medications')\n", + " display(med_input)\n", + "else:\n", + " print(\"Which Features you want to include for cohort?\")\n", + " dia_input = widgets.Checkbox(description='Diagnosis')\n", + " display(dia_input)\n", + " lab_input = widgets.Checkbox(description='Labs')\n", + " display(lab_input)\n", + " proc_input = widgets.Checkbox(description='Procedures')\n", + " display(proc_input)\n", + " med_input = widgets.Checkbox(description='Medications')\n", + " display(med_input)\n", + "print(\"**Please run below cell to extract selected features**\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('cohort_ICU_mortality_0_', True, True, False, True, True, True, True)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#DEBUG\n", + "(\n", + " cohort_extractor.cohort_output,\n", + " prediction_task.use_icu,\n", + " dia_input.value,\n", + " not prediction_task.use_icu and lab_input.value,\n", + " prediction_task.use_icu and chart_input.value,\n", + " med_input.value,\n", + "\n", + " prediction_task.use_icu and out_input.value,\n", + " proc_input.value,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:[EXTRACTING DIAGNOSIS DATA]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:[SUCCESSFULLY SAVED DIAGNOSES DATA]\n", + "INFO:root:[EXTRACTING PROCEDURES DATA]\n", + "INFO:root:# Unique Events: 82\n", + "INFO:root:# Admissions: 138\n", + "INFO:root:Total rows: 1435\n", + "INFO:root:[SUCCESSFULLY SAVED PROCEDURES DATA]\n", + "INFO:root:[EXTRACTING MEDICATIONS DATA]\n", + "INFO:root:Number of unique types of drugs: 76\n", + "INFO:root:Number of admissions: 136\n", + "INFO:root:Total number of rows: 11038\n", + "INFO:root:[SUCCESSFULLY SAVED MEDICATIONS DATA]\n", + "INFO:root:[EXTRACTING OUTPUT EVENTS DATA]\n", + "INFO:root:# Unique Events: 39\n", + "INFO:root:# Admissions: 137\n", + "INFO:root:Total rows: 9362\n", + "INFO:root:[SUCCESSFULLY SAVED OUTPUT EVENTS DATA]\n", + "INFO:root:[EXTRACTING CHART EVENTS DATA]\n", + "1it [00:00, 1.02it/s]\n", + "INFO:root:# Unique Events: 298\n", + "INFO:root:# Admissions: 140\n", + "INFO:root:Total rows: 162571\n", + "INFO:root:# Unique Events: 298\n", + "INFO:root:# Admissions: 140\n", + "INFO:root:Total rows: 162571\n", + "INFO:root:[SUCCESSFULLY SAVED CHART EVENTS DATA]\n" + ] + } + ], + "source": [ + "feature_extractor= FeatureExtractor(\n", + " cohort_output=cohort_extractor.cohort_output,\n", + " use_icu=prediction_task.use_icu,\n", + " for_diagnoses=dia_input.value,\n", + " for_labs= not prediction_task.use_icu and lab_input.value,\n", + " for_output_events= prediction_task.use_icu and out_input.value,\n", + " for_chart_events=prediction_task.use_icu and chart_input.value,\n", + " for_procedures=proc_input.value,\n", + " for_medications= med_input.value,\n", + ")\n", + "\n", + "features = feature_extractor.save_features()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. CLINICAL GROUPING\n", + "Grouping medical codes will reduce dimensional space of features.\n", + "\n", + "Default options selected below will group medical codes to reduce feature dimension space.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Do you want to group ICD 10 DIAG codes ?\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2910dfea5a8a4e118722900171948a66", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "RadioButtons(index=2, layout=Layout(width='100%'), options=('Keep both ICD-9 and ICD-10 codes', 'Convert ICD-9…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "**Please run below cell to perform feature preprocessing**\n" + ] + } + ], + "source": [ + "if feature_extractor.for_diagnoses:\n", + " print(\"Do you want to group ICD 10 DIAG codes ?\")\n", + " group_dia_icd_input = widgets.RadioButtons(options=['Keep both ICD-9 and ICD-10 codes','Convert ICD-9 to ICD-10 codes','Convert ICD-9 to ICD-10 and group ICD-10 codes'],value='Convert ICD-9 to ICD-10 and group ICD-10 codes',layout={'width': '100%'})\n", + " display(group_dia_icd_input) \n", + "\n", + "if not prediction_task.use_icu:\n", + " if feature_extractor.for_medications:\n", + " print(\"Do you want to group Medication codes to use Non propietary names?\")\n", + " group_med_code_input = widgets.RadioButtons(options=['Yes','No'],value='Yes',layout={'width': '100%'})\n", + " display(group_med_code_input)\n", + " if feature_extractor.for_procedures:\n", + " print(\"Which ICD codes for Procedures you want to keep in data?\")\n", + " group_proc_icd_input = widgets.RadioButtons(options=['ICD-9 and ICD-10','ICD-10'],value='ICD-10',layout={'width': '100%'})\n", + " display(group_proc_icd_input)\n", + "print(\"**Please run below cell to perform feature preprocessing**\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "from pipeline.feature.diagnoses import IcdGroupOption\n", + "\n", + "group_diag_icd = IcdGroupOption.KEEP\n", + "if feature_extractor.for_diagnoses:\n", + " if group_dia_icd_input.value == \"Keep both ICD-9 and ICD-10 codes\":\n", + " group_dia_icd_input = IcdGroupOption.KEEP\n", + " elif group_dia_icd_input.value == \"Convert ICD-9 to ICD-10 codes\":\n", + " group_dia_icd_input = IcdGroupOption.CONVERT\n", + " elif group_dia_icd_input.value == \"Convert ICD-9 to ICD-10 and group ICD-10 codes\":\n", + " group_dia_icd_input = IcdGroupOption.GROUP\n", + "\n", + "\n", + "group_med_code = feature_extractor.for_medications and (not prediction_task.use_icu) and (group_med_code_input.value==\"Yes\")\n", + "keep_proc_icd9 = prediction_task.use_icu or not(feature_extractor.for_procedures and (group_proc_icd_input.value==\"ICD-10\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(,\n", + " True,\n", + " True,\n", + " False,\n", + " False,\n", + " False,\n", + " False)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(group_diag_icd, \n", + "group_med_code,\n", + "keep_proc_icd9,\n", + "False,\n", + "False,\n", + "False,\n", + "False,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:[PROCESSING DIAGNOSIS DATA]\n", + "INFO:root:Total number of rows: 2647\n", + "INFO:root:[SUCCESSFULLY SAVED DIAGNOSES DATA]\n" + ] + } + ], + "source": [ + "from pipeline.features_preprocessor import FeaturePreprocessor\n", + "feat_preproc = FeaturePreprocessor(feature_extractor=feature_extractor, \n", + " group_diag_icd=group_diag_icd, \n", + " group_med_code=group_med_code,\n", + " keep_proc_icd9=keep_proc_icd9,\n", + " clean_chart=False,\n", + " impute_outlier_chart=False,\n", + " clean_labs=False,\n", + " impute_labs=False,\n", + " )\n", + "preproc = feat_preproc.preprocess_no_event_features()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(,\n", + " True,\n", + " True,\n", + " False,\n", + " False,\n", + " False,\n", + " False)" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(group_diag_icd, group_med_code,keep_proc_icd9,False,False,False,False,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. SUMMARY OF FEATURES\n", + "\n", + "This step will generate summary of all features extracted so far.
\n", + "It will save summary files in **./preproc_data/summary/**
\n", + "- These files provide summary about **mean frequency** of medical codes per admission.
\n", + "- It also provides **total occurrence count** of each medical code.
\n", + "- For labs and chart events it will also provide
**missing %** which tells how many rows for a certain medical code has missing value.\n", + "\n", + "Please use this information to further refine your cohort by selecting
which medical codes in each feature you want to keep and
which codes you would like to remove for downstream analysis tasks.\n", + "\n", + "**Please run below cell to generate summary files**" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "summaries = feat_preproc.save_summaries()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Feature Selection\n", + "\n", + "based on the files generated in previous step and other infromation gathered by you,
\n", + "Please select which medical codes you want to include in this study.\n", + "\n", + "Please run below cell to to select options for which features you want to perform feature selection.\n", + "\n", + "- Select **Yes** if you want to select a subset of medical codes for that feature and
**edit** the corresponding feature file for it.\n", + "- Select **No** if you want to keep all the codes in a feature." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Do you want to do Feature Selection for Diagnoses \n", + " (If yes, please edit list of codes in ./data/summary/diag_features.csv)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a7269594cc614e42a5419a18686a8cd5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "RadioButtons(index=1, options=('Yes', 'No'), value='No')" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Do you want to do Feature Selection for Medications \n", + " (If yes, please edit list of codes in ./data/summary/med_features.csv)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "99ec2392121145e199ee5514747f0e9a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "RadioButtons(index=1, options=('Yes', 'No'), value='No')" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Do you want to do Feature Selection for Procedures \n", + " (If yes, please edit list of codes in ./data/summary/proc_features.csv)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "97c98f43d03345958b51265d37173fea", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "RadioButtons(index=1, options=('Yes', 'No'), value='No')" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Do you want to do Feature Selection for Output event \n", + " (If yes, please edit list of codes in ./data/summary/out_features.csv)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "180e3da541a241e3bb680e2096173230", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "RadioButtons(index=1, options=('Yes', 'No'), value='No')" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Do you want to do Feature Selection for Chart events \n", + " (If yes, please edit list of codes in ./data/summary/chart_features.csv)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b88c35d877374ae6a7ed61d8c58fc55b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "RadioButtons(index=1, options=('Yes', 'No'), value='No')" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG:Comm:handle_msg[a7269594cc614e42a5419a18686a8cd5]({'header': {'date': datetime.datetime(2023, 12, 11, 15, 33, 51, 698000, tzinfo=tzutc()), 'msg_id': '65b9c316-a36d-4bdf-b1c0-85ea90cb52dc', 'msg_type': 'comm_msg', 'session': 'e8845a06-cb7d-41e1-bf48-372172e275ed', 'username': '6b58c396-60d8-4f33-b913-49b4b6a7dd8b', 'version': '5.2'}, 'msg_id': '65b9c316-a36d-4bdf-b1c0-85ea90cb52dc', 'msg_type': 'comm_msg', 'parent_header': {}, 'metadata': {}, 'content': {'comm_id': 'a7269594cc614e42a5419a18686a8cd5', 'data': {'method': 'update', 'state': {'index': 0}, 'buffer_paths': []}}, 'buffers': []})\n", + "DEBUG:Comm:handle_msg[99ec2392121145e199ee5514747f0e9a]({'header': {'date': datetime.datetime(2023, 12, 11, 15, 33, 52, 759000, tzinfo=tzutc()), 'msg_id': '6cb9e6ed-a5cf-4c38-beaf-5caec1e950c2', 'msg_type': 'comm_msg', 'session': 'e8845a06-cb7d-41e1-bf48-372172e275ed', 'username': '6b58c396-60d8-4f33-b913-49b4b6a7dd8b', 'version': '5.2'}, 'msg_id': '6cb9e6ed-a5cf-4c38-beaf-5caec1e950c2', 'msg_type': 'comm_msg', 'parent_header': {}, 'metadata': {}, 'content': {'comm_id': '99ec2392121145e199ee5514747f0e9a', 'data': {'method': 'update', 'state': {'index': 0}, 'buffer_paths': []}}, 'buffers': []})\n", + "DEBUG:Comm:handle_msg[97c98f43d03345958b51265d37173fea]({'header': {'date': datetime.datetime(2023, 12, 11, 15, 33, 53, 682000, tzinfo=tzutc()), 'msg_id': '6bf5c215-49dc-470e-9596-6ac8c782350e', 'msg_type': 'comm_msg', 'session': 'e8845a06-cb7d-41e1-bf48-372172e275ed', 'username': '6b58c396-60d8-4f33-b913-49b4b6a7dd8b', 'version': '5.2'}, 'msg_id': '6bf5c215-49dc-470e-9596-6ac8c782350e', 'msg_type': 'comm_msg', 'parent_header': {}, 'metadata': {}, 'content': {'comm_id': '97c98f43d03345958b51265d37173fea', 'data': {'method': 'update', 'state': {'index': 0}, 'buffer_paths': []}}, 'buffers': []})\n", + "DEBUG:Comm:handle_msg[180e3da541a241e3bb680e2096173230]({'header': {'date': datetime.datetime(2023, 12, 11, 15, 33, 54, 860000, tzinfo=tzutc()), 'msg_id': '19a3da7e-0a9b-4599-8f01-4ff6464cecdb', 'msg_type': 'comm_msg', 'session': 'e8845a06-cb7d-41e1-bf48-372172e275ed', 'username': '6b58c396-60d8-4f33-b913-49b4b6a7dd8b', 'version': '5.2'}, 'msg_id': '19a3da7e-0a9b-4599-8f01-4ff6464cecdb', 'msg_type': 'comm_msg', 'parent_header': {}, 'metadata': {}, 'content': {'comm_id': '180e3da541a241e3bb680e2096173230', 'data': {'method': 'update', 'state': {'index': 0}, 'buffer_paths': []}}, 'buffers': []})\n", + "DEBUG:Comm:handle_msg[b88c35d877374ae6a7ed61d8c58fc55b]({'header': {'date': datetime.datetime(2023, 12, 11, 15, 33, 55, 463000, tzinfo=tzutc()), 'msg_id': '27b76f35-1cd4-48f8-b45c-ba1c01464494', 'msg_type': 'comm_msg', 'session': 'e8845a06-cb7d-41e1-bf48-372172e275ed', 'username': '6b58c396-60d8-4f33-b913-49b4b6a7dd8b', 'version': '5.2'}, 'msg_id': '27b76f35-1cd4-48f8-b45c-ba1c01464494', 'msg_type': 'comm_msg', 'parent_header': {}, 'metadata': {}, 'content': {'comm_id': 'b88c35d877374ae6a7ed61d8c58fc55b', 'data': {'method': 'update', 'state': {'index': 0}, 'buffer_paths': []}}, 'buffers': []})\n" + ] + } + ], + "source": [ + "if feature_extractor.for_diagnoses:\n", + " print(\"Do you want to do Feature Selection for Diagnoses \\n (If yes, please edit list of codes in ./data/summary/diag_features.csv)\")\n", + " select_dia_input = widgets.RadioButtons(options=['Yes','No'],value='No')\n", + " display(select_dia_input) \n", + "if feature_extractor.for_medications:\n", + " print(\"Do you want to do Feature Selection for Medications \\n (If yes, please edit list of codes in ./data/summary/med_features.csv)\")\n", + " select_med_input = widgets.RadioButtons(options=['Yes','No'],value='No')\n", + " display(select_med_input) \n", + "if feature_extractor.for_procedures:\n", + " print(\"Do you want to do Feature Selection for Procedures \\n (If yes, please edit list of codes in ./data/summary/proc_features.csv)\")\n", + " select_proc_input = widgets.RadioButtons(options=['Yes','No'],value='No')\n", + " display(select_proc_input) \n", + "if prediction_task.use_icu and feature_extractor.for_output_events:\n", + " print(\"Do you want to do Feature Selection for Output event \\n (If yes, please edit list of codes in ./data/summary/out_features.csv)\")\n", + " select_out_input = widgets.RadioButtons(options=['Yes','No'],value='No')\n", + " display(select_out_input) \n", + "if prediction_task.use_icu and feature_extractor.for_chart_events:\n", + " print(\"Do you want to do Feature Selection for Chart events \\n (If yes, please edit list of codes in ./data/summary/chart_features.csv)\")\n", + " select_chart_input = widgets.RadioButtons(options=['Yes','No'],value='No')\n", + " display(select_chart_input) \n", + "if not(prediction_task.use_icu) and feature_extractor.for_labs:\n", + " print(\"Do you want to do Feature Selection for Labs \\n (If yes, please edit list of codes in ./data/summary/lab_features.csv)\")\n", + " select_lab_input = widgets.RadioButtons(options=['Yes','No'],value='No')\n", + " display(select_lab_input) " + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(False, True, True, True, True, False, False)" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(prediction_task.use_icu, select_diag, select_med,select_proc, select_lab,select_chart, select_out)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "from pipeline.feature_selector import FeatureSelector\n", + "\n", + "\n", + "select_diag=select_dia_input.value == 'Yes' if feature_extractor.for_diagnoses else False\n", + "select_med=select_med_input.value == 'Yes' if feature_extractor.for_medications else False\n", + "select_proc=select_proc_input.value == 'Yes' if feature_extractor.for_procedures else False\n", + "select_out=select_out_input.value == 'Yes' if prediction_task.use_icu and feature_extractor.for_output_events else False\n", + "select_chart=select_chart_input.value == 'Yes' if prediction_task.use_icu and feature_extractor.for_chart_events else False\n", + "select_lab=select_lab_input.value == 'Yes' if not (prediction_task.use_icu) and feature_extractor.for_labs else False\n", + "\n", + "feature_selector = FeatureSelector(prediction_task.use_icu, select_diag, select_med,select_proc, select_lab,select_chart, select_out)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. CLEANING OF FEATURES\n", + "Below you will have option to to clean lab and chart events by performing outlier removal and unit conversion.\n", + "\n", + "Outlier removal is performed to remove values higher than selected **right threshold** percentile and lower than selected **left threshold** percentile among all values for each itemid. \n", + "\n", + "**Please run below cell to select preprocessing for diferent features**" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Outlier removal in values of chart events ?\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "25076133c50e48afbfe25fc07ac0e8db", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "RadioButtons(layout=Layout(height='40px', width='100%'), options=('No outlier detection', 'Impute Outlier (def…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6161acba75304c98be6eeda34e3934e2", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(Label(value='Right Outlier Threshold', layout=Layout(width='150px')), IntSlider(value=98, layou…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3e23f6ab80974a0ba8b8da7853818794", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(Label(value='Left Outlier Threshold', layout=Layout(width='150px')), IntSlider(value=0, layout=…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "if (prediction_task.use_icu and select_chart) or (not(prediction_task.use_icu) and select_lab):\n", + " event_name = \"chart\" if prediction_task.use_icu else \"lab\"\n", + " print(f\"Outlier removal in values of {event_name} events ?\")\n", + " layout = widgets.Layout(width='100%', height='40px') #set width and height\n", + "\n", + " outlier_input = widgets.RadioButtons(options=['No outlier detection','Impute Outlier (default:98)','Remove outliers (default:98)'],value='No outlier detection',layout=layout)\n", + " display(outlier_input)\n", + " right_outlier=widgets.IntSlider(\n", + " value=98,\n", + " min=90,\n", + " max=99,\n", + " step=1,\n", + " disabled=False,layout={'width': '100%'}\n", + " )\n", + " left_outlier=widgets.IntSlider(\n", + " value=0,\n", + " min=0,\n", + " max=10,\n", + " step=1,\n", + " disabled=False,layout={'width': '100%'}\n", + " )\n", + " display(widgets.HBox([widgets.Label('Right Outlier Threshold',layout={'width': '150px'}), right_outlier]))\n", + " display(widgets.HBox([widgets.Label('Left Outlier Threshold',layout={'width': '150px'}), left_outlier]))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "not(prediction_task.use_icu) and select_lab" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "98" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "right_outlier.value" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "right_thresh=100\n", + "left_thresh = 0\n", + "clean_chart = False\n", + "clean_lab = False\n", + "if (prediction_task.use_icu and select_chart):\n", + " clean_chart=outlier_input.value!='No outlier detection'\n", + " right_thresh = right_outlier.value\n", + " left_thresh = left_outlier.value\n", + "if (not(prediction_task.use_icu) and select_lab):\n", + " clean_lab=outlier_input.value!='No outlier detection'\n", + " right_thresh = right_outlier.value\n", + " left_thresh = left_outlier.value\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(None, False, False, False, False, True, True, 98, 0)" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(None,False,False,clean_chart,clean_chart,clean_lab, clean_lab,right_thresh,left_thresh)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "feat_preproc = FeaturePreprocessor(feature_extractor=feature_extractor, \n", + " group_diag_icd=None,\n", + " group_med_code=False,\n", + " keep_proc_icd9=False,\n", + " \n", + "\n", + " clean_chart=clean_chart,\n", + " impute_outlier_chart = clean_chart,\n", + " clean_labs=clean_lab,\n", + " impute_labs = clean_lab,\n", + "\n", + " thresh = right_thresh,\n", + " left_thresh=left_thresh\n", + " )\n", + "preproc = feat_preproc.preproc_events_features()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Time-Series Representation\n", + "In this section, please choose how you want to process and represent time-series data.\n", + "\n", + "- First option is to select the length of time-series data you want to include for this study. (Default is 72 hours)\n", + "\n", + "- Second option is to select bucket size which tells in what size time windows you want to divide your time-series.
\n", + "For example, if you select **2** bucket size, it wil aggregate data for every 2 hours and
a time-series of length 24 hours will be represented as time-series with 12 time-windows
where data for every 2 hours is agggregated from original raw time-series.\n", + "\n", + "During this step, we will also save the time-series data in data dictionaries in the format that can be directly used for following deep learning analysis.\n", + "\n", + "### Imputation\n", + "You can also choose if you want to impute lab/chart values. The imputation will be done by froward fill and mean or median imputation.
\n", + "Values will be forward fill first and if no value exists for that admission we will use mean or median value for the patient.\n", + "\n", + "The data dictionaries will be saved in **./data/dict/**\n", + "\n", + "Please refer the readme to know the structure of data dictionaries.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prediction_task.target_type" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=======Time-series Data Represenation=======\n", + "Length of data to be included for time-series prediction ?\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "cd267a86fa524e31a387d158d70173ae", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "RadioButtons(options=('First 72 hours', 'First 48 hours', 'First 24 hours', 'Custom'), value='First 72 hours')" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6a93b3e471a74c40bd3733aa658f0bd5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(Label(value='Fisrt (in hours):', layout=Layout(width='150px')), IntSlider(value=72, description…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "What time bucket size you want to choose ?\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "592e8e287214438ba539ed045574b03b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "RadioButtons(options=('1 hour', '2 hour', '3 hour', '4 hour', '5 hour', 'Custom'), value='1 hour')" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6ddba3d30a92453e9ac4989f4a35336c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(Label(value='Bucket Size (in hours):', layout=Layout(width='150px')), IntSlider(value=1, max=6,…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Do you want to forward fill and mean or median impute lab/chart values to form continuous data signal?\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0a303ac3c4514203bcd9107e717f57c0", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "RadioButtons(options=('No Imputation', 'forward fill and mean', 'forward fill and median'), value='No Imputati…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "If you have choosen mortality prediction task, then what prediction window length you want to keep?\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4ac0e3cd00184c0e8f51d2017695856b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "RadioButtons(options=('2 hours', '4 hours', '6 hours', '8 hours', 'Custom'), value='2 hours')" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "58f7f0b018aa4a5bb32243376d6a21d7", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(Label(value='Prediction window (in hours)', layout=Layout(width='180px')), IntSlider(value=2, m…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "**Please run below cell to perform time-series represenation and save in data dictionaries**\n" + ] + } + ], + "source": [ + "print(\"=======Time-series Data Represenation=======\")\n", + "\n", + "print(\"Length of data to be included for time-series prediction ?\")\n", + "if(prediction_task.target_type== TargetType.MORTALITY):\n", + " radio_input8 = widgets.RadioButtons(options=['First 72 hours','First 48 hours','First 24 hours','Custom'],value='First 72 hours')\n", + " display(radio_input8)\n", + " text2=widgets.IntSlider(\n", + " value=72,\n", + " min=24,\n", + " max=72,\n", + " step=1,\n", + " description='Fisrt',\n", + " disabled=False\n", + " )\n", + " display(widgets.HBox([widgets.Label('Fisrt (in hours):',layout={'width': '150px'}), text2]))\n", + "elif(prediction_task.target_type== TargetType.READMISSION):\n", + " radio_input8 = widgets.RadioButtons(options=['Last 72 hours','Last 48 hours','Last 24 hours','Custom'],value='Last 72 hours')\n", + " display(radio_input8)\n", + " text2=widgets.IntSlider(\n", + " value=72,\n", + " min=24,\n", + " max=72,\n", + " step=1,\n", + " description='Last',\n", + " disabled=False\n", + " )\n", + " display(widgets.HBox([widgets.Label('Last (in hours):',layout={'width': '150px'}), text2]))\n", + "elif(prediction_task.target_type== TargetType.LOS):\n", + " radio_input8 = widgets.RadioButtons(options=['First 12 hours','First 24 hours','Custom'],value='First 24 hours')\n", + " display(radio_input8)\n", + " text2=widgets.IntSlider(\n", + " value=72,\n", + " min=12,\n", + " max=72,\n", + " step=1,\n", + " description='First',\n", + " disabled=False\n", + " )\n", + " display(widgets.HBox([widgets.Label('Fisrt (in hours):',layout={'width': '150px'}), text2]))\n", + " \n", + " \n", + "print(\"What time bucket size you want to choose ?\")\n", + "radio_input7 = widgets.RadioButtons(options=['1 hour','2 hour','3 hour','4 hour','5 hour','Custom'],value='1 hour')\n", + "display(radio_input7)\n", + "text1=widgets.IntSlider(\n", + " value=1,\n", + " min=1,\n", + " max=6,\n", + " step=1,\n", + " disabled=False\n", + " )\n", + "#display(text1)\n", + "display(widgets.HBox([widgets.Label('Bucket Size (in hours):',layout={'width': '150px'}), text1]))\n", + "print(\"Do you want to forward fill and mean or median impute lab/chart values to form continuous data signal?\")\n", + "radio_impute = widgets.RadioButtons(options=['No Imputation', 'forward fill and mean','forward fill and median'],value='No Imputation')\n", + "display(radio_impute) \n", + "\n", + "radio_input6 = widgets.RadioButtons(options=['0 hours','2 hours','4 hours','6 hours'],value='0 hours')\n", + "if(prediction_task.target_type== TargetType.MORTALITY):\n", + " print(\"If you have choosen mortality prediction task, then what prediction window length you want to keep?\")\n", + " radio_input6 = widgets.RadioButtons(options=['2 hours','4 hours','6 hours','8 hours','Custom'],value='2 hours')\n", + " display(radio_input6)\n", + " text3=widgets.IntSlider(\n", + " value=2,\n", + " min=2,\n", + " max=8,\n", + " step=1,\n", + " disabled=False\n", + " )\n", + " display(widgets.HBox([widgets.Label('Prediction window (in hours)',layout={'width': '180px'}), text3]))\n", + "print(\"**Please run below cell to perform time-series represenation and save in data dictionaries**\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "if (radio_input6.value=='Custom'):\n", + " predW=int(text3.value)\n", + "else:\n", + " predW=int(radio_input6.value[0].strip())\n", + "if (radio_input7.value=='Custom'):\n", + " bucket=int(text1.value)\n", + "else:\n", + " bucket=int(radio_input7.value[0].strip())\n", + "if (radio_input8.value=='Custom'):\n", + " include=int(text2.value)\n", + "else:\n", + " include=int(radio_input8.value.split()[1])\n", + "if (radio_impute.value=='forward fill and mean'):\n", + " impute='Mean'\n", + "elif (radio_impute.value=='forward fill and median'):\n", + " impute='Median'\n", + "else:\n", + " impute=False\n", + "\n", + "# if data_icu:\n", + "# gen=data_generation_icu.Generator(cohort_output,data_mort,data_admn,data_los,diag_flag,proc_flag,out_flag,chart_flag,med_flag,impute,include,bucket,predW)\n", + "# #gen=data_generation_icu.Generator(cohort_output,data_mort,diag_flag,False,False,chart_flag,False,impute,include,bucket,predW)\n", + "# #if chart_flag:\n", + "# # gen=data_generation_icu.Generator(cohort_output,data_mort,False,False,False,chart_flag,False,impute,include,bucket,predW)\n", + "# else:\n", + "# gen=data_generation.Generator(cohort_output,data_mort,data_admn,data_los,diag_flag,lab_flag,proc_flag,med_flag,impute,include,bucket,predW)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'cohort_Non-ICU_readmission_30_I25'" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cohort_extractor.cohort_output" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "from pipeline.data_generator import DataGenerator\n", + "#DataGenerator()\n", + "\n", + "#cohort = generate_admission_cohort(cohort_extractor.cohort_output)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "from pipeline.file_info.preproc.cohort import COHORT_PATH, CohortHeader, NonIcuCohortHeader\n", + "\n", + "data = pd.read_csv(\n", + " COHORT_PATH / f\"{cohort_extractor.cohort_output}.csv.gz\",\n", + " compression=\"gzip\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 8. Machine Learning Models\n", + "\n", + "Below we provide options to select -\n", + "- Type of machine learning model\n", + "- Wheteher to concatenate or aggregate time-series features.\n", + " For example, if the EHR data has collected value for Blood Pressure for one year over 4 time windows of 3 months each then,\n", + " - **Conactenate** will concatenate all four values resulting in 4 different features for blood pressure,\n", + " - **Aggregate** will aggreagte(mean) over four tiem windows resulting in one feature for blood pressure." + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=======Machine :earning Models=======\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "dd967624cad24063913264116696f534", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "RadioButtons(index=2, options=('Logistic Regression', 'Random Forest', 'Gradient Bossting', 'Xgboost'), value=…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Do you wnat to conactenate the time-series feature\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b246493882164e79acdd545e94a40342", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "RadioButtons(options=('Conactenate', 'Aggregate'), value='Conactenate')" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Please select below option for cross-validation\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "71e68d480e9742deacce186515771a75", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "RadioButtons(index=1, options=('No CV', '5-fold CV', '10-fold CV'), value='5-fold CV')" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Do you want to do oversampling for minority calss ?\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "545a474ddd31435392b099cfb0420b17", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "RadioButtons(options=('True', 'False'), value='True')" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "print(\"=======Machine :earning Models=======\")\n", + "radio_input5 = widgets.RadioButtons(options=['Logistic Regression','Random Forest','Gradient Bossting','Xgboost'],value='Gradient Bossting')\n", + "display(radio_input5)\n", + "print(\"Do you wnat to conactenate the time-series feature\")\n", + "radio_input6 = widgets.RadioButtons(options=['Conactenate','Aggregate'],value='Conactenate')\n", + "display(radio_input6)\n", + "print(\"Please select below option for cross-validation\")\n", + "radio_input7 = widgets.RadioButtons(options=['No CV','5-fold CV','10-fold CV'],value='5-fold CV')\n", + "display(radio_input7)\n", + "print(\"Do you want to do oversampling for minority calss ?\")\n", + "radio_input8 = widgets.RadioButtons(options=['True','False'],value='True')\n", + "display(radio_input8)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/model/data_generation.py b/model/data_generation.py index 5f1a55c2bb..3461d336dc 100644 --- a/model/data_generation.py +++ b/model/data_generation.py @@ -7,23 +7,44 @@ import os import sys from pathlib import Path -sys.path.append(os.path.dirname(os.path.abspath(__file__)) + './../..') + +sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "./../..") if not os.path.exists("./data/dict"): os.makedirs("./data/dict") - -class Generator(): - def __init__(self,cohort_output,if_mort,if_admn,if_los,feat_cond,feat_lab,feat_proc,feat_med,impute,include_time=24,bucket=1,predW=0): - self.impute=impute - self.feat_cond,self.feat_proc,self.feat_med,self.feat_lab = feat_cond,feat_proc,feat_med,feat_lab - self.cohort_output=cohort_output - + + +class Generator: + def __init__( + self, + cohort_output, + if_mort, + if_admn, + if_los, + feat_cond, + feat_lab, + feat_proc, + feat_med, + impute, + include_time=24, + bucket=1, + predW=0, + ): + self.impute = impute + self.feat_cond, self.feat_proc, self.feat_med, self.feat_lab = ( + feat_cond, + feat_proc, + feat_med, + feat_lab, + ) + self.cohort_output = cohort_output + self.data = self.generate_adm() print("[ READ COHORT ]") self.generate_feat() print("[ READ ALL FEATURES ]") if if_mort: print(predW) - self.mortality_length(include_time,predW) + self.mortality_length(include_time, predW) print("[ PROCESSED TIME SERIES TO EQUAL LENGTH ]") elif if_admn: self.readmission_length(include_time) @@ -32,510 +53,630 @@ def __init__(self,cohort_output,if_mort,if_admn,if_los,feat_cond,feat_lab,feat_p self.los_length(include_time) print("[ PROCESSED TIME SERIES TO EQUAL LENGTH ]") self.smooth_meds(bucket) - - #if(self.feat_lab): + + # if(self.feat_lab): # print("[ ======READING LABS ]") # nhid=len(self.hids) # for n in range(0,nhids,10000): # self.generate_labs(self.hids[n,n+10000]) print("[ SUCCESSFULLY SAVED DATA DICTIONARIES ]") - + def generate_feat(self): - if(self.feat_cond): + if self.feat_cond: print("[ ======READING DIAGNOSIS ]") self.generate_cond() - if(self.feat_proc): + if self.feat_proc: print("[ ======READING PROCEDURES ]") self.generate_proc() - if(self.feat_med): + if self.feat_med: print("[ ======READING MEDICATIONS ]") self.generate_meds() - if(self.feat_lab): + if self.feat_lab: print("[ ======READING LABS ]") self.generate_labs() - - + def generate_adm(self): - data=pd.read_csv(f"./data/cohort/{self.cohort_output}.csv.gz", compression='gzip', header=0, index_col=None) - data['admittime'] = pd.to_datetime(data['admittime']) - data['dischtime'] = pd.to_datetime(data['dischtime']) - data['los']=pd.to_timedelta(data['dischtime']-data['admittime'],unit='h') - data['los']=data['los'].astype(str) - data[['days', 'dummy','hours']] = data['los'].str.split(' ', -1, expand=True) - data[['hours','min','sec']] = data['hours'].str.split(':', -1, expand=True) - data['los']=pd.to_numeric(data['days'])*24+pd.to_numeric(data['hours']) - data=data.drop(columns=['days', 'dummy','hours','min','sec']) - data=data[data['los']>0] - data['Age']=data['Age'].astype(int) + data = pd.read_csv( + f"./data/cohort/{self.cohort_output}.csv.gz", + compression="gzip", + header=0, + index_col=None, + ) + data["admittime"] = pd.to_datetime(data["admittime"]) + data["dischtime"] = pd.to_datetime(data["dischtime"]) + data["los"] = pd.to_timedelta(data["dischtime"] - data["admittime"], unit="h") + data["los"] = data["los"].astype(str) + data[["days", "dummy", "hours"]] = data["los"].str.split(" ", expand=True) + data[["hours", "min", "sec"]] = data["hours"].str.split(":", expand=True) + data["los"] = pd.to_numeric(data["days"]) * 24 + pd.to_numeric(data["hours"]) + data = data.drop(columns=["days", "dummy", "hours", "min", "sec"]) + data = data[data["los"] > 0] + data["Age"] = data["Age"].astype(int) return data - + def generate_cond(self): - cond=pd.read_csv("./data/features/preproc_diag.csv.gz", compression='gzip', header=0, index_col=None) - cond=cond[cond['hadm_id'].isin(self.data['hadm_id'])] - cond_per_adm = cond.groupby('hadm_id').size().max() + cond = pd.read_csv( + "./data/features/preproc_diag.csv.gz", + compression="gzip", + header=0, + index_col=None, + ) + cond = cond[cond["hadm_id"].isin(self.data["hadm_id"])] + cond_per_adm = cond.groupby("hadm_id").size().max() self.cond, self.cond_per_adm = cond, cond_per_adm - + def generate_proc(self): - proc=pd.read_csv("./data/features/preproc_proc.csv.gz", compression='gzip', header=0, index_col=None) - proc=proc[proc['hadm_id'].isin(self.data['hadm_id'])] - proc[['start_days', 'dummy','start_hours']] = proc['proc_time_from_admit'].str.split(' ', -1, expand=True) - proc[['start_hours','min','sec']] = proc['start_hours'].str.split(':', -1, expand=True) - proc['start_time']=pd.to_numeric(proc['start_days'])*24+pd.to_numeric(proc['start_hours']) - proc=proc.drop(columns=['start_days', 'dummy','start_hours','min','sec']) - proc=proc[proc['start_time']>=0] - + proc = pd.read_csv( + "./data/features/preproc_proc.csv.gz", + compression="gzip", + header=0, + index_col=None, + ) + proc = proc[proc["hadm_id"].isin(self.data["hadm_id"])] + proc[["start_days", "dummy", "start_hours"]] = proc[ + "proc_time_from_admit" + ].str.split(" ", expand=True) + proc[["start_hours", "min", "sec"]] = proc["start_hours"].str.split( + ":", expand=True + ) + proc["start_time"] = pd.to_numeric(proc["start_days"]) * 24 + pd.to_numeric( + proc["start_hours"] + ) + proc = proc.drop(columns=["start_days", "dummy", "start_hours", "min", "sec"]) + proc = proc[proc["start_time"] >= 0] + ###Remove where event time is after discharge time - proc=pd.merge(proc,self.data[['hadm_id','los']],on='hadm_id',how='left') - proc['sanity']=proc['los']-proc['start_time'] - proc=proc[proc['sanity']>0] - del proc['sanity'] - - self.proc=proc - + proc = pd.merge(proc, self.data[["hadm_id", "los"]], on="hadm_id", how="left") + proc["sanity"] = proc["los"] - proc["start_time"] + proc = proc[proc["sanity"] > 0] + del proc["sanity"] + + self.proc = proc + def generate_labs(self): chunksize = 10000000 - final=pd.DataFrame() - for labs in tqdm(pd.read_csv("./data/features/preproc_labs.csv.gz", compression='gzip', header=0, index_col=None,chunksize=chunksize)): - labs=labs[labs['hadm_id'].isin(self.data['hadm_id'])] - labs[['start_days', 'dummy','start_hours']] = labs['lab_time_from_admit'].str.split(' ', -1, expand=True) - labs[['start_hours','min','sec']] = labs['start_hours'].str.split(':', -1, expand=True) - labs['start_time']=pd.to_numeric(labs['start_days'])*24+pd.to_numeric(labs['start_hours']) - labs=labs.drop(columns=['start_days', 'dummy','start_hours','min','sec']) - labs=labs[labs['start_time']>=0] + final = pd.DataFrame() + for labs in tqdm( + pd.read_csv( + "./data/features/preproc_labs.csv.gz", + compression="gzip", + header=0, + index_col=None, + chunksize=chunksize, + ) + ): + labs = labs[labs["hadm_id"].isin(self.data["hadm_id"])] + labs[["start_days", "dummy", "start_hours"]] = labs[ + "lab_time_from_admit" + ].str.split(" ", expand=True) + labs[["start_hours", "min", "sec"]] = labs["start_hours"].str.split( + ":", expand=True + ) + labs["start_time"] = pd.to_numeric(labs["start_days"]) * 24 + pd.to_numeric( + labs["start_hours"] + ) + labs = labs.drop( + columns=["start_days", "dummy", "start_hours", "min", "sec"] + ) + labs = labs[labs["start_time"] >= 0] ###Remove where event time is after discharge time - labs=pd.merge(labs,self.data[['hadm_id','los']],on='hadm_id',how='left') - labs['sanity']=labs['los']-labs['start_time'] - labs=labs[labs['sanity']>0] - del labs['sanity'] - + labs = pd.merge( + labs, self.data[["hadm_id", "los"]], on="hadm_id", how="left" + ) + labs["sanity"] = labs["los"] - labs["start_time"] + labs = labs[labs["sanity"] > 0] + del labs["sanity"] + if final.empty: - final=labs + final = labs else: - final=final.append(labs, ignore_index=True) + final = pd.concat([final, labs], ignore_index=True) + + self.labs = final - self.labs=final - def generate_meds(self): - meds=pd.read_csv("./data/features/preproc_med.csv.gz", compression='gzip', header=0, index_col=None) - meds[['start_days', 'dummy','start_hours']] = meds['start_hours_from_admit'].str.split(' ', -1, expand=True) - meds[['start_hours','min','sec']] = meds['start_hours'].str.split(':', -1, expand=True) - meds['start_time']=pd.to_numeric(meds['start_days'])*24+pd.to_numeric(meds['start_hours']) - meds[['start_days', 'dummy','start_hours']] = meds['stop_hours_from_admit'].str.split(' ', -1, expand=True) - meds[['start_hours','min','sec']] = meds['start_hours'].str.split(':', -1, expand=True) - meds['stop_time']=pd.to_numeric(meds['start_days'])*24+pd.to_numeric(meds['start_hours']) - meds=meds.drop(columns=['start_days', 'dummy','start_hours','min','sec']) + meds = pd.read_csv( + "./data/features/preproc_med.csv.gz", + compression="gzip", + header=0, + index_col=None, + ) + meds[["start_days", "dummy", "start_hours"]] = meds[ + "start_hours_from_admit" + ].str.split(" ", expand=True) + meds[["start_hours", "min", "sec"]] = meds["start_hours"].str.split( + ":", -1, expand=True + ) + meds["start_time"] = pd.to_numeric(meds["start_days"]) * 24 + pd.to_numeric( + meds["start_hours"] + ) + meds[["start_days", "dummy", "start_hours"]] = meds[ + "stop_hours_from_admit" + ].str.split(" ", expand=True) + meds[["start_hours", "min", "sec"]] = meds["start_hours"].str.split( + ":", expand=True + ) + meds["stop_time"] = pd.to_numeric(meds["start_days"]) * 24 + pd.to_numeric( + meds["start_hours"] + ) + meds = meds.drop(columns=["start_days", "dummy", "start_hours", "min", "sec"]) #####Sanity check - meds['sanity']=meds['stop_time']-meds['start_time'] - meds=meds[meds['sanity']>0] - del meds['sanity'] + meds["sanity"] = meds["stop_time"] - meds["start_time"] + meds = meds[meds["sanity"] > 0] + del meds["sanity"] #####Select hadm_id as in main file - meds=meds[meds['hadm_id'].isin(self.data['hadm_id'])] - meds=pd.merge(meds,self.data[['hadm_id','los']],on='hadm_id',how='left') + meds = meds[meds["hadm_id"].isin(self.data["hadm_id"])] + meds = pd.merge(meds, self.data[["hadm_id", "los"]], on="hadm_id", how="left") #####Remove where start time is after end of visit - meds['sanity']=meds['los']-meds['start_time'] - meds=meds[meds['sanity']>0] - del meds['sanity'] + meds["sanity"] = meds["los"] - meds["start_time"] + meds = meds[meds["sanity"] > 0] + del meds["sanity"] ####Any stop_time after end of visit is set at end of visit - meds.loc[meds['stop_time'] > meds['los'],'stop_time']=meds.loc[meds['stop_time'] > meds['los'],'los'] - del meds['los'] - - meds['dose_val_rx']=meds['dose_val_rx'].apply(pd.to_numeric, errors='coerce') - - - self.meds=meds - - - def mortality_length(self,include_time,predW): - self.los=include_time - self.data=self.data[(self.data['los']>=include_time+predW)] - self.hids=self.data['hadm_id'].unique() - - if(self.feat_cond): - self.cond=self.cond[self.cond['hadm_id'].isin(self.data['hadm_id'])] - - self.data['los']=include_time + meds.loc[meds["stop_time"] > meds["los"], "stop_time"] = meds.loc[ + meds["stop_time"] > meds["los"], "los" + ] + del meds["los"] + + meds["dose_val_rx"] = meds["dose_val_rx"].apply(pd.to_numeric, errors="coerce") + + self.meds = meds + + def mortality_length(self, include_time, predW): + self.los = include_time + self.data = self.data[(self.data["los"] >= include_time + predW)] + self.hids = self.data["hadm_id"].unique() + + if self.feat_cond: + self.cond = self.cond[self.cond["hadm_id"].isin(self.data["hadm_id"])] + + self.data["los"] = include_time ###MEDS - if(self.feat_med): - self.meds=self.meds[self.meds['hadm_id'].isin(self.data['hadm_id'])] - self.meds=self.meds[self.meds['start_time']<=include_time] - self.meds.loc[self.meds.stop_time >include_time, 'stop_time']=include_time - - + if self.feat_med: + self.meds = self.meds[self.meds["hadm_id"].isin(self.data["hadm_id"])] + self.meds = self.meds[self.meds["start_time"] <= include_time] + self.meds.loc[ + self.meds.stop_time > include_time, "stop_time" + ] = include_time + ###PROCS - if(self.feat_proc): - self.proc=self.proc[self.proc['hadm_id'].isin(self.data['hadm_id'])] - self.proc=self.proc[self.proc['start_time']<=include_time] - + if self.feat_proc: + self.proc = self.proc[self.proc["hadm_id"].isin(self.data["hadm_id"])] + self.proc = self.proc[self.proc["start_time"] <= include_time] + ###LAB - if(self.feat_lab): - self.labs=self.labs[self.labs['hadm_id'].isin(self.data['hadm_id'])] - self.labs=self.labs[self.labs['start_time']<=include_time] - - - self.los=include_time - - def los_length(self,include_time): - self.los=include_time - self.data=self.data[(self.data['los']>=include_time)] - self.hids=self.data['hadm_id'].unique() - - if(self.feat_cond): - self.cond=self.cond[self.cond['hadm_id'].isin(self.data['hadm_id'])] - - self.data['los']=include_time + if self.feat_lab: + self.labs = self.labs[self.labs["hadm_id"].isin(self.data["hadm_id"])] + self.labs = self.labs[self.labs["start_time"] <= include_time] + + self.los = include_time + + def los_length(self, include_time): + self.los = include_time + self.data = self.data[(self.data["los"] >= include_time)] + self.hids = self.data["hadm_id"].unique() + + if self.feat_cond: + self.cond = self.cond[self.cond["hadm_id"].isin(self.data["hadm_id"])] + + self.data["los"] = include_time ###MEDS - if(self.feat_med): - self.meds=self.meds[self.meds['hadm_id'].isin(self.data['hadm_id'])] - self.meds=self.meds[self.meds['start_time']<=include_time] - self.meds.loc[self.meds.stop_time >include_time, 'stop_time']=include_time - - + if self.feat_med: + self.meds = self.meds[self.meds["hadm_id"].isin(self.data["hadm_id"])] + self.meds = self.meds[self.meds["start_time"] <= include_time] + self.meds.loc[ + self.meds.stop_time > include_time, "stop_time" + ] = include_time + ###PROCS - if(self.feat_proc): - self.proc=self.proc[self.proc['hadm_id'].isin(self.data['hadm_id'])] - self.proc=self.proc[self.proc['start_time']<=include_time] - + if self.feat_proc: + self.proc = self.proc[self.proc["hadm_id"].isin(self.data["hadm_id"])] + self.proc = self.proc[self.proc["start_time"] <= include_time] + ###LAB - if(self.feat_lab): - self.labs=self.labs[self.labs['hadm_id'].isin(self.data['hadm_id'])] - self.labs=self.labs[self.labs['start_time']<=include_time] - - - #self.los=include_time - - def readmission_length(self,include_time): - self.los=include_time - self.data=self.data[(self.data['los']>=include_time)] - self.hids=self.data['hadm_id'].unique() - if(self.feat_cond): - self.cond=self.cond[self.cond['hadm_id'].isin(self.data['hadm_id'])] - self.data['select_time']=self.data['los']-include_time - self.data['los']=include_time + if self.feat_lab: + self.labs = self.labs[self.labs["hadm_id"].isin(self.data["hadm_id"])] + self.labs = self.labs[self.labs["start_time"] <= include_time] + + # self.los=include_time + + def readmission_length(self, include_time): + self.los = include_time + self.data = self.data[(self.data["los"] >= include_time)] + self.hids = self.data["hadm_id"].unique() + if self.feat_cond: + self.cond = self.cond[self.cond["hadm_id"].isin(self.data["hadm_id"])] + self.data["select_time"] = self.data["los"] - include_time + self.data["los"] = include_time ####Make equal length input time series and remove data for pred window if needed - + ###MEDS - if(self.feat_med): - self.meds=self.meds[self.meds['hadm_id'].isin(self.data['hadm_id'])] - self.meds=pd.merge(self.meds,self.data[['hadm_id','select_time']],on='hadm_id',how='left') - self.meds['stop_time']=self.meds['stop_time']-self.meds['select_time'] - self.meds['start_time']=self.meds['start_time']-self.meds['select_time'] - self.meds=self.meds[self.meds['stop_time']>=0] - self.meds.loc[self.meds.start_time <0, 'start_time']=0 - + if self.feat_med: + self.meds = self.meds[self.meds["hadm_id"].isin(self.data["hadm_id"])] + self.meds = pd.merge( + self.meds, + self.data[["hadm_id", "select_time"]], + on="hadm_id", + how="left", + ) + self.meds["stop_time"] = self.meds["stop_time"] - self.meds["select_time"] + self.meds["start_time"] = self.meds["start_time"] - self.meds["select_time"] + self.meds = self.meds[self.meds["stop_time"] >= 0] + self.meds.loc[self.meds.start_time < 0, "start_time"] = 0 + ###PROCS - if(self.feat_proc): - self.proc=self.proc[self.proc['hadm_id'].isin(self.data['hadm_id'])] - self.proc=pd.merge(self.proc,self.data[['hadm_id','select_time']],on='hadm_id',how='left') - self.proc['start_time']=self.proc['start_time']-self.proc['select_time'] - self.proc=self.proc[self.proc['start_time']>=0] - + if self.feat_proc: + self.proc = self.proc[self.proc["hadm_id"].isin(self.data["hadm_id"])] + self.proc = pd.merge( + self.proc, + self.data[["hadm_id", "select_time"]], + on="hadm_id", + how="left", + ) + self.proc["start_time"] = self.proc["start_time"] - self.proc["select_time"] + self.proc = self.proc[self.proc["start_time"] >= 0] + ###LABS - if(self.feat_lab): - self.labs=self.labs[self.labs['hadm_id'].isin(self.data['hadm_id'])] - self.labs=pd.merge(self.labs,self.data[['hadm_id','select_time']],on='hadm_id',how='left') - self.labs['start_time']=self.labs['start_time']-self.labs['select_time'] - self.labs=self.labs[self.labs['start_time']>=0] - - - def smooth_meds(self,bucket): - final_meds=pd.DataFrame() - final_proc=pd.DataFrame() - final_labs=pd.DataFrame() - - if(self.feat_med): - self.meds=self.meds.sort_values(by=['start_time']) - if(self.feat_proc): - self.proc=self.proc.sort_values(by=['start_time']) - - t=0 - for i in tqdm(range(0,self.los,bucket)): + if self.feat_lab: + self.labs = self.labs[self.labs["hadm_id"].isin(self.data["hadm_id"])] + self.labs = pd.merge( + self.labs, + self.data[["hadm_id", "select_time"]], + on="hadm_id", + how="left", + ) + self.labs["start_time"] = self.labs["start_time"] - self.labs["select_time"] + self.labs = self.labs[self.labs["start_time"] >= 0] + + def smooth_meds(self, bucket): + final_meds = pd.DataFrame() + final_proc = pd.DataFrame() + final_labs = pd.DataFrame() + + if self.feat_med: + self.meds = self.meds.sort_values(by=["start_time"]) + if self.feat_proc: + self.proc = self.proc.sort_values(by=["start_time"]) + + t = 0 + for i in tqdm(range(0, self.los, bucket)): ###MEDS - if(self.feat_med): - sub_meds=self.meds[(self.meds['start_time']>=i) & (self.meds['start_time']= i) + & (self.meds["start_time"] < i + bucket) + ] + .groupby(["hadm_id", "drug_name"]) + .agg( + { + "stop_time": "max", + "subject_id": "max", + "dose_val_rx": np.nanmean, + } + ) + ) + sub_meds = sub_meds.reset_index() + sub_meds["start_time"] = t + sub_meds["stop_time"] = sub_meds["stop_time"] / bucket if final_meds.empty: - final_meds=sub_meds + final_meds = sub_meds else: - final_meds=final_meds.append(sub_meds) - + final_meds = pd.concat([final_meds, sub_meds], ignore_index=True) + + ###PROC - if(self.feat_proc): - sub_proc=self.proc[(self.proc['start_time']>=i) & (self.proc['start_time']= i) + & (self.proc["start_time"] < i + bucket) + ] + .groupby(["hadm_id", "icd_code"]) + .agg({"subject_id": "max"}) + ) + sub_proc = sub_proc.reset_index() + sub_proc["start_time"] = t if final_proc.empty: - final_proc=sub_proc - else: - final_proc=final_proc.append(sub_proc) - + final_proc = sub_proc + else: + final_proc = pd.concat([final_proc, sub_proc], ignore_index=True) + ###LABS - if(self.feat_lab): - sub_labs=self.labs[(self.labs['start_time']>=i) & (self.labs['start_time']= i) + & (self.labs["start_time"] < i + bucket) + ] + .groupby(["hadm_id", "itemid"]) + .agg({"subject_id": "max", "valuenum": np.nanmean}) + ) + sub_labs = sub_labs.reset_index() + sub_labs["start_time"] = t if final_labs.empty: - final_labs=sub_labs - else: - final_labs=final_labs.append(sub_labs) - - t=t+1 - los=int(self.los/bucket) - + final_labs = sub_labs + else: + final_labs = pd.concat([final_labs, sub_labs], ignore_index=True) + + + t = t + 1 + los = int(self.los / bucket) + ###MEDS - if(self.feat_med): - f2_meds=final_meds.groupby(['hadm_id','drug_name']).size() - self.med_per_adm=f2_meds.groupby('hadm_id').sum().reset_index()[0].max() - self.medlength_per_adm=final_meds.groupby('hadm_id').size().max() - + if self.feat_med: + f2_meds = final_meds.groupby(["hadm_id", "drug_name"]).size() + self.med_per_adm = f2_meds.groupby("hadm_id").sum().reset_index()[0].max() + self.medlength_per_adm = final_meds.groupby("hadm_id").size().max() + ###PROC - if(self.feat_proc): - f2_proc=final_proc.groupby(['hadm_id','icd_code']).size() - self.proc_per_adm=f2_proc.groupby('hadm_id').sum().reset_index()[0].max() - self.proclength_per_adm=final_proc.groupby('hadm_id').size().max() - - ###LABS - if(self.feat_lab): - f2_labs=final_labs.groupby(['hadm_id','itemid']).size() - self.labs_per_adm=f2_labs.groupby('hadm_id').sum().reset_index()[0].max() - self.labslength_per_adm=final_labs.groupby('hadm_id').size().max() + if self.feat_proc: + f2_proc = final_proc.groupby(["hadm_id", "icd_code"]).size() + self.proc_per_adm = f2_proc.groupby("hadm_id").sum().reset_index()[0].max() + self.proclength_per_adm = final_proc.groupby("hadm_id").size().max() + + ###LABS + if self.feat_lab: + f2_labs = final_labs.groupby(["hadm_id", "itemid"]).size() + self.labs_per_adm = f2_labs.groupby("hadm_id").sum().reset_index()[0].max() + self.labslength_per_adm = final_labs.groupby("hadm_id").size().max() ###CREATE DICT print("[ PROCESSED TIME SERIES TO EQUAL TIME INTERVAL ]") - self.create_Dict(final_meds,final_proc,final_labs,los) - - - def create_Dict(self,meds,proc,labs,los): + self.create_Dict(final_meds, final_proc, final_labs, los) + + def create_Dict(self, meds, proc, labs, los): print("[ CREATING DATA DICTIONARIES ]") - dataDic={} - labels_csv=pd.DataFrame(columns=['hadm_id','label']) - labels_csv['hadm_id']=pd.Series(self.hids) - labels_csv['label']=0 + dataDic = {} + labels_csv = pd.DataFrame(columns=["hadm_id", "label"]) + labels_csv["hadm_id"] = pd.Series(self.hids) + labels_csv["label"] = 0 for hid in self.hids: - grp=self.data[self.data['hadm_id']==hid] - #print(grp.head()) - #print(grp['gender']) - #print(int(grp['Age'])) - #print(grp['ethnicity'].iloc[0]) - dataDic[hid]={'Cond':{},'Proc':{},'Med':{},'Lab':{},'ethnicity':grp['ethnicity'].iloc[0],'age':int(grp['Age']),'gender':grp['gender'].iloc[0],'label':int(grp['label'])} - labels_csv.loc[labels_csv['hadm_id']==hid,'label']=int(grp['label']) + grp = self.data[self.data["hadm_id"] == hid] + # print(grp.head()) + # print(grp['gender']) + # print(int(grp['Age'])) + # print(grp['ethnicity'].iloc[0]) + dataDic[hid] = { + "Cond": {}, + "Proc": {}, + "Med": {}, + "Lab": {}, + "ethnicity": grp["ethnicity"].iloc[0], + "age": int(grp["Age"]), + "gender": grp["gender"].iloc[0], + "label": int(grp["label"]), + } + labels_csv.loc[labels_csv["hadm_id"] == hid, "label"] = int(grp["label"]) for hid in tqdm(self.hids): - grp=self.data[self.data['hadm_id']==hid] - demo_csv=grp[['Age','gender','ethnicity','insurance']] - if not os.path.exists("./data/csv/"+str(hid)): - os.makedirs("./data/csv/"+str(hid)) - demo_csv.to_csv('./data/csv/'+str(hid)+'/demo.csv',index=False) - - dyn_csv=pd.DataFrame() + grp = self.data[self.data["hadm_id"] == hid] + demo_csv = grp[["Age", "gender", "ethnicity", "insurance"]] + if not os.path.exists("./data/csv/" + str(hid)): + os.makedirs("./data/csv/" + str(hid)) + demo_csv.to_csv("./data/csv/" + str(hid) + "/demo.csv", index=False) + + dyn_csv = pd.DataFrame() ###MEDS - if(self.feat_med): - feat=meds['drug_name'].unique() - df2=meds[meds['hadm_id']==hid] - if df2.shape[0]==0: - val=pd.DataFrame(np.zeros([los,len(feat)]),columns=feat) - val=val.fillna(0) - val.columns=pd.MultiIndex.from_product([["MEDS"], val.columns]) + if self.feat_med: + feat = meds["drug_name"].unique() + df2 = meds[meds["hadm_id"] == hid] + if df2.shape[0] == 0: + val = pd.DataFrame(np.zeros([los, len(feat)]), columns=feat) + val = val.fillna(0) + val.columns = pd.MultiIndex.from_product([["MEDS"], val.columns]) else: - val=df2.pivot_table(index='start_time',columns='drug_name',values='dose_val_rx') - df2=df2.pivot_table(index='start_time',columns='drug_name',values='stop_time') - #print(df2.shape) + val = df2.pivot_table( + index="start_time", columns="drug_name", values="dose_val_rx" + ) + df2 = df2.pivot_table( + index="start_time", columns="drug_name", values="stop_time" + ) + # print(df2.shape) add_indices = pd.Index(range(los)).difference(df2.index) - add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna(np.nan) - df2=pd.concat([df2, add_df]) - df2=df2.sort_index() - df2=df2.ffill() - df2=df2.fillna(0) - - val=pd.concat([val, add_df]) - val=val.sort_index() - val=val.ffill() - val=val.fillna(-1) - #print(df2.head()) - df2.iloc[:,0:]=df2.iloc[:,0:].sub(df2.index,0) - df2[df2>0]=1 - df2[df2<0]=0 - val.iloc[:,0:]=df2.iloc[:,0:]*val.iloc[:,0:] - #print(df2.head()) - dataDic[hid]['Med']['signal']=df2.iloc[:,0:].to_dict(orient="list") - dataDic[hid]['Med']['val']=val.iloc[:,0:].to_dict(orient="list") - - - feat_df=pd.DataFrame(columns=list(set(feat)-set(val.columns))) - - val=pd.concat([val,feat_df],axis=1) - - val=val[feat] - val=val.fillna(0) - - val.columns=pd.MultiIndex.from_product([["MEDS"], val.columns]) - if(dyn_csv.empty): - dyn_csv=val + add_df = pd.DataFrame( + index=add_indices, columns=df2.columns + ).fillna(np.nan) + df2 = pd.concat([df2, add_df]) + df2 = df2.sort_index() + df2 = df2.ffill() + df2 = df2.fillna(0) + + val = pd.concat([val, add_df]) + val = val.sort_index() + val = val.ffill() + val = val.fillna(-1) + # print(df2.head()) + df2.iloc[:, 0:] = df2.iloc[:, 0:].sub(df2.index, 0) + df2[df2 > 0] = 1 + df2[df2 < 0] = 0 + val.iloc[:, 0:] = df2.iloc[:, 0:] * val.iloc[:, 0:] + # print(df2.head()) + dataDic[hid]["Med"]["signal"] = df2.iloc[:, 0:].to_dict( + orient="list" + ) + dataDic[hid]["Med"]["val"] = val.iloc[:, 0:].to_dict(orient="list") + + feat_df = pd.DataFrame(columns=list(set(feat) - set(val.columns))) + + val = pd.concat([val, feat_df], axis=1) + + val = val[feat] + val = val.fillna(0) + + val.columns = pd.MultiIndex.from_product([["MEDS"], val.columns]) + if dyn_csv.empty: + dyn_csv = val else: - dyn_csv=pd.concat([dyn_csv,val],axis=1) + dyn_csv = pd.concat([dyn_csv, val], axis=1) - - ###PROCS - if(self.feat_proc): - feat=proc['icd_code'].unique() - df2=proc[proc['hadm_id']==hid] - if df2.shape[0]==0: - df2=pd.DataFrame(np.zeros([los,len(feat)]),columns=feat) - df2=df2.fillna(0) - df2.columns=pd.MultiIndex.from_product([["PROC"], df2.columns]) + if self.feat_proc: + feat = proc["icd_code"].unique() + df2 = proc[proc["hadm_id"] == hid] + if df2.shape[0] == 0: + df2 = pd.DataFrame(np.zeros([los, len(feat)]), columns=feat) + df2 = df2.fillna(0) + df2.columns = pd.MultiIndex.from_product([["PROC"], df2.columns]) else: - df2['val']=1 - df2=df2.pivot_table(index='start_time',columns='icd_code',values='val') - #print(df2.shape) + df2["val"] = 1 + df2 = df2.pivot_table( + index="start_time", columns="icd_code", values="val" + ) + # print(df2.shape) add_indices = pd.Index(range(los)).difference(df2.index) - add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna(np.nan) - df2=pd.concat([df2, add_df]) - df2=df2.sort_index() - df2=df2.fillna(0) - df2[df2>0]=1 - #print(df2.head()) - dataDic[hid]['Proc']=df2.to_dict(orient="list") - - feat_df=pd.DataFrame(columns=list(set(feat)-set(df2.columns))) - df2=pd.concat([df2,feat_df],axis=1) - - df2=df2[feat] - df2=df2.fillna(0) - df2.columns=pd.MultiIndex.from_product([["PROC"], df2.columns]) - - if(dyn_csv.empty): - dyn_csv=df2 + add_df = pd.DataFrame( + index=add_indices, columns=df2.columns + ).fillna(np.nan) + df2 = pd.concat([df2, add_df]) + df2 = df2.sort_index() + df2 = df2.fillna(0) + df2[df2 > 0] = 1 + # print(df2.head()) + dataDic[hid]["Proc"] = df2.to_dict(orient="list") + + feat_df = pd.DataFrame(columns=list(set(feat) - set(df2.columns))) + df2 = pd.concat([df2, feat_df], axis=1) + + df2 = df2[feat] + df2 = df2.fillna(0) + df2.columns = pd.MultiIndex.from_product([["PROC"], df2.columns]) + + if dyn_csv.empty: + dyn_csv = df2 else: - dyn_csv=pd.concat([dyn_csv,df2],axis=1) - + dyn_csv = pd.concat([dyn_csv, df2], axis=1) + ###LABS - if(self.feat_lab): - feat=labs['itemid'].unique() - df2=labs[labs['hadm_id']==hid] - if df2.shape[0]==0: - val=pd.DataFrame(np.zeros([los,len(feat)]),columns=feat) - val=val.fillna(0) - val.columns=pd.MultiIndex.from_product([["LAB"], val.columns]) + if self.feat_lab: + feat = labs["itemid"].unique() + df2 = labs[labs["hadm_id"] == hid] + if df2.shape[0] == 0: + val = pd.DataFrame(np.zeros([los, len(feat)]), columns=feat) + val = val.fillna(0) + val.columns = pd.MultiIndex.from_product([["LAB"], val.columns]) else: - val=df2.pivot_table(index='start_time',columns='itemid',values='valuenum') - df2['val']=1 - df2=df2.pivot_table(index='start_time',columns='itemid',values='val') - #print(df2.shape) + val = df2.pivot_table( + index="start_time", columns="itemid", values="valuenum" + ) + df2["val"] = 1 + df2 = df2.pivot_table( + index="start_time", columns="itemid", values="val" + ) + # print(df2.shape) add_indices = pd.Index(range(los)).difference(df2.index) - add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna(np.nan) - df2=pd.concat([df2, add_df]) - df2=df2.sort_index() - df2=df2.fillna(0) - - val=pd.concat([val, add_df]) - val=val.sort_index() - if self.impute=='Mean': - val=val.ffill() - val=val.bfill() - val=val.fillna(val.mean()) - elif self.impute=='Median': - val=val.ffill() - val=val.bfill() - val=val.fillna(val.median()) - val=val.fillna(0) - - df2[df2>0]=1 - df2[df2<0]=0 - - #print(df2.head()) - dataDic[hid]['Lab']['signal']=df2.iloc[:,0:].to_dict(orient="list") - dataDic[hid]['Lab']['val']=val.iloc[:,0:].to_dict(orient="list") - - feat_df=pd.DataFrame(columns=list(set(feat)-set(val.columns))) - val=pd.concat([val,feat_df],axis=1) - - val=val[feat] - val=val.fillna(0) - val.columns=pd.MultiIndex.from_product([["LAB"], val.columns]) - - if(dyn_csv.empty): - dyn_csv=val + add_df = pd.DataFrame( + index=add_indices, columns=df2.columns + ).fillna(np.nan) + df2 = pd.concat([df2, add_df]) + df2 = df2.sort_index() + df2 = df2.fillna(0) + + val = pd.concat([val, add_df]) + val = val.sort_index() + if self.impute == "Mean": + val = val.ffill() + val = val.bfill() + val = val.fillna(val.mean()) + elif self.impute == "Median": + val = val.ffill() + val = val.bfill() + val = val.fillna(val.median()) + val = val.fillna(0) + + df2[df2 > 0] = 1 + df2[df2 < 0] = 0 + + # print(df2.head()) + dataDic[hid]["Lab"]["signal"] = df2.iloc[:, 0:].to_dict( + orient="list" + ) + dataDic[hid]["Lab"]["val"] = val.iloc[:, 0:].to_dict(orient="list") + + feat_df = pd.DataFrame(columns=list(set(feat) - set(val.columns))) + val = pd.concat([val, feat_df], axis=1) + + val = val[feat] + val = val.fillna(0) + val.columns = pd.MultiIndex.from_product([["LAB"], val.columns]) + + if dyn_csv.empty: + dyn_csv = val else: - dyn_csv=pd.concat([dyn_csv,val],axis=1) - - #Save temporal data to csv - dyn_csv.to_csv('./data/csv/'+str(hid)+'/dynamic.csv',index=False) - + dyn_csv = pd.concat([dyn_csv, val], axis=1) + + # Save temporal data to csv + dyn_csv.to_csv("./data/csv/" + str(hid) + "/dynamic.csv", index=False) + ##########COND######### - if(self.feat_cond): - feat=self.cond['new_icd_code'].unique() - grp=self.cond[self.cond['hadm_id']==hid] - if(grp.shape[0]==0): - dataDic[hid]['Cond']={'fids':list([''])} - feat_df=pd.DataFrame(np.zeros([1,len(feat)]),columns=feat) - grp=feat_df.fillna(0) - grp.columns=pd.MultiIndex.from_product([["COND"], grp.columns]) + if self.feat_cond: + feat = self.cond["new_icd_code"].unique() + grp = self.cond[self.cond["hadm_id"] == hid] + if grp.shape[0] == 0: + dataDic[hid]["Cond"] = {"fids": list([""])} + feat_df = pd.DataFrame(np.zeros([1, len(feat)]), columns=feat) + grp = feat_df.fillna(0) + grp.columns = pd.MultiIndex.from_product([["COND"], grp.columns]) else: - dataDic[hid]['Cond']={'fids':list(grp['new_icd_code'])} - grp['val']=1 - grp=grp.drop_duplicates() - grp=grp.pivot(index='hadm_id',columns='new_icd_code',values='val').reset_index(drop=True) - feat_df=pd.DataFrame(columns=list(set(feat)-set(grp.columns))) - grp=pd.concat([grp,feat_df],axis=1) - grp=grp.fillna(0) - grp=grp[feat] - grp.columns=pd.MultiIndex.from_product([["COND"], grp.columns]) - grp.to_csv('./data/csv/'+str(hid)+'/static.csv',index=False) - labels_csv.to_csv('./data/csv/labels.csv',index=False) - - + dataDic[hid]["Cond"] = {"fids": list(grp["new_icd_code"])} + grp["val"] = 1 + grp = grp.drop_duplicates() + grp = grp.pivot( + index="hadm_id", columns="new_icd_code", values="val" + ).reset_index(drop=True) + feat_df = pd.DataFrame(columns=list(set(feat) - set(grp.columns))) + grp = pd.concat([grp, feat_df], axis=1) + grp = grp.fillna(0) + grp = grp[feat] + grp.columns = pd.MultiIndex.from_product([["COND"], grp.columns]) + grp.to_csv("./data/csv/" + str(hid) + "/static.csv", index=False) + labels_csv.to_csv("./data/csv/labels.csv", index=False) + ######SAVE DICTIONARIES############## - metaDic={'Cond':{},'Proc':{},'Med':{},'Lab':{},'LOS':{}} - metaDic['LOS']=los - with open("./data/dict/dataDic", 'wb') as fp: + metaDic = {"Cond": {}, "Proc": {}, "Med": {}, "Lab": {}, "LOS": {}} + metaDic["LOS"] = los + with open("./data/dict/dataDic", "wb") as fp: pickle.dump(dataDic, fp) - with open("./data/dict/hadmDic", 'wb') as fp: + with open("./data/dict/hadmDic", "wb") as fp: pickle.dump(self.hids, fp) - - with open("./data/dict/ethVocab", 'wb') as fp: - pickle.dump(list(self.data['ethnicity'].unique()), fp) - self.eth_vocab = self.data['ethnicity'].nunique() - - with open("./data/dict/ageVocab", 'wb') as fp: - pickle.dump(list(self.data['Age'].unique()), fp) - self.age_vocab = self.data['Age'].nunique() - - with open("./data/dict/insVocab", 'wb') as fp: - pickle.dump(list(self.data['insurance'].unique()), fp) - self.ins_vocab = self.data['insurance'].nunique() - - if(self.feat_med): - with open("./data/dict/medVocab", 'wb') as fp: - pickle.dump(list(meds['drug_name'].unique()), fp) - self.med_vocab = meds['drug_name'].nunique() - metaDic['Med']=self.med_per_adm - - if(self.feat_cond): - with open("./data/dict/condVocab", 'wb') as fp: - pickle.dump(list(self.cond['new_icd_code'].unique()), fp) - self.cond_vocab = self.cond['new_icd_code'].nunique() - metaDic['Cond']=self.cond_per_adm - - if(self.feat_proc): - with open("./data/dict/procVocab", 'wb') as fp: - pickle.dump(list(proc['icd_code'].unique()), fp) - self.proc_vocab = proc['icd_code'].unique() - metaDic['Proc']=self.proc_per_adm - - if(self.feat_lab): - with open("./data/dict/labsVocab", 'wb') as fp: - pickle.dump(list(labs['itemid'].unique()), fp) - self.lab_vocab = labs['itemid'].unique() - metaDic['Lab']=self.labs_per_adm - - with open("./data/dict/metaDic", 'wb') as fp: - pickle.dump(metaDic, fp) - + with open("./data/dict/ethVocab", "wb") as fp: + pickle.dump(list(self.data["ethnicity"].unique()), fp) + self.eth_vocab = self.data["ethnicity"].nunique() + + with open("./data/dict/ageVocab", "wb") as fp: + pickle.dump(list(self.data["Age"].unique()), fp) + self.age_vocab = self.data["Age"].nunique() + with open("./data/dict/insVocab", "wb") as fp: + pickle.dump(list(self.data["insurance"].unique()), fp) + self.ins_vocab = self.data["insurance"].nunique() + if self.feat_med: + with open("./data/dict/medVocab", "wb") as fp: + pickle.dump(list(meds["drug_name"].unique()), fp) + self.med_vocab = meds["drug_name"].nunique() + metaDic["Med"] = self.med_per_adm + + if self.feat_cond: + with open("./data/dict/condVocab", "wb") as fp: + pickle.dump(list(self.cond["new_icd_code"].unique()), fp) + self.cond_vocab = self.cond["new_icd_code"].nunique() + metaDic["Cond"] = self.cond_per_adm + + if self.feat_proc: + with open("./data/dict/procVocab", "wb") as fp: + pickle.dump(list(proc["icd_code"].unique()), fp) + self.proc_vocab = proc["icd_code"].unique() + metaDic["Proc"] = self.proc_per_adm + + if self.feat_lab: + with open("./data/dict/labsVocab", "wb") as fp: + pickle.dump(list(labs["itemid"].unique()), fp) + self.lab_vocab = labs["itemid"].unique() + metaDic["Lab"] = self.labs_per_adm + + with open("./data/dict/metaDic", "wb") as fp: + pickle.dump(metaDic, fp) diff --git a/model/data_generation_icu.py b/model/data_generation_icu.py index e9ed83dd0d..14ba0fa1fb 100644 --- a/model/data_generation_icu.py +++ b/model/data_generation_icu.py @@ -8,25 +8,48 @@ import os import sys from pathlib import Path -sys.path.append(os.path.dirname(os.path.abspath(__file__)) + './../..') + +sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "./../..") if not os.path.exists("./data/dict"): os.makedirs("./data/dict") if not os.path.exists("./data/csv"): os.makedirs("./data/csv") - -class Generator(): - def __init__(self,cohort_output,if_mort,if_admn,if_los,feat_cond,feat_proc,feat_out,feat_chart,feat_med,impute,include_time=24,bucket=1,predW=6): - self.feat_cond,self.feat_proc,self.feat_out,self.feat_chart,self.feat_med = feat_cond,feat_proc,feat_out,feat_chart,feat_med - self.cohort_output=cohort_output - self.impute=impute + + +class Generator: + def __init__( + self, + cohort_output, + if_mort, + if_admn, + if_los, + feat_cond, + feat_proc, + feat_out, + feat_chart, + feat_med, + impute, + include_time=24, + bucket=1, + predW=6, + ): + ( + self.feat_cond, + self.feat_proc, + self.feat_out, + self.feat_chart, + self.feat_med, + ) = (feat_cond, feat_proc, feat_out, feat_chart, feat_med) + self.cohort_output = cohort_output + self.impute = impute self.data = self.generate_adm() print("[ READ COHORT ]") - + self.generate_feat() print("[ READ ALL FEATURES ]") - + if if_mort: - self.mortality_length(include_time,predW) + self.mortality_length(include_time, predW) print("[ PROCESSED TIME SERIES TO EQUAL LENGTH ]") elif if_admn: self.readmission_length(include_time) @@ -34,701 +57,863 @@ def __init__(self,cohort_output,if_mort,if_admn,if_los,feat_cond,feat_proc,feat_ elif if_los: self.los_length(include_time) print("[ PROCESSED TIME SERIES TO EQUAL LENGTH ]") - + self.smooth_meds(bucket) print("[ SUCCESSFULLY SAVED DATA DICTIONARIES ]") - + def generate_feat(self): - if(self.feat_cond): + if self.feat_cond: print("[ ======READING DIAGNOSIS ]") self.generate_cond() - if(self.feat_proc): + if self.feat_proc: print("[ ======READING PROCEDURES ]") self.generate_proc() - if(self.feat_out): + if self.feat_out: print("[ ======READING OUT EVENTS ]") self.generate_out() - if(self.feat_chart): + if self.feat_chart: print("[ ======READING CHART EVENTS ]") self.generate_chart() - if(self.feat_med): + if self.feat_med: print("[ ======READING MEDICATIONS ]") self.generate_meds() + breakpoint() def generate_adm(self): - data=pd.read_csv(f"./data/cohort/{self.cohort_output}.csv.gz", compression='gzip', header=0, index_col=None) - data['intime'] = pd.to_datetime(data['intime']) - data['outtime'] = pd.to_datetime(data['outtime']) - data['los']=pd.to_timedelta(data['outtime']-data['intime'],unit='h') - data['los']=data['los'].astype(str) - data[['days', 'dummy','hours']] = data['los'].str.split(' ', -1, expand=True) - data[['hours','min','sec']] = data['hours'].str.split(':', -1, expand=True) - data['los']=pd.to_numeric(data['days'])*24+pd.to_numeric(data['hours']) - data=data.drop(columns=['days', 'dummy','hours','min','sec']) - data=data[data['los']>0] - data['Age']=data['Age'].astype(int) - #print(data.head()) - #print(data.shape) + data = pd.read_csv( + f"./data/cohort/{self.cohort_output}.csv.gz", + compression="gzip", + header=0, + index_col=None, + ) + data["intime"] = pd.to_datetime(data["intime"]) + data["outtime"] = pd.to_datetime(data["outtime"]) + data["los"] = pd.to_timedelta(data["outtime"] - data["intime"], unit="h") + data["los"] = data["los"].astype(str) + data[["days", "dummy", "hours"]] = data["los"].str.split(" ", expand=True) + data[["hours", "min", "sec"]] = data["hours"].str.split(":", expand=True) + data["los"] = pd.to_numeric(data["days"]) * 24 + pd.to_numeric(data["hours"]) + data = data.drop(columns=["days", "dummy", "hours", "min", "sec"]) + data = data[data["los"] > 0] + data["Age"] = data["Age"].astype(int) + # print(data.head()) + # print(data.shape) return data - + def generate_cond(self): - cond=pd.read_csv("./data/features/preproc_diag_icu.csv.gz", compression='gzip', header=0, index_col=None) - cond=cond[cond['stay_id'].isin(self.data['stay_id'])] - cond_per_adm = cond.groupby('stay_id').size().max() + cond = pd.read_csv( + "./data/features/preproc_diag_icu.csv.gz", + compression="gzip", + header=0, + index_col=None, + ) + cond = cond[cond["stay_id"].isin(self.data["stay_id"])] + cond_per_adm = cond.groupby("stay_id").size().max() self.cond, self.cond_per_adm = cond, cond_per_adm - + def generate_proc(self): - proc=pd.read_csv("./data/features/preproc_proc_icu.csv.gz", compression='gzip', header=0, index_col=None) - proc=proc[proc['stay_id'].isin(self.data['stay_id'])] - proc[['start_days', 'dummy','start_hours']] = proc['event_time_from_admit'].str.split(' ', -1, expand=True) - proc[['start_hours','min','sec']] = proc['start_hours'].str.split(':', -1, expand=True) - proc['start_time']=pd.to_numeric(proc['start_days'])*24+pd.to_numeric(proc['start_hours']) - proc=proc.drop(columns=['start_days', 'dummy','start_hours','min','sec']) - proc=proc[proc['start_time']>=0] - + proc = pd.read_csv( + "./data/features/preproc_proc_icu.csv.gz", + compression="gzip", + header=0, + index_col=None, + ) + proc = proc[proc["stay_id"].isin(self.data["stay_id"])] + proc[["start_days", "dummy", "start_hours"]] = proc[ + "event_time_from_admit" + ].str.split(" ", expand=True) + proc[["start_hours", "min", "sec"]] = proc["start_hours"].str.split( + ":", expand=True + ) + proc["start_time"] = pd.to_numeric(proc["start_days"]) * 24 + pd.to_numeric( + proc["start_hours"] + ) + proc = proc.drop(columns=["start_days", "dummy", "start_hours", "min", "sec"]) + proc = proc[proc["start_time"] >= 0] + ###Remove where event time is after discharge time - proc=pd.merge(proc,self.data[['stay_id','los']],on='stay_id',how='left') - proc['sanity']=proc['los']-proc['start_time'] - proc=proc[proc['sanity']>0] - del proc['sanity'] - - self.proc=proc - + proc = pd.merge(proc, self.data[["stay_id", "los"]], on="stay_id", how="left") + proc["sanity"] = proc["los"] - proc["start_time"] + proc = proc[proc["sanity"] > 0] + del proc["sanity"] + + self.proc = proc + def generate_out(self): - out=pd.read_csv("./data/features/preproc_out_icu.csv.gz", compression='gzip', header=0, index_col=None) - out=out[out['stay_id'].isin(self.data['stay_id'])] - out[['start_days', 'dummy','start_hours']] = out['event_time_from_admit'].str.split(' ', -1, expand=True) - out[['start_hours','min','sec']] = out['start_hours'].str.split(':', -1, expand=True) - out['start_time']=pd.to_numeric(out['start_days'])*24+pd.to_numeric(out['start_hours']) - out=out.drop(columns=['start_days', 'dummy','start_hours','min','sec']) - out=out[out['start_time']>=0] - + out = pd.read_csv( + "./data/features/preproc_out_icu.csv.gz", + compression="gzip", + header=0, + index_col=None, + ) + out = out[out["stay_id"].isin(self.data["stay_id"])] + out[["start_days", "dummy", "start_hours"]] = out[ + "event_time_from_admit" + ].str.split(" ", expand=True) + out[["start_hours", "min", "sec"]] = out["start_hours"].str.split( + ":", expand=True + ) + out["start_time"] = pd.to_numeric(out["start_days"]) * 24 + pd.to_numeric( + out["start_hours"] + ) + out = out.drop(columns=["start_days", "dummy", "start_hours", "min", "sec"]) + out = out[out["start_time"] >= 0] + ###Remove where event time is after discharge time - out=pd.merge(out,self.data[['stay_id','los']],on='stay_id',how='left') - out['sanity']=out['los']-out['start_time'] - out=out[out['sanity']>0] - del out['sanity'] - - self.out=out - - + out = pd.merge(out, self.data[["stay_id", "los"]], on="stay_id", how="left") + out["sanity"] = out["los"] - out["start_time"] + out = out[out["sanity"] > 0] + del out["sanity"] + + self.out = out + def generate_chart(self): chunksize = 5000000 - final=pd.DataFrame() - for chart in tqdm(pd.read_csv("./data/features/preproc_chart_icu.csv.gz", compression='gzip', header=0, index_col=None,chunksize=chunksize)): - chart=chart[chart['stay_id'].isin(self.data['stay_id'])] - chart[['start_days', 'dummy','start_hours']] = chart['event_time_from_admit'].str.split(' ', -1, expand=True) - chart[['start_hours','min','sec']] = chart['start_hours'].str.split(':', -1, expand=True) - chart['start_time']=pd.to_numeric(chart['start_days'])*24+pd.to_numeric(chart['start_hours']) - chart=chart.drop(columns=['start_days', 'dummy','start_hours','min','sec','event_time_from_admit']) - chart=chart[chart['start_time']>=0] + final = pd.DataFrame() + for chart in tqdm( + pd.read_csv( + "./data/features/preproc_chart_icu.csv.gz", + compression="gzip", + header=0, + index_col=None, + chunksize=chunksize, + ) + ): + chart = chart[chart["stay_id"].isin(self.data["stay_id"])] + chart[["start_days", "dummy", "start_hours"]] = chart[ + "event_time_from_admit" + ].str.split(" ", expand=True) + chart[["start_hours", "min", "sec"]] = chart["start_hours"].str.split( + ":", expand=True + ) + chart["start_time"] = pd.to_numeric( + chart["start_days"] + ) * 24 + pd.to_numeric(chart["start_hours"]) + chart = chart.drop( + columns=[ + "start_days", + "dummy", + "start_hours", + "min", + "sec", + "event_time_from_admit", + ] + ) + chart = chart[chart["start_time"] >= 0] ###Remove where event time is after discharge time - chart=pd.merge(chart,self.data[['stay_id','los']],on='stay_id',how='left') - chart['sanity']=chart['los']-chart['start_time'] - chart=chart[chart['sanity']>0] - del chart['sanity'] - del chart['los'] - + chart = pd.merge( + chart, self.data[["stay_id", "los"]], on="stay_id", how="left" + ) + chart["sanity"] = chart["los"] - chart["start_time"] + chart = chart[chart["sanity"] > 0] + del chart["sanity"] + del chart["los"] + if final.empty: - final=chart + final = chart else: - final=final.append(chart, ignore_index=True) - - self.chart=final - - - + final = pd.concat([final, chart], ignore_index=True) + + self.chart = final + def generate_meds(self): - meds=pd.read_csv("./data/features/preproc_med_icu.csv.gz", compression='gzip', header=0, index_col=None) - meds[['start_days', 'dummy','start_hours']] = meds['start_hours_from_admit'].str.split(' ', -1, expand=True) - meds[['start_hours','min','sec']] = meds['start_hours'].str.split(':', -1, expand=True) - meds['start_time']=pd.to_numeric(meds['start_days'])*24+pd.to_numeric(meds['start_hours']) - meds[['start_days', 'dummy','start_hours']] = meds['stop_hours_from_admit'].str.split(' ', -1, expand=True) - meds[['start_hours','min','sec']] = meds['start_hours'].str.split(':', -1, expand=True) - meds['stop_time']=pd.to_numeric(meds['start_days'])*24+pd.to_numeric(meds['start_hours']) - meds=meds.drop(columns=['start_days', 'dummy','start_hours','min','sec']) + meds = pd.read_csv( + "./data/features/preproc_med_icu.csv.gz", + compression="gzip", + header=0, + index_col=None, + ) + meds[["start_days", "dummy", "start_hours"]] = meds[ + "start_hours_from_admit" + ].str.split(" ", expand=True) + meds[["start_hours", "min", "sec"]] = meds["start_hours"].str.split( + ":", expand=True + ) + meds["start_time"] = pd.to_numeric(meds["start_days"]) * 24 + pd.to_numeric( + meds["start_hours"] + ) + meds[["start_days", "dummy", "start_hours"]] = meds[ + "stop_hours_from_admit" + ].str.split(" ", expand=True) + meds[["start_hours", "min", "sec"]] = meds["start_hours"].str.split( + ":", expand=True + ) + meds["stop_time"] = pd.to_numeric(meds["start_days"]) * 24 + pd.to_numeric( + meds["start_hours"] + ) + meds = meds.drop(columns=["start_days", "dummy", "start_hours", "min", "sec"]) #####Sanity check - meds['sanity']=meds['stop_time']-meds['start_time'] - meds=meds[meds['sanity']>0] - del meds['sanity'] + meds["sanity"] = meds["stop_time"] - meds["start_time"] + meds = meds[meds["sanity"] > 0] + del meds["sanity"] #####Select hadm_id as in main file - meds=meds[meds['stay_id'].isin(self.data['stay_id'])] - meds=pd.merge(meds,self.data[['stay_id','los']],on='stay_id',how='left') + meds = meds[meds["stay_id"].isin(self.data["stay_id"])] + meds = pd.merge(meds, self.data[["stay_id", "los"]], on="stay_id", how="left") #####Remove where start time is after end of visit - meds['sanity']=meds['los']-meds['start_time'] - meds=meds[meds['sanity']>0] - del meds['sanity'] + meds["sanity"] = meds["los"] - meds["start_time"] + meds = meds[meds["sanity"] > 0] + del meds["sanity"] ####Any stop_time after end of visit is set at end of visit - meds.loc[meds['stop_time'] > meds['los'],'stop_time']=meds.loc[meds['stop_time'] > meds['los'],'los'] - del meds['los'] - - meds['rate']=meds['rate'].apply(pd.to_numeric, errors='coerce') - meds['amount']=meds['amount'].apply(pd.to_numeric, errors='coerce') - - self.meds=meds - - def mortality_length(self,include_time,predW): - print("include_time",include_time) - self.los=include_time - self.data=self.data[(self.data['los']>=include_time+predW)] - self.hids=self.data['stay_id'].unique() - - if(self.feat_cond): - self.cond=self.cond[self.cond['stay_id'].isin(self.data['stay_id'])] - - self.data['los']=include_time + meds.loc[meds["stop_time"] > meds["los"], "stop_time"] = meds.loc[ + meds["stop_time"] > meds["los"], "los" + ] + del meds["los"] + + meds["rate"] = meds["rate"].apply(pd.to_numeric, errors="coerce") + meds["amount"] = meds["amount"].apply(pd.to_numeric, errors="coerce") + + self.meds = meds + + def mortality_length(self, include_time, predW): + print("include_time", include_time) + self.los = include_time + self.data = self.data[(self.data["los"] >= include_time + predW)] + self.hids = self.data["stay_id"].unique() + + if self.feat_cond: + self.cond = self.cond[self.cond["stay_id"].isin(self.data["stay_id"])] + + self.data["los"] = include_time ####Make equal length input time series and remove data for pred window if needed - + ###MEDS - if(self.feat_med): - self.meds=self.meds[self.meds['stay_id'].isin(self.data['stay_id'])] - self.meds=self.meds[self.meds['start_time']<=include_time] - self.meds.loc[self.meds.stop_time >include_time, 'stop_time']=include_time - - + if self.feat_med: + self.meds = self.meds[self.meds["stay_id"].isin(self.data["stay_id"])] + self.meds = self.meds[self.meds["start_time"] <= include_time] + self.meds.loc[ + self.meds.stop_time > include_time, "stop_time" + ] = include_time + ###PROCS - if(self.feat_proc): - self.proc=self.proc[self.proc['stay_id'].isin(self.data['stay_id'])] - self.proc=self.proc[self.proc['start_time']<=include_time] - + if self.feat_proc: + self.proc = self.proc[self.proc["stay_id"].isin(self.data["stay_id"])] + self.proc = self.proc[self.proc["start_time"] <= include_time] + ###OUT - if(self.feat_out): - self.out=self.out[self.out['stay_id'].isin(self.data['stay_id'])] - self.out=self.out[self.out['start_time']<=include_time] - - ###CHART - if(self.feat_chart): - self.chart=self.chart[self.chart['stay_id'].isin(self.data['stay_id'])] - self.chart=self.chart[self.chart['start_time']<=include_time] - - #self.los=include_time - def los_length(self,include_time): - print("include_time",include_time) - self.los=include_time - self.data=self.data[(self.data['los']>=include_time)] - self.hids=self.data['stay_id'].unique() - - if(self.feat_cond): - self.cond=self.cond[self.cond['stay_id'].isin(self.data['stay_id'])] - - self.data['los']=include_time + if self.feat_out: + self.out = self.out[self.out["stay_id"].isin(self.data["stay_id"])] + self.out = self.out[self.out["start_time"] <= include_time] + + ###CHART + if self.feat_chart: + self.chart = self.chart[self.chart["stay_id"].isin(self.data["stay_id"])] + self.chart = self.chart[self.chart["start_time"] <= include_time] + + # self.los=include_time + + def los_length(self, include_time): + print("include_time", include_time) + self.los = include_time + self.data = self.data[(self.data["los"] >= include_time)] + self.hids = self.data["stay_id"].unique() + + if self.feat_cond: + self.cond = self.cond[self.cond["stay_id"].isin(self.data["stay_id"])] + + self.data["los"] = include_time ####Make equal length input time series and remove data for pred window if needed - + ###MEDS - if(self.feat_med): - self.meds=self.meds[self.meds['stay_id'].isin(self.data['stay_id'])] - self.meds=self.meds[self.meds['start_time']<=include_time] - self.meds.loc[self.meds.stop_time >include_time, 'stop_time']=include_time - - + if self.feat_med: + self.meds = self.meds[self.meds["stay_id"].isin(self.data["stay_id"])] + self.meds = self.meds[self.meds["start_time"] <= include_time] + self.meds.loc[ + self.meds.stop_time > include_time, "stop_time" + ] = include_time + ###PROCS - if(self.feat_proc): - self.proc=self.proc[self.proc['stay_id'].isin(self.data['stay_id'])] - self.proc=self.proc[self.proc['start_time']<=include_time] - + if self.feat_proc: + self.proc = self.proc[self.proc["stay_id"].isin(self.data["stay_id"])] + self.proc = self.proc[self.proc["start_time"] <= include_time] + ###OUT - if(self.feat_out): - self.out=self.out[self.out['stay_id'].isin(self.data['stay_id'])] - self.out=self.out[self.out['start_time']<=include_time] - - ###CHART - if(self.feat_chart): - self.chart=self.chart[self.chart['stay_id'].isin(self.data['stay_id'])] - self.chart=self.chart[self.chart['start_time']<=include_time] - - def readmission_length(self,include_time): - self.los=include_time - self.data=self.data[(self.data['los']>=include_time)] - self.hids=self.data['stay_id'].unique() - - if(self.feat_cond): - self.cond=self.cond[self.cond['stay_id'].isin(self.data['stay_id'])] - self.data['select_time']=self.data['los']-include_time - self.data['los']=include_time + if self.feat_out: + self.out = self.out[self.out["stay_id"].isin(self.data["stay_id"])] + self.out = self.out[self.out["start_time"] <= include_time] + + ###CHART + if self.feat_chart: + self.chart = self.chart[self.chart["stay_id"].isin(self.data["stay_id"])] + self.chart = self.chart[self.chart["start_time"] <= include_time] + + def readmission_length(self, include_time): + self.los = include_time + self.data = self.data[(self.data["los"] >= include_time)] + self.hids = self.data["stay_id"].unique() + + if self.feat_cond: + self.cond = self.cond[self.cond["stay_id"].isin(self.data["stay_id"])] + self.data["select_time"] = self.data["los"] - include_time + self.data["los"] = include_time ####Make equal length input time series and remove data for pred window if needed - + ###MEDS - if(self.feat_med): - self.meds=self.meds[self.meds['stay_id'].isin(self.data['stay_id'])] - self.meds=pd.merge(self.meds,self.data[['stay_id','select_time']],on='stay_id',how='left') - self.meds['stop_time']=self.meds['stop_time']-self.meds['select_time'] - self.meds['start_time']=self.meds['start_time']-self.meds['select_time'] - self.meds=self.meds[self.meds['stop_time']>=0] - self.meds.loc[self.meds.start_time <0, 'start_time']=0 - + if self.feat_med: + self.meds = self.meds[self.meds["stay_id"].isin(self.data["stay_id"])] + self.meds = pd.merge( + self.meds, + self.data[["stay_id", "select_time"]], + on="stay_id", + how="left", + ) + self.meds["stop_time"] = self.meds["stop_time"] - self.meds["select_time"] + self.meds["start_time"] = self.meds["start_time"] - self.meds["select_time"] + self.meds = self.meds[self.meds["stop_time"] >= 0] + self.meds.loc[self.meds.start_time < 0, "start_time"] = 0 + ###PROCS - if(self.feat_proc): - self.proc=self.proc[self.proc['stay_id'].isin(self.data['stay_id'])] - self.proc=pd.merge(self.proc,self.data[['stay_id','select_time']],on='stay_id',how='left') - self.proc['start_time']=self.proc['start_time']-self.proc['select_time'] - self.proc=self.proc[self.proc['start_time']>=0] - + if self.feat_proc: + self.proc = self.proc[self.proc["stay_id"].isin(self.data["stay_id"])] + self.proc = pd.merge( + self.proc, + self.data[["stay_id", "select_time"]], + on="stay_id", + how="left", + ) + self.proc["start_time"] = self.proc["start_time"] - self.proc["select_time"] + self.proc = self.proc[self.proc["start_time"] >= 0] + ###OUT - if(self.feat_out): - self.out=self.out[self.out['stay_id'].isin(self.data['stay_id'])] - self.out=pd.merge(self.out,self.data[['stay_id','select_time']],on='stay_id',how='left') - self.out['start_time']=self.out['start_time']-self.out['select_time'] - self.out=self.out[self.out['start_time']>=0] - - ###CHART - if(self.feat_chart): - self.chart=self.chart[self.chart['stay_id'].isin(self.data['stay_id'])] - self.chart=pd.merge(self.chart,self.data[['stay_id','select_time']],on='stay_id',how='left') - self.chart['start_time']=self.chart['start_time']-self.chart['select_time'] - self.chart=self.chart[self.chart['start_time']>=0] - - - def smooth_meds(self,bucket): - final_meds=pd.DataFrame() - final_proc=pd.DataFrame() - final_out=pd.DataFrame() - final_chart=pd.DataFrame() - - if(self.feat_med): - self.meds=self.meds.sort_values(by=['start_time']) - if(self.feat_proc): - self.proc=self.proc.sort_values(by=['start_time']) - if(self.feat_out): - self.out=self.out.sort_values(by=['start_time']) - if(self.feat_chart): - self.chart=self.chart.sort_values(by=['start_time']) - - t=0 - for i in tqdm(range(0,self.los,bucket)): + if self.feat_out: + self.out = self.out[self.out["stay_id"].isin(self.data["stay_id"])] + self.out = pd.merge( + self.out, + self.data[["stay_id", "select_time"]], + on="stay_id", + how="left", + ) + self.out["start_time"] = self.out["start_time"] - self.out["select_time"] + self.out = self.out[self.out["start_time"] >= 0] + + ###CHART + if self.feat_chart: + self.chart = self.chart[self.chart["stay_id"].isin(self.data["stay_id"])] + self.chart = pd.merge( + self.chart, + self.data[["stay_id", "select_time"]], + on="stay_id", + how="left", + ) + self.chart["start_time"] = ( + self.chart["start_time"] - self.chart["select_time"] + ) + self.chart = self.chart[self.chart["start_time"] >= 0] + + def smooth_meds(self, bucket): + final_meds = pd.DataFrame() + final_proc = pd.DataFrame() + final_out = pd.DataFrame() + final_chart = pd.DataFrame() + + if self.feat_med: + self.meds = self.meds.sort_values(by=["start_time"]) + if self.feat_proc: + self.proc = self.proc.sort_values(by=["start_time"]) + if self.feat_out: + self.out = self.out.sort_values(by=["start_time"]) + if self.feat_chart: + self.chart = self.chart.sort_values(by=["start_time"]) + + t = 0 + for i in tqdm(range(0, self.los, bucket)): ###MEDS - if(self.feat_med): - sub_meds=self.meds[(self.meds['start_time']>=i) & (self.meds['start_time']= i) + & (self.meds["start_time"] < i + bucket) + ] + .groupby(["stay_id", "itemid", "orderid"]) + .agg( + { + "stop_time": "max", + "subject_id": "max", + "rate": np.nanmean, + "amount": np.nanmean, + } + ) + ) + sub_meds = sub_meds.reset_index() + sub_meds["start_time"] = t + sub_meds["stop_time"] = sub_meds["stop_time"] / bucket if final_meds.empty: - final_meds=sub_meds + final_meds = sub_meds else: - final_meds=final_meds.append(sub_meds) - + final_meds = pd.concat([final_meds, sub_meds], ignore_index=True) + ###PROC - if(self.feat_proc): - sub_proc=self.proc[(self.proc['start_time']>=i) & (self.proc['start_time']= i) + & (self.proc["start_time"] < i + bucket) + ] + .groupby(["stay_id", "itemid"]) + .agg({"subject_id": "max"}) + ) + sub_proc = sub_proc.reset_index() + sub_proc["start_time"] = t if final_proc.empty: - final_proc=sub_proc - else: - final_proc=final_proc.append(sub_proc) - - ###OUT - if(self.feat_out): - sub_out=self.out[(self.out['start_time']>=i) & (self.out['start_time']= i) + & (self.out["start_time"] < i + bucket) + ] + .groupby(["stay_id", "itemid"]) + .agg({"subject_id": "max"}) + ) + sub_out = sub_out.reset_index() + sub_out["start_time"] = t if final_out.empty: - final_out=sub_out - else: - final_out=final_out.append(sub_out) - - - ###CHART - if(self.feat_chart): - sub_chart=self.chart[(self.chart['start_time']>=i) & (self.chart['start_time']= i) + & (self.chart["start_time"] < i + bucket) + ] + .groupby(["stay_id", "itemid"]) + .agg({"valuenum": np.nanmean}) + ) + sub_chart = sub_chart.reset_index() + sub_chart["start_time"] = t if final_chart.empty: - final_chart=sub_chart - else: - final_chart=final_chart.append(sub_chart) - - t=t+1 - print("bucket",bucket) - los=int(self.los/bucket) - - + final_chart = sub_chart + else: + final_chart = pd.concat([final_chart, sub_chart], ignore_index=True) + + t = t + 1 + print("bucket", bucket) + los = int(self.los / bucket) + ###MEDS - if(self.feat_med): - f2_meds=final_meds.groupby(['stay_id','itemid','orderid']).size() - self.med_per_adm=f2_meds.groupby('stay_id').sum().reset_index()[0].max() - self.medlength_per_adm=final_meds.groupby('stay_id').size().max() - + if self.feat_med: + f2_meds = final_meds.groupby(["stay_id", "itemid", "orderid"]).size() + self.med_per_adm = f2_meds.groupby("stay_id").sum().reset_index()[0].max() + self.medlength_per_adm = final_meds.groupby("stay_id").size().max() + ###PROC - if(self.feat_proc): - f2_proc=final_proc.groupby(['stay_id','itemid']).size() - self.proc_per_adm=f2_proc.groupby('stay_id').sum().reset_index()[0].max() - self.proclength_per_adm=final_proc.groupby('stay_id').size().max() - + if self.feat_proc: + f2_proc = final_proc.groupby(["stay_id", "itemid"]).size() + self.proc_per_adm = f2_proc.groupby("stay_id").sum().reset_index()[0].max() + self.proclength_per_adm = final_proc.groupby("stay_id").size().max() + ###OUT - if(self.feat_out): - f2_out=final_out.groupby(['stay_id','itemid']).size() - self.out_per_adm=f2_out.groupby('stay_id').sum().reset_index()[0].max() - self.outlength_per_adm=final_out.groupby('stay_id').size().max() - - + if self.feat_out: + f2_out = final_out.groupby(["stay_id", "itemid"]).size() + self.out_per_adm = f2_out.groupby("stay_id").sum().reset_index()[0].max() + self.outlength_per_adm = final_out.groupby("stay_id").size().max() + ###chart - if(self.feat_chart): - f2_chart=final_chart.groupby(['stay_id','itemid']).size() - self.chart_per_adm=f2_chart.groupby('stay_id').sum().reset_index()[0].max() - self.chartlength_per_adm=final_chart.groupby('stay_id').size().max() - + if self.feat_chart: + f2_chart = final_chart.groupby(["stay_id", "itemid"]).size() + self.chart_per_adm = ( + f2_chart.groupby("stay_id").sum().reset_index()[0].max() + ) + self.chartlength_per_adm = final_chart.groupby("stay_id").size().max() + print("[ PROCESSED TIME SERIES TO EQUAL TIME INTERVAL ]") ###CREATE DICT -# if(self.feat_chart): -# self.create_chartDict(final_chart,los) -# else: - self.create_Dict(final_meds,final_proc,final_out,final_chart,los) - - - def create_chartDict(self,chart,los): - dataDic={} + # if(self.feat_chart): + # self.create_chartDict(final_chart,los) + # else: + self.create_Dict(final_meds, final_proc, final_out, final_chart, los) + + def create_chartDict(self, chart, los): + dataDic = {} for hid in self.hids: - grp=self.data[self.data['stay_id']==hid] - dataDic[hid]={'Chart':{},'label':int(grp['label'])} + grp = self.data[self.data["stay_id"] == hid] + dataDic[hid] = {"Chart": {}, "label": int(grp["label"])} for hid in tqdm(self.hids): ###CHART - if(self.feat_chart): - df2=chart[chart['stay_id']==hid] - val=df2.pivot_table(index='start_time',columns='itemid',values='valuenum') - df2['val']=1 - df2=df2.pivot_table(index='start_time',columns='itemid',values='val') - #print(df2.shape) + if self.feat_chart: + df2 = chart[chart["stay_id"] == hid] + val = df2.pivot_table( + index="start_time", columns="itemid", values="valuenum" + ) + df2["val"] = 1 + df2 = df2.pivot_table( + index="start_time", columns="itemid", values="val" + ) + # print(df2.shape) add_indices = pd.Index(range(los)).difference(df2.index) - add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna(np.nan) - df2=pd.concat([df2, add_df]) - df2=df2.sort_index() - df2=df2.fillna(0) - - val=pd.concat([val, add_df]) - val=val.sort_index() - if self.impute=='Mean': - val=val.ffill() - val=val.bfill() - val=val.fillna(val.mean()) - elif self.impute=='Median': - val=val.ffill() - val=val.bfill() - val=val.fillna(val.median()) - val=val.fillna(0) - - - df2[df2>0]=1 - df2[df2<0]=0 - #print(df2.head()) - dataDic[hid]['Chart']['signal']=df2.iloc[:,0:].to_dict(orient="list") - dataDic[hid]['Chart']['val']=val.iloc[:,0:].to_dict(orient="list") - - - + add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna( + np.nan + ) + df2 = pd.concat([df2, add_df]) + df2 = df2.sort_index() + df2 = df2.fillna(0) + + val = pd.concat([val, add_df]) + val = val.sort_index() + if self.impute == "Mean": + val = val.ffill() + val = val.bfill() + val = val.fillna(val.mean()) + elif self.impute == "Median": + val = val.ffill() + val = val.bfill() + val = val.fillna(val.median()) + val = val.fillna(0) + + df2[df2 > 0] = 1 + df2[df2 < 0] = 0 + # print(df2.head()) + dataDic[hid]["Chart"]["signal"] = df2.iloc[:, 0:].to_dict(orient="list") + dataDic[hid]["Chart"]["val"] = val.iloc[:, 0:].to_dict(orient="list") + ######SAVE DICTIONARIES############## - with open("./data/dict/metaDic", 'rb') as fp: - metaDic=pickle.load(fp) - - with open("./data/dict/dataChartDic", 'wb') as fp: + with open("./data/dict/metaDic", "rb") as fp: + metaDic = pickle.load(fp) + + with open("./data/dict/dataChartDic", "wb") as fp: pickle.dump(dataDic, fp) - - with open("./data/dict/chartVocab", 'wb') as fp: - pickle.dump(list(chart['itemid'].unique()), fp) - self.chart_vocab = chart['itemid'].nunique() - metaDic['Chart']=self.chart_per_adm - - - with open("./data/dict/metaDic", 'wb') as fp: + with open("./data/dict/chartVocab", "wb") as fp: + pickle.dump(list(chart["itemid"].unique()), fp) + self.chart_vocab = chart["itemid"].nunique() + metaDic["Chart"] = self.chart_per_adm + + with open("./data/dict/metaDic", "wb") as fp: pickle.dump(metaDic, fp) - - - def create_Dict(self,meds,proc,out,chart,los): - dataDic={} + + def create_Dict(self, meds, proc, out, chart, los): + dataDic = {} print(los) - labels_csv=pd.DataFrame(columns=['stay_id','label']) - labels_csv['stay_id']=pd.Series(self.hids) - labels_csv['label']=0 -# print("# Unique gender",self.data.gender.nunique()) -# print("# Unique ethnicity",self.data.ethnicity.nunique()) -# print("# Unique insurance",self.data.insurance.nunique()) + labels_csv = pd.DataFrame(columns=["stay_id", "label"]) + labels_csv["stay_id"] = pd.Series(self.hids) + labels_csv["label"] = 0 + # print("# Unique gender",self.data.gender.nunique()) + # print("# Unique ethnicity",self.data.ethnicity.nunique()) + # print("# Unique insurance",self.data.insurance.nunique()) for hid in self.hids: - grp=self.data[self.data['stay_id']==hid] - dataDic[hid]={'Cond':{},'Proc':{},'Med':{},'Out':{},'Chart':{},'ethnicity':grp['ethnicity'].iloc[0],'age':int(grp['Age']),'gender':grp['gender'].iloc[0],'label':int(grp['label'])} - labels_csv.loc[labels_csv['stay_id']==hid,'label']=int(grp['label']) - + grp = self.data[self.data["stay_id"] == hid] + dataDic[hid] = { + "Cond": {}, + "Proc": {}, + "Med": {}, + "Out": {}, + "Chart": {}, + "ethnicity": grp["ethnicity"].iloc[0], + "age": int(grp["Age"]), + "gender": grp["gender"].iloc[0], + "label": int(grp["label"]), + } + labels_csv.loc[labels_csv["stay_id"] == hid, "label"] = int(grp["label"]) - #print(static_csv.head()) + # print(static_csv.head()) for hid in tqdm(self.hids): - grp=self.data[self.data['stay_id']==hid] - demo_csv=grp[['Age','gender','ethnicity','insurance']] - if not os.path.exists("./data/csv/"+str(hid)): - os.makedirs("./data/csv/"+str(hid)) - demo_csv.to_csv('./data/csv/'+str(hid)+'/demo.csv',index=False) - - dyn_csv=pd.DataFrame() + grp = self.data[self.data["stay_id"] == hid] + demo_csv = grp[["Age", "gender", "ethnicity", "insurance"]] + if not os.path.exists("./data/csv/" + str(hid)): + os.makedirs("./data/csv/" + str(hid)) + demo_csv.to_csv("./data/csv/" + str(hid) + "/demo.csv", index=False) + + dyn_csv = pd.DataFrame() ###MEDS - if(self.feat_med): - feat=meds['itemid'].unique() - df2=meds[meds['stay_id']==hid] - if df2.shape[0]==0: - amount=pd.DataFrame(np.zeros([los,len(feat)]),columns=feat) - amount=amount.fillna(0) - amount.columns=pd.MultiIndex.from_product([["MEDS"], amount.columns]) + if self.feat_med: + feat = meds["itemid"].unique() + df2 = meds[meds["stay_id"] == hid] + if df2.shape[0] == 0: + amount = pd.DataFrame(np.zeros([los, len(feat)]), columns=feat) + amount = amount.fillna(0) + amount.columns = pd.MultiIndex.from_product( + [["MEDS"], amount.columns] + ) else: - rate=df2.pivot_table(index='start_time',columns='itemid',values='rate') - #print(rate) - amount=df2.pivot_table(index='start_time',columns='itemid',values='amount') - df2=df2.pivot_table(index='start_time',columns='itemid',values='stop_time') - #print(df2.shape) + rate = df2.pivot_table( + index="start_time", columns="itemid", values="rate" + ) + # print(rate) + amount = df2.pivot_table( + index="start_time", columns="itemid", values="amount" + ) + df2 = df2.pivot_table( + index="start_time", columns="itemid", values="stop_time" + ) + # print(df2.shape) add_indices = pd.Index(range(los)).difference(df2.index) - add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna(np.nan) - df2=pd.concat([df2, add_df]) - df2=df2.sort_index() - df2=df2.ffill() - df2=df2.fillna(0) - - rate=pd.concat([rate, add_df]) - rate=rate.sort_index() - rate=rate.ffill() - rate=rate.fillna(-1) - - amount=pd.concat([amount, add_df]) - amount=amount.sort_index() - amount=amount.ffill() - amount=amount.fillna(-1) - #print(df2.head()) - df2.iloc[:,0:]=df2.iloc[:,0:].sub(df2.index,0) - df2[df2>0]=1 - df2[df2<0]=0 - rate.iloc[:,0:]=df2.iloc[:,0:]*rate.iloc[:,0:] - amount.iloc[:,0:]=df2.iloc[:,0:]*amount.iloc[:,0:] - #print(df2.head()) - dataDic[hid]['Med']['signal']=df2.iloc[:,0:].to_dict(orient="list") - dataDic[hid]['Med']['rate']=rate.iloc[:,0:].to_dict(orient="list") - dataDic[hid]['Med']['amount']=amount.iloc[:,0:].to_dict(orient="list") - - - feat_df=pd.DataFrame(columns=list(set(feat)-set(amount.columns))) - # print(feat) - # print(amount.columns) - # print(amount.head()) - amount=pd.concat([amount,feat_df],axis=1) - - amount=amount[feat] - amount=amount.fillna(0) - # print(amount.columns) - amount.columns=pd.MultiIndex.from_product([["MEDS"], amount.columns]) - - if(dyn_csv.empty): - dyn_csv=amount + add_df = pd.DataFrame( + index=add_indices, columns=df2.columns + ).fillna(np.nan) + df2 = pd.concat([df2, add_df]) + df2 = df2.sort_index() + df2 = df2.ffill() + df2 = df2.fillna(0) + + rate = pd.concat([rate, add_df]) + rate = rate.sort_index() + rate = rate.ffill() + rate = rate.fillna(-1) + + amount = pd.concat([amount, add_df]) + amount = amount.sort_index() + amount = amount.ffill() + amount = amount.fillna(-1) + # print(df2.head()) + df2.iloc[:, 0:] = df2.iloc[:, 0:].sub(df2.index, 0) + df2[df2 > 0] = 1 + df2[df2 < 0] = 0 + rate.iloc[:, 0:] = df2.iloc[:, 0:] * rate.iloc[:, 0:] + amount.iloc[:, 0:] = df2.iloc[:, 0:] * amount.iloc[:, 0:] + # print(df2.head()) + dataDic[hid]["Med"]["signal"] = df2.iloc[:, 0:].to_dict( + orient="list" + ) + dataDic[hid]["Med"]["rate"] = rate.iloc[:, 0:].to_dict( + orient="list" + ) + dataDic[hid]["Med"]["amount"] = amount.iloc[:, 0:].to_dict( + orient="list" + ) + + feat_df = pd.DataFrame( + columns=list(set(feat) - set(amount.columns)) + ) + # print(feat) + # print(amount.columns) + # print(amount.head()) + amount = pd.concat([amount, feat_df], axis=1) + + amount = amount[feat] + amount = amount.fillna(0) + # print(amount.columns) + amount.columns = pd.MultiIndex.from_product( + [["MEDS"], amount.columns] + ) + + if dyn_csv.empty: + dyn_csv = amount else: - dyn_csv=pd.concat([dyn_csv,amount],axis=1) - - - - - + dyn_csv = pd.concat([dyn_csv, amount], axis=1) + ###PROCS - if(self.feat_proc): - feat=proc['itemid'].unique() - df2=proc[proc['stay_id']==hid] - if df2.shape[0]==0: - df2=pd.DataFrame(np.zeros([los,len(feat)]),columns=feat) - df2=df2.fillna(0) - df2.columns=pd.MultiIndex.from_product([["PROC"], df2.columns]) + if self.feat_proc: + feat = proc["itemid"].unique() + df2 = proc[proc["stay_id"] == hid] + if df2.shape[0] == 0: + df2 = pd.DataFrame(np.zeros([los, len(feat)]), columns=feat) + df2 = df2.fillna(0) + df2.columns = pd.MultiIndex.from_product([["PROC"], df2.columns]) else: - df2['val']=1 - #print(df2) - df2=df2.pivot_table(index='start_time',columns='itemid',values='val') - #print(df2.shape) + df2["val"] = 1 + # print(df2) + df2 = df2.pivot_table( + index="start_time", columns="itemid", values="val" + ) + # print(df2.shape) add_indices = pd.Index(range(los)).difference(df2.index) - add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna(np.nan) - df2=pd.concat([df2, add_df]) - df2=df2.sort_index() - df2=df2.fillna(0) - df2[df2>0]=1 - #print(df2.head()) - dataDic[hid]['Proc']=df2.to_dict(orient="list") - - - feat_df=pd.DataFrame(columns=list(set(feat)-set(df2.columns))) - df2=pd.concat([df2,feat_df],axis=1) - - df2=df2[feat] - df2=df2.fillna(0) - df2.columns=pd.MultiIndex.from_product([["PROC"], df2.columns]) - - if(dyn_csv.empty): - dyn_csv=df2 + add_df = pd.DataFrame( + index=add_indices, columns=df2.columns + ).fillna(np.nan) + df2 = pd.concat([df2, add_df]) + df2 = df2.sort_index() + df2 = df2.fillna(0) + df2[df2 > 0] = 1 + # print(df2.head()) + dataDic[hid]["Proc"] = df2.to_dict(orient="list") + + feat_df = pd.DataFrame(columns=list(set(feat) - set(df2.columns))) + df2 = pd.concat([df2, feat_df], axis=1) + + df2 = df2[feat] + df2 = df2.fillna(0) + df2.columns = pd.MultiIndex.from_product([["PROC"], df2.columns]) + + if dyn_csv.empty: + dyn_csv = df2 else: - dyn_csv=pd.concat([dyn_csv,df2],axis=1) - - - - + dyn_csv = pd.concat([dyn_csv, df2], axis=1) + ###OUT - if(self.feat_out): - feat=out['itemid'].unique() - df2=out[out['stay_id']==hid] - if df2.shape[0]==0: - df2=pd.DataFrame(np.zeros([los,len(feat)]),columns=feat) - df2=df2.fillna(0) - df2.columns=pd.MultiIndex.from_product([["OUT"], df2.columns]) + if self.feat_out: + feat = out["itemid"].unique() + df2 = out[out["stay_id"] == hid] + if df2.shape[0] == 0: + df2 = pd.DataFrame(np.zeros([los, len(feat)]), columns=feat) + df2 = df2.fillna(0) + df2.columns = pd.MultiIndex.from_product([["OUT"], df2.columns]) else: - df2['val']=1 - df2=df2.pivot_table(index='start_time',columns='itemid',values='val') - #print(df2.shape) + df2["val"] = 1 + df2 = df2.pivot_table( + index="start_time", columns="itemid", values="val" + ) + # print(df2.shape) add_indices = pd.Index(range(los)).difference(df2.index) - add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna(np.nan) - df2=pd.concat([df2, add_df]) - df2=df2.sort_index() - df2=df2.fillna(0) - df2[df2>0]=1 - #print(df2.head()) - dataDic[hid]['Out']=df2.to_dict(orient="list") - - feat_df=pd.DataFrame(columns=list(set(feat)-set(df2.columns))) - df2=pd.concat([df2,feat_df],axis=1) - - df2=df2[feat] - df2=df2.fillna(0) - df2.columns=pd.MultiIndex.from_product([["OUT"], df2.columns]) - - if(dyn_csv.empty): - dyn_csv=df2 + add_df = pd.DataFrame( + index=add_indices, columns=df2.columns + ).fillna(np.nan) + df2 = pd.concat([df2, add_df]) + df2 = df2.sort_index() + df2 = df2.fillna(0) + df2[df2 > 0] = 1 + # print(df2.head()) + dataDic[hid]["Out"] = df2.to_dict(orient="list") + + feat_df = pd.DataFrame(columns=list(set(feat) - set(df2.columns))) + df2 = pd.concat([df2, feat_df], axis=1) + + df2 = df2[feat] + df2 = df2.fillna(0) + df2.columns = pd.MultiIndex.from_product([["OUT"], df2.columns]) + + if dyn_csv.empty: + dyn_csv = df2 else: - dyn_csv=pd.concat([dyn_csv,df2],axis=1) - - - + dyn_csv = pd.concat([dyn_csv, df2], axis=1) + ###CHART - if(self.feat_chart): - feat=chart['itemid'].unique() - df2=chart[chart['stay_id']==hid] - if df2.shape[0]==0: - val=pd.DataFrame(np.zeros([los,len(feat)]),columns=feat) - val=val.fillna(0) - val.columns=pd.MultiIndex.from_product([["CHART"], val.columns]) + if self.feat_chart: + feat = chart["itemid"].unique() + df2 = chart[chart["stay_id"] == hid] + if df2.shape[0] == 0: + val = pd.DataFrame(np.zeros([los, len(feat)]), columns=feat) + val = val.fillna(0) + val.columns = pd.MultiIndex.from_product([["CHART"], val.columns]) else: - val=df2.pivot_table(index='start_time',columns='itemid',values='valuenum') - df2['val']=1 - df2=df2.pivot_table(index='start_time',columns='itemid',values='val') - #print(df2.shape) + val = df2.pivot_table( + index="start_time", columns="itemid", values="valuenum" + ) + df2["val"] = 1 + df2 = df2.pivot_table( + index="start_time", columns="itemid", values="val" + ) + # print(df2.shape) add_indices = pd.Index(range(los)).difference(df2.index) - add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna(np.nan) - df2=pd.concat([df2, add_df]) - df2=df2.sort_index() - df2=df2.fillna(0) - - val=pd.concat([val, add_df]) - val=val.sort_index() - if self.impute=='Mean': - val=val.ffill() - val=val.bfill() - val=val.fillna(val.mean()) - elif self.impute=='Median': - val=val.ffill() - val=val.bfill() - val=val.fillna(val.median()) - val=val.fillna(0) - - - df2[df2>0]=1 - df2[df2<0]=0 - #print(df2.head()) - dataDic[hid]['Chart']['signal']=df2.iloc[:,0:].to_dict(orient="list") - dataDic[hid]['Chart']['val']=val.iloc[:,0:].to_dict(orient="list") - - feat_df=pd.DataFrame(columns=list(set(feat)-set(val.columns))) - val=pd.concat([val,feat_df],axis=1) - - val=val[feat] - val=val.fillna(0) - val.columns=pd.MultiIndex.from_product([["CHART"], val.columns]) - - if(dyn_csv.empty): - dyn_csv=val + add_df = pd.DataFrame( + index=add_indices, columns=df2.columns + ).fillna(np.nan) + df2 = pd.concat([df2, add_df]) + df2 = df2.sort_index() + df2 = df2.fillna(0) + + val = pd.concat([val, add_df]) + val = val.sort_index() + if self.impute == "Mean": + val = val.ffill() + val = val.bfill() + val = val.fillna(val.mean()) + elif self.impute == "Median": + val = val.ffill() + val = val.bfill() + val = val.fillna(val.median()) + val = val.fillna(0) + + df2[df2 > 0] = 1 + df2[df2 < 0] = 0 + # print(df2.head()) + dataDic[hid]["Chart"]["signal"] = df2.iloc[:, 0:].to_dict( + orient="list" + ) + dataDic[hid]["Chart"]["val"] = val.iloc[:, 0:].to_dict( + orient="list" + ) + + feat_df = pd.DataFrame(columns=list(set(feat) - set(val.columns))) + val = pd.concat([val, feat_df], axis=1) + + val = val[feat] + val = val.fillna(0) + val.columns = pd.MultiIndex.from_product([["CHART"], val.columns]) + + if dyn_csv.empty: + dyn_csv = val else: - dyn_csv=pd.concat([dyn_csv,val],axis=1) - - #Save temporal data to csv - dyn_csv.to_csv('./data/csv/'+str(hid)+'/dynamic.csv',index=False) - + dyn_csv = pd.concat([dyn_csv, val], axis=1) + + # Save temporal data to csv + dyn_csv.to_csv("./data/csv/" + str(hid) + "/dynamic.csv", index=False) + ##########COND######### - if(self.feat_cond): - feat=self.cond['new_icd_code'].unique() - grp=self.cond[self.cond['stay_id']==hid] - if(grp.shape[0]==0): - dataDic[hid]['Cond']={'fids':list([''])} - feat_df=pd.DataFrame(np.zeros([1,len(feat)]),columns=feat) - grp=feat_df.fillna(0) - grp.columns=pd.MultiIndex.from_product([["COND"], grp.columns]) + if self.feat_cond: + feat = self.cond["new_icd_code"].unique() + grp = self.cond[self.cond["stay_id"] == hid] + if grp.shape[0] == 0: + dataDic[hid]["Cond"] = {"fids": list([""])} + feat_df = pd.DataFrame(np.zeros([1, len(feat)]), columns=feat) + grp = feat_df.fillna(0) + grp.columns = pd.MultiIndex.from_product([["COND"], grp.columns]) else: - dataDic[hid]['Cond']={'fids':list(grp['new_icd_code'])} - grp['val']=1 - grp=grp.drop_duplicates() - grp=grp.pivot(index='stay_id',columns='new_icd_code',values='val').reset_index(drop=True) - feat_df=pd.DataFrame(columns=list(set(feat)-set(grp.columns))) - grp=pd.concat([grp,feat_df],axis=1) - grp=grp.fillna(0) - grp=grp[feat] - grp.columns=pd.MultiIndex.from_product([["COND"], grp.columns]) - grp.to_csv('./data/csv/'+str(hid)+'/static.csv',index=False) - labels_csv.to_csv('./data/csv/labels.csv',index=False) - - + dataDic[hid]["Cond"] = {"fids": list(grp["new_icd_code"])} + grp["val"] = 1 + grp = grp.drop_duplicates() + grp = grp.pivot( + index="stay_id", columns="new_icd_code", values="val" + ).reset_index(drop=True) + feat_df = pd.DataFrame(columns=list(set(feat) - set(grp.columns))) + grp = pd.concat([grp, feat_df], axis=1) + grp = grp.fillna(0) + grp = grp[feat] + grp.columns = pd.MultiIndex.from_product([["COND"], grp.columns]) + grp.to_csv("./data/csv/" + str(hid) + "/static.csv", index=False) + labels_csv.to_csv("./data/csv/labels.csv", index=False) + ######SAVE DICTIONARIES############## - metaDic={'Cond':{},'Proc':{},'Med':{},'Out':{},'Chart':{},'LOS':{}} - metaDic['LOS']=los - with open("./data/dict/dataDic", 'wb') as fp: + metaDic = {"Cond": {}, "Proc": {}, "Med": {}, "Out": {}, "Chart": {}, "LOS": {}} + metaDic["LOS"] = los + with open("./data/dict/dataDic", "wb") as fp: pickle.dump(dataDic, fp) - with open("./data/dict/hadmDic", 'wb') as fp: + with open("./data/dict/hadmDic", "wb") as fp: pickle.dump(self.hids, fp) - - with open("./data/dict/ethVocab", 'wb') as fp: - pickle.dump(list(self.data['ethnicity'].unique()), fp) - self.eth_vocab = self.data['ethnicity'].nunique() - - with open("./data/dict/ageVocab", 'wb') as fp: - pickle.dump(list(self.data['Age'].unique()), fp) - self.age_vocab = self.data['Age'].nunique() - - with open("./data/dict/insVocab", 'wb') as fp: - pickle.dump(list(self.data['insurance'].unique()), fp) - self.ins_vocab = self.data['insurance'].nunique() - - if(self.feat_med): - with open("./data/dict/medVocab", 'wb') as fp: - pickle.dump(list(meds['itemid'].unique()), fp) - self.med_vocab = meds['itemid'].nunique() - metaDic['Med']=self.med_per_adm - - if(self.feat_out): - with open("./data/dict/outVocab", 'wb') as fp: - pickle.dump(list(out['itemid'].unique()), fp) - self.out_vocab = out['itemid'].nunique() - metaDic['Out']=self.out_per_adm - - if(self.feat_chart): - with open("./data/dict/chartVocab", 'wb') as fp: - pickle.dump(list(chart['itemid'].unique()), fp) - self.chart_vocab = chart['itemid'].nunique() - metaDic['Chart']=self.chart_per_adm - - if(self.feat_cond): - with open("./data/dict/condVocab", 'wb') as fp: - pickle.dump(list(self.cond['new_icd_code'].unique()), fp) - self.cond_vocab = self.cond['new_icd_code'].nunique() - metaDic['Cond']=self.cond_per_adm - - if(self.feat_proc): - with open("./data/dict/procVocab", 'wb') as fp: - pickle.dump(list(proc['itemid'].unique()), fp) - self.proc_vocab = proc['itemid'].nunique() - metaDic['Proc']=self.proc_per_adm - - with open("./data/dict/metaDic", 'wb') as fp: - pickle.dump(metaDic, fp) - - - + with open("./data/dict/ethVocab", "wb") as fp: + pickle.dump(list(self.data["ethnicity"].unique()), fp) + self.eth_vocab = self.data["ethnicity"].nunique() + + with open("./data/dict/ageVocab", "wb") as fp: + pickle.dump(list(self.data["Age"].unique()), fp) + self.age_vocab = self.data["Age"].nunique() + + with open("./data/dict/insVocab", "wb") as fp: + pickle.dump(list(self.data["insurance"].unique()), fp) + self.ins_vocab = self.data["insurance"].nunique() + if self.feat_med: + with open("./data/dict/medVocab", "wb") as fp: + pickle.dump(list(meds["itemid"].unique()), fp) + self.med_vocab = meds["itemid"].nunique() + metaDic["Med"] = self.med_per_adm + + if self.feat_out: + with open("./data/dict/outVocab", "wb") as fp: + pickle.dump(list(out["itemid"].unique()), fp) + self.out_vocab = out["itemid"].nunique() + metaDic["Out"] = self.out_per_adm + + if self.feat_chart: + with open("./data/dict/chartVocab", "wb") as fp: + pickle.dump(list(chart["itemid"].unique()), fp) + self.chart_vocab = chart["itemid"].nunique() + metaDic["Chart"] = self.chart_per_adm + + if self.feat_cond: + with open("./data/dict/condVocab", "wb") as fp: + pickle.dump(list(self.cond["new_icd_code"].unique()), fp) + self.cond_vocab = self.cond["new_icd_code"].nunique() + metaDic["Cond"] = self.cond_per_adm + + if self.feat_proc: + with open("./data/dict/procVocab", "wb") as fp: + pickle.dump(list(proc["itemid"].unique()), fp) + self.proc_vocab = proc["itemid"].nunique() + metaDic["Proc"] = self.proc_per_adm + + with open("./data/dict/metaDic", "wb") as fp: + pickle.dump(metaDic, fp) diff --git a/old_pipeline_script.py b/old_pipeline_script.py new file mode 100644 index 0000000000..6b4cb2dacf --- /dev/null +++ b/old_pipeline_script.py @@ -0,0 +1,65 @@ +from preprocessing.day_intervals_preproc.day_intervals_cohort_v2 import extract_data +from preprocessing.hosp_module_preproc.feature_selection_icu import ( + feature_icu, + preprocess_features_icu, + generate_summary_icu, + features_selection_icu, +) +from model.data_generation_icu import Generator + +cohort_output = extract_data( + "ICU", + "Mortality", + 0, + "No Disease Filter", + "d:\\Work\\Repos\\MIMIC-IV-Data-Pipeline", + "", +) + +feature_icu("cohort_icu_mortality_0_", "mimiciv", True, True, True, True, True) + +preprocess_features_icu( + "cohort_icu_mortality_0_", + True, + "Convert ICD-9 to ICD-10 and group ICD-10 codes", + False, + False, + False, + 0, + 0, +) + +generate_summary_icu(True, True, True, True, True) +features_selection_icu( + "cohort_icu_mortality_0_", + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, +) + +preprocess_features_icu( + "cohort_icu_mortality_0_", False, False, True, True, True, 98, 0 +) + +gen = Generator( + "cohort_icu_mortality_0_", + True, + False, + False, + True, + True, + True, + True, + True, + False, + 72, + 1, + 2, +) diff --git a/pipeline/cohort_extractor.py b/pipeline/cohort_extractor.py new file mode 100644 index 0000000000..1b2df6759f --- /dev/null +++ b/pipeline/cohort_extractor.py @@ -0,0 +1,145 @@ +from typing import Tuple +import pandas as pd +import logging +from pipeline.file_info.raw.hosp import ( + load_hosp_patients, + load_hosp_admissions, + HospAdmissions, +) +from pipeline.file_info.raw.icu import load_icu_icustays +from pipeline.file_info.preproc.cohort import CohortHeader +from pipeline.prediction_task import PredictionTask +from pipeline.preprocessing.visit import ( + make_patients, + make_icu_visits, + make_no_icu_visits, + filter_visits, +) +from pipeline.preprocessing.cohort import Cohort + +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger() + + +class CohortExtractor: + """ + Extracts cohort data based on specified prediction tasks and ICU status. + + Attributes: + prediction_task (PredictionTask): The prediction task to be used for cohort extraction. + cohort_output (Path): The path for the output of the cohort data. + """ + + def __init__( + self, + prediction_task: PredictionTask, + cohort_output: str = None, + ): + self.prediction_task = prediction_task + self.cohort_output = cohort_output + + def get_icu_status(self) -> str: + """Determines the ICU status based on the prediction task.""" + return "ICU" if self.prediction_task.use_icu else "Non-ICU" + + def generate_extract_log(self) -> str: + """Generates a log message for the extraction process.""" + icu_log = self.get_icu_status() + task_info = f"{icu_log} | {self.prediction_task.target_type}" + if self.prediction_task.disease_readmission: + task_info += f" DUE TO {self.prediction_task.disease_readmission}" + + if self.prediction_task.disease_selection: + task_info += f" ADMITTED DUE TO {self.prediction_task.disease_selection}" + + return f"EXTRACTING FOR: {task_info} | {self.prediction_task.nb_days} |".upper() + + def generate_output_suffix(self) -> str: + """Generates a suffix for the output file based on the task details.""" + return ( + self.get_icu_status() # .lower() + + "_" + + self.prediction_task.target_type.lower().replace(" ", "_") + + "_" + + str(self.prediction_task.nb_days) + + "_" + + ( + self.prediction_task.disease_readmission + if self.prediction_task.disease_readmission + else "" + ) + ) + + def fill_outputs(self) -> None: + """Fills in the output details based on the prediction task.""" + disease_selection = ( + f"_{self.prediction_task.disease_selection}" + if self.prediction_task.disease_selection + else "" + ) + self.cohort_output = ( + self.cohort_output + or f"cohort_{self.generate_output_suffix()}{disease_selection}" + ) + + def load_hospital_data(self) -> Tuple[pd.DataFrame, pd.DataFrame]: + """Loads hospital patient and admission data.""" + return load_hosp_patients(), load_hosp_admissions() + + def create_visits(self, hosp_patients, hosp_admissions): + if self.prediction_task.use_icu: + icu_icustays = load_icu_icustays() + return make_icu_visits( + icu_icustays, hosp_patients, self.prediction_task.target_type + ) + else: + return make_no_icu_visits(hosp_admissions, self.prediction_task.target_type) + + def filter_and_merge_visits( + self, + visits: pd.DataFrame, + hosp_patients: pd.DataFrame, + hosp_admissions: pd.DataFrame, + ) -> pd.DataFrame: + """Filters and merges visit records with patient and admission data.""" + visits = filter_visits( + visits, + self.prediction_task.disease_readmission, + self.prediction_task.disease_selection, + ) + patients_data = make_patients(hosp_patients) + patients_filtered = patients_data.loc[patients_data["age"] >= 18] + admissions_info = hosp_admissions[ + [ + HospAdmissions.HOSPITAL_ADMISSION_ID, + HospAdmissions.INSURANCE, + HospAdmissions.RACE, + ] + ] + visits = visits.merge(patients_filtered, on=CohortHeader.PATIENT_ID) + visits = visits.merge(admissions_info, on=CohortHeader.HOSPITAL_ADMISSION_ID) + return visits + + def extract(self) -> Cohort: + """ + Extracts the cohort data based on specified criteria and saves it. + + Returns: + Cohort: The extracted and processed cohort data. + """ + logger.info("===========MIMIC-IV v2.0============") + self.fill_outputs() + logger.info(self.generate_extract_log()) + + hosp_patients, hosp_admissions = self.load_hospital_data() + visits = self.create_visits(hosp_patients, hosp_admissions) + visits = self.filter_and_merge_visits(visits, hosp_patients, hosp_admissions) + self.fill_outputs() + cohort = Cohort( + icu=self.prediction_task.use_icu, + name=self.cohort_output, + ) + cohort.prepare_labels(visits, self.prediction_task) + cohort.save() + cohort.save_summary() + return cohort diff --git a/pipeline/conversion/icd.py b/pipeline/conversion/icd.py new file mode 100644 index 0000000000..eb34d18493 --- /dev/null +++ b/pipeline/conversion/icd.py @@ -0,0 +1,40 @@ +import pandas as pd +import numpy as np + +from pipeline.file_info.common import load_static_icd_map, IcdMap +from pipeline.file_info.raw.hosp import HospDiagnosesIcd + +ROOT_ICD_CONVERT = "root_icd10_convert" + + +class IcdConverter: + def __init__(self): + self.conversions_icd_9_10 = self._get_conversions_icd_9_10() + + def _get_conversions_icd_9_10(self) -> dict: + """Create mapping dictionary ICD9 -> ICD10""" + icd_map_df = load_static_icd_map() + filtered_df = icd_map_df[icd_map_df[IcdMap.DIAGNOISIS_CODE].str.len() == 3] + filtered_df = filtered_df.drop_duplicates(subset=IcdMap.DIAGNOISIS_CODE) + return dict(zip(filtered_df[IcdMap.DIAGNOISIS_CODE], filtered_df[IcdMap.ICD10])) + + def standardize_icd(self, df: pd.DataFrame) -> pd.DataFrame: + """Standardizes ICD codes in a DataFrame.""" + df[ROOT_ICD_CONVERT] = df.apply( + lambda row: self.conversions_icd_9_10.get( + row[HospDiagnosesIcd.ICD_CODE][:3], np.nan + ) + if row[HospDiagnosesIcd.ICD_VERSION] == 9 + else row[HospDiagnosesIcd.ICD_CODE], + axis=1, + ) + df[HospDiagnosesIcd.ROOT] = df[ROOT_ICD_CONVERT].apply( + lambda x: x[:3] if type(x) is str else np.nan + ) + return df + + def get_pos_ids(self, diag: pd.DataFrame, ICD10_code: str) -> pd.Series: + """Extracts unique hospital admission IDs where 'root' contains a specific ICD-10 code.""" + return diag[diag[HospDiagnosesIcd.ROOT].str.contains(ICD10_code, na=False)][ + HospDiagnosesIcd.HOSPITAL_ADMISSION_ID + ].unique() diff --git a/pipeline/conversion/ndc.py b/pipeline/conversion/ndc.py new file mode 100644 index 0000000000..a96bb876b8 --- /dev/null +++ b/pipeline/conversion/ndc.py @@ -0,0 +1,65 @@ +import pandas as pd +import numpy as np + +from pipeline.file_info.common import MAP_NDC_PATH +from enum import StrEnum + + +class NdcMappingHeader(StrEnum): + PRODUCT_NDC = "productndc" + NON_PROPRIETARY_NAME = "nonproprietaryname" + PHARM_CLASSES = "pharm_classes" + NEW_NDC = "new_ndc" + + +def prepare_ndc_mapping() -> pd.DataFrame: + ndc_map = read_ndc_mapping()[ + [ + NdcMappingHeader.PRODUCT_NDC, + NdcMappingHeader.NON_PROPRIETARY_NAME, + NdcMappingHeader.PHARM_CLASSES, + ] + ] + ndc_map[NdcMappingHeader.NON_PROPRIETARY_NAME] = ( + ndc_map[NdcMappingHeader.NON_PROPRIETARY_NAME].fillna("").str.lower() + ) + # Normalize the NDC codes in the mapping table so that they can be merged + ndc_map.loc[:, NdcMappingHeader.NEW_NDC] = ndc_map[ + NdcMappingHeader.PRODUCT_NDC + ].apply(format_ndc_table) + ndc_map = ndc_map.drop_duplicates( + subset=[NdcMappingHeader.NEW_NDC, NdcMappingHeader.NON_PROPRIETARY_NAME] + ) + return ndc_map + + +def ndc_to_str(ndc: int) -> str: + """Converts NDC code to a string with leading zeros restored, keeping only the first 9 digits.""" + if ndc < 0: # Handling dummy values + return np.nan + ndc_str = str(ndc).zfill(11) + return ndc_str[:-2] + + +def format_ndc_table(ndc: str) -> str: + """Formats NDC code from the mapping table to the standard 11-digit format, taking only the first 9 digits.""" + parts = ndc.split("-") + formatted_ndc = "".join( + part.zfill(length) for part, length in zip(parts, [5, 4, 2]) + ) + return formatted_ndc[:9] # Taking only the manufacturer and product sections + + +def read_ndc_mapping() -> pd.DataFrame: + """Reads and processes NDC mapping table from a file.""" + ndc_map = pd.read_csv(MAP_NDC_PATH, delimiter="\t", encoding="latin1") + ndc_map.columns = ndc_map.columns.str.lower() + return ndc_map + + +def get_EPC(s: str) -> list: + """Extracts the Established Pharmacologic Class (EPC) from a string.""" + if not isinstance(s, str): + return np.nan + + return [phrase for phrase in s.split(",") if "[EPC]" in phrase] diff --git a/pipeline/conversion/uom.py b/pipeline/conversion/uom.py new file mode 100644 index 0000000000..d727c0120f --- /dev/null +++ b/pipeline/conversion/uom.py @@ -0,0 +1,32 @@ +import pandas as pd +import numpy as np + + +def drop_wrong_uom(data: pd.DataFrame, cut_off) -> pd.DataFrame: + """Drop rows with uncommon units of measurement for each itemid, based on a cut-off frequency. + + Args: + data (pd.DataFrame): The input DataFrame containing the data. + cut_off: The cut-off frequency used to determine uncommon units of measurement. + + Returns: + pd.DataFrame: The filtered DataFrame with rows dropped based on uncommon units of measurement. + """ + + # Create a function to filter each group + def filter_group(group): + value_counts = group["valueuom"].value_counts() + most_frequent_uom = value_counts.idxmax() + frequency = value_counts.max() + + # Check if the most frequent uom meets the cut-off criteria + if frequency / len(group) > cut_off: + return group[group["valueuom"] == most_frequent_uom] + return group + + # Apply the filter function to each group and concatenate the results + return ( + data.groupby("itemid", group_keys=False) + .apply(filter_group) + .reset_index(drop=True) + ) diff --git a/pipeline/data_generator.py b/pipeline/data_generator.py new file mode 100644 index 0000000000..963d90469a --- /dev/null +++ b/pipeline/data_generator.py @@ -0,0 +1,382 @@ +from typing import Dict +import pandas as pd +from tqdm import tqdm +from pipeline.dict_maker import DictMaker +from pipeline.feature.lab_events import Lab +from pipeline.feature.medications import Medications +from pipeline.feature.output_events import OutputEvents +from pipeline.feature.procedures import Procedures +from pipeline.file_info.common import PREPROC_PATH +from pipeline.file_info.preproc.cohort import COHORT_PATH +from pipeline.file_info.preproc.feature import ( + EXTRACT_CHART_ICU_PATH, + EXTRACT_DIAG_ICU_PATH, + EXTRACT_DIAG_PATH, + EXTRACT_LABS_PATH, + EXTRACT_MED_ICU_PATH, + EXTRACT_MED_PATH, + EXTRACT_OUT_ICU_PATH, + EXTRACT_PROC_ICU_PATH, + EXTRACT_PROC_PATH, +) +from pipeline.prediction_task import PredictionTask, TargetType +import logging + +from pipeline.features_extractor import FeatureExtractor +from pipeline.feature.chart_events import Chart, ChartEvents +from pipeline.feature.diagnoses import Diagnoses +from pipeline.preprocessing.cohort import read_cohort +from pipeline.feature.feature_abc import Feature + +logger = logging.getLogger() + + +class DataGenerator: + def __init__( + self, + cohort_output: pd.DataFrame, + feature_extractor: FeatureExtractor, + # impute: str, + include_time: int = 24, + bucket: int = 1, + predW: int = 0, + target_type: TargetType = TargetType.LOS, + ): + self.cohort_output = cohort_output + self.feature_extractor = feature_extractor + # self.impute = impute + self.include_time = include_time + self.bucket = bucket + self.predW = predW + self.target_type = target_type + self.dia = pd.DataFrame() + self.proc = pd.DataFrame() + self.out = pd.DataFrame() + self.chart = pd.DataFrame() + self.med = pd.DataFrame() + self.lab = pd.DataFrame() + self.med_per_adm = pd.DataFrame() + self.out_per_adm = pd.DataFrame() + self.chart_per_adm = pd.DataFrame() + self.dia_per_adm = pd.DataFrame() + self.proc_per_adm = pd.DataFrame() + self.labs_per_adm = pd.DataFrame() + + def generate_features(self): + print("[ ======READING DIAGNOSIS ]") + self.cohort = read_cohort(self.cohort_output, self.feature_extractor.use_icu) + if self.feature_extractor.for_diagnoses: + preproc_dia = pd.read_csv( + EXTRACT_DIAG_ICU_PATH + if self.feature_extractor.use_icu + else EXTRACT_DIAG_PATH, + compression="gzip", + ) + dia = Diagnoses(use_icu=self.feature_extractor.use_icu, df=preproc_dia) + self.dia, self.dia_per_adm = dia.generate_fun(self.cohort) + if self.feature_extractor.for_procedures: + print("[ ======READING PROCEDURES ]") + preproc_proc = pd.read_csv( + EXTRACT_PROC_ICU_PATH + if self.feature_extractor.use_icu + else EXTRACT_PROC_PATH, + compression="gzip", + ) + proc = Procedures(use_icu=self.feature_extractor.use_icu, df=preproc_proc) + self.proc = proc.generate_fun(self.cohort) + + if self.feature_extractor.use_icu and self.feature_extractor.for_output_events: + print("[ ======READING OUTPUT ]") + preproc_out = pd.read_csv(EXTRACT_OUT_ICU_PATH, compression="gzip") + out = OutputEvents(df=preproc_out) + self.out = out.generate_fun(self.cohort) + + if self.feature_extractor.use_icu and self.feature_extractor.for_chart_events: + print("[ ======READING CHART ]") + preproc_chart = pd.read_csv( + EXTRACT_CHART_ICU_PATH, compression="gzip", chunksize=5000000 + ) + chart = Chart(df=preproc_chart) + self.chart = chart.generate_fun(self.cohort) + + if self.feature_extractor.for_medications: + print("[ ======READING MEDICATIONS ]") + preproc_med = pd.read_csv( + EXTRACT_MED_ICU_PATH + if self.feature_extractor.use_icu + else EXTRACT_MED_PATH, + compression="gzip", + ) + med = Medications(use_icu=self.feature_extractor.use_icu, df=preproc_med) + self.med = med.generate_fun(self.cohort) + if not (self.feature_extractor.use_icu) and self.feature_extractor.for_labs: + print("[ ======READING LABS ]") + preproc_labs = pd.read_csv( + EXTRACT_LABS_PATH, compression="gzip", chunksize=5000000 + ) + lab = Lab(df=preproc_labs) + self.lab = lab.generate_fun(self.cohort) + breakpoint() + + def length_by_target(self): + self.los = self.include_time + self.cohort = self.cohort[(self.cohort["los"] >= self.include_time)] + self.hids = self.cohort["hadm_id"].unique() + + if self.target_type == TargetType.MORTALITY: + if self.feature_extractor.for_diagnoses: + dia = Diagnoses(use_icu=self.feature_extractor.use_icu, df=self.dia) + dia.mortality_length(self.cohort) + self.dia = dia.df + if self.feature_extractor.for_procedures: + proc = Procedures(use_icu=self.feature_extractor.use_icu, df=self.proc) + proc.mortality_length(self.cohort, self.include_time) + self.proc = proc.df + if ( + self.feature_extractor.use_icu + and self.feature_extractor.for_output_events + ): + out = OutputEvents(df=self.out) + out.mortality_length(self.cohort, self.include_time) + self.out = out.df + if ( + self.feature_extractor.use_icu + and self.feature_extractor.for_chart_events + ): + chart = Chart(df=self.chart) + chart.mortality_length(self.cohort, self.include_time) + self.chart = chart.df + if self.feature_extractor.for_medications: + med = Medications(use_icu=self.feature_extractor.use_icu, df=self.chart) + med.mortality_length(self.cohort, self.include_time) + self.med = med.df + print("[ PROCESSED TIME SERIES TO EQUAL LENGTH ]") + elif self.target_type == TargetType.READMISSION: + if self.feature_extractor.for_diagnoses: + dia = Diagnoses(use_icu=self.feature_extractor.use_icu, df=self.dia) + dia.read_length() + self.dia = dia.df + if self.feature_extractor.for_procedures: + proc = Procedures(use_icu=self.feature_extractor.use_icu, df=self.proc) + proc.read_length(self.cohort) + self.proc = proc.df + if ( + self.feature_extractor.use_icu + and self.feature_extractor.for_output_events + ): + out = OutputEvents(df=self.out) + out.read_length(self.cohort) + self.out = out.df + if ( + self.feature_extractor.use_icu + and self.feature_extractor.for_chart_events + ): + chart = Chart(df=self.chart) + chart.read_length(self.cohort) + self.chart = chart.df + if self.feature_extractor.for_medications: + med = Medications(use_icu=self.feature_extractor.use_icu, df=self.chart) + med.read_length(self.cohort) + self.med = med.df + print("[ PROCESSED TIME SERIES TO EQUAL LENGTH ]") + elif self.target_type == TargetType.LOS: + if self.feature_extractor.for_diagnoses: + dia = Diagnoses(use_icu=self.feature_extractor.use_icu, df=self.dia) + dia.los_length(self.cohort) + self.dia = dia.df + if self.feature_extractor.for_procedures: + proc = Procedures(use_icu=self.feature_extractor.use_icu, df=self.proc) + proc.los_length(self.cohort, self.include_time) + self.proc = proc.df + if ( + self.feature_extractor.use_icu + and self.feature_extractor.for_output_events + ): + out = OutputEvents(df=self.out) + out.los_length(self.cohort, self.include_time) + self.out = out.df + if ( + self.feature_extractor.use_icu + and self.feature_extractor.for_chart_events + ): + chart = Chart(df=self.chart) + chart.los_length(self.cohort, self.include_time) + self.chart = chart.df + if self.feature_extractor.for_medications: + med = Medications(use_icu=self.feature_extractor.use_icu, df=self.med) + med.los_length(self.cohort, self.include_time) + self.med = med.df + print("[ PROCESSED TIME SERIES TO EQUAL LENGTH ]") + + def smooth_ini(self): + if self.feature_extractor.for_medications: + self.med = self.med.sort_values(by=["start_time"]) + if self.feature_extractor.for_procedures: + self.proc = self.proc.sort_values(by=["start_time"]) + if self.feature_extractor.for_output_events and self.feature_extractor.use_icu: + self.out = self.out.sort_values(by=["start_time"]) + if self.feature_extractor.for_chart_events and self.feature_extractor.use_icu: + self.chart = self.chart.sort_values(by=["start_time"]) + + return + + def smooth_tqdm(self): + final_proc = pd.DataFrame() + final_out = pd.DataFrame() + final_chart = pd.DataFrame() + final_meds = pd.DataFrame() + final_lab = pd.DataFrame() + t = 0 + for i in tqdm(range(0, self.include_time, self.bucket)): + if self.feature_extractor.for_medications: + med = Medications(use_icu=self.feature_extractor.use_icu, df=self.med) + sub_meds = med.smooth_meds_step(self.bucket, i, t) + if final_meds.empty: + final_meds = sub_meds + else: + final_meds = pd.concat([final_meds, sub_meds], ignore_index=True) + + if self.feature_extractor.for_procedures: + proc = Procedures(use_icu=self.feature_extractor.use_icu, df=self.proc) + sub_proc = proc.smooth_meds_step(self.bucket, i, t) + if final_proc.empty: + final_proc = sub_proc + else: + final_proc = pd.concat([final_proc, sub_proc], ignore_index=True) + + if ( + self.feature_extractor.for_output_events + and self.feature_extractor.use_icu + ): + out = OutputEvents(df=self.out) + sub_out = out.smooth_meds_step(self.bucket, i, t) + if final_out.empty: + final_out = sub_out + else: + final_out = pd.concat([final_out, sub_out], ignore_index=True) + + if ( + self.feature_extractor.for_chart_events + and self.feature_extractor.use_icu + ): + chart = Chart(df=self.chart) + sub_chart = chart.smooth_meds_step(self.bucket, i, t) + if final_chart.empty: + final_chart = sub_chart + else: + final_chart = pd.concat([final_chart, sub_chart], ignore_index=True) + + if self.feature_extractor.for_labs and not self.feature_extractor.use_icu: + lab = Lab(df=self.lab) + sub_lab = lab.smooth_meds_step(self.bucket, i, t) + if final_lab.empty: + final_lab = sub_lab + else: + final_lab = pd.concat([final_lab, sub_lab], ignore_index=True) + t = t + 1 + los = int(self.include_time / self.bucket) + + if self.feature_extractor.for_medications: + f2_meds = final_meds.groupby( + ["stay_id", "itemid", "orderid"] + if self.feature_extractor.use_icu + else ["hadm_id", "drug_name"] + ).size() + self.med_per_adm = ( + f2_meds.groupby( + "stay_id" if self.feature_extractor.use_icu else "hadm_id" + ) + .sum() + .reset_index()[0] + .max() + ) + self.medlength_per_adm = ( + final_meds.groupby( + "stay_id" if self.feature_extractor.use_icu else "hadm_id" + ) + .size() + .max() + ) + if self.feature_extractor.for_procedures: + f2_proc = final_proc.groupby( + ["stay_id", "itemid"] + if self.feature_extractor.use_icu + else ["hadm_id", "icd_code"] + ).size() + self.proc_per_adm = ( + f2_proc.groupby( + "stay_id" if self.feature_extractor.use_icu else "hadm_id" + ) + .sum() + .reset_index()[0] + .max() + ) + self.proclength_per_adm = ( + final_proc.groupby( + "stay_id" if self.feature_extractor.use_icu else "hadm_id" + ) + .size() + .max() + ) + if self.feature_extractor.use_icu: + if self.feature_extractor.for_output_events: + f2_out = final_out.groupby(["stay_id", "itemid"]).size() + self.out_per_adm = ( + f2_out.groupby( + "stay_id" if self.feature_extractor.use_icu else "hadm_id" + ) + .sum() + .reset_index()[0] + .max() + ) + self.outlength_per_adm = ( + final_out.groupby( + "stay_id" if self.feature_extractor.use_icu else "hadm_id" + ) + .size() + .max() + ) + if self.feature_extractor.for_chart_events: + f2_chart = final_chart.groupby(["stay_id", "itemid"]).size() + self.chart_per_adm = ( + f2_chart.groupby("stay_id").sum().reset_index()[0].max() + ) + self.chartlength_per_adm = final_chart.groupby("stay_id").size().max() + else: + if self.feature_extractor.for_procedures: + f2_labs = final_lab.groupby(["hadm_id", "itemid"]).size() + self.labs_per_adm = ( + f2_labs.groupby("hadm_id").sum().reset_index()[0].max() + ) + self.labslength_per_adm = final_lab.groupby("hadm_id").size().max() + + dict_maker = DictMaker( + self.feature_extractor, + self.hids, + self.med_per_adm, + self.out_per_adm, + self.chart_per_adm, + self.dia_per_adm, + self.proc_per_adm, + self.labs_per_adm, + ) + dict_maker.create_dict( + self.dia, + final_meds, + final_proc, + final_out, + final_lab, + final_chart, + self.cohort, + los, + ) + dict_maker.save_dictionaries( + self.dia, + final_meds, + final_proc, + final_out, + final_lab, + final_chart, + self.cohort, + los, + ) diff --git a/pipeline/dict_maker.py b/pipeline/dict_maker.py new file mode 100644 index 0000000000..045157892c --- /dev/null +++ b/pipeline/dict_maker.py @@ -0,0 +1,559 @@ +from typing import Dict +import numpy as np +import pandas as pd +from tqdm import tqdm +import pickle +import os +from pipeline.file_info.common import PREPROC_PATH + +import logging + +from pipeline.features_extractor import FeatureExtractor + +logger = logging.getLogger() + +CSVPATH = PREPROC_PATH / "csv" +DICT_PATH = PREPROC_PATH / "dict" + + +class DictMaker: + def __init__( + self, + feature_extractor: FeatureExtractor, + hids, + med_per_adm, + out_per_adm, + chart_per_adm, + dia_per_adm, + proc_per_adm, + labs_per_adm, + ): + self.feature_extractor = feature_extractor + self.hids = hids + self.med_per_adm = med_per_adm + self.out_per_adm = out_per_adm + self.chart_per_adm = chart_per_adm + self.dia_per_adm = dia_per_adm + self.proc_per_adm = proc_per_adm + self.labs_per_adm = labs_per_adm + + def create_dict( + self, diag, meds, proc, out, labs, chart, cohort: pd.DataFrame, los + ): + group_col = "stay_id" if self.feature_extractor.use_icu else "hadm_id" + self.dataDic = {} + self.labels_csv = pd.DataFrame(columns=[group_col, "label"]) + self.labels_csv[group_col] = pd.Series(self.hids) + self.labels_csv["label"] = 0 + + for hid in self.hids: + grp = cohort[cohort[group_col] == hid] + if len(grp) == 0: + self.dataDic[hid] = { + "Cond": {}, + "Proc": {}, + "Med": {}, + "Out": {}, + "Chart": {}, + "Lab": {}, + "ethnicity": {}, + "age": {}, + "gender": {}, + "label": {}, + } + + else: + self.dataDic[hid] = { + "Cond": {}, + "Proc": {}, + "Med": {}, + "Out": {}, + "Chart": {}, + "Lab": {}, + "ethnicity": grp["ethnicity"].iloc[0], + "age": int(grp["age"].iloc[0]), + "gender": grp["gender"].iloc[0], + "label": int(grp["label"].iloc[0]), + } + self.labels_csv.loc[self.labels_csv[group_col] == hid, "label"] = int( + grp["label"].iloc[0] + ) + for hid in tqdm(self.hids): + grp = cohort[cohort[group_col] == hid] + self.demo_csv = grp[["age", "gender", "ethnicity", "insurance"]] + if not os.path.exists(CSVPATH / str(hid)): + os.makedirs(CSVPATH / str(hid)) + self.demo_csv.to_csv(CSVPATH / str(hid) / "demo.csv", index=False) + + dyn_csv = pd.DataFrame() + if self.feature_extractor.for_medications: + self.process_med_by_hid(meds, los, hid, dyn_csv) + if self.feature_extractor.for_procedures: + self.process_proc_by_hid(proc, los, hid, dyn_csv) + if self.feature_extractor.for_labs and not self.feature_extractor.use_icu: + self.process_lab_by_hid(labs, los, hid, dyn_csv) + if self.feature_extractor.for_labs and self.feature_extractor.use_icu: + self.process_out_by_hid(out, los, hid, dyn_csv) + if ( + self.feature_extractor.for_chart_events + and self.feature_extractor.use_icu + ): + self.process_chart_by_hid(chart, los, hid, dyn_csv) + # self.save_csv_files() + dyn_csv.to_csv(CSVPATH / str(hid) / "dynamic.csv", index=False) + if self.feature_extractor.for_diagnoses: + self.process_dia_by_hid(diag, los, hid, dyn_csv) + + grp.to_csv(CSVPATH / str(hid) / "static.csv", index=False) + self.labels_csv.to_csv(CSVPATH / str(hid) / "labels.csv", index=False) + + def process_med_by_hid(self, feature, los, hid, dyn_csv): + group_col = "stay_id" if self.feature_extractor.use_icu else "hadm_id" + code_col = "itemid" if self.feature_extractor.use_icu else "drug_name" + feat = feature[code_col].unique() + df2 = feature[feature[group_col] == hid] + if df2.shape[0] == 0: + if self.feature_extractor.use_icu: + amount = pd.DataFrame(np.zeros([los, len(feat)]), columns=feat) + amount = amount.fillna(0) + amount.columns = pd.MultiIndex.from_product([["MEDS"], amount.columns]) + else: + val = pd.DataFrame(np.zeros([los, len(feat)]), columns=feat) + val = val.fillna(0) + val.columns = pd.MultiIndex.from_product([["MEDS"], val.columns]) + else: + if self.feature_extractor.use_icu: + rate = df2.pivot_table( + index="start_time", columns="itemid", values="rate" + ) + amount = df2.pivot_table( + index="start_time", columns="itemid", values="amount" + ) + df2 = df2.pivot_table( + index="start_time", columns="itemid", values="stop_time" + ) + add_indices = pd.Index(range(los)).difference(df2.index) + add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna( + np.nan + ) + df2 = pd.concat([df2, add_df]) + df2 = df2.sort_index() + df2 = df2.ffill() + df2 = df2.fillna(0) + rate = pd.concat([rate, add_df]) + rate = rate.sort_index() + rate = rate.ffill() + rate = rate.fillna(-1) + amount = pd.concat([amount, add_df]) + amount = amount.sort_index() + amount = amount.ffill() + amount = amount.fillna(-1) + df2.iloc[:, 0:] = df2.iloc[:, 0:].sub(df2.index, 0) + df2[df2 > 0] = 1 + df2[df2 < 0] = 0 + rate.iloc[:, 0:] = df2.iloc[:, 0:] * rate.iloc[:, 0:] + amount.iloc[:, 0:] = df2.iloc[:, 0:] * amount.iloc[:, 0:] + self.dataDic[hid]["Med"]["signal"] = df2.iloc[:, 0:].to_dict( + orient="list" + ) + self.dataDic[hid]["Med"]["rate"] = rate.iloc[:, 0:].to_dict( + orient="list" + ) + self.dataDic[hid]["Med"]["amount"] = amount.iloc[:, 0:].to_dict( + orient="list" + ) + feat_df = pd.DataFrame(columns=list(set(feat) - set(amount.columns))) + amount = pd.concat([amount, feat_df], axis=1) + amount = amount[feat] + amount = amount.fillna(0) + amount.columns = pd.MultiIndex.from_product([["MEDS"], amount.columns]) + else: + val = df2.pivot_table( + index="start_time", columns="drug_name", values="dose_val_rx" + ) + df2 = df2.pivot_table( + index="start_time", columns="drug_name", values="stop_time" + ) + # print(df2.shape) + add_indices = pd.Index(range(los)).difference(df2.index) + add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna( + np.nan + ) + df2 = pd.concat([df2, add_df]) + df2 = df2.sort_index() + df2 = df2.ffill() + df2 = df2.fillna(0) + val = pd.concat([val, add_df]) + val = val.sort_index() + val = val.ffill() + val = val.fillna(-1) + df2.iloc[:, 0:] = df2.iloc[:, 0:].sub(df2.index, 0) + df2[df2 > 0] = 1 + df2[df2 < 0] = 0 + val.iloc[:, 0:] = df2.iloc[:, 0:] * val.iloc[:, 0:] + self.dataDic[hid]["Med"]["signal"] = df2.iloc[:, 0:].to_dict( + orient="list" + ) + self.dataDic[hid]["Med"]["val"] = val.iloc[:, 0:].to_dict(orient="list") + feat_df = pd.DataFrame(columns=list(set(feat) - set(val.columns))) + val = pd.concat([val, feat_df], axis=1) + val = val[feat] + val = val.fillna(0) + val.columns = pd.MultiIndex.from_product([["MEDS"], val.columns]) + + if self.feature_extractor.use_icu: + if dyn_csv.empty: + dyn_csv = amount + else: + dyn_csv = pd.concat([dyn_csv, amount], axis=1) + else: + if dyn_csv.empty: + dyn_csv = val + else: + dyn_csv = pd.concat([dyn_csv, val], axis=1) + + def process_proc_by_hid(self, feature, los, hid, dyn_csv): + group_col = "stay_id" if self.feature_extractor.use_icu else "hadm_id" + code_col = "itemid" if self.feature_extractor.use_icu else "icd_code" + df2 = feature[feature[group_col] == hid] + feat = feature[code_col].unique() + if self.feature_extractor.use_icu: + if df2.shape[0] == 0: + df2 = pd.DataFrame(np.zeros([los, len(feat)]), columns=feat) + df2 = df2.fillna(0) + df2.columns = pd.MultiIndex.from_product([["PROC"], df2.columns]) + else: + df2["val"] = 1 + df2 = df2.pivot_table( + index="start_time", columns="itemid", values="val" + ) + add_indices = pd.Index(range(los)).difference(df2.index) + add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna( + np.nan + ) + df2 = pd.concat([df2, add_df]) + df2 = df2.sort_index() + df2 = df2.fillna(0) + df2[df2 > 0] = 1 + self.dataDic[hid]["Proc"] = df2.to_dict(orient="list") + feat_df = pd.DataFrame(columns=list(set(feat) - set(df2.columns))) + df2 = pd.concat([df2, feat_df], axis=1) + df2 = df2[feat] + df2 = df2.fillna(0) + df2.columns = pd.MultiIndex.from_product([["PROC"], df2.columns]) + else: + if df2.shape[0] == 0: + df2 = pd.DataFrame(np.zeros([los, len(feat)]), columns=feat) + df2 = df2.fillna(0) + df2.columns = pd.MultiIndex.from_product([["PROC"], df2.columns]) + else: + df2["val"] = 1 + df2 = df2.pivot_table( + index="start_time", columns="icd_code", values="val" + ) + add_indices = pd.Index(range(los)).difference(df2.index) + add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna( + np.nan + ) + df2 = pd.concat([df2, add_df]) + df2 = df2.sort_index() + df2 = df2.fillna(0) + df2[df2 > 0] = 1 + self.dataDic[hid]["Proc"] = df2.to_dict(orient="list") + + feat_df = pd.DataFrame(columns=list(set(feat) - set(df2.columns))) + df2 = pd.concat([df2, feat_df], axis=1) + + df2 = df2[feat] + df2 = df2.fillna(0) + df2.columns = pd.MultiIndex.from_product([["PROC"], df2.columns]) + + if dyn_csv.empty: + dyn_csv = df2 + else: + dyn_csv = pd.concat([dyn_csv, df2], axis=1) + + def process_out_by_hid(self, feature, los, hid, dyn_csv): + feat = feature["itemid"].unique() + df2 = feature[feature["stay_id"] == hid] + if df2.shape[0] == 0: + df2 = pd.DataFrame(np.zeros([los, len(feat)]), columns=feat) + df2 = df2.fillna(0) + df2.columns = pd.MultiIndex.from_product([["OUT"], df2.columns]) + else: + df2["val"] = 1 + df2 = df2.pivot_table(index="start_time", columns="itemid", values="val") + add_indices = pd.Index(range(los)).difference(df2.index) + add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna(np.nan) + df2 = pd.concat([df2, add_df]) + df2 = df2.sort_index() + df2 = df2.fillna(0) + df2[df2 > 0] = 1 + self.dataDic[hid]["Out"] = df2.to_dict(orient="list") + feat_df = pd.DataFrame(columns=list(set(feat) - set(df2.columns))) + df2 = pd.concat([df2, feat_df], axis=1) + df2 = df2[feat] + df2 = df2.fillna(0) + df2.columns = pd.MultiIndex.from_product([["OUT"], df2.columns]) + if dyn_csv.empty: + dyn_csv = df2 + else: + dyn_csv = pd.concat([dyn_csv, df2], axis=1) + + def process_chart_by_hid(self, feature, los, hid, dyn_csv): + feat = feature["itemid"].unique() + df2 = feature[feature["stay_id"] == hid] + if df2.shape[0] == 0: + val = pd.DataFrame(np.zeros([los, len(feat)]), columns=feat) + val = val.fillna(0) + val.columns = pd.MultiIndex.from_product([["CHART"], val.columns]) + else: + val = df2.pivot_table( + index="start_time", columns="itemid", values="valuenum" + ) + df2["val"] = 1 + df2 = df2.pivot_table(index="start_time", columns="itemid", values="val") + add_indices = pd.Index(range(los)).difference(df2.index) + add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna(np.nan) + df2 = pd.concat([df2, add_df]) + df2 = df2.sort_index() + df2 = df2.fillna(0) + + val = pd.concat([val, add_df]) + val = val.sort_index() + if self.impute == "Mean": + val = val.ffill() + val = val.bfill() + val = val.fillna(val.mean()) + elif self.impute == "Median": + val = val.ffill() + val = val.bfill() + val = val.fillna(val.median()) + val = val.fillna(0) + + df2[df2 > 0] = 1 + df2[df2 < 0] = 0 + self.dataDic[hid]["Chart"]["signal"] = df2.iloc[:, 0:].to_dict( + orient="list" + ) + self.dataDic[hid]["Chart"]["val"] = val.iloc[:, 0:].to_dict(orient="list") + + feat_df = pd.DataFrame(columns=list(set(feat) - set(val.columns))) + val = pd.concat([val, feat_df], axis=1) + + val = val[feat] + val = val.fillna(0) + val.columns = pd.MultiIndex.from_product([["CHART"], val.columns]) + + if dyn_csv.empty: + dyn_csv = val + else: + dyn_csv = pd.concat([dyn_csv, val], axis=1) + + def process_lab_by_hid(self, feature, los, hid, dyn_csv): + feat = feature["itemid"].unique() + df2 = feature[feature["hadm_id"] == hid] + if df2.shape[0] == 0: + val = pd.DataFrame(np.zeros([los, len(feat)]), columns=feat) + val = val.fillna(0) + val.columns = pd.MultiIndex.from_product([["LAB"], val.columns]) + else: + val = df2.pivot_table( + index="start_time", columns="itemid", values="valuenum" + ) + df2["val"] = 1 + df2 = df2.pivot_table(index="start_time", columns="itemid", values="val") + add_indices = pd.Index(range(los)).difference(df2.index) + add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna(np.nan) + df2 = pd.concat([df2, add_df]) + df2 = df2.sort_index() + df2 = df2.fillna(0) + + val = pd.concat([val, add_df]) + val = val.sort_index() + if self.impute == "Mean": + val = val.ffill() + val = val.bfill() + val = val.fillna(val.mean()) + elif self.impute == "Median": + val = val.ffill() + val = val.bfill() + val = val.fillna(val.median()) + val = val.fillna(0) + + df2[df2 > 0] = 1 + df2[df2 < 0] = 0 + + # print(df2.head()) + self.dataDic[hid]["Lab"]["signal"] = df2.iloc[:, 0:].to_dict(orient="list") + self.dataDic[hid]["Lab"]["val"] = val.iloc[:, 0:].to_dict(orient="list") + + feat_df = pd.DataFrame(columns=list(set(feat) - set(val.columns))) + val = pd.concat([val, feat_df], axis=1) + + val = val[feat] + val = val.fillna(0) + val.columns = pd.MultiIndex.from_product([["LAB"], val.columns]) + + if dyn_csv.empty: + dyn_csv = val + else: + dyn_csv = pd.concat([dyn_csv, val], axis=1) + + # if dyn_csv.empty: + # dyn_csv = amount_or_val + # else: + # dyn_csv = pd.concat([dyn_csv, amount_or_val], axis=1) + + def process_dia_by_hid(self, feature, los, hid, dyn_csv): + if self.feature_extractor.use_icu: + feat = feature["new_icd_code"].unique() + grp = feature[feature["stay_id"] == hid] + if grp.shape[0] == 0: + self.dataDic[hid]["Cond"] = {"fids": list([""])} + feat_df = pd.DataFrame(np.zeros([1, len(feat)]), columns=feat) + grp = feat_df.fillna(0) + grp.columns = pd.MultiIndex.from_product([["COND"], grp.columns]) + else: + self.dataDic[hid]["Cond"] = {"fids": list(grp["new_icd_code"])} + grp["val"] = 1 + grp = grp.drop_duplicates() + grp = grp.pivot( + index="stay_id", columns="new_icd_code", values="val" + ).reset_index(drop=True) + feat_df = pd.DataFrame(columns=list(set(feat) - set(grp.columns))) + grp = pd.concat([grp, feat_df], axis=1) + grp = grp.fillna(0) + grp = grp[feat] + grp.columns = pd.MultiIndex.from_product([["COND"], grp.columns]) + else: + feat = feature["new_icd_code"].unique() + grp = feature[feature["hadm_id"] == hid].copy() + if grp.shape[0] == 0: + self.dataDic[hid]["Cond"] = {"fids": list([""])} + feat_df = pd.DataFrame(np.zeros([1, len(feat)]), columns=feat) + grp = feat_df.fillna(0) + grp.columns = pd.MultiIndex.from_product([["COND"], grp.columns]) + else: + self.dataDic[hid]["Cond"] = {"fids": list(grp["new_icd_code"])} + grp["val"] = 1 + grp = grp.drop_duplicates() + grp = grp.pivot( + index="hadm_id", columns="new_icd_code", values="val" + ).reset_index(drop=True) + feat_df = pd.DataFrame(columns=list(set(feat) - set(grp.columns))) + grp = pd.concat([grp, feat_df], axis=1) + grp = grp.fillna(0) + grp = grp[feat] + grp.columns = pd.MultiIndex.from_product([["COND"], grp.columns]) + + def save_dictionaries( + self, diag, meds, proc, out, labs, chart, cohort: pd.DataFrame, los + ): + if self.feature_extractor.use_icu: + metaDic = { + "Cond": {}, + "Proc": {}, + "Med": {}, + "Out": {}, + "Chart": {}, + "LOS": {}, + } + metaDic["LOS"] = los + with open(DICT_PATH / "dataDic", "wb") as fp: + pickle.dump(self.dataDic, fp) + + with open(DICT_PATH / "hadmDic", "wb") as fp: + pickle.dump(self.hids, fp) + + with open(DICT_PATH / "ethVocab", "wb") as fp: + pickle.dump(list(cohort["ethnicity"].unique()), fp) + self.eth_vocab = cohort["ethnicity"].nunique() + + with open(DICT_PATH / "ageVocab", "wb") as fp: + pickle.dump(list(cohort["age"].unique()), fp) + self.age_vocab = cohort["age"].nunique() + + with open(DICT_PATH / "insVocab", "wb") as fp: + pickle.dump(list(cohort["insurance"].unique()), fp) + self.ins_vocab = cohort["insurance"].nunique() + + if self.feature_extractor.for_medications: + with open(DICT_PATH / "medVocab", "wb") as fp: + pickle.dump(list(meds["itemid"].unique()), fp) + self.med_vocab = meds["itemid"].nunique() + metaDic["Med"] = self.med_per_adm + + if self.feature_extractor.for_output_events: + with open(DICT_PATH / "outVocab", "wb") as fp: + pickle.dump(list(out["itemid"].unique()), fp) + self.out_vocab = out["itemid"].nunique() + metaDic["Out"] = self.out_per_adm + + if self.feature_extractor.for_chart_events: + with open(DICT_PATH / "chartVocab", "wb") as fp: + pickle.dump(list(chart["itemid"].unique()), fp) + self.chart_vocab = chart["itemid"].nunique() + metaDic["Chart"] = self.chart_per_adm + + if self.feature_extractor.for_diagnoses: + with open(DICT_PATH / "condVocab", "wb") as fp: + pickle.dump(list(diag["new_icd_code"].unique()), fp) + self.cond_vocab = diag["new_icd_code"].nunique() + metaDic["Cond"] = self.dia_per_adm + + if self.feature_extractor.for_procedures: + with open(DICT_PATH / "procVocab", "wb") as fp: + pickle.dump(list(proc["itemid"].unique()), fp) + self.proc_vocab = proc["itemid"].nunique() + metaDic["Proc"] = self.proc_per_adm + + with open(DICT_PATH / "metaDic", "wb") as fp: + pickle.dump(metaDic, fp) + else: + metaDic = {"Cond": {}, "Proc": {}, "Med": {}, "Lab": {}, "LOS": {}} + metaDic["LOS"] = los + with open(DICT_PATH / "dataDic", "wb") as fp: + pickle.dump(self.dataDic, fp) + + with open(DICT_PATH / "hadmDic", "wb") as fp: + pickle.dump(self.hids, fp) + + with open(DICT_PATH / "ethVocab", "wb") as fp: + pickle.dump(list(cohort["ethnicity"].unique()), fp) + self.eth_vocab = cohort["ethnicity"].nunique() + + with open(DICT_PATH / "ageVocab", "wb") as fp: + pickle.dump(list(cohort["age"].unique()), fp) + self.age_vocab = cohort["age"].nunique() + + with open(DICT_PATH / "insVocab", "wb") as fp: + pickle.dump(list(cohort["insurance"].unique()), fp) + self.ins_vocab = cohort["insurance"].nunique() + + if self.feature_extractor.for_medications: + with open(DICT_PATH / "medVocab", "wb") as fp: + pickle.dump(list(meds["drug_name"].unique()), fp) + self.med_vocab = meds["drug_name"].nunique() + metaDic["Med"] = self.med_per_adm + + if self.feature_extractor.for_diagnoses: + with open(DICT_PATH / "condVocab", "wb") as fp: + pickle.dump(list(diag["new_icd_code"].unique()), fp) + self.cond_vocab = diag["new_icd_code"].nunique() + metaDic["Cond"] = self.dia_per_adm + + if self.feature_extractor.for_procedures: + with open(DICT_PATH / "procVocab", "wb") as fp: + pickle.dump(list(proc["icd_code"].unique()), fp) + self.proc_vocab = proc["icd_code"].unique() + metaDic["Proc"] = self.proc_per_adm + + if self.feature_extractor.for_labs: + with open(DICT_PATH / "labsVocab", "wb") as fp: + pickle.dump(list(labs["itemid"].unique()), fp) + self.lab_vocab = labs["itemid"].unique() + metaDic["Lab"] = self.labs_per_adm + + with open(DICT_PATH / "metaDic", "wb") as fp: + pickle.dump(metaDic, fp) diff --git a/pipeline/feature/chart_events.py b/pipeline/feature/chart_events.py new file mode 100644 index 0000000000..6d555e65e9 --- /dev/null +++ b/pipeline/feature/chart_events.py @@ -0,0 +1,216 @@ +from tqdm import tqdm +from pipeline.feature.feature_abc import Feature, Name +import logging +import pandas as pd +from pipeline.preprocessing.outlier_removal import outlier_imputation +from pipeline.file_info.preproc.feature import ChartEventsHeader +from pipeline.file_info.preproc.cohort import IcuCohortHeader +from pipeline.file_info.raw.icu import ( + ICU_CHART_EVENTS_PATH, + load_icu_chart_events, + ChartEvents, +) + +from pipeline.conversion.uom import drop_wrong_uom + +logger = logging.getLogger() + + +class Chart(Feature): + def __init__(self, df: pd.DataFrame = pd.DataFrame(), chunksize: int = 10000000): + self.df = df + self.chunksize = chunksize + self.final_df = pd.DataFrame() + + def name() -> str: + return Name.CHART + + def df(self) -> pd.DataFrame: + return self.df + + def extract_from(self, cohort: pd.DataFrame) -> pd.DataFrame: + """Function for processing hospital observations from a pickled cohort, optimized for memory efficiency.""" + logger.info("[EXTRACTING CHART EVENTS DATA]") + processed_chunks = [ + self.process_chunk_chart_events(chunk, cohort) + for chunk in tqdm(load_icu_chart_events(self.chunksize)) + ] + chart = pd.concat(processed_chunks, ignore_index=True) + + """Log statistics about the chart events before drop.""" + logger.info(f"# Unique Events: {chart[ChartEventsHeader.ITEM_ID].nunique()}") + logger.info(f"# Admissions: {chart[ChartEventsHeader.STAY_ID].nunique()}") + logger.info(f"Total rows: {chart.shape[0]}") + + chart = drop_wrong_uom(chart, 0.95) + """Log statistics about the chart events.""" + logger.info(f"# Unique Events: {chart[ChartEventsHeader.ITEM_ID].nunique()}") + logger.info(f"# Admissions: {chart[ChartEventsHeader.STAY_ID].nunique()}") + logger.info(f"Total rows: {chart.shape[0]}") + chart = chart[[h.value for h in ChartEventsHeader]] + self.df = chart + return chart + + def process_chunk_chart_events( + self, chunk: pd.DataFrame, cohort: pd.DataFrame + ) -> pd.DataFrame: + """Process a single chunk of chart events.""" + chunk = chunk.dropna(subset=[ChartEvents.VALUENUM]) + chunk = chunk.merge( + cohort[[IcuCohortHeader.STAY_ID, IcuCohortHeader.IN_TIME]], + on=ChartEvents.STAY_ID, + ) + chunk[ChartEventsHeader.EVENT_TIME_FROM_ADMIT] = ( + chunk[ChartEvents.CHARTTIME] - chunk[IcuCohortHeader.IN_TIME] + ) + chunk = chunk.drop(["charttime", "intime"], axis=1) + chunk = chunk.dropna() + chunk = chunk.drop_duplicates() + return chunk + + def summary(self): + chart: pd.DataFrame = self.df + freq = ( + chart.groupby([ChartEventsHeader.STAY_ID, ChartEventsHeader.ITEM_ID]) + .size() + .reset_index(name="mean_frequency") + ) + freq = ( + freq.groupby([ChartEventsHeader.ITEM_ID])["mean_frequency"] + .mean() + .reset_index() + ) + + missing = ( + chart[chart[ChartEventsHeader.VALUE_NUM] == 0] + .groupby(ChartEventsHeader.ITEM_ID) + .size() + .reset_index(name="missing_count") + ) + total = ( + chart.groupby(ChartEventsHeader.ITEM_ID) + .size() + .reset_index(name="total_count") + ) + summary = pd.merge(missing, total, on=ChartEventsHeader.ITEM_ID, how="right") + summary = pd.merge(freq, summary, on=ChartEventsHeader.ITEM_ID, how="right") + summary = summary.fillna(0) + return summary + + def preproc(self): + pass + + def impute_outlier(self, impute, thresh, left_thresh): + logger.info("[PROCESSING CHART EVENTS DATA]") + self.df = outlier_imputation( + self.df, + ChartEventsHeader.ITEM_ID, + ChartEventsHeader.VALUE_NUM, + thresh, + left_thresh, + impute, + ) + + logger.info("Total number of rows", self.df.shape[0]) + logger.info("[SUCCESSFULLY SAVED CHART EVENTS DATA]") + return self.df + + def generate_fun(self, cohort): + processed_chunks = [] + for chart in tqdm(self.df): + chart = chart[chart["stay_id"].isin(cohort["stay_id"])].copy() + # Convert 'event_time_from_admit' to numeric total hours + time_parts = chart["event_time_from_admit"].str.extract( + r"(\d+) days (\d+):(\d+):(\d+)" + ) + chart["start_time"] = pd.to_numeric(time_parts[0]) * 24 + pd.to_numeric( + time_parts[1] + ) + chart = pd.merge( + chart, cohort[["stay_id", "los"]], on="stay_id", how="left" + ) + chart = chart[chart["los"] - chart["start_time"] > 0] + chart = chart.drop(columns=["event_time_from_admit", "los"]) + processed_chunks.append(chart) + final = pd.concat(processed_chunks, ignore_index=True) + self.df = final + return final + + def mortality_length(self, cohort, include_time): + self.df = self.df[self.df["stay_id"].isin(cohort["stay_id"])] + self.df = self.df[self.df["start_time"] <= include_time] + return self.df + + def los_length(self, cohort, include_time): + self.df = self.df[self.df["stay_id"].isin(cohort["stay_id"])] + self.df = self.df[self.df["start_time"] <= include_time] + return self.df + + def read_length(self, cohort): + self.df = self.df[self.df["stay_id"].isin(cohort["stay_id"])] + self.df = pd.merge( + self.df, cohort[["stay_id", "select_time"]], on="stay_id", how="left" + ) + self.df["start_time"] = self.df["start_time"] - self.df["select_time"] + self.df = self.df[self.df["start_time"] >= 0] + return self.df + + def smooth_meds_step(self, bucket, i, t): + sub_chart = ( + self.df[(self.df["start_time"] >= i) & (self.df["start_time"] < i + bucket)] + .groupby(["stay_id", "itemid"]) + .agg({"valuenum": "mean"}) + ) + sub_chart = sub_chart.reset_index() + sub_chart["start_time"] = t + return sub_chart + + # def smooth_meds(self): + # f2_df = self.final_df.groupby(["stay_id", "itemid"]).size() + # df_per_adm = f2_df.groupby("stay_id").sum().reset_index()[0].max() + # dflength_per_adm = self.final_df.groupby("stay_id").size().max() + # return f2_df, df_per_adm, dflength_per_adm + + # def dict_step(self, hid, los, dataDic): + # feat = self.final_df["itemid"].unique() + # df2 = self.final_df[self.final_df["stay_id"] == hid] + # if df2.shape[0] == 0: + # val = pd.DataFrame(np.zeros([los, len(feat)]), columns=feat) + # val = val.fillna(0) + # val.columns = pd.MultiIndex.from_product([["CHART"], val.columns]) + # else: + # val = df2.pivot_table( + # index="start_time", columns="itemid", values="valuenum" + # ) + # df2["val"] = 1 + # df2 = df2.pivot_table(index="start_time", columns="itemid", values="val") + # add_indices = pd.Index(range(los)).difference(df2.index) + # add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna(np.nan) + # df2 = pd.concat([df2, add_df]) + # df2 = df2.sort_index() + # df2 = df2.fillna(0) + + # val = pd.concat([val, add_df]) + # val = val.sort_index() + # if self.impute == "Mean": + # val = val.ffill() + # val = val.bfill() + # val = val.fillna(val.mean()) + # elif self.impute == "Median": + # val = val.ffill() + # val = val.bfill() + # val = val.fillna(val.median()) + # val = val.fillna(0) + + # df2[df2 > 0] = 1 + # df2[df2 < 0] = 0 + # dataDic[hid]["Chart"]["signal"] = df2.iloc[:, 0:].to_dict(orient="list") + # dataDic[hid]["Chart"]["val"] = val.iloc[:, 0:].to_dict(orient="list") + + # feat_df = pd.DataFrame(columns=list(set(feat) - set(val.columns))) + # val = pd.concat([val, feat_df], axis=1) + + # val = val[feat] + # val = val.fillna(0) + # val.columns = pd.MultiIndex.from_product([["CHART"], val.columns]) + # return val diff --git a/pipeline/feature/diagnoses.py b/pipeline/feature/diagnoses.py new file mode 100644 index 0000000000..970a64d545 --- /dev/null +++ b/pipeline/feature/diagnoses.py @@ -0,0 +1,122 @@ +from enum import StrEnum +from pipeline.conversion.icd import IcdConverter +from pipeline.feature.feature_abc import Feature, Name +import logging +import pandas as pd +from pipeline.file_info.preproc.feature import ( + DiagnosesHeader, + DiagnosesIcuHeader, +) +from pipeline.file_info.preproc.cohort import CohortHeader, IcuCohortHeader +from pipeline.file_info.preproc.feature import PreprocDiagnosesHeader +from pipeline.file_info.raw.hosp import load_hosp_diagnosis_icd + +logger = logging.getLogger() + + +class IcdGroupOption(StrEnum): + KEEP = "Keep both ICD-9 and ICD-10 codes" + CONVERT = "Convert ICD-9 to ICD-10 codes" + GROUP = "Convert ICD-9 to ICD-10 and group ICD-10 codes" + + +MEAN_FREQUENCY_HEADER = "mean_frequency" + + +class Diagnoses(Feature): + def __init__(self, use_icu: bool, df: pd.DataFrame = pd.DataFrame()): + self.use_icu = use_icu + self.df = df + + def name() -> str: + return Name.DIAGNOSES + + def df(self) -> pd.DataFrame: + return self.df + + def extract_from(self, cohort: pd.DataFrame) -> pd.DataFrame: + logger.info("[EXTRACTING DIAGNOSIS DATA]") + hosp_diagnose = load_hosp_diagnosis_icd() + admissions_cohort_cols = ( + [ + CohortHeader.HOSPITAL_ADMISSION_ID, + IcuCohortHeader.STAY_ID, + CohortHeader.LABEL, + ] + if self.use_icu + else [CohortHeader.HOSPITAL_ADMISSION_ID, CohortHeader.LABEL] + ) + diag = hosp_diagnose.merge( + cohort[admissions_cohort_cols], + on=DiagnosesHeader.HOSPITAL_ADMISSION_ID, + ) + icd_converter = IcdConverter() + diag = icd_converter.standardize_icd(diag) + diag = diag[ + [h.value for h in DiagnosesHeader] + + ([DiagnosesIcuHeader.STAY_ID] if self.use_icu else []) + ] + self.df = diag + return diag + + def preproc(self, group_diag_icd: IcdGroupOption) -> pd.DataFrame: + logger.info(f"[PROCESSING DIAGNOSIS DATA]") + preproc_code = { + IcdGroupOption.KEEP: DiagnosesHeader.ICD_CODE, + IcdGroupOption.CONVERT: DiagnosesHeader.ROOT_ICD10, + IcdGroupOption.GROUP: DiagnosesHeader.ROOT, + }.get(group_diag_icd) + self.df[PreprocDiagnosesHeader.NEW_ICD_CODE] = self.df[preproc_code] + self.df = self.df[ + [c for c in PreprocDiagnosesHeader] + + ([DiagnosesIcuHeader.STAY_ID] if self.use_icu else []) + ] + self.icd_group_option = group_diag_icd + logger.info(f"Total number of rows: {self.df.shape[0]}") + return self.df + + def summary(self): + diag: pd.DataFrame = self.df + group_column = ( + DiagnosesIcuHeader.STAY_ID + if self.use_icu + else DiagnosesHeader.HOSPITAL_ADMISSION_ID + ) + freq = diag.groupby([group_column, PreprocDiagnosesHeader.NEW_ICD_CODE]).size() + freq = freq.reset_index(name="mean_frequency") + mean_freq = freq.groupby(PreprocDiagnosesHeader.NEW_ICD_CODE)[ + "mean_frequency" + ].mean() + total = ( + diag.groupby(PreprocDiagnosesHeader.NEW_ICD_CODE) + .size() + .reset_index(name="total_count") + ) + summary = pd.merge( + mean_freq, total, on=PreprocDiagnosesHeader.NEW_ICD_CODE, how="right" + ) + summary = summary.fillna(0) + return summary + + def generate_fun(self, cohort: pd.DataFrame): + diag: pd.DataFrame = self.df + diag = diag[ + diag[DiagnosesHeader.HOSPITAL_ADMISSION_ID].isin( + cohort[CohortHeader.HOSPITAL_ADMISSION_ID] + ) + ] + diag_per_adm = diag.groupby(DiagnosesHeader.HOSPITAL_ADMISSION_ID).size().max() + self.df = diag + return diag, diag_per_adm + + def mortality_length(self, cohort): + col = "stay_id" if self.use_icu else "hadm_id" + self.df = self.df[self.df[col].isin(cohort[col])] + + def los_length(self, cohort): + col = "stay_id" if self.use_icu else "hadm_id" + self.df = self.df[self.df[col].isin(cohort[col])] + + def read_length(self, cohort): + col = "stay_id" if self.use_icu else "hadm_id" + self.df = self.df[self.df[col].isin(cohort[col])] diff --git a/pipeline/feature/feature_abc.py b/pipeline/feature/feature_abc.py new file mode 100644 index 0000000000..9d2c9fc8d1 --- /dev/null +++ b/pipeline/feature/feature_abc.py @@ -0,0 +1,56 @@ +from abc import ABC, abstractmethod, abstractproperty +import pandas as pd +from enum import StrEnum + + +# dictionaire d info avec les path, le name, les options der group et de clean? +# feature name pour les log +# extract_path, extract_summary_path, preproc_path, summary_path, (cleaned_path?) +class Name(StrEnum): + DIAGNOSES = "DIAGNOSES" + PROCEDURES = "PROCEDURES" + MEDICATIONS = "MEDICATIONS" + OUTPUT = "OUTPUT EVENTS" + CHART = "CHART EVENTS" + LAB = "LAB EVENTS" + + +class Feature(ABC): + @staticmethod + @abstractmethod + def name(): + pass + + """ + Abstract base class for a feature in the dataset. + Defines the structure and required methods for a feature. + """ + + @abstractproperty + def df(self): + return self.df + + @abstractproperty + def df(self): + return self.df + + @abstractmethod + def extract_from(self, cohort: pd.DataFrame) -> pd.DataFrame: + """ + Generate the feature data and return it as a DataFrame. + """ + pass + + @abstractmethod + def preproc(self) -> None: + """ + Preprocess the feature data. + """ + pass + + @abstractmethod + def summary(self) -> None: + """ + Generate a summary of the feature. + """ + pass diff --git a/pipeline/feature/lab_events.py b/pipeline/feature/lab_events.py new file mode 100644 index 0000000000..92d8cbb04a --- /dev/null +++ b/pipeline/feature/lab_events.py @@ -0,0 +1,259 @@ +from tqdm import tqdm +from pipeline.preprocessing.admission_imputer import ( + INPUTED_HOSPITAL_ADMISSION_ID_HEADER, + impute_hadm_ids, +) +from pipeline.feature.feature_abc import Feature, Name +import logging +import pandas as pd +from pipeline.preprocessing.outlier_removal import outlier_imputation +from pipeline.file_info.preproc.feature import LabEventsHeader +from pipeline.file_info.preproc.cohort import CohortHeader, NonIcuCohortHeader +from pipeline.file_info.raw.hosp import ( + HospAdmissions, + HospLabEvents, + load_hosp_admissions, + load_hosp_lab_events, +) +from pipeline.file_info.common import save_data +from pipeline.conversion.uom import drop_wrong_uom + +logger = logging.getLogger() + + +class Lab(Feature): + def name() -> str: + return Name.LAB + + def __init__(self, df: pd.DataFrame = pd.DataFrame(), chunksize: int = 10000000): + self.df = df + self.chunksize = chunksize + self.final_df = pd.DataFrame() + + def df(self): + return self.df + + def extract_from(self, cohort: pd.DataFrame) -> pd.DataFrame: + """Process and transform lab events data.""" + logger.info("[EXTRACTING LABS DATA]") + admissions = load_hosp_admissions()[ + [ + HospAdmissions.PATIENT_ID, + HospAdmissions.ID, + HospAdmissions.ADMITTIME, + HospAdmissions.DISCHTIME, + ] + ] + usecols = [ + HospLabEvents.ITEM_ID, + HospLabEvents.PATIENT_ID, + HospLabEvents.HOSPITAL_ADMISSION_ID, + HospLabEvents.CHART_TIME, + HospLabEvents.VALUE_NUM, + HospLabEvents.VALUE_UOM, + ] + processed_chunks = [ + self.process_lab_chunk(chunk, admissions, cohort) + for chunk in tqdm( + load_hosp_lab_events(chunksize=self.chunksize, use_cols=usecols) + ) + ] + labevents = pd.concat(processed_chunks, ignore_index=True) + labevents = labevents[[h.value for h in LabEventsHeader]] + self.df = labevents + return labevents + + def process_lab_chunk( + self, chunk: pd.DataFrame, admissions: pd.DataFrame, cohort: pd.DataFrame + ) -> pd.DataFrame: + """Process a single chunk of lab events.""" + chunk = chunk.dropna(subset=[HospLabEvents.VALUE_NUM]).fillna( + {HospLabEvents.VALUE_UOM: 0} + ) + chunk = chunk[ + chunk[LabEventsHeader.PATIENT_ID].isin(cohort[CohortHeader.PATIENT_ID]) + ] + chunk_with_hadm, chunk_no_hadm = ( + chunk[chunk[HospLabEvents.HOSPITAL_ADMISSION_ID].notna()], + chunk[chunk[HospLabEvents.HOSPITAL_ADMISSION_ID].isna()], + ) + chunk_imputed = impute_hadm_ids(chunk_no_hadm.copy(), admissions) + chunk_imputed[HospLabEvents.HOSPITAL_ADMISSION_ID] = chunk_imputed[ + INPUTED_HOSPITAL_ADMISSION_ID_HEADER + ] + chunk_imputed = chunk_imputed[ + [ + HospLabEvents.PATIENT_ID, + HospLabEvents.HOSPITAL_ADMISSION_ID, + HospLabEvents.ITEM_ID, + HospLabEvents.CHART_TIME, + HospLabEvents.VALUE_NUM, + HospLabEvents.VALUE_UOM, + ] + ] + merged_chunk = pd.concat([chunk_with_hadm, chunk_imputed], ignore_index=True) + return self.merge_with_cohort_and_calculate_lab_time(merged_chunk, cohort) + + # in utils? + def merge_with_cohort_and_calculate_lab_time( + self, chunk: pd.DataFrame, cohort: pd.DataFrame + ) -> pd.DataFrame: + """Merge chunk with cohort data and calculate the lab time from admit time.""" + chunk = chunk.merge( + cohort[ + [ + CohortHeader.HOSPITAL_ADMISSION_ID, + NonIcuCohortHeader.ADMIT_TIME, + NonIcuCohortHeader.DISCH_TIME, + ] + ], + on=LabEventsHeader.HOSPITAL_ADMISSION_ID, + ) + chunk[LabEventsHeader.CHART_TIME] = pd.to_datetime( + chunk[LabEventsHeader.CHART_TIME] + ) + chunk[LabEventsHeader.LAB_TIME_FROM_ADMIT] = ( + chunk[LabEventsHeader.CHART_TIME] - chunk[LabEventsHeader.ADMIT_TIME] + ) + return chunk.dropna() + + def preproc(self): + pass + + def impute_outlier(self, impute, thresh, left_thresh): + print("[PROCESSING LABS DATA]") + self.df = outlier_imputation( + self.df, + HospLabEvents.ITEM_ID, + HospLabEvents.VALUE_NUM, + thresh, + left_thresh, + impute, + ) + print("Total number of rows", self.df.shape[0]) + print("[SUCCESSFULLY SAVED LABS DATA]") + return self.df + + def summary(self): + labs: pd.DataFrame = self.df + freq = ( + labs.groupby( + [LabEventsHeader.HOSPITAL_ADMISSION_ID, LabEventsHeader.ITEM_ID] + ) + .size() + .reset_index(name="mean_frequency") + ) + freq = ( + freq.groupby([LabEventsHeader.ITEM_ID])["mean_frequency"] + .mean() + .reset_index() + ) + + missing = ( + labs[labs[LabEventsHeader.VALUE_NUM] == 0] + .groupby(LabEventsHeader.ITEM_ID) + .size() + .reset_index(name="missing_count") + ) + total = ( + labs.groupby(LabEventsHeader.ITEM_ID).size().reset_index(name="total_count") + ) + summary = pd.merge(missing, total, on=LabEventsHeader.ITEM_ID, how="right") + summary = pd.merge(freq, summary, on=LabEventsHeader.ITEM_ID, how="right") + summary["missing%"] = 100 * (summary["missing_count"] / summary["total_count"]) + summary = summary.fillna(0) + + return summary + + def generate_fun(self, cohort): + processed_chunks = [] + for labs in tqdm(self.df): + labs = labs[labs["hadm_id"].isin(cohort["hadm_id"])].copy() + # Process 'lab_time_from_admit' to numeric total hours + time_parts = labs["lab_time_from_admit"].str.extract( + r"(\d+) days (\d+):(\d+):(\d+)" + ) + labs["start_time"] = pd.to_numeric(time_parts[0]) * 24 + pd.to_numeric( + time_parts[1] + ) + labs = pd.merge(labs, cohort[["hadm_id", "los"]], on="hadm_id", how="left") + labs = labs[labs["los"] - labs["start_time"] > 0] + labs = labs.drop(columns=["lab_time_from_admit", "los"]) + processed_chunks.append(labs) + final = pd.concat(processed_chunks, ignore_index=True) + self.df = final + return final + + def mortality_length(self, cohort, include_time): + self.df = self.df[self.df["hadm_id"].isin(cohort["hadm_id"])] + self.df = self.df[self.df["start_time"] <= include_time] + + def los_length(self, cohort, include_time): + self.df = self.df[self.df["hadm_id"].isin(cohort["hadm_id"])] + self.df = self.df[self.df["start_time"] <= include_time] + + def read_length(self, cohort): + self.df = self.df[self.df["hadm_id"].isin(cohort["hadm_id"])] + + def smooth_meds_step(self, bucket, i, t): + sub_labs = ( + self.df[(self.df["start_time"] >= i) & (self.df["start_time"] < i + bucket)] + .groupby(["hadm_id", "itemid"]) + .agg({"subject_id": "max", "valuenum": "mean"}) + ) + sub_labs = sub_labs.reset_index() + sub_labs["start_time"] = t + return sub_labs + + # def smooth_meds(self): + # f2_df = self.final_df.groupby(["hadm_id", "itemid"]).size() + # df_per_adm = f2_df.groupby("hadm_id").sum().reset_index()[0].max() + # dflength_per_adm = self.final_df.groupby("hadm_id").size().max() + # return f2_df, df_per_adm, dflength_per_adm + + # def dict_step(self, hid, los, dataDic): + # feat = self.final_df["itemid"].unique() + # df2 = self.final_df[self.final_df["hadm_id"] == hid] + # if df2.shape[0] == 0: + # val = pd.DataFrame(np.zeros([los, len(feat)]), columns=feat) + # val = val.fillna(0) + # val.columns = pd.MultiIndex.from_product([["LAB"], val.columns]) + # else: + # val = df2.pivot_table( + # index="start_time", columns="itemid", values="valuenum" + # ) + # df2["val"] = 1 + # df2 = df2.pivot_table(index="start_time", columns="itemid", values="val") + # # print(df2.shape) + # add_indices = pd.Index(range(los)).difference(df2.index) + # add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna(np.nan) + # df2 = pd.concat([df2, add_df]) + # df2 = df2.sort_index() + # df2 = df2.fillna(0) + + # val = pd.concat([val, add_df]) + # val = val.sort_index() + # if self.impute == "Mean": + # val = val.ffill() + # val = val.bfill() + # val = val.fillna(val.mean()) + # elif self.impute == "Median": + # val = val.ffill() + # val = val.bfill() + # val = val.fillna(val.median()) + # val = val.fillna(0) + + # df2[df2 > 0] = 1 + # df2[df2 < 0] = 0 + + # # print(df2.head()) + # dataDic[hid]["Lab"]["signal"] = df2.iloc[:, 0:].to_dict(orient="list") + # dataDic[hid]["Lab"]["val"] = val.iloc[:, 0:].to_dict(orient="list") + + # feat_df = pd.DataFrame(columns=list(set(feat) - set(val.columns))) + # val = pd.concat([val, feat_df], axis=1) + + # val = val[feat] + # val = val.fillna(0) + # val.columns = pd.MultiIndex.from_product([["LAB"], val.columns]) + # return val diff --git a/pipeline/feature/medications.py b/pipeline/feature/medications.py new file mode 100644 index 0000000000..81e45e09a9 --- /dev/null +++ b/pipeline/feature/medications.py @@ -0,0 +1,364 @@ +from pipeline.feature.feature_abc import Feature, Name +import logging +import pandas as pd +import numpy as np +from pipeline.conversion.ndc import ( + NdcMappingHeader, + get_EPC, + ndc_to_str, + prepare_ndc_mapping, +) +from pipeline.file_info.preproc.feature import ( + MedicationsHeader, + IcuMedicationHeader, + NonIcuMedicationHeader, + PreprocMedicationHeader, +) +from pipeline.file_info.preproc.cohort import ( + CohortHeader, + IcuCohortHeader, + NonIcuCohortHeader, +) +from pipeline.file_info.raw.hosp import ( + HospPrescriptions, + load_hosp_prescriptions, +) +from pipeline.file_info.raw.icu import ( + InputEvents, + load_input_events, +) + +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger() + + +class Medications(Feature): + def __init__( + self, use_icu: bool, df: pd.DataFrame = pd.DataFrame(), group_code: bool = False + ): + self.use_icu = use_icu + self.group_code = group_code + self.df = df + self.final_df = pd.DataFrame() + self.admid = ( + IcuCohortHeader.STAY_ID + if self.use_icu + else CohortHeader.HOSPITAL_ADMISSION_ID + ) + + def name() -> str: + return Name.MEDICATIONS + + def df(self) -> pd.DataFrame: + return self.df + + def extract_from(self, cohort: pd.DataFrame) -> pd.DataFrame: + logger.info(f"[EXTRACTING MEDICATIONS DATA]") + cohort_headers = ( + [ + CohortHeader.HOSPITAL_ADMISSION_ID, + IcuCohortHeader.STAY_ID, + IcuCohortHeader.IN_TIME, + ] + if self.use_icu + else [CohortHeader.HOSPITAL_ADMISSION_ID, NonIcuCohortHeader.ADMIT_TIME] + ) + admissions = cohort[cohort_headers] + raw_med = load_input_events() if self.use_icu else load_hosp_prescriptions() + medications = raw_med.merge( + admissions, + on=self.admid, + ) + admit_header = ( + IcuCohortHeader.IN_TIME if self.use_icu else NonIcuCohortHeader.ADMIT_TIME + ) + + medications[MedicationsHeader.START_HOURS_FROM_ADMIT] = ( + medications[InputEvents.STARTTIME] - medications[admit_header] + ) + medications[MedicationsHeader.STOP_HOURS_FROM_ADMIT] = ( + medications[ + InputEvents.ENDTIME if self.use_icu else HospPrescriptions.STOP_TIME + ] + - medications[admit_header] + ) + medications = ( + medications.dropna() + if self.use_icu + else self.normalize_non_icu(medications) + ) + self.log_medication_stats(medications) + cols = [h.value for h in MedicationsHeader] + [ + h.value + for h in (IcuMedicationHeader if self.use_icu else NonIcuMedicationHeader) + ] + medications = medications[cols] + self.df = medications + return medications + + def normalize_non_icu(self, med: pd.DataFrame) -> pd.DataFrame: + """ + Normalize medication data for non-ICU cases. + + Args: + med (pd.DataFrame): The medication dataframe. + + Returns: + pd.DataFrame: The normalized dataframe. + """ + med[NonIcuMedicationHeader.DRUG] = ( + med[NonIcuMedicationHeader.DRUG] + .fillna("") + .astype(str) + .str.lower() + .str.strip() + .str.replace(" ", "_") + ) + med[HospPrescriptions.NDC] = med[HospPrescriptions.NDC].fillna(-1) + med[HospPrescriptions.NDC] = med[HospPrescriptions.NDC].astype("Int64") + med[NdcMappingHeader.NEW_NDC] = med[HospPrescriptions.NDC].apply(ndc_to_str) + ndc_map = prepare_ndc_mapping() + med = med.merge(ndc_map, on=NdcMappingHeader.NEW_NDC, how="left") + med[NonIcuMedicationHeader.EPC] = med["pharm_classes"].apply(get_EPC) + return med + + def log_medication_stats(self, med: pd.DataFrame) -> None: + """ + Log statistics for medication data. + + Args: + med (pd.DataFrame): The medication dataframe. + """ + unique_drug_count = med[ + InputEvents.ITEMID if self.use_icu else NonIcuMedicationHeader.DRUG + ].nunique() + unique_admission_count = med[ + InputEvents.STAY_ID if self.use_icu else CohortHeader.HOSPITAL_ADMISSION_ID + ].nunique() + logger.info(f"Number of unique types of drugs: {unique_drug_count}") + if not self.use_icu: + logger.info( + f"Number of unique type of drug after grouping: {med[NonIcuMedicationHeader.NON_PROPRIEATARY_NAME].nunique()}" + ) + logger.info(f"Number of admissions: {unique_admission_count}") + logger.info(f"Total number of rows: {med.shape[0]}") + + def preproc(self, group_code: bool): + med: pd.DataFrame = self.df + logger.info("[PROCESSING MEDICATIONS DATA]") + med[PreprocMedicationHeader.DRUG_NAME] = ( + med[NonIcuMedicationHeader.NON_PROPRIEATARY_NAME] + if group_code + else med[NonIcuMedicationHeader.DRUG] + ) + med = med.drop( + columns=[ + NonIcuMedicationHeader.NON_PROPRIEATARY_NAME, + NonIcuMedicationHeader.DRUG, + ] + ) + med.dropna() + self.group_code = group_code + self.df = med + logger.info(f"Total number of rows: {med.shape[0]}") + return med + + def summary(self) -> pd.DataFrame: + med: pd.DataFrame = self.df + feature_name = ( + IcuMedicationHeader.ITEM_ID.value + if self.use_icu + else PreprocMedicationHeader.DRUG_NAME.value + ) + group_columns = ( + [IcuMedicationHeader.STAY_ID, IcuMedicationHeader.ITEM_ID] + if self.use_icu + else [ + MedicationsHeader.HOSPITAL_ADMISSION_ID, + PreprocMedicationHeader.DRUG_NAME, + ] + ) + freq = med.groupby(group_columns).size().reset_index(name="mean_frequency") + amount_column = ( + IcuMedicationHeader.AMOUNT + if self.use_icu + else NonIcuMedicationHeader.DOSE_VAL_RX + ) + missing = ( + med[med[amount_column] == 0] + .groupby(feature_name) + .size() + .reset_index(name="missing_count") + ) + total = med.groupby(feature_name).size().reset_index(name="total_count") + summary = pd.merge(missing, total, on=feature_name, how="right") + summary = pd.merge(freq, summary, on=feature_name, how="right") + summary["missing%"] = 100 * (summary["missing_count"] / summary["total_count"]) + summary = summary.fillna(0) + return summary + + def generate_fun(self, cohort: pd.DataFrame): + meds: pd.DataFrame = self.df + meds[["start_days", "dummy", "start_hours"]] = meds[ + "start_hours_from_admit" + ].str.split(" ", expand=True) + meds[["start_hours", "min", "sec"]] = meds["start_hours"].str.split( + ":", expand=True + ) + meds["start_time"] = pd.to_numeric(meds["start_days"]) * 24 + pd.to_numeric( + meds["start_hours"] + ) + meds[["start_days", "dummy", "start_hours"]] = meds[ + "stop_hours_from_admit" + ].str.split(" ", expand=True) + meds[["start_hours", "min", "sec"]] = meds["start_hours"].str.split( + ":", expand=True + ) + meds["stop_time"] = pd.to_numeric(meds["start_days"]) * 24 + pd.to_numeric( + meds["start_hours"] + ) + meds = meds.drop(columns=["start_days", "dummy", "start_hours", "min", "sec"]) + #####Sanity check + meds["sanity"] = meds["stop_time"] - meds["start_time"] + meds = meds[meds["sanity"] > 0] + del meds["sanity"] + #####Select hadm_id as in main file + meds = meds[meds[self.admid].isin(cohort[self.admid])] + meds = pd.merge(meds, cohort[[self.admid, "los"]], on=self.admid, how="left") + + #####Remove where start time is after end of visit + meds["sanity"] = meds["los"] - meds["start_time"] + meds = meds[meds["sanity"] > 0] + del meds["sanity"] + ####Any stop_time after end of visit is set at end of visit + meds.loc[meds["stop_time"] > meds["los"], "stop_time"] = meds.loc[ + meds["stop_time"] > meds["los"], "los" + ] + del meds["los"] + if self.use_icu: + meds["rate"] = meds["rate"].apply(pd.to_numeric, errors="coerce") + meds["amount"] = meds["amount"].apply(pd.to_numeric, errors="coerce") + else: + meds["dose_val_rx"] = meds["dose_val_rx"].apply( + pd.to_numeric, errors="coerce" + ) + + self.df = meds + return meds + + def mortality_length(self, cohort, include_time): + self.df = self.df[self.df[self.admid].isin(cohort[self.admid])] + self.df = self.df[self.df["start_time"] <= include_time] + self.df.loc[self.df["stop_time"] > include_time, "stop_time"] = include_time + return self.df + + def los_length(self, cohort, include_time): + self.df = self.df[self.df[self.admid].isin(cohort[self.admid])] + self.df = self.df[self.df["start_time"] <= include_time] + self.df.loc[self.df["stop_time"] > include_time, "stop_time"] = include_time + return self.df + + def read_length(self, cohort): + self.df = self.df[self.df[self.admid].isin(cohort[self.admid])] + self.df = pd.merge( + self.df, cohort[[self.admid, "select_time"]], on=self.admid, how="left" + ) + self.df["stop_time"] = self.df["stop_time"] - self.df["select_time"] + + self.df["start_time"] = self.df["start_time"] - self.df["select_time"] + self.df = self.df[self.df["stop_time"] >= 0] + self.df.loc[self.df["start_time"] < 0, "start_time"] = 0 + return self.df + + def smooth_meds_step(self, bucket, i, t): + group_cols = ( + ["stay_id", "itemid", "orderid"] + if self.use_icu + else ["hadm_id", "drug_name"] + ) + agg_funcs = ( + {"stop_time": "max", "subject_id": "max", "rate": "mean", "amount": "mean"} + if self.use_icu + else {"stop_time": "max", "subject_id": "max", "dose_val_rx": "mean"} + ) + + sub_meds = self.df[ + (self.df["start_time"] >= i) & (self.df["start_time"] < i + bucket) + ] + sub_meds = sub_meds.groupby(group_cols).agg(agg_funcs).reset_index() + sub_meds["start_time"] = t + sub_meds["stop_time"] = sub_meds["stop_time"] / bucket + return sub_meds + + # def dict_step(self, hid, los, dataDic): + # feat = self.final_df["itemid" if self.use_icu else "drug_name"].unique() + # df2 = self.final_df[ + # self.final_df["stay_id" if self.use_icu else "hadm_id"] == hid + # ] + # if df2.shape[0] == 0: + # val = pd.DataFrame(np.zeros([los, len(feat)]), columns=feat) + # val = val.fillna(0) + # val.columns = pd.MultiIndex.from_product([["MEDS"], val.columns]) + # else: + # if self.use_icu: + # rate = df2.pivot_table( + # index="start_time", columns="itemid", values="rate" + # ) + # amount = df2.pivot_table( + # index="start_time", columns="itemid", values="amount" + # ) + # else: + # val = df2.pivot_table( + # index="start_time", columns="drug_name", values="dose_val_rx" + # ) + + # df2 = df2.pivot_table( + # index="start_time", + # columns="itemid" if self.use_icu else "drug_name", + # values="stop_time", + # ) + # add_indices = pd.Index(range(los)).difference(df2.index) + # add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna(np.nan) + # df2 = pd.concat([df2, add_df]) + # df2 = df2.sort_index() + # df2 = df2.ffill() + # df2 = df2.fillna(0) + # if self.use_icu: + # rate = pd.concat([rate, add_df]) + # rate = rate.sort_index() + # rate = rate.ffill() + # rate = rate.fillna(-1) + # amount = pd.concat([amount, add_df]) + # amount = amount.sort_index() + # amount = amount.ffill() + # amount = amount.fillna(-1) + # else: + # val = pd.concat([val, add_df]) + # val = val.sort_index() + # val = val.ffill() + # val = val.fillna(-1) + + # df2.iloc[:, 0:] = df2.iloc[:, 0:].sub(df2.index, 0) + # df2[df2 > 0] = 1 + # df2[df2 < 0] = 0 + # val.iloc[:, 0:] = df2.iloc[:, 0:] * val.iloc[:, 0:] + # # print(df2.head()) + # if self.use_icu: + # dataDic.iloc[:, 0:].to_dict(orient="list") + # dataDic[hid]["Med"]["rate"] = rate.iloc[:, 0:].to_dict(orient="list") + # dataDic[hid]["Med"]["amount"] = amount.iloc[:, 0:].to_dict( + # orient="list" + # ) + # else: + # dataDic[hid]["Med"]["signal"] = df2.iloc[:, 0:].to_dict(orient="list") + # dataDic[hid]["Med"]["val"] = val.iloc[:, 0:].to_dict(orient="list") + + # feat_df = pd.DataFrame(columns=list(set(feat) - set(val.columns))) + + # val = pd.concat([val, feat_df], axis=1) + + # val = val[feat] + # val = val.fillna(0) + + # val.columns = pd.MultiIndex.from_product([["MEDS"], val.columns]) + # return val diff --git a/pipeline/feature/output_events.py b/pipeline/feature/output_events.py new file mode 100644 index 0000000000..9cba3bd4f8 --- /dev/null +++ b/pipeline/feature/output_events.py @@ -0,0 +1,152 @@ +from pipeline.feature.feature_abc import Feature, Name +import logging +import pandas as pd +import numpy as np +from pipeline.file_info.preproc.feature import ( + OutputEventsHeader, +) +from pipeline.file_info.preproc.cohort import IcuCohortHeader +from pipeline.file_info.raw.icu import load_icu_output_events, OuputputEvents + +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger() + + +class OutputEvents(Feature): + def __init__(self, df: pd.DataFrame = pd.DataFrame()): + self.df = df + self.final_df = pd.DataFrame() + + def name() -> str: + return Name.OUTPUT + + def df(self): + return self.df + + def extract_from(self, cohort: pd.DataFrame) -> pd.DataFrame: + """Function for getting hosp observations pertaining to a pickled cohort. + Function is structured to save memory when reading and transforming data.""" + logger.info("[EXTRACTING OUTPUT EVENTS DATA]") + raw_out = load_icu_output_events() + out = raw_out.merge( + cohort[ + [ + IcuCohortHeader.STAY_ID, + IcuCohortHeader.IN_TIME, + IcuCohortHeader.OUT_TIME, + ] + ], + on=IcuCohortHeader.STAY_ID, + ) + out[OutputEventsHeader.EVENT_TIME_FROM_ADMIT] = ( + out[OuputputEvents.CHART_TIME] - out[IcuCohortHeader.IN_TIME] + ) + out = out.dropna() + + # Print unique counts and value_counts + logger.info(f"# Unique Events: {out[OuputputEvents.ITEM_ID].nunique()}") + logger.info(f"# Admissions: {out[OuputputEvents.STAY_ID].nunique()}") + logger.info(f"Total rows: {out.shape[0]}") + out = out[[h.value for h in OutputEventsHeader]] + self.df = out + return out + + def preproc(self): + pass + + def summary(self): + out: pd.DataFrame = self.df + freq = ( + out.groupby([OutputEventsHeader.STAY_ID, OutputEventsHeader.ITEM_ID]) + .size() + .reset_index(name="mean_frequency") + ) + freq = freq.groupby(["itemid"])["mean_frequency"].mean().reset_index() + total = ( + out.groupby(OutputEventsHeader.ITEM_ID) + .size() + .reset_index(name="total_count") + ) + summary = pd.merge(freq, total, on=OutputEventsHeader.ITEM_ID, how="right") + summary = summary.fillna(0) + return summary + + def generate_fun(self, cohort): + """ + Processes event times in the data, adjusting based on the cohort stay_id and length of stay (los). + """ + breakpoint() + out: pd.DataFrame = self.df[self.df["stay_id"].isin(cohort["stay_id"])].copy() + time_split = out["event_time_from_admit"].str.extract( + r"(\d+) days (\d+):(\d+):(\d+)" + ) + out["start_time"] = pd.to_numeric(time_split[0]) * 24 + pd.to_numeric( + time_split[1] + ) + # Removing entries where event time is after discharge time + out = out.merge(cohort[["stay_id", "los"]], on="stay_id", how="left") + out = out[out["los"] - out["start_time"] > 0] + out = out.drop(columns=["los", "event_time_from_admit"]) + + self.df = out + return out + + def mortality_length(self, cohort, include_time): + self.df = self.df[self.df["stay_id"].isin(cohort["stay_id"])] + self.df = self.df[self.df["start_time"] <= include_time] + + def los_length(self, cohort, include_time): + self.df = self.df[self.df["stay_id"].isin(cohort["stay_id"])] + self.df = self.df[self.df["start_time"] <= include_time] + + def read_length(self, cohort): + self.df = self.df[self.df["stay_id"].isin(cohort["stay_id"])] + self.df = pd.merge( + self.df, cohort[["stay_id", "select_time"]], on="stay_id", how="left" + ) + self.df["start_time"] = self.df["start_time"] - self.df["select_time"] + self.df = self.df[self.df["start_time"] >= 0] + + def smooth_meds_step(self, bucket, i, t): + sub_out = ( + self.df[(self.df["start_time"] >= i) & (self.df["start_time"] < i + bucket)] + .groupby(["stay_id", "itemid"]) + .agg({"subject_id": "max"}) + ) + sub_out = sub_out.reset_index() + sub_out["start_time"] = t + return sub_out + + # def smooth_meds(self): + # f2_df = self.final_df.groupby(["stay_id", "itemid"]).size() + # df_per_adm = f2_df.groupby("stay_id").sum().reset_index()[0].max() + # dflength_per_adm = self.final_df.groupby("stay_id").size().max() + # return f2_df, df_per_adm, dflength_per_adm + + # def dict_step(self, hid, los, dataDic): + # feat = self.final_df["itemid"].unique() + # df2 = self.final_df[self.final_df["stay_id"] == hid] + # if df2.shape[0] == 0: + # df2 = pd.DataFrame(np.zeros([los, len(feat)]), columns=feat) + # df2 = df2.fillna(0) + # df2.columns = pd.MultiIndex.from_product([["OUT"], df2.columns]) + # else: + # df2["val"] = 1 + # df2 = df2.pivot_table(index="start_time", columns="itemid", values="val") + # # print(df2.shape) + # add_indices = pd.Index(range(los)).difference(df2.index) + # add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna(np.nan) + # df2 = pd.concat([df2, add_df]) + # df2 = df2.sort_index() + # df2 = df2.fillna(0) + # df2[df2 > 0] = 1 + # # print(df2.head()) + # dataDic[hid]["Out"] = df2.to_dict(orient="list") + + # feat_df = pd.DataFrame(columns=list(set(feat) - set(df2.columns))) + # df2 = pd.concat([df2, feat_df], axis=1) + + # df2 = df2[feat] + # df2 = df2.fillna(0) + # df2.columns = pd.MultiIndex.from_product([["OUT"], df2.columns]) + # return df2 diff --git a/pipeline/feature/procedures.py b/pipeline/feature/procedures.py new file mode 100644 index 0000000000..e9d6e1b2d1 --- /dev/null +++ b/pipeline/feature/procedures.py @@ -0,0 +1,284 @@ +from pipeline.feature.feature_abc import Feature, Name +import logging +import pandas as pd +import numpy as np +from pipeline.file_info.preproc.feature import ( + ProceduresHeader, + IcuProceduresHeader, + NonIcuProceduresHeader, +) +from pipeline.file_info.preproc.cohort import ( + CohortHeader, + IcuCohortHeader, + NonIcuCohortHeader, +) +from pipeline.file_info.raw.hosp import HospProceduresIcd, load_hosp_procedures_icd +from pipeline.file_info.raw.icu import load_icu_procedure_events + +logger = logging.getLogger() + + +class Procedures(Feature): + def __init__( + self, use_icu: bool, df: pd.DataFrame = pd.DataFrame, keep_icd9: bool = True + ): + self.use_icu = use_icu + self.keep_icd9 = keep_icd9 + self.df = df + self.final_df = pd.DataFrame() + self.adm_id = ( + IcuCohortHeader.STAY_ID + if self.use_icu + else CohortHeader.HOSPITAL_ADMISSION_ID + ) + self.time_from_admit = ( + IcuProceduresHeader.EVENT_TIME_FROM_ADMIT + if self.use_icu + else NonIcuProceduresHeader.PROC_TIME_FROM_ADMIT + ) + + def name() -> str: + return Name.PROCEDURES + + def df(self): + return self.df + + def extract_from(self, cohort: pd.DataFrame) -> pd.DataFrame: + logger.info("[EXTRACTING PROCEDURES DATA]") + raw_procedures = ( + load_icu_procedure_events() if self.use_icu else load_hosp_procedures_icd() + ) + procedures = raw_procedures.merge( + cohort[ + [ + CohortHeader.PATIENT_ID, + CohortHeader.HOSPITAL_ADMISSION_ID, + IcuCohortHeader.STAY_ID, + IcuCohortHeader.IN_TIME, + IcuCohortHeader.OUT_TIME, + ] + if self.use_icu + else [ + CohortHeader.HOSPITAL_ADMISSION_ID, + NonIcuCohortHeader.ADMIT_TIME, + NonIcuCohortHeader.DISCH_TIME, + ] + ], + on=IcuCohortHeader.STAY_ID + if self.use_icu + else HospProceduresIcd.HOSPITAL_ADMISSION_ID, + ) + procedures[self.time_from_admit] = ( + procedures[ + IcuProceduresHeader.START_TIME + if self.use_icu + else NonIcuProceduresHeader.CHART_DATE + ] + - procedures[ + IcuProceduresHeader.IN_TIME + if self.use_icu + else NonIcuProceduresHeader.ADMIT_TIME + ] + ) + procedures = procedures.dropna() + self.log_icu(procedures) if self.use_icu else self.log_non_icu(procedures) + procedures = procedures[ + [h.value for h in ProceduresHeader] + + [ + h.value + for h in ( + IcuProceduresHeader if self.use_icu else NonIcuProceduresHeader + ) + ] + ] + self.df = procedures + return procedures + + def log_icu(self, procedures: pd.DataFrame) -> None: + logger.info( + f"# Unique Events: {procedures[IcuProceduresHeader.ITEM_ID].dropna().nunique()}" + ) + logger.info( + f"# Admissions: {procedures[IcuProceduresHeader.STAY_ID].nunique()}" + ) + logger.info(f"Total rows: {procedures.shape[0]}") + + def log_non_icu(self, procedures: pd.DataFrame) -> None: + for v in [9, 10]: + unique_procedures_count = ( + procedures.loc[procedures[NonIcuProceduresHeader.ICD_VERSION] == v][ + NonIcuProceduresHeader.ICD_CODE + ] + .dropna() + .nunique() + ) + logger.info(f" # Unique ICD{v} Procedures:{ unique_procedures_count}") + + logger.info( + f"\nValue counts of each ICD version:\n {procedures[NonIcuProceduresHeader.ICD_VERSION].value_counts()}" + ) + logger.info( + f"# Admissions:{procedures[CohortHeader.HOSPITAL_ADMISSION_ID].nunique()}" + ) + logger.info(f"Total number of rows: {procedures.shape[0]}") + + def preproc(self, keep_icd9: bool): + logger.info("[PROCESSING PROCEDURES DATA]") + proc = self.df.copy() + if not keep_icd9: + proc = proc.loc[proc[NonIcuProceduresHeader.ICD_VERSION] == 10] + proc = proc[ + [ + ProceduresHeader.PATIENT_ID, + ProceduresHeader.HOSPITAL_ADMISSION_ID, + NonIcuProceduresHeader.ICD_CODE, + NonIcuProceduresHeader.CHART_DATE, + NonIcuProceduresHeader.ADMIT_TIME, + NonIcuProceduresHeader.PROC_TIME_FROM_ADMIT, + ] + ] + if not keep_icd9: + proc = proc.dropna() + self.keep_icd9 = keep_icd9 + logger.info(f"Total number of rows: {proc.shape[0]}") + self.df = proc + return self.df + + def summary(self): + proc: pd.DataFrame = self.df + feature_name = ( + IcuProceduresHeader.ITEM_ID + if self.use_icu + else NonIcuProceduresHeader.ICD_CODE + ) + freq = ( + proc.groupby([self.adm_id, feature_name]) + .size() + .reset_index(name="mean_frequency") + ) + freq = freq.groupby(feature_name)["mean_frequency"].mean().reset_index() + total = proc.groupby(feature_name).size().reset_index(name="total_count") + summary = pd.merge(freq, total, on=feature_name, how="right") + summary = summary.fillna(0) + return summary + + def generate_fun(self, cohort: pd.DataFrame): + proc: pd.DataFrame = self.df + proc = proc[proc[self.adm_id].isin(cohort[self.adm_id])] + proc[["start_days", "dummy", "start_hours"]] = proc[ + self.time_from_admit + ].str.split(" ", expand=True) + proc[["start_hours", "min", "sec"]] = proc["start_hours"].str.split( + ":", expand=True + ) + proc["start_time"] = pd.to_numeric(proc["start_days"]) * 24 + pd.to_numeric( + proc["start_hours"] + ) + proc = proc.drop(columns=["start_days", "dummy", "start_hours", "min", "sec"]) + proc = proc[proc["start_time"] >= 0] + + ###Remove where event time is after discharge time + proc = pd.merge(proc, cohort[[self.adm_id, "los"]], on=self.adm_id, how="left") + proc["sanity"] = proc["los"] - proc["start_time"] + proc = proc[proc["sanity"] > 0] + del proc["sanity"] + self.df = proc + return proc + + def mortality_length(self, cohort, include_time): + self.df = self.df[self.df[self.adm_id].isin(cohort[self.adm_id])] + self.df = self.df[self.df[self.adm_id] <= include_time] + + def los_length(self, cohort, include_time): + self.df = self.df[self.df[self.adm_id].isin(cohort[self.adm_id])] + self.df = self.df[self.df[self.adm_id] <= include_time] + + def read_length(self, cohort): + col = "stay_id" if self.use_icu else "hadm_id" + self.df = self.df[self.df[col].isin(cohort[col])] + self.df = pd.merge(self.df, cohort[[col, "select_time"]], on=col, how="left") + self.df["start_time"] = self.proc["start_time"] - self.proc["select_time"] + self.df = self.df[self.df["start_time"] >= 0] + + def smooth_meds_step(self, bucket, i, t): + sub_proc = ( + self.df[(self.df["start_time"] >= i) & (self.df["start_time"] < i + bucket)] + .groupby(["stay_id", "itemid"] if self.use_icu else ["hadm_id", "icd_code"]) + .agg({"subject_id": "max"}) + ) + sub_proc = sub_proc.reset_index() + sub_proc["start_time"] = t + return sub_proc + + # def smooth_meds(self): + # f2_df = self.final_df.groupby( + # ["stay_id", "itemid", "orderid"] + # if self.use_icu + # else ["hadm_id", "icd_code"] + # ).size() + # df_per_adm = f2_df.groupby(self.adm_id).sum().reset_index()[0].max() + # dflength_per_adm = self.final_df.groupby(self.adm_id).size().max() + # return f2_df, df_per_adm, dflength_per_adm + + # def dict_step(self, hid, los, dataDic): + # if self.use_icu: + # feat = self.final_df["itemid"].unique() + # df2 = self.final_df[self.final_df["stay_id"] == hid] + # if df2.shape[0] == 0: + # df2 = pd.DataFrame(np.zeros([los, len(feat)]), columns=feat) + # df2 = df2.fillna(0) + # df2.columns = pd.MultiIndex.from_product([["PROC"], df2.columns]) + # else: + # df2["val"] = 1 + # # print(df2) + # df2 = df2.pivot_table( + # index="start_time", columns="itemid", values="val" + # ) + # # print(df2.shape) + # add_indices = pd.Index(range(los)).difference(df2.index) + # add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna( + # np.nan + # ) + # df2 = pd.concat([df2, add_df]) + # df2 = df2.sort_index() + # df2 = df2.fillna(0) + # df2[df2 > 0] = 1 + # # print(df2.head()) + # dataDic[hid]["Proc"] = df2.to_dict(orient="list") + + # feat_df = pd.DataFrame(columns=list(set(feat) - set(df2.columns))) + # df2 = pd.concat([df2, feat_df], axis=1) + + # df2 = df2[feat] + # df2 = df2.fillna(0) + # df2.columns = pd.MultiIndex.from_product([["PROC"], df2.columns]) + # else: + # feat = self.final_df["icd_code"].unique() + # df2 = self.final_df[self.final_df["hadm_id"] == hid] + # if df2.shape[0] == 0: + # df2 = pd.DataFrame(np.zeros([los, len(feat)]), columns=feat) + # df2 = df2.fillna(0) + # df2.columns = pd.MultiIndex.from_product([["PROC"], df2.columns]) + # else: + # df2["val"] = 1 + # df2 = df2.pivot_table( + # index="start_time", columns="icd_code", values="val" + # ) + # add_indices = pd.Index(range(los)).difference(df2.index) + # add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna( + # np.nan + # ) + # df2 = pd.concat([df2, add_df]) + # df2 = df2.sort_index() + # df2 = df2.fillna(0) + # df2[df2 > 0] = 1 + # dataDic[hid]["Proc"] = df2.to_dict(orient="list") + + # feat_df = pd.DataFrame(columns=list(set(feat) - set(df2.columns))) + # df2 = pd.concat([df2, feat_df], axis=1) + + # df2 = df2[feat] + # df2 = df2.fillna(0) + # df2.columns = pd.MultiIndex.from_product([["PROC"], df2.columns]) + + # return df2 diff --git a/pipeline/feature_selector.py b/pipeline/feature_selector.py new file mode 100644 index 0000000000..3eb7aac901 --- /dev/null +++ b/pipeline/feature_selector.py @@ -0,0 +1,139 @@ +import pandas as pd +import logging +from pipeline.file_info.preproc.feature import ( + EXTRACT_DIAG_PATH, + EXTRACT_DIAG_ICU_PATH, + PreprocDiagnosesHeader, + EXTRACT_MED_ICU_PATH, + EXTRACT_MED_PATH, + IcuMedicationHeader, + PreprocMedicationHeader, + EXTRACT_OUT_ICU_PATH, + EXTRACT_LABS_PATH, + EXTRACT_PROC_PATH, + EXTRACT_PROC_ICU_PATH, + EXTRACT_CHART_ICU_PATH, + IcuProceduresHeader, + NonIcuProceduresHeader, +) +from typing import List +from pathlib import Path + +from pipeline.file_info.preproc.summary import ( + CHART_FEATURES_PATH, + DIAG_FEATURES_PATH, + LABS_FEATURES_PATH, + MED_FEATURES_PATH, + OUT_FEATURES_PATH, + PROC_FEATURES_PATH, +) + +logger = logging.getLogger() + + +# TODO REPLACE HARD CODED COLUMN AND CLASS NAMES +class FeatureSelector: + def __init__( + self, + use_icu: bool, + select_dia: bool, + select_med: bool, + select_proc: bool, + select_labs: bool, + select_chart: bool, + select_out: bool, + ): + self.use_icu = use_icu + + self.select_dia = select_dia + self.select_med = select_med + self.select_proc = select_proc + self.select_dia = select_dia + self.select_labs = select_labs + self.select_chart = select_chart + self.select_out = select_out + + def feature_selection(self) -> List[pd.DataFrame]: + features: List[pd.DataFrame] = [] + if self.select_dia: + features.append( + self.process_feature_selection( + EXTRACT_DIAG_ICU_PATH if self.use_icu else EXTRACT_DIAG_PATH, + DIAG_FEATURES_PATH, + PreprocDiagnosesHeader.NEW_ICD_CODE.value, + "Diagnosis", + ) + ) + + if self.select_med: + path = EXTRACT_MED_ICU_PATH if self.use_icu else EXTRACT_MED_PATH + feature_name = ( + IcuMedicationHeader.ITEM_ID + if self.use_icu + else PreprocMedicationHeader.DRUG_NAME + ) + features.append( + self.process_feature_selection( + path, MED_FEATURES_PATH, feature_name, "Medications" + ) + ) + + if self.select_proc: + path = EXTRACT_PROC_ICU_PATH if self.use_icu else EXTRACT_PROC_PATH + features.append( + self.process_feature_selection( + path, + PROC_FEATURES_PATH, + IcuProceduresHeader.ITEM_ID + if self.use_icu + else NonIcuProceduresHeader.ICD_CODE.value, + "Procedures", + ) + ) + + if self.select_labs: + labs = self.concat_csv_chunks(EXTRACT_LABS_PATH, 10000000) + feature_df = pd.read_csv(LABS_FEATURES_PATH) + labs = labs[labs["itemid"].isin(feature_df["itemid"].unique())] + self.log_and_save(labs, EXTRACT_LABS_PATH, "Labs") + features.append(labs) + + if self.select_chart: + features.append( + self.process_feature_selection( + EXTRACT_CHART_ICU_PATH, + CHART_FEATURES_PATH, + "itemid", + "Output Events", + ) + ) + + if self.select_out: + features.append( + self.process_feature_selection( + EXTRACT_OUT_ICU_PATH, OUT_FEATURES_PATH, "itemid", "Output Events" + ) + ) + + return features + + def process_feature_selection( + self, data_path: Path, feature_path: Path, feature_col: str, data_type: str + ): + """Generalized method for processing feature selection.""" + data_df = pd.read_csv(data_path, compression="gzip") + feature_df = pd.read_csv(feature_path) + data_df = data_df[data_df[feature_col].isin(feature_df[feature_col].unique())] + self.log_and_save(data_df, data_path, data_type) + return data_df + + def concat_csv_chunks(self, file_path: Path, chunksize: int): + """Concatenate chunks from a CSV file.""" + chunks = pd.read_csv(file_path, compression="gzip", chunksize=chunksize) + return pd.concat(chunks, ignore_index=True) + + def log_and_save(self, df: pd.DataFrame, path: Path, data_type: str): + """Log information and save DataFrame to a CSV file.""" + logger.info(f"Total number of rows in {data_type}: {df.shape[0]}") + df.to_csv(path, compression="gzip", index=False) + logger.info(f"[SUCCESSFULLY SAVED {data_type} DATA]") diff --git a/pipeline/features_extractor.py b/pipeline/features_extractor.py new file mode 100644 index 0000000000..4b2199cfb3 --- /dev/null +++ b/pipeline/features_extractor.py @@ -0,0 +1,115 @@ +from pathlib import Path +import pandas as pd +import logging +from pipeline.feature.feature_abc import Feature +from pipeline.feature.chart_events import Chart +from pipeline.feature.diagnoses import Diagnoses +from pipeline.feature.medications import Medications +from pipeline.feature.output_events import OutputEvents +from pipeline.feature.procedures import Procedures +from pipeline.file_info.common import save_data +from pipeline.file_info.preproc.cohort import load_cohort +from pipeline.feature.lab_events import Lab +from typing import List, Tuple + +from pipeline.file_info.preproc.feature import ( + EXTRACT_CHART_ICU_PATH, + EXTRACT_DIAG_ICU_PATH, + EXTRACT_DIAG_PATH, + EXTRACT_LABS_PATH, + EXTRACT_MED_ICU_PATH, + EXTRACT_MED_PATH, + EXTRACT_OUT_ICU_PATH, + EXTRACT_PROC_ICU_PATH, + EXTRACT_PROC_PATH, +) +from pipeline.feature.feature_abc import Name + +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger() + + +class FeatureExtractor: + """ + Extracts various features from a cohort based on specified conditions. + + Attributes: + cohort_output (str): Output path or identifier for the cohort. + use_icu (bool): Flag to indicate whether ICU data should be used. + for_diagnoses (bool): Flag to extract diagnosis features. + for_output_events (bool): Flag to extract output event features. + for_chart_events (bool): Flag to extract chart event features. + for_procedures (bool): Flag to extract procedure features. + for_medications (bool): Flag to extract medication features. + for_labs (bool): Flag to extract lab event features. + """ + + def __init__( + self, + cohort_output: str, + use_icu: bool, + for_diagnoses: bool, + for_output_events: bool, + for_chart_events: bool, + for_procedures: bool, + for_medications: bool, + for_labs: bool, + ): + self.cohort_output = cohort_output + self.use_icu = use_icu + self.for_diagnoses = for_diagnoses + self.for_output_events = for_output_events + self.for_chart_events = for_chart_events + self.for_procedures = for_procedures + self.for_medications = for_medications + self.for_labs = for_labs + + def save_features(self) -> List[pd.DataFrame]: + """ + Loads the cohort and extracts features based on the specified conditions. + + Returns: + List[pd.DataFrame]: A list of DataFrames, each containing a type of extracted feature. + """ + cohort = load_cohort(self.use_icu, self.cohort_output) + feature_conditions: List[Tuple[bool, Feature, Path]] = [ + ( + self.for_diagnoses, + Diagnoses(use_icu=self.use_icu), + EXTRACT_DIAG_ICU_PATH if self.use_icu else EXTRACT_DIAG_PATH, + ), + ( + self.for_procedures, + Procedures(use_icu=self.use_icu), + EXTRACT_PROC_ICU_PATH if self.use_icu else EXTRACT_PROC_PATH, + ), + ( + self.for_medications, + Medications(use_icu=self.use_icu), + EXTRACT_MED_ICU_PATH if self.use_icu else EXTRACT_MED_PATH, + ), + ( + self.for_output_events and self.use_icu, + OutputEvents(), + EXTRACT_OUT_ICU_PATH, + ), + ( + self.for_chart_events and self.use_icu, + Chart(), + EXTRACT_CHART_ICU_PATH, + ), + ( + self.for_labs and not self.use_icu, + Lab(), + EXTRACT_LABS_PATH, + ), + ] + features = {} + for condition, feature, path in feature_conditions: + if condition: + extract_feature = feature.extract_from(cohort) + feature_name = feature.__class__.name() + features[feature_name] = extract_feature + save_data(extract_feature, path, feature_name) + + return features diff --git a/pipeline/features_preprocessor.py b/pipeline/features_preprocessor.py new file mode 100644 index 0000000000..11acdbe774 --- /dev/null +++ b/pipeline/features_preprocessor.py @@ -0,0 +1,99 @@ +import pandas as pd +import logging + +from pipeline.feature.diagnoses import IcdGroupOption +from pipeline.feature.lab_events import Lab +from pipeline.feature_selector import FeatureSelector +from pipeline.features_extractor import FeatureExtractor +from typing import List + +from pipeline.feature.chart_events import Chart +from pipeline.file_info.common import save_data +from pipeline.file_info.preproc.feature import EXTRACT_CHART_ICU_PATH, EXTRACT_LABS_PATH + +from pipeline.no_event_feature_preprocessor import NoEventFeaturePreprocessor +from pipeline.summarizer import Summarizer + +logger = logging.getLogger() + + +# REMOVE FEATURE EXTRACTOR? +class FeaturePreprocessor: + def __init__( + self, + feature_extractor: FeatureExtractor, + group_diag_icd: IcdGroupOption, + group_med_code: bool, + keep_proc_icd9: bool, + clean_chart: bool = False, + impute_outlier_chart: bool = False, + clean_labs: bool = False, + impute_labs: bool = False, + thresh: int = 100, + left_thresh: int = 0, + ): + self.feature_extractor = feature_extractor + self.group_diag_icd = group_diag_icd + self.group_med_code = group_med_code + self.keep_proc_icd9 = keep_proc_icd9 + self.clean_chart = clean_chart + self.impute_outlier_chart = impute_outlier_chart + self.clean_labs = clean_labs + self.impute_labs = impute_labs + self.thresh = thresh + self.left_thresh = left_thresh + + def preprocess_no_event_features(self): + preprocessor = NoEventFeaturePreprocessor( + self.feature_extractor, + self.group_diag_icd, + self.group_med_code, + self.keep_proc_icd9, + ) + return preprocessor.preprocess() + + def save_summaries(self): + summarizer = Summarizer(self.feature_extractor) + return summarizer.save_summaries() + + def feature_selection(self) -> List[pd.DataFrame]: + feature_selector = FeatureSelector( + use_icu=self.feature_extractor.use_icu, + select_dia=self.feature_extractor.for_diagnoses, + select_med=self.feature_extractor.for_medications, + select_proc=self.feature_extractor.for_procedures, + select_chart=self.feature_extractor.for_chart_events, + select_labs=self.feature_extractor.for_labs, + select_out=self.feature_extractor.for_output_events, + ) + return feature_selector.feature_selection() + + def preproc_events_features(self) -> List[pd.DataFrame]: + event_preproc_features: List[pd.DataFrame] = [] + if self.clean_chart and self.feature_extractor.use_icu: + extract_chart = pd.read_csv(EXTRACT_CHART_ICU_PATH, compression="gzip") + chart = Chart(df=extract_chart) + preproc_chart = chart.impute_outlier( + self.impute_outlier_chart, + self.thresh, + self.left_thresh, + ) + save_data(preproc_chart, EXTRACT_CHART_ICU_PATH, "CHART EVENTS") + event_preproc_features.append(preproc_chart) + if self.clean_labs and not self.feature_extractor.use_icu: + extract_labs = pd.read_csv(EXTRACT_LABS_PATH, compression="gzip") + lab = Lab(df=extract_labs) + preproc_lab = lab.impute_outlier( + impute=self.impute_labs, + thresh=self.thresh, + left_thresh=self.left_thresh, + ) + save_data(preproc_lab, EXTRACT_LABS_PATH, "LABS EVENTS") + event_preproc_features.append(lab.preproc()) + return event_preproc_features + + def preprocess(self): + self.preprocess_no_event_features() + self.save_summaries() + self.feature_selection() + self.preproc_events_features() diff --git a/pipeline/file_info/common.py b/pipeline/file_info/common.py new file mode 100644 index 0000000000..3609a8508c --- /dev/null +++ b/pipeline/file_info/common.py @@ -0,0 +1,41 @@ +from enum import StrEnum +from pathlib import Path +import pandas as pd +import logging + +RAW_PATH = Path("raw_data") / "mimiciv_2_0" +MAP_PATH = Path("utils") / "mappings" / "ICD9_to_ICD10_mapping.txt" +MAP_NDC_PATH = Path("utils") / "mappings" / "ndc_product.txt" +PREPROC_PATH = Path("preproc_data") + +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger() + + +# icd mapping +class IcdMap(StrEnum): + DIAGNOISIS_TYPE = "diagnosis_type" + DIAGNOISIS_CODE = "diagnosis_code" + DIAGNOISIS_DESCRIPTION = "diagnosis_description" + ICD9 = "icd9cm" + ICD10 = "icd10cm" + FLAGS = "flags" + + +def load_static_icd_map() -> pd.DataFrame: + return pd.read_csv(MAP_PATH, delimiter="\t") + + +class NdcMap(StrEnum): + NON_PROPRIETARY_NAME = "NONPROPRIETARYNAME" + + +def load_ndc_mapping() -> pd.DataFrame: + return pd.read_csv(MAP_NDC_PATH, delimiter="\t") + + +def save_data(data: pd.DataFrame, path: Path, data_name: str) -> pd.DataFrame: + """Save DataFrame to specified path.""" + data.to_csv(path, compression="gzip", index=False) + logger.info(f"[SUCCESSFULLY SAVED {data_name} DATA]") + return data diff --git a/pipeline/file_info/preproc/cohort.py b/pipeline/file_info/preproc/cohort.py new file mode 100644 index 0000000000..65093e47c0 --- /dev/null +++ b/pipeline/file_info/preproc/cohort.py @@ -0,0 +1,53 @@ +from enum import StrEnum +from pipeline.file_info.common import PREPROC_PATH +import pandas as pd +import logging + +logger = logging.getLogger() +COHORT_PATH = PREPROC_PATH / "cohort" + + +# split common header icu header non icu header +class CohortHeader(StrEnum): + PATIENT_ID = "subject_id" + HOSPITAL_ADMISSION_ID = "hadm_id" + FIRST_CARE_UNIT = "first_careunit" + LAST_CARE_UNIT = "last_careunit" + LOS = "los" + AGE = "age" + MIN_VALID_YEAR = "min_valid_year" + DOD = "dod" + GENDER = "gender" + INSURANCE = "insurance" + ETHICITY = "ethnicity" + LABEL = "label" + + +class IcuCohortHeader(StrEnum): + STAY_ID = "stay_id" + IN_TIME = "intime" + OUT_TIME = "outtime" + + +class NonIcuCohortHeader(StrEnum): + ADMIT_TIME = "admittime" + DISCH_TIME = "dischtime" + + +def load_cohort(use_icu: bool, cohort_ouput: str) -> pd.DataFrame: + """Load cohort data from a CSV file.""" + cohort_path = COHORT_PATH / f"{cohort_ouput}.csv.gz" + try: + return pd.read_csv( + cohort_path, + compression="gzip", + parse_dates=[ + IcuCohortHeader.IN_TIME if use_icu else NonIcuCohortHeader.ADMIT_TIME + ], + ) + except FileNotFoundError: + logger.error(f"Cohort file not found at {cohort_path}") + raise + except Exception as e: + logger.error(f"Error loading cohort file: {e}") + raise diff --git a/pipeline/file_info/preproc/feature.py b/pipeline/file_info/preproc/feature.py new file mode 100644 index 0000000000..a26ca41278 --- /dev/null +++ b/pipeline/file_info/preproc/feature.py @@ -0,0 +1,126 @@ +from enum import StrEnum + +from pipeline.file_info.common import PREPROC_PATH + + +FEATURE_PATH = PREPROC_PATH / "features" +FEATURE_EXTRACT_PATH = FEATURE_PATH / "extract" +FEATURE_PREPROC_PATH = FEATURE_PATH / "preproc" +FEATURE_SUMMARY_PATH = FEATURE_PATH / "summary" + + +EXTRACT_DIAG_PATH = FEATURE_EXTRACT_PATH / "diag.csv.gz" +EXTRACT_DIAG_ICU_PATH = FEATURE_EXTRACT_PATH / "diag_icu.csv.gz" + + +class DiagnosesHeader(StrEnum): + PATIENT_ID = "subject_id" + HOSPITAL_ADMISSION_ID = "hadm_id" + ICD_CODE = "icd_code" + ROOT_ICD10 = "root_icd10_convert" + ROOT = "root" + + +class DiagnosesIcuHeader(StrEnum): + STAY_ID = "stay_id" + + +class PreprocDiagnosesHeader(StrEnum): + PATIENT_ID = "subject_id" + HOSPITAL_ADMISSION_ID = "hadm_id" + NEW_ICD_CODE = "new_icd_code" + + +EXTRACT_PROC_PATH = FEATURE_EXTRACT_PATH / "proc.csv.gz" +EXTRACT_PROC_ICU_PATH = FEATURE_EXTRACT_PATH / "proc_icu.csv.gz" + + +class ProceduresHeader(StrEnum): + PATIENT_ID = "subject_id" + HOSPITAL_ADMISSION_ID = "hadm_id" + + +class IcuProceduresHeader(StrEnum): + STAY_ID = "stay_id" + ITEM_ID = "itemid" + START_TIME = "starttime" + IN_TIME = "intime" + EVENT_TIME_FROM_ADMIT = "event_time_from_admit" + + +class NonIcuProceduresHeader(StrEnum): + ICD_CODE = "icd_code" + ICD_VERSION = "icd_version" + CHART_DATE = "chartdate" + ADMIT_TIME = "admittime" + PROC_TIME_FROM_ADMIT = "proc_time_from_admit" + + +EXTRACT_MED_ICU_PATH = FEATURE_EXTRACT_PATH / "med_icu.csv.gz" +EXTRACT_MED_PATH = FEATURE_EXTRACT_PATH / "med.csv.gz" + + +class MedicationsHeader(StrEnum): + PATIENT_ID = "subject_id" + HOSPITAL_ADMISSION_ID = "hadm_id" + START_TIME = "starttime" + START_HOURS_FROM_ADMIT = "start_hours_from_admit" + STOP_HOURS_FROM_ADMIT = "stop_hours_from_admit" + + +class IcuMedicationHeader(StrEnum): + STAY_ID = "stay_id" + ITEM_ID = "itemid" + END_TIME = "endtime" + RATE = "rate" + AMOUNT = "amount" + ORDER_ID = "orderid" + + +class NonIcuMedicationHeader(StrEnum): + STOP_TIME = "stoptime" + DRUG = "drug" + NON_PROPRIEATARY_NAME = "nonproprietaryname" + DOSE_VAL_RX = "dose_val_rx" + EPC = "EPC" + + +class PreprocMedicationHeader(StrEnum): + DRUG_NAME = "drug_name" + + +EXTRACT_OUT_ICU_PATH = FEATURE_EXTRACT_PATH / "out_icu.csv.gz" + + +class OutputEventsHeader(StrEnum): + PATIENT_ID = "subject_id" + HOSPITAL_ADMISSION_ID = "hadm_id" + STAY_ID = "stay_id" + ITEM_ID = "itemid" + CHART_TIME = "charttime" + IN_TIME = "intime" + EVENT_TIME_FROM_ADMIT = "event_time_from_admit" + + +EXTRACT_LABS_PATH = FEATURE_EXTRACT_PATH / "labs.csv.gz" +PREPROC_LABS_ICU_PATH = FEATURE_PREPROC_PATH / "labs.csv.gz" + + +class LabEventsHeader(StrEnum): + PATIENT_ID = "subject_id" + HOSPITAL_ADMISSION_ID = "hadm_id" + ITEM_ID = "itemid" + CHART_TIME = "charttime" + ADMIT_TIME = "admittime" + LAB_TIME_FROM_ADMIT = "lab_time_from_admit" + VALUE_NUM = "valuenum" + + +EXTRACT_CHART_ICU_PATH = FEATURE_EXTRACT_PATH / "chart_icu.csv.gz" + + +class ChartEventsHeader(StrEnum): + STAY_ID = "stay_id" + ITEM_ID = "itemid" + VALUE_NUM = "valuenum" + EVENT_TIME_FROM_ADMIT = "event_time_from_admit" diff --git a/pipeline/file_info/preproc/summary.py b/pipeline/file_info/preproc/summary.py new file mode 100644 index 0000000000..fd10357b88 --- /dev/null +++ b/pipeline/file_info/preproc/summary.py @@ -0,0 +1,22 @@ +from pipeline.file_info.common import PREPROC_PATH + +SUMMARY_PATH = PREPROC_PATH / "summary" + +DIAG_FEATURES_PATH = SUMMARY_PATH / "diag_features.csv" +DIAG_SUMMARY_PATH = SUMMARY_PATH / "diag_summary.csv" + +MED_FEATURES_PATH = SUMMARY_PATH / "med_features.csv" +MED_SUMMARY_PATH = SUMMARY_PATH / "med_summary.csv" + +OUT_FEATURES_PATH = SUMMARY_PATH / "out_features.csv" +OUT_SUMMARY_PATH = SUMMARY_PATH / "out_summary.csv" + +PROC_FEATURES_PATH = SUMMARY_PATH / "proc_features.csv" +PROC_SUMMARY_PATH = SUMMARY_PATH / "proc_summary.csv" + +LABS_FEATURES_PATH = SUMMARY_PATH / "labs_features.csv" +LABS_SUMMARY_PATH = SUMMARY_PATH / "labs_summary.csv" + + +CHART_FEATURES_PATH = SUMMARY_PATH / "chart_features.csv" +CHART_SUMMARY_PATH = SUMMARY_PATH / "chart_summary.csv" diff --git a/pipeline/file_info/raw/hosp.py b/pipeline/file_info/raw/hosp.py new file mode 100644 index 0000000000..203ef75563 --- /dev/null +++ b/pipeline/file_info/raw/hosp.py @@ -0,0 +1,135 @@ +from enum import StrEnum +import pandas as pd +from pipeline.file_info.common import RAW_PATH + +""" +The Hosp module provides all data acquired from the hospital wide electronic health record +""" + +HOSP = "hosp" + +HOSP_DIAGNOSES_ICD_PATH = RAW_PATH / HOSP / "diagnoses_icd.csv.gz" +HOSP_PATIENTS_PATH = RAW_PATH / HOSP / "patients.csv.gz" +HOSP_LAB_EVENTS_PATH = RAW_PATH / HOSP / "labevents.csv.gz" +HOSP_ADMISSIONS_PATH = RAW_PATH / HOSP / "admissions.csv.gz" +HOSP_PREDICTIONS_PATH = RAW_PATH / HOSP / "prescriptions.csv.gz" +HOSP_PROCEDURES_ICD_PATH = RAW_PATH / HOSP / "procedures_icd.csv.gz" + + +# information regarding a patient +class HospPatients(StrEnum): + ID = "subject_id" # patient id + ANCHOR_YEAR = "anchor_year" # shifted year for the patient + ANCHOR_AGE = "anchor_age" # patient’s age in the anchor_year + ANCHOR_YEAR_GROUP = "anchor_year_group" # anchor_year occurred during this range + DOD = "dod" # de-identified date of death for the patient + GENDER = "gender" + + +def load_hosp_patients() -> pd.DataFrame: + return pd.read_csv( + HOSP_PATIENTS_PATH, + compression="gzip", + parse_dates=[HospPatients.DOD], + ) + + +# information regarding a patient’s admission to the hospital +class HospAdmissions(StrEnum): + ID = "hadm_id" # hospitalization id + PATIENT_ID = "subject_id" # patient id + ADMITTIME = "admittime" # datetime the patient was admitted to the hospital + DISCHTIME = "dischtime" # datetime the patient was discharged from the hospital + HOSPITAL_EXPIRE_FLAG = "hospital_expire_flag" # whether the patient died within the given hospitalization + LOS = "los" + HOSPITAL_ADMISSION_ID = "hadm_id" + INSURANCE = "insurance" + RACE = "race" + + +def load_hosp_admissions() -> pd.DataFrame: + return pd.read_csv( + HOSP_ADMISSIONS_PATH, + compression="gzip", + parse_dates=[HospAdmissions.ADMITTIME.value, HospAdmissions.DISCHTIME.value], + ) + + +class HospDiagnosesIcd(StrEnum): + SUBJECT_ID = "subject_id" # patient id + HOSPITAL_ADMISSION_ID = "hadm_id" # patient hospitalization id + SEQ_NUM = "seq_num" # priority assigned to the diagnoses + ICD_CODE = "icd_code" # International Coding Definitions code + ICD_VERSION = "icd_version" # version for the coding system + # added + ICD10 = "root_icd10_convert" + ROOT = "root" + + +def load_hosp_diagnosis_icd() -> pd.DataFrame: + return pd.read_csv(HOSP_DIAGNOSES_ICD_PATH, compression="gzip") + + +class HospLabEvents(StrEnum): + PATIENT_ID = "subject_id" + HOSPITAL_ADMISSION_ID = "hadm_id" + CHART_TIME = "charttime" + ITEM_ID = "itemid" + ADMIT_TIME = "admittime" + LAB_TIME_FROM_ADMIT = "lab_time_from_admit" + VALUE_NUM = "valuenum" + VALUE_UOM = "valueuom" + + +def load_hosp_lab_events(chunksize: int, use_cols=None) -> pd.DataFrame: + return pd.read_csv( + HOSP_LAB_EVENTS_PATH, + compression="gzip", + parse_dates=["charttime"], + chunksize=chunksize, + usecols=use_cols, + ) + + +class HospProceduresIcd(StrEnum): + PATIENT_ID = "subject_id" + HOSPITAL_ADMISSION_ID = "hadm_id" + SEQ_NUM = "seq_num" + CHART_DATE = "chartdate" + ICD_CODE = "icd_code" + ICD_VERSION = "icd_version" + + +def load_hosp_procedures_icd() -> pd.DataFrame: + return pd.read_csv( + HOSP_PROCEDURES_ICD_PATH, + compression="gzip", + parse_dates=[HospProceduresIcd.CHART_DATE.value], + ).drop_duplicates() + + +class HospPrescriptions(StrEnum): + PATIENT_ID = "subject_id" + HOSPITAL_ADMISSION_ID = "hadm_id" + DRUG = "drug" + START_TIME = "starttime" + STOP_TIME = "stoptime" + NDC = "ndc" + DOSE_VAL_RX = "dose_val_rx" + + +def load_hosp_prescriptions() -> pd.DataFrame: + return pd.read_csv( + HOSP_PREDICTIONS_PATH, + compression="gzip", + usecols=[ + HospPrescriptions.PATIENT_ID, + HospPrescriptions.HOSPITAL_ADMISSION_ID, + HospPrescriptions.DRUG, + HospPrescriptions.START_TIME, + HospPrescriptions.STOP_TIME, + HospPrescriptions.NDC, + HospPrescriptions.DOSE_VAL_RX, + ], + parse_dates=[HospPrescriptions.START_TIME, HospPrescriptions.STOP_TIME], + ) diff --git a/pipeline/file_info/raw/icu.py b/pipeline/file_info/raw/icu.py new file mode 100644 index 0000000000..66384668b3 --- /dev/null +++ b/pipeline/file_info/raw/icu.py @@ -0,0 +1,115 @@ +from enum import StrEnum +import pandas as pd +from pipeline.file_info.common import RAW_PATH + +""" +The ICU module contains information collected from the clinical information system used within the ICU. + +""" +ICU = "icu" + +ICU_ICUSTAY_PATH = RAW_PATH / ICU / "icustays.csv.gz" +ICU_INPUT_EVENT_PATH = RAW_PATH / ICU / "inputevents.csv.gz" +ICU_OUTPUT_EVENT_PATH = RAW_PATH / ICU / "outputevents.csv.gz" +ICU_CHART_EVENTS_PATH = RAW_PATH / ICU / "chartevents.csv.gz" +ICU_PROCEDURE_EVENTS_PATH = RAW_PATH / ICU / "procedureevents.csv.gz" + + +# information regarding ICU stays +class IcuStays(StrEnum): + PATIENT_ID = "subject_id" # patient id + ID = "stay_id" # icu stay id + HOSPITAL_ADMISSION_ID = "hadm_id" # patient hospitalization id + INTIME = "intime" # datetime the patient was transferred into the ICU. + OUTTIME = "outtime" # datetime the patient was transferred out the ICU. + LOS = "los" # length of stay for the patient for the given ICU stay in fractional days. + # added? + ADMITTIME = "admittime" + + +def load_icu_icustays() -> pd.DataFrame: + return pd.read_csv( + ICU_ICUSTAY_PATH, + compression="gzip", + parse_dates=[IcuStays.INTIME, IcuStays.OUTTIME], + ) + + +# Information regarding patient outputs including urine, drainage... +class OuputputEvents(StrEnum): + SUBJECT_ID = "subject_id" # patient id + HOSPITAL_ADMISSION_ID = "hadm_id" # patient hospitalization id + STAY_ID = "stay_id" # patient icu stay id + ITEM_ID = "itemid" # single measurement type id + CHART_TIME = "charttime" # time of an output event + + +def load_icu_output_events() -> pd.DataFrame: + return pd.read_csv( + ICU_OUTPUT_EVENT_PATH, + compression="gzip", + parse_dates=[OuputputEvents.CHART_TIME], + ).drop_duplicates() + + +class ChartEvents(StrEnum): + STAY_ID = "stay_id" + CHARTTIME = "charttime" + ITEMID = "itemid" + VALUENUM = "valuenum" + VALUEOM = "valueuom" + + +def load_icu_chart_events(chunksize: int) -> pd.DataFrame: + return pd.read_csv( + ICU_CHART_EVENTS_PATH, + compression="gzip", + usecols=[c for c in ChartEvents], + parse_dates=[ChartEvents.CHARTTIME], + chunksize=chunksize, + ) + + +def load_icu_chart_events(chunksize: int) -> pd.DataFrame: + return pd.read_csv( + ICU_CHART_EVENTS_PATH, + compression="gzip", + usecols=[c for c in ChartEvents], + parse_dates=[ChartEvents.CHARTTIME.value], + chunksize=chunksize, + ) + + +class InputEvents(StrEnum): + SUBJECT_ID = "subject_id" + STAY_ID = "stay_id" + ITEMID = "itemid" + STARTTIME = "starttime" + ENDTIME = "endtime" + RATE = "rate" + AMOUNT = "amount" + ORDERID = "orderid" + + +def load_input_events() -> pd.DataFrame: + return pd.read_csv( + ICU_INPUT_EVENT_PATH, + compression="gzip", + usecols=[f for f in InputEvents], + parse_dates=[InputEvents.STARTTIME, InputEvents.ENDTIME], + ) + + +class ProceduresEvents(StrEnum): + STAY_ID = "stay_id" + START_TIME = "starttime" + ITEM_ID = "itemid" + + +def load_icu_procedure_events() -> pd.DataFrame: + return pd.read_csv( + ICU_PROCEDURE_EVENTS_PATH, + compression="gzip", + usecols=[h for h in ProceduresEvents], + parse_dates=[ProceduresEvents.START_TIME], + ).drop_duplicates() diff --git a/pipeline/no_event_feature_preprocessor.py b/pipeline/no_event_feature_preprocessor.py new file mode 100644 index 0000000000..40c06b8644 --- /dev/null +++ b/pipeline/no_event_feature_preprocessor.py @@ -0,0 +1,71 @@ +import pandas as pd +import logging +from pipeline.feature.diagnoses import Diagnoses, IcdGroupOption +from pipeline.feature.medications import Medications +from pipeline.feature.procedures import Procedures +from pipeline.features_extractor import FeatureExtractor +from typing import List + +from pipeline.file_info.common import save_data +from pipeline.file_info.preproc.feature import ( + EXTRACT_DIAG_ICU_PATH, + EXTRACT_DIAG_PATH, + EXTRACT_MED_PATH, + EXTRACT_PROC_PATH, + EXTRACT_DIAG_ICU_PATH, + EXTRACT_DIAG_PATH, +) + +logger = logging.getLogger() + + +class NoEventFeaturePreprocessor: + def __init__( + self, + feature_extractor: FeatureExtractor, + group_diag_icd: IcdGroupOption, + group_med_code: bool, + keep_proc_icd9: bool, + ): + self.feature_extractor = feature_extractor + self.group_diag_icd = group_diag_icd + self.group_med_code = group_med_code + self.keep_proc_icd9 = keep_proc_icd9 + + def preprocess(self) -> List[pd.DataFrame]: + no_event_preproc_features = [] + if self.feature_extractor.for_diagnoses: + extract_dia = pd.read_csv( + EXTRACT_DIAG_ICU_PATH + if self.feature_extractor.use_icu + else EXTRACT_DIAG_PATH, + compression="gzip", + ) + dia = Diagnoses( + use_icu=self.feature_extractor.use_icu, + df=extract_dia, + ) + preproc_dia = dia.preproc(self.group_diag_icd) + save_data( + preproc_dia, + EXTRACT_DIAG_ICU_PATH + if self.feature_extractor.use_icu + else EXTRACT_DIAG_PATH, + "DIAGNOSES", + ) + no_event_preproc_features.append(preproc_dia) + if not self.feature_extractor.use_icu: + if self.feature_extractor.for_medications: + extract_med = pd.read_csv(EXTRACT_MED_PATH, compression="gzip") + med = Medications(use_icu=False, df=extract_med) + preproc_med = med.preproc(self.group_med_code) + save_data(preproc_med, EXTRACT_MED_PATH, "MEDICATIONS") + + no_event_preproc_features.append(preproc_med) + if self.feature_extractor.for_procedures: + extract_proc = pd.read_csv(EXTRACT_PROC_PATH, compression="gzip") + proc = Procedures(use_icu=False, df=extract_proc) + preproc_proc = proc.preproc(self.keep_proc_icd9) + save_data(preproc_proc, EXTRACT_PROC_PATH, "PROCEDURES") + no_event_preproc_features.append(preproc_proc) + return no_event_preproc_features diff --git a/pipeline/prediction_task.py b/pipeline/prediction_task.py new file mode 100644 index 0000000000..56cc76f4fc --- /dev/null +++ b/pipeline/prediction_task.py @@ -0,0 +1,34 @@ +from enum import StrEnum + + +class TargetType(StrEnum): + MORTALITY = "Mortality" + LOS = "Lenghth of stay" + READMISSION = "Readmission" + + +class DiseaseCode(StrEnum): + HEARTH_FAILURE = "I50" + CAD = "I25" # Coronary Artery Disease + CKD = "N18" # Chronic Kidney Disease + COPD = "J44" # Chronic obstructive pulmonary disease + + +class PredictionTask: + def __init__( + self, + target_type: TargetType, + disease_readmission: DiseaseCode | None, + disease_selection: DiseaseCode | None, + nb_days: int | None, + use_icu: bool, + ): + if nb_days is not None and nb_days < 0: + raise ValueError( + "the number of days after a readmission should be positive." + ) + self.target_type = target_type + self.disease_readmission = disease_readmission + self.disease_selection = disease_selection + self.nb_days = nb_days + self.use_icu = use_icu diff --git a/pipeline/preprocessing/admission_imputer.py b/pipeline/preprocessing/admission_imputer.py new file mode 100644 index 0000000000..1c615bbb69 --- /dev/null +++ b/pipeline/preprocessing/admission_imputer.py @@ -0,0 +1,92 @@ +import pandas as pd +from collections import defaultdict +from typing import Union, List, Tuple +from functools import partial +from multiprocessing import Pool + +from pipeline.file_info.raw.hosp import HospAdmissions +from pipeline.file_info.preproc.feature import LabEventsHeader + + +INPUTED_HOSPITAL_ADMISSION_ID_HEADER = "hadm_id_new" + + +def hadm_imputer( + charttime: pd.Timestamp, + hadm_old: Union[str, float], + hadm_ids_w_timestamps: List[Tuple[str, pd.Timestamp, pd.Timestamp]], +) -> Tuple[str, pd.Timestamp, pd.Timestamp]: + """ + Impute hospital admission ID based on the chart time and a list of admission IDs with timestamps. + """ + + # If old HADM ID exists and is valid, use that + if not pd.isna(hadm_old): + hadm_old = str(int(hadm_old)) + for h_id, adm_time, disch_time in hadm_ids_w_timestamps: + if h_id == hadm_old: + return hadm_old, adm_time, disch_time + + # Filter and sort HADM IDs based on their proximity to the lab event charttime + valid_hadm_ids = [ + (hadm_id, admittime, dischtime) + for hadm_id, admittime, dischtime in hadm_ids_w_timestamps + if admittime <= charttime <= dischtime + ] + valid_hadm_ids.sort(key=lambda x: abs(charttime - x[1])) + + # Return the most relevant HADM ID or None if no valid ID is found + return valid_hadm_ids[0] if valid_hadm_ids else (None, None, None) + + +def impute_row(row, subject_hadm_admittime_tracker): + """Helper function to impute data for a single row.""" + new_hadm_id, new_admittime, new_dischtime = hadm_imputer( + row[LabEventsHeader.CHART_TIME], + row[LabEventsHeader.HOSPITAL_ADMISSION_ID], + subject_hadm_admittime_tracker.get(row[LabEventsHeader.PATIENT_ID], []), + ) + return pd.Series( + [new_hadm_id, new_admittime, new_dischtime], + index=[ + INPUTED_HOSPITAL_ADMISSION_ID_HEADER, + HospAdmissions.ADMITTIME, + HospAdmissions.DISCHTIME, + ], + ) + + +def process_chunk( + chunk: pd.DataFrame, subject_hadm_admittime_tracker: dict +) -> pd.DataFrame: + """Process a single chunk for imputing HADM IDs.""" + imputed_data = chunk.apply( + lambda row: impute_row(row, subject_hadm_admittime_tracker), axis=1 + ) + return pd.concat([chunk, imputed_data], axis=1) + + +def impute_hadm_ids(lab_table: pd.DataFrame, admissions: pd.DataFrame) -> pd.DataFrame: + """Impute missing HADM IDs in the lab table.""" + # Create tracker from admission table + subject_hadm_admittime_tracker = defaultdict(list) + for row in admissions.itertuples(): + subject_hadm_admittime_tracker[row.subject_id].append( + (row.hadm_id, row.admittime, row.dischtime) + ) + + # Prepare chunks and function for parallel processing + chunk_size = 100 + chunks = [ + lab_table[i : i + chunk_size] for i in range(0, len(lab_table), chunk_size) + ] + process_func = partial( + process_chunk, subject_hadm_admittime_tracker=subject_hadm_admittime_tracker + ) + + # Parallel processing + with Pool(8) as pool: + processed_chunks = pool.map(process_func, chunks) + non_empty_chunks = [chunk.dropna(how="all", axis=1) for chunk in processed_chunks] + # Consolidate processed chunks + return pd.concat(non_empty_chunks, ignore_index=True) diff --git a/pipeline/preprocessing/cohort.py b/pipeline/preprocessing/cohort.py new file mode 100644 index 0000000000..1d6f8f642e --- /dev/null +++ b/pipeline/preprocessing/cohort.py @@ -0,0 +1,123 @@ +import pandas as pd +import numpy as np +import datetime +from pipeline.file_info.common import save_data +from pipeline.file_info.preproc.cohort import ( + COHORT_PATH, + CohortHeader, + NonIcuCohortHeader, + IcuCohortHeader, +) +import logging +from pathlib import Path +from pipeline.file_info.raw.hosp import HospAdmissions + +from pipeline.prediction_task import PredictionTask, TargetType + +logger = logging.getLogger() + + +class Cohort: + def __init__( + self, + icu: bool, + name: str, + df: pd.DataFrame = pd.DataFrame(), + ): + self.df = df + self.icu = icu + self.name = name + self.summary_name = f"summary_{name}" + self.admit_col = ( + IcuCohortHeader.IN_TIME if self.icu else NonIcuCohortHeader.ADMIT_TIME + ) + self.disch_col = ( + IcuCohortHeader.OUT_TIME if self.icu else NonIcuCohortHeader.DISCH_TIME + ) + + def prepare_mort_labels(self, visits: pd.DataFrame): + visits = visits.dropna(subset=[self.admit_col, self.disch_col]) + visits[CohortHeader.DOD] = pd.to_datetime(visits[CohortHeader.DOD]) + visits[CohortHeader.LABEL] = np.where( + (visits[CohortHeader.DOD] >= visits[self.admit_col]) + & (visits[CohortHeader.DOD] <= visits[self.disch_col]), + 1, + 0, + ) + logger.info( + f"[ MORTALITY LABELS FINISHED: {visits[CohortHeader.LABEL].sum()} Mortality Cases ]" + ) + return visits + + def prepare_read_labels(self, visits: pd.DataFrame, nb_days: int): + gap = datetime.timedelta(days=nb_days) + visits["next_admit"] = ( + visits.sort_values(by=[self.admit_col]) + .groupby(CohortHeader.PATIENT_ID)[self.admit_col] + .shift(-1) + ) + visits["time_to_next"] = visits["next_admit"] - visits[self.disch_col] + visits[CohortHeader.LABEL] = ( + visits["time_to_next"].notnull() & (visits["time_to_next"] <= gap) + ).astype(int) + readmit_cases = visits[CohortHeader.LABEL].sum() + logger.info( + f"[ READMISSION LABELS FINISHED: {readmit_cases} Readmission Cases ]" + ) + return visits.drop(columns=["next_admit", "time_to_next"]) + + def prepare_los_labels(self, visits: pd.DataFrame, nb_days): + visits = visits.dropna( + subset=[self.admit_col, self.disch_col, CohortHeader.LOS] + ) + visits[CohortHeader.LABEL] = (visits[CohortHeader.LOS] > nb_days).astype(int) + logger.info( + f"[ LOS LABELS FINISHED: {visits[CohortHeader.LABEL].sum()} LOS Cases ]" + ) + return visits + + def prepare_labels(self, visits: pd.DataFrame, prediction_task: PredictionTask): + if prediction_task.target_type == TargetType.MORTALITY: + df = self.prepare_mort_labels(visits) + elif prediction_task.target_type == TargetType.READMISSION: + df = self.prepare_read_labels(visits, prediction_task.nb_days) + elif prediction_task.target_type == TargetType.LOS: + df = self.prepare_los_labels(visits, prediction_task.nb_days) + df = df.sort_values(by=[CohortHeader.PATIENT_ID, self.admit_col]) + self.df = df.rename(columns={HospAdmissions.RACE: CohortHeader.ETHICITY}) + + def save(self): + save_data(self.df, COHORT_PATH / f"{self.name}.csv.gz", "COHORT") + + def save_summary(self): + summary = "\n".join( + [ + f"{self.df} FOR {' ICU' if self.icu else ''} DATA", + f"# Admission Records: {self.df.shape[0]}", + f"# Patients: {self.df[CohortHeader.PATIENT_ID].nunique()}", + f"# Positive cases: {self.df[self.df[CohortHeader.LABEL]==1].shape[0]}", + f"# Negative cases: {self.df[self.df[CohortHeader.LABEL]==0].shape[0]}", + ] + ) + with open(COHORT_PATH / f"{self.summary_name}.txt", "w") as f: + f.write(summary) + + +def read_cohort(name: str, use_icu: bool) -> pd.DataFrame: + data = pd.read_csv( + COHORT_PATH / f"{name}.csv.gz", + compression="gzip", + ) + start_time = IcuCohortHeader.IN_TIME if use_icu else NonIcuCohortHeader.ADMIT_TIME + stop_time = IcuCohortHeader.OUT_TIME if use_icu else NonIcuCohortHeader.DISCH_TIME + for col in [start_time, stop_time]: + data[col] = pd.to_datetime(data[col]) + data[CohortHeader.LOS] = ( + (data[stop_time] - data[start_time]).dt.total_seconds() / 3600 + ).astype(int) + data = data[data[CohortHeader.LOS] > 0] + data[CohortHeader.AGE] = data[CohortHeader.AGE].astype(int) + + logger.info("[ READ COHORT ]") + + return data diff --git a/pipeline/preprocessing/data_gen.py b/pipeline/preprocessing/data_gen.py new file mode 100644 index 0000000000..eb5b877e05 --- /dev/null +++ b/pipeline/preprocessing/data_gen.py @@ -0,0 +1,26 @@ +# import pandas as pd +# import logging +# from pipeline.file_info.preproc.cohort import COHORT_PATH, CohortHeader + +# logger = logging.getLogger() + + +# def generate_admission_cohort(cohort_output: str) -> pd.DataFrame: +# data = pd.read_csv( +# COHORT_PATH / f"{cohort_output}.csv.gz", +# compression="gzip", +# ) +# for col in [CohortHeader.ADMIT_TIME, CohortHeader.DISCH_TIME]: +# data[col] = pd.to_datetime(data[col]) + +# data[CohortHeader.LOS] = ( +# ( +# data[CohortHeader.DISCH_TIME] - data[CohortHeader.ADMIT_TIME] +# ).dt.total_seconds() +# / 3600 +# ).astype(int) +# data = data[data[CohortHeader.LOS] > 0] +# data[CohortHeader.AGE] = data[CohortHeader.AGE].astype(int) + +# logger.info("[ READ COHORT ]") +# return data diff --git a/pipeline/preprocessing/outlier_removal.py b/pipeline/preprocessing/outlier_removal.py new file mode 100644 index 0000000000..5489765e5b --- /dev/null +++ b/pipeline/preprocessing/outlier_removal.py @@ -0,0 +1,65 @@ +import pandas as pd +import numpy as np + + +def compute_outlier_imputation( + arr: np.ndarray, cut_off: float, left_thresh: float, impute: bool +) -> np.ndarray: + """Imputes or flags outliers in an array based on percentile thresholds. + + Args: + arr (np.ndarray): The input array. + cut_off (float): The upper percentile threshold to define outliers. + left_thresh (float): The lower percentile threshold to define outliers. + impute (bool): If True, outliers are imputed with threshold values. If False, they are replaced with NaN. + + Returns: + np.ndarray: The array with outliers imputed or flagged. + """ + lower_bound = np.percentile(arr, left_thresh) + upper_bound = np.percentile(arr, cut_off) + + if impute: + np.clip(arr, lower_bound, upper_bound, out=arr) + else: + arr = np.where((arr < lower_bound) | (arr > upper_bound), np.nan, arr) + + +def outlier_imputation( + data: pd.DataFrame, + id_attribute: str, + value_attribute: str, + cut_off: float, + left_thresh: float, + impute: bool, +) -> pd.DataFrame: + """ + Applies outlier imputation or removal to a specific attribute of a DataFrame, grouped by another attribute. + + Args: + data (pd.DataFrame): The input DataFrame. + id_attribute (str): The attribute to group by. + value_attribute (str): The attribute to apply outlier processing. + cut_off (float): Upper percentile threshold for defining outliers. + left_thresh (float): Lower percentile threshold for defining outliers. + impute (bool): If True, imputes outliers with threshold values; if False, replaces them with NaN. + + Returns: + pd.DataFrame: The DataFrame with outlier processing applied. + """ + + def impute_group(group: pd.Series) -> pd.Series: + arr = group.values + imputed_arr = compute_outlier_imputation(arr, cut_off, left_thresh, impute) + return pd.Series(imputed_arr, index=group.index) + + # Apply the outlier imputation or removal to each group + data[value_attribute] = data.groupby(id_attribute)[value_attribute].transform( + impute_group + ) + + # Optionally drop rows with NaN values in the value_attribute column + if not impute: + data = data.dropna(subset=[value_attribute]) + + return data diff --git a/pipeline/preprocessing/visit.py b/pipeline/preprocessing/visit.py new file mode 100644 index 0000000000..0d295fb2b0 --- /dev/null +++ b/pipeline/preprocessing/visit.py @@ -0,0 +1,126 @@ +import pandas as pd +from pipeline.conversion.icd import IcdConverter +from pipeline.file_info.raw.hosp import ( + HospDiagnosesIcd, + HospPatients, + HospAdmissions, + load_hosp_diagnosis_icd, +) +from pipeline.file_info.raw.icu import IcuStays + +from pipeline.file_info.preproc.cohort import ( + CohortHeader, + IcuCohortHeader, + NonIcuCohortHeader, +) +from pipeline.prediction_task import TargetType +from pipeline.prediction_task import DiseaseCode +from typing import Optional +import logging + + +logger = logging.getLogger() + + +def make_patients(hosp_patients: pd.DataFrame) -> pd.DataFrame: + patients = hosp_patients[ + [ + HospPatients.ID, + HospPatients.ANCHOR_YEAR, + HospPatients.ANCHOR_YEAR_GROUP, + HospPatients.ANCHOR_AGE, + HospPatients.DOD, + HospPatients.GENDER, + ] + ].copy() + max_anchor_year_group = ( + patients[HospPatients.ANCHOR_YEAR_GROUP].str.slice(start=-4).astype(int) + ) + # To identify visits with prediction windows outside the range 2008-2019. + patients[CohortHeader.MIN_VALID_YEAR] = ( + hosp_patients[HospPatients.ANCHOR_YEAR] + 2019 - max_anchor_year_group + ) + return patients.rename(columns={HospPatients.ANCHOR_AGE: CohortHeader.AGE})[ + [ + HospPatients.ID, + CohortHeader.AGE, + CohortHeader.MIN_VALID_YEAR, + HospPatients.DOD, + HospPatients.GENDER, + ] + ] + + +def make_icu_visits( + icu_icustays: pd.DataFrame, hosp_patients: pd.DataFrame, target_type: TargetType +) -> pd.DataFrame: + if target_type != TargetType.READMISSION: + return icu_icustays + # Filter out stays where either there is no death or the death occurred after ICU discharge + patients_dod = hosp_patients[[HospPatients.ID, HospPatients.DOD]] + visits = icu_icustays.merge(patients_dod, on=IcuStays.PATIENT_ID) + filtered_visits = visits.loc[ + (visits[HospPatients.DOD].isna()) + | (visits[HospPatients.DOD] >= visits[IcuStays.OUTTIME]) + ] + return filtered_visits[ + [ + CohortHeader.PATIENT_ID, + IcuCohortHeader.STAY_ID, + CohortHeader.HOSPITAL_ADMISSION_ID, + IcuCohortHeader.IN_TIME, + IcuCohortHeader.OUT_TIME, + CohortHeader.LOS, + ] + ] + + +def make_no_icu_visits( + hosp_admissions: pd.DataFrame, target_type: TargetType +) -> pd.DataFrame: + hosp_admissions[HospAdmissions.LOS] = ( + hosp_admissions[HospAdmissions.DISCHTIME] + - hosp_admissions[HospAdmissions.ADMITTIME] + ).dt.days + + if target_type == TargetType.READMISSION: + # Filter out hospitalizations where the patient expired + hosp_admissions = hosp_admissions[ + hosp_admissions[HospAdmissions.HOSPITAL_EXPIRE_FLAG] == 0 + ] + return hosp_admissions[ + [ + CohortHeader.PATIENT_ID, + CohortHeader.HOSPITAL_ADMISSION_ID, + NonIcuCohortHeader.ADMIT_TIME, + NonIcuCohortHeader.DISCH_TIME, + CohortHeader.LOS, + ] + ] + + +def filter_visits( + visits: pd.DataFrame, + disease_readmission: Optional[DiseaseCode], + disease_selection: Optional[DiseaseCode], +) -> pd.DataFrame: + """# Filter visits based on readmission due to a specific disease and on disease selection""" + icd_converter = IcdConverter() + diag = load_hosp_diagnosis_icd()[ + [ + HospDiagnosesIcd.ICD_CODE, + HospDiagnosesIcd.ICD_VERSION, + HospDiagnosesIcd.HOSPITAL_ADMISSION_ID, + ] + ] + diag = icd_converter.standardize_icd(diag) + diag.dropna(subset=[HospDiagnosesIcd.ROOT], inplace=True) + if disease_readmission: + hids = icd_converter.get_pos_ids(diag, disease_readmission) + visits = visits[visits[CohortHeader.HOSPITAL_ADMISSION_ID].isin(hids)] + + if disease_selection: + hids = icd_converter.get_pos_ids(diag, disease_selection) + visits = visits[visits[CohortHeader.HOSPITAL_ADMISSION_ID].isin(hids)] + + return visits diff --git a/pipeline/summarizer.py b/pipeline/summarizer.py new file mode 100644 index 0000000000..5ddb18f55d --- /dev/null +++ b/pipeline/summarizer.py @@ -0,0 +1,165 @@ +import pandas as pd +import logging + +from tqdm import tqdm +from pipeline.feature.feature_abc import Feature +from pipeline.feature.diagnoses import Diagnoses +from pipeline.feature.lab_events import Lab +from pipeline.feature.medications import Medications +from pipeline.feature.output_events import OutputEvents +from pipeline.feature.procedures import Procedures +from pipeline.features_extractor import FeatureExtractor +from typing import List, Type + +from pipeline.feature.chart_events import Chart +from pipeline.file_info.common import save_data +from pipeline.file_info.preproc.feature import ( + EXTRACT_CHART_ICU_PATH, + EXTRACT_LABS_PATH, + EXTRACT_MED_ICU_PATH, + EXTRACT_MED_PATH, + EXTRACT_OUT_ICU_PATH, + EXTRACT_PROC_ICU_PATH, + EXTRACT_PROC_PATH, + EXTRACT_DIAG_PATH, + EXTRACT_DIAG_ICU_PATH, + ChartEventsHeader, + IcuMedicationHeader, + IcuProceduresHeader, + LabEventsHeader, + NonIcuProceduresHeader, + OutputEventsHeader, + PreprocDiagnosesHeader, + PreprocMedicationHeader, +) +from pipeline.file_info.preproc.summary import ( + CHART_FEATURES_PATH, + CHART_SUMMARY_PATH, + DIAG_FEATURES_PATH, + DIAG_SUMMARY_PATH, + LABS_FEATURES_PATH, + LABS_SUMMARY_PATH, + MED_FEATURES_PATH, + MED_SUMMARY_PATH, + OUT_FEATURES_PATH, + OUT_SUMMARY_PATH, + PROC_FEATURES_PATH, + PROC_SUMMARY_PATH, +) +from pipeline.no_event_feature_preprocessor import NoEventFeaturePreprocessor +from pathlib import Path + +logger = logging.getLogger() + + +class Summarizer: + def __init__( + self, + feature_extractor: FeatureExtractor, + ): + self.feature_extractor = feature_extractor + + def process_feature( + self, + feature_class: Type[Feature], + path: Path, + summary_path: Path, + feature_name: str, + features_path: str, + use_icu: bool = True, + ) -> pd.DataFrame: + """ + Process a feature, save its summary, and export relevant data to a CSV file. + """ + feature = ( + feature_class( + use_icu=self.feature_extractor.use_icu, + df=pd.read_csv(path, compression="gzip"), + ) + if use_icu + else feature_class(df=pd.read_csv(path, compression="gzip")) + ) + summary = feature.summary() + save_data(summary, summary_path, f"{feature_class.__name__.upper()} SUMMARY") + summary[feature_name].to_csv(features_path, index=False) + return summary + + def save_summaries(self) -> List[pd.DataFrame]: + summaries = [] + if self.feature_extractor.for_diagnoses: + summary = self.process_feature( + Diagnoses, + EXTRACT_DIAG_ICU_PATH + if self.feature_extractor.use_icu + else EXTRACT_DIAG_PATH, + DIAG_SUMMARY_PATH, + PreprocDiagnosesHeader.NEW_ICD_CODE, + DIAG_FEATURES_PATH, + ) + summaries.append(summary) + if self.feature_extractor.for_medications: + summary = self.process_feature( + Medications, + EXTRACT_MED_ICU_PATH + if self.feature_extractor.use_icu + else EXTRACT_MED_PATH, + MED_SUMMARY_PATH, + IcuMedicationHeader.ITEM_ID + if self.feature_extractor.use_icu + else PreprocMedicationHeader.DRUG_NAME, + MED_FEATURES_PATH, + ) + summaries.append(summary) + + if self.feature_extractor.for_procedures: + summary = self.process_feature( + Procedures, + EXTRACT_PROC_ICU_PATH + if self.feature_extractor.use_icu + else EXTRACT_PROC_PATH, + PROC_SUMMARY_PATH, + IcuProceduresHeader.ITEM_ID + if self.feature_extractor.use_icu + else NonIcuProceduresHeader.ICD_CODE, + PROC_FEATURES_PATH, + ) + summaries.append(summary) + + if self.feature_extractor.for_output_events: + summary = self.process_feature( + OutputEvents, + EXTRACT_OUT_ICU_PATH, + OUT_SUMMARY_PATH, + OutputEventsHeader.ITEM_ID, + OUT_FEATURES_PATH, + use_icu=False, + ) + summaries.append(summary) + + if self.feature_extractor.for_chart_events: + summary = self.process_feature( + Chart, + EXTRACT_CHART_ICU_PATH, + CHART_SUMMARY_PATH, + ChartEventsHeader.ITEM_ID, + CHART_FEATURES_PATH, + use_icu=False, + ) + summaries.append(summary) + + if self.feature_extractor.for_labs: + # Special handling for labs by chunk + labs = pd.concat( + tqdm( + pd.read_csv( + EXTRACT_LABS_PATH, compression="gzip", chunksize=10000000 + ) + ), + ignore_index=True, + ) + lab = Lab(df=labs) + summary = lab.summary() + save_data(summary, LABS_SUMMARY_PATH, "LABS SUMMARY") + summary[LabEventsHeader.ITEM_ID].to_csv(LABS_FEATURES_PATH, index=False) + summaries.append(summary) + return summaries diff --git a/pipeline/temp_icu.py b/pipeline/temp_icu.py new file mode 100644 index 0000000000..c1b19c026b --- /dev/null +++ b/pipeline/temp_icu.py @@ -0,0 +1,311 @@ +def create_Dict(self, meds, proc, out, chart, los): + dataDic = {} + print(los) + labels_csv = pd.DataFrame(columns=["stay_id", "label"]) + labels_csv["stay_id"] = pd.Series(self.hids) + labels_csv["label"] = 0 + # print("# Unique gender",self.data.gender.nunique()) + # print("# Unique ethnicity",self.data.ethnicity.nunique()) + # print("# Unique insurance",self.data.insurance.nunique()) + + for hid in self.hids: + grp = self.data[self.data["stay_id"] == hid] + dataDic[hid] = { + "Cond": {}, + "Proc": {}, + "Med": {}, + "Out": {}, + "Chart": {}, + "ethnicity": grp["ethnicity"].iloc[0], + "age": int(grp["Age"]), + "gender": grp["gender"].iloc[0], + "label": int(grp["label"]), + } + labels_csv.loc[labels_csv["stay_id"] == hid, "label"] = int(grp["label"]) + + # print(static_csv.head()) + for hid in tqdm(self.hids): + grp = self.data[self.data["stay_id"] == hid] + demo_csv = grp[["Age", "gender", "ethnicity", "insurance"]] + if not os.path.exists("./data/csv/" + str(hid)): + os.makedirs("./data/csv/" + str(hid)) + demo_csv.to_csv("./data/csv/" + str(hid) + "/demo.csv", index=False) + + dyn_csv = pd.DataFrame() + ###MEDS + if self.feat_med: + feat = meds["itemid"].unique() + df2 = meds[meds["stay_id"] == hid] + if df2.shape[0] == 0: + amount = pd.DataFrame(np.zeros([los, len(feat)]), columns=feat) + amount = amount.fillna(0) + amount.columns = pd.MultiIndex.from_product([["MEDS"], amount.columns]) + else: + rate = df2.pivot_table( + index="start_time", columns="itemid", values="rate" + ) + # print(rate) + amount = df2.pivot_table( + index="start_time", columns="itemid", values="amount" + ) + df2 = df2.pivot_table( + index="start_time", columns="itemid", values="stop_time" + ) + # print(df2.shape) + add_indices = pd.Index(range(los)).difference(df2.index) + add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna( + np.nan + ) + df2 = pd.concat([df2, add_df]) + df2 = df2.sort_index() + df2 = df2.ffill() + df2 = df2.fillna(0) + + rate = pd.concat([rate, add_df]) + rate = rate.sort_index() + rate = rate.ffill() + rate = rate.fillna(-1) + + amount = pd.concat([amount, add_df]) + amount = amount.sort_index() + amount = amount.ffill() + amount = amount.fillna(-1) + # print(df2.head()) + df2.iloc[:, 0:] = df2.iloc[:, 0:].sub(df2.index, 0) + df2[df2 > 0] = 1 + df2[df2 < 0] = 0 + rate.iloc[:, 0:] = df2.iloc[:, 0:] * rate.iloc[:, 0:] + amount.iloc[:, 0:] = df2.iloc[:, 0:] * amount.iloc[:, 0:] + # print(df2.head()) + dataDic[hid]["Med"]["signal"] = df2.iloc[:, 0:].to_dict(orient="list") + dataDic[hid]["Med"]["rate"] = rate.iloc[:, 0:].to_dict(orient="list") + dataDic[hid]["Med"]["amount"] = amount.iloc[:, 0:].to_dict( + orient="list" + ) + + feat_df = pd.DataFrame(columns=list(set(feat) - set(amount.columns))) + # print(feat) + # print(amount.columns) + # print(amount.head()) + amount = pd.concat([amount, feat_df], axis=1) + + amount = amount[feat] + amount = amount.fillna(0) + # print(amount.columns) + amount.columns = pd.MultiIndex.from_product([["MEDS"], amount.columns]) + + if dyn_csv.empty: + dyn_csv = amount + else: + dyn_csv = pd.concat([dyn_csv, amount], axis=1) + + ###PROCS + if self.feat_proc: + feat = proc["itemid"].unique() + df2 = proc[proc["stay_id"] == hid] + if df2.shape[0] == 0: + df2 = pd.DataFrame(np.zeros([los, len(feat)]), columns=feat) + df2 = df2.fillna(0) + df2.columns = pd.MultiIndex.from_product([["PROC"], df2.columns]) + else: + df2["val"] = 1 + # print(df2) + df2 = df2.pivot_table( + index="start_time", columns="itemid", values="val" + ) + # print(df2.shape) + add_indices = pd.Index(range(los)).difference(df2.index) + add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna( + np.nan + ) + df2 = pd.concat([df2, add_df]) + df2 = df2.sort_index() + df2 = df2.fillna(0) + df2[df2 > 0] = 1 + # print(df2.head()) + dataDic[hid]["Proc"] = df2.to_dict(orient="list") + + feat_df = pd.DataFrame(columns=list(set(feat) - set(df2.columns))) + df2 = pd.concat([df2, feat_df], axis=1) + + df2 = df2[feat] + df2 = df2.fillna(0) + df2.columns = pd.MultiIndex.from_product([["PROC"], df2.columns]) + + if dyn_csv.empty: + dyn_csv = df2 + else: + dyn_csv = pd.concat([dyn_csv, df2], axis=1) + + ###OUT + if self.feat_out: + feat = out["itemid"].unique() + df2 = out[out["stay_id"] == hid] + if df2.shape[0] == 0: + df2 = pd.DataFrame(np.zeros([los, len(feat)]), columns=feat) + df2 = df2.fillna(0) + df2.columns = pd.MultiIndex.from_product([["OUT"], df2.columns]) + else: + df2["val"] = 1 + df2 = df2.pivot_table( + index="start_time", columns="itemid", values="val" + ) + # print(df2.shape) + add_indices = pd.Index(range(los)).difference(df2.index) + add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna( + np.nan + ) + df2 = pd.concat([df2, add_df]) + df2 = df2.sort_index() + df2 = df2.fillna(0) + df2[df2 > 0] = 1 + # print(df2.head()) + dataDic[hid]["Out"] = df2.to_dict(orient="list") + + feat_df = pd.DataFrame(columns=list(set(feat) - set(df2.columns))) + df2 = pd.concat([df2, feat_df], axis=1) + + df2 = df2[feat] + df2 = df2.fillna(0) + df2.columns = pd.MultiIndex.from_product([["OUT"], df2.columns]) + + if dyn_csv.empty: + dyn_csv = df2 + else: + dyn_csv = pd.concat([dyn_csv, df2], axis=1) + + ###CHART + if self.feat_chart: + feat = chart["itemid"].unique() + df2 = chart[chart["stay_id"] == hid] + if df2.shape[0] == 0: + val = pd.DataFrame(np.zeros([los, len(feat)]), columns=feat) + val = val.fillna(0) + val.columns = pd.MultiIndex.from_product([["CHART"], val.columns]) + else: + val = df2.pivot_table( + index="start_time", columns="itemid", values="valuenum" + ) + df2["val"] = 1 + df2 = df2.pivot_table( + index="start_time", columns="itemid", values="val" + ) + # print(df2.shape) + add_indices = pd.Index(range(los)).difference(df2.index) + add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna( + np.nan + ) + df2 = pd.concat([df2, add_df]) + df2 = df2.sort_index() + df2 = df2.fillna(0) + + val = pd.concat([val, add_df]) + val = val.sort_index() + if self.impute == "Mean": + val = val.ffill() + val = val.bfill() + val = val.fillna(val.mean()) + elif self.impute == "Median": + val = val.ffill() + val = val.bfill() + val = val.fillna(val.median()) + val = val.fillna(0) + + df2[df2 > 0] = 1 + df2[df2 < 0] = 0 + # print(df2.head()) + dataDic[hid]["Chart"]["signal"] = df2.iloc[:, 0:].to_dict(orient="list") + dataDic[hid]["Chart"]["val"] = val.iloc[:, 0:].to_dict(orient="list") + + feat_df = pd.DataFrame(columns=list(set(feat) - set(val.columns))) + val = pd.concat([val, feat_df], axis=1) + + val = val[feat] + val = val.fillna(0) + val.columns = pd.MultiIndex.from_product([["CHART"], val.columns]) + + if dyn_csv.empty: + dyn_csv = val + else: + dyn_csv = pd.concat([dyn_csv, val], axis=1) + + # Save temporal data to csv + dyn_csv.to_csv("./data/csv/" + str(hid) + "/dynamic.csv", index=False) + + ##########COND######### + if self.feat_cond: + feat = self.cond["new_icd_code"].unique() + grp = self.cond[self.cond["stay_id"] == hid] + if grp.shape[0] == 0: + dataDic[hid]["Cond"] = {"fids": list([""])} + feat_df = pd.DataFrame(np.zeros([1, len(feat)]), columns=feat) + grp = feat_df.fillna(0) + grp.columns = pd.MultiIndex.from_product([["COND"], grp.columns]) + else: + dataDic[hid]["Cond"] = {"fids": list(grp["new_icd_code"])} + grp["val"] = 1 + grp = grp.drop_duplicates() + grp = grp.pivot( + index="stay_id", columns="new_icd_code", values="val" + ).reset_index(drop=True) + feat_df = pd.DataFrame(columns=list(set(feat) - set(grp.columns))) + grp = pd.concat([grp, feat_df], axis=1) + grp = grp.fillna(0) + grp = grp[feat] + grp.columns = pd.MultiIndex.from_product([["COND"], grp.columns]) + grp.to_csv("./data/csv/" + str(hid) + "/static.csv", index=False) + labels_csv.to_csv("./data/csv/labels.csv", index=False) + + ######SAVE DICTIONARIES############## + metaDic = {"Cond": {}, "Proc": {}, "Med": {}, "Out": {}, "Chart": {}, "LOS": {}} + metaDic["LOS"] = los + with open("./data/dict/dataDic", "wb") as fp: + pickle.dump(dataDic, fp) + + with open("./data/dict/hadmDic", "wb") as fp: + pickle.dump(self.hids, fp) + + with open("./data/dict/ethVocab", "wb") as fp: + pickle.dump(list(self.data["ethnicity"].unique()), fp) + self.eth_vocab = self.data["ethnicity"].nunique() + + with open("./data/dict/ageVocab", "wb") as fp: + pickle.dump(list(self.data["Age"].unique()), fp) + self.age_vocab = self.data["Age"].nunique() + + with open("./data/dict/insVocab", "wb") as fp: + pickle.dump(list(self.data["insurance"].unique()), fp) + self.ins_vocab = self.data["insurance"].nunique() + + if self.feat_med: + with open("./data/dict/medVocab", "wb") as fp: + pickle.dump(list(meds["itemid"].unique()), fp) + self.med_vocab = meds["itemid"].nunique() + metaDic["Med"] = self.med_per_adm + + if self.feat_out: + with open("./data/dict/outVocab", "wb") as fp: + pickle.dump(list(out["itemid"].unique()), fp) + self.out_vocab = out["itemid"].nunique() + metaDic["Out"] = self.out_per_adm + + if self.feat_chart: + with open("./data/dict/chartVocab", "wb") as fp: + pickle.dump(list(chart["itemid"].unique()), fp) + self.chart_vocab = chart["itemid"].nunique() + metaDic["Chart"] = self.chart_per_adm + + if self.feat_cond: + with open("./data/dict/condVocab", "wb") as fp: + pickle.dump(list(self.cond["new_icd_code"].unique()), fp) + self.cond_vocab = self.cond["new_icd_code"].nunique() + metaDic["Cond"] = self.cond_per_adm + + if self.feat_proc: + with open("./data/dict/procVocab", "wb") as fp: + pickle.dump(list(proc["itemid"].unique()), fp) + self.proc_vocab = proc["itemid"].nunique() + metaDic["Proc"] = self.proc_per_adm + + with open("./data/dict/metaDic", "wb") as fp: + pickle.dump(metaDic, fp) diff --git a/pipeline/temp_no_icu b/pipeline/temp_no_icu new file mode 100644 index 0000000000..b4a6c8a9bf --- /dev/null +++ b/pipeline/temp_no_icu @@ -0,0 +1,249 @@ +def create_Dict(self, meds, proc, labs, los): + print("[ CREATING DATA DICTIONARIES ]") + dataDic = {} + labels_csv = pd.DataFrame(columns=["hadm_id", "label"]) + labels_csv["hadm_id"] = pd.Series(self.hids) + labels_csv["label"] = 0 + for hid in self.hids: + grp = self.data[self.data["hadm_id"] == hid] + # print(grp.head()) + # print(grp['gender']) + # print(int(grp['Age'])) + # print(grp['ethnicity'].iloc[0]) + dataDic[hid] = { + "Cond": {}, + "Proc": {}, + "Med": {}, + "Lab": {}, + "ethnicity": grp["ethnicity"].iloc[0], + "age": int(grp["Age"]), + "gender": grp["gender"].iloc[0], + "label": int(grp["label"]), + } + labels_csv.loc[labels_csv["hadm_id"] == hid, "label"] = int(grp["label"]) + for hid in tqdm(self.hids): + grp = self.data[self.data["hadm_id"] == hid] + demo_csv = grp[["Age", "gender", "ethnicity", "insurance"]] + if not os.path.exists("./data/csv/" + str(hid)): + os.makedirs("./data/csv/" + str(hid)) + demo_csv.to_csv("./data/csv/" + str(hid) + "/demo.csv", index=False) + + dyn_csv = pd.DataFrame() + ###MEDS + if self.feat_med: + feat = meds["drug_name"].unique() + df2 = meds[meds["hadm_id"] == hid] + if df2.shape[0] == 0: + val = pd.DataFrame(np.zeros([los, len(feat)]), columns=feat) + val = val.fillna(0) + val.columns = pd.MultiIndex.from_product([["MEDS"], val.columns]) + else: + val = df2.pivot_table( + index="start_time", columns="drug_name", values="dose_val_rx" + ) + df2 = df2.pivot_table( + index="start_time", columns="drug_name", values="stop_time" + ) + # print(df2.shape) + add_indices = pd.Index(range(los)).difference(df2.index) + add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna( + np.nan + ) + df2 = pd.concat([df2, add_df]) + df2 = df2.sort_index() + df2 = df2.ffill() + df2 = df2.fillna(0) + + val = pd.concat([val, add_df]) + val = val.sort_index() + val = val.ffill() + val = val.fillna(-1) + # print(df2.head()) + df2.iloc[:, 0:] = df2.iloc[:, 0:].sub(df2.index, 0) + df2[df2 > 0] = 1 + df2[df2 < 0] = 0 + val.iloc[:, 0:] = df2.iloc[:, 0:] * val.iloc[:, 0:] + # print(df2.head()) + dataDic[hid]["Med"]["signal"] = df2.iloc[:, 0:].to_dict(orient="list") + dataDic[hid]["Med"]["val"] = val.iloc[:, 0:].to_dict(orient="list") + + feat_df = pd.DataFrame(columns=list(set(feat) - set(val.columns))) + + val = pd.concat([val, feat_df], axis=1) + + val = val[feat] + val = val.fillna(0) + + val.columns = pd.MultiIndex.from_product([["MEDS"], val.columns]) + if dyn_csv.empty: + dyn_csv = val + else: + dyn_csv = pd.concat([dyn_csv, val], axis=1) + + ###PROCS + if self.feat_proc: + feat = proc["icd_code"].unique() + df2 = proc[proc["hadm_id"] == hid] + if df2.shape[0] == 0: + df2 = pd.DataFrame(np.zeros([los, len(feat)]), columns=feat) + df2 = df2.fillna(0) + df2.columns = pd.MultiIndex.from_product([["PROC"], df2.columns]) + else: + df2["val"] = 1 + df2 = df2.pivot_table( + index="start_time", columns="icd_code", values="val" + ) + # print(df2.shape) + add_indices = pd.Index(range(los)).difference(df2.index) + add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna( + np.nan + ) + df2 = pd.concat([df2, add_df]) + df2 = df2.sort_index() + df2 = df2.fillna(0) + df2[df2 > 0] = 1 + # print(df2.head()) + dataDic[hid]["Proc"] = df2.to_dict(orient="list") + + feat_df = pd.DataFrame(columns=list(set(feat) - set(df2.columns))) + df2 = pd.concat([df2, feat_df], axis=1) + + df2 = df2[feat] + df2 = df2.fillna(0) + df2.columns = pd.MultiIndex.from_product([["PROC"], df2.columns]) + + if dyn_csv.empty: + dyn_csv = df2 + else: + dyn_csv = pd.concat([dyn_csv, df2], axis=1) + + ###LABS + if self.feat_lab: + feat = labs["itemid"].unique() + df2 = labs[labs["hadm_id"] == hid] + if df2.shape[0] == 0: + val = pd.DataFrame(np.zeros([los, len(feat)]), columns=feat) + val = val.fillna(0) + val.columns = pd.MultiIndex.from_product([["LAB"], val.columns]) + else: + val = df2.pivot_table( + index="start_time", columns="itemid", values="valuenum" + ) + df2["val"] = 1 + df2 = df2.pivot_table( + index="start_time", columns="itemid", values="val" + ) + # print(df2.shape) + add_indices = pd.Index(range(los)).difference(df2.index) + add_df = pd.DataFrame(index=add_indices, columns=df2.columns).fillna( + np.nan + ) + df2 = pd.concat([df2, add_df]) + df2 = df2.sort_index() + df2 = df2.fillna(0) + + val = pd.concat([val, add_df]) + val = val.sort_index() + if self.impute == "Mean": + val = val.ffill() + val = val.bfill() + val = val.fillna(val.mean()) + elif self.impute == "Median": + val = val.ffill() + val = val.bfill() + val = val.fillna(val.median()) + val = val.fillna(0) + + df2[df2 > 0] = 1 + df2[df2 < 0] = 0 + + # print(df2.head()) + dataDic[hid]["Lab"]["signal"] = df2.iloc[:, 0:].to_dict(orient="list") + dataDic[hid]["Lab"]["val"] = val.iloc[:, 0:].to_dict(orient="list") + + feat_df = pd.DataFrame(columns=list(set(feat) - set(val.columns))) + val = pd.concat([val, feat_df], axis=1) + + val = val[feat] + val = val.fillna(0) + val.columns = pd.MultiIndex.from_product([["LAB"], val.columns]) + + if dyn_csv.empty: + dyn_csv = val + else: + dyn_csv = pd.concat([dyn_csv, val], axis=1) + + # Save temporal data to csv + dyn_csv.to_csv("./data/csv/" + str(hid) + "/dynamic.csv", index=False) + + ##########COND######### + if self.feat_cond: + feat = self.cond["new_icd_code"].unique() + grp = self.cond[self.cond["hadm_id"] == hid] + if grp.shape[0] == 0: + dataDic[hid]["Cond"] = {"fids": list([""])} + feat_df = pd.DataFrame(np.zeros([1, len(feat)]), columns=feat) + grp = feat_df.fillna(0) + grp.columns = pd.MultiIndex.from_product([["COND"], grp.columns]) + else: + dataDic[hid]["Cond"] = {"fids": list(grp["new_icd_code"])} + grp["val"] = 1 + grp = grp.drop_duplicates() + grp = grp.pivot( + index="hadm_id", columns="new_icd_code", values="val" + ).reset_index(drop=True) + feat_df = pd.DataFrame(columns=list(set(feat) - set(grp.columns))) + grp = pd.concat([grp, feat_df], axis=1) + grp = grp.fillna(0) + grp = grp[feat] + grp.columns = pd.MultiIndex.from_product([["COND"], grp.columns]) + grp.to_csv("./data/csv/" + str(hid) + "/static.csv", index=False) + labels_csv.to_csv("./data/csv/labels.csv", index=False) + + ######SAVE DICTIONARIES############## + metaDic = {"Cond": {}, "Proc": {}, "Med": {}, "Lab": {}, "LOS": {}} + metaDic["LOS"] = los + with open("./data/dict/dataDic", "wb") as fp: + pickle.dump(dataDic, fp) + + with open("./data/dict/hadmDic", "wb") as fp: + pickle.dump(self.hids, fp) + + with open("./data/dict/ethVocab", "wb") as fp: + pickle.dump(list(self.data["ethnicity"].unique()), fp) + self.eth_vocab = self.data["ethnicity"].nunique() + + with open("./data/dict/ageVocab", "wb") as fp: + pickle.dump(list(self.data["Age"].unique()), fp) + self.age_vocab = self.data["Age"].nunique() + + with open("./data/dict/insVocab", "wb") as fp: + pickle.dump(list(self.data["insurance"].unique()), fp) + self.ins_vocab = self.data["insurance"].nunique() + + if self.feat_med: + with open("./data/dict/medVocab", "wb") as fp: + pickle.dump(list(meds["drug_name"].unique()), fp) + self.med_vocab = meds["drug_name"].nunique() + metaDic["Med"] = self.med_per_adm + + if self.feat_cond: + with open("./data/dict/condVocab", "wb") as fp: + pickle.dump(list(self.cond["new_icd_code"].unique()), fp) + self.cond_vocab = self.cond["new_icd_code"].nunique() + metaDic["Cond"] = self.cond_per_adm + + if self.feat_proc: + with open("./data/dict/procVocab", "wb") as fp: + pickle.dump(list(proc["icd_code"].unique()), fp) + self.proc_vocab = proc["icd_code"].unique() + metaDic["Proc"] = self.proc_per_adm + + if self.feat_lab: + with open("./data/dict/labsVocab", "wb") as fp: + pickle.dump(list(labs["itemid"].unique()), fp) + self.lab_vocab = labs["itemid"].unique() + metaDic["Lab"] = self.labs_per_adm + + with open("./data/dict/metaDic", "wb") as fp: + pickle.dump(metaDic, fp) diff --git a/preprocess_outcomes.py b/preprocess_outcomes.py index 412995dc60..a09fd5fdbe 100644 --- a/preprocess_outcomes.py +++ b/preprocess_outcomes.py @@ -1,4 +1,4 @@ -''' +""" Lrasmy@Zhilab Jan 2021 # This script processes originally extracted data on a distributed platform @@ -23,10 +23,11 @@ # .pts: List of unique Patient ids. Created for validation and comparison purposes # .types: Python dictionary that maps string diagnosis codes to integer diagnosis codes. # Main output files for the baseline RNN models are .combined -''' +""" import sys from optparse import OptionParser + try: import cPickle as pickle except: @@ -37,247 +38,293 @@ from datetime import datetime as dt from datetime import timedelta import glob -#import timeit ( for time tracking if required) - - -def load_data( dataFile, labelFile , typeFile , dist=False, exclude=[]): - ## loading Case - print('loading data') - - if dist: - all_files1 = glob.glob(dataFile + "/*.csv") - li1 = [] - for filename in all_files1: - df = pd.read_csv(filename) - li1.append(df) - data_dat = pd.concat(li1).drop_duplicates() - else: data_dat=pd.read_table(dataFile) - - data_dat.columns = ["Pt_id", "ICD", "Time"] - - if len(exclude)>0: - data_dat=data_dat[~(data_dat["ICD"].str.startswith(tuple(exclude)))] - - print('loaded data for: ',data_dat["Pt_id"].nunique()) - print('loading labels') - - if dist: - all_files = glob.glob(labelFile + "/*.csv") - li = [] - for filename in all_files: - df = pd.read_csv(filename) - li.append(df) - - data_lbl_v1 = pd.concat(li).drop_duplicates() - else: data_lbl_v1=pd.read_table(labelFile) - - data_lbl_v1.columns = ["Pt_id", "mort_label","LOS"]#,"vent_label","time_to_intub","Readmission_label","plos_label"] - data_lbl=pd.merge(data_dat["Pt_id"].drop_duplicates(),data_lbl_v1, how='inner').drop_duplicates() - print('loaded labels for: ',data_lbl_v1["Pt_id"].nunique() , ' after primary cleaning ',data_lbl["Pt_id"].nunique()) - print('Mortality Case counts: ',data_lbl[data_lbl["mort_label"]==1]["Pt_id"].nunique()) - #print('Intubation Case counts: ',data_lbl[data_lbl["vent_label"]==1]["Pt_id"].nunique()) - #print('Intubation Case with tti >=1 : ',data_lbl[(data_lbl["vent_label"]==1)& (data_lbl["time_to_intub"]>=1)]["Pt_id"].nunique()) - print('LOS>7 : ',data_lbl[data_lbl["LOS"]>7]["Pt_id"].nunique()) - #print('pLOS>7 : ',data_lbl[data_lbl["plos_label"]==1]["Pt_id"].nunique()) - #print('Readmission case counts : ',data_lbl[data_lbl["Readmission_label"]==1]["Pt_id"].nunique()) - - ### An example of sampling code: Control Sampling - #print('pt sampling') - #data_sk=data_dat["Pt_id"] - #data_sk=data_sk.drop_duplicates() - #data_sk_samp=data_sk.sample(n=samplesize_pts) ## that is an input arg 7 - #data_dat=data_dat[data_dat["Pt_id"].isin(data_sk_samp.values.tolist())] - #data_lbl=data_lbl[data_lbl["Pt_id"].isin(data_sk_samp.values.tolist())] - - - - ## loading the types - - if typeFile=='NA': - types={"zero_pad":0} - print('new types dictionary') - else: - with open(typeFile, 'rb') as t2: - types=pickle.load(t2) - print('types dictionary loaded') - #end_time = timeit.timeit() - #print ("consumed time for data loading",(_start -end_time)/1000.0 ) - return data_dat, data_lbl, types - - -def pickle_data (data_dat, data_lbl, types, reverse=True): - - full_list=[] + +# import timeit ( for time tracking if required) + + +def load_data(dataFile, labelFile, typeFile, dist=False, exclude=[]): + ## loading Case + print("loading data") + + if dist: + all_files1 = glob.glob(dataFile + "/*.csv") + li1 = [] + for filename in all_files1: + df = pd.read_csv(filename) + li1.append(df) + data_dat = pd.concat(li1).drop_duplicates() + else: + data_dat = pd.read_table(dataFile) + + data_dat.columns = ["Pt_id", "ICD", "Time"] + + if len(exclude) > 0: + data_dat = data_dat[~(data_dat["ICD"].str.startswith(tuple(exclude)))] + + print("loaded data for: ", data_dat["Pt_id"].nunique()) + print("loading labels") + + if dist: + all_files = glob.glob(labelFile + "/*.csv") + li = [] + for filename in all_files: + df = pd.read_csv(filename) + li.append(df) + + data_lbl_v1 = pd.concat(li).drop_duplicates() + else: + data_lbl_v1 = pd.read_table(labelFile) + + data_lbl_v1.columns = [ + "Pt_id", + "mort_label", + "LOS", + ] # ,"vent_label","time_to_intub","Readmission_label","plos_label"] + data_lbl = pd.merge( + data_dat["Pt_id"].drop_duplicates(), data_lbl_v1, how="inner" + ).drop_duplicates() + print( + "loaded labels for: ", + data_lbl_v1["Pt_id"].nunique(), + " after primary cleaning ", + data_lbl["Pt_id"].nunique(), + ) + print( + "Mortality Case counts: ", + data_lbl[data_lbl["mort_label"] == 1]["Pt_id"].nunique(), + ) + # print('Intubation Case counts: ',data_lbl[data_lbl["vent_label"]==1]["Pt_id"].nunique()) + # print('Intubation Case with tti >=1 : ',data_lbl[(data_lbl["vent_label"]==1)& (data_lbl["time_to_intub"]>=1)]["Pt_id"].nunique()) + print("LOS>7 : ", data_lbl[data_lbl["LOS"] > 7]["Pt_id"].nunique()) + # print('pLOS>7 : ',data_lbl[data_lbl["plos_label"]==1]["Pt_id"].nunique()) + # print('Readmission case counts : ',data_lbl[data_lbl["Readmission_label"]==1]["Pt_id"].nunique()) + + ### An example of sampling code: Control Sampling + # print('pt sampling') + # data_sk=data_dat["Pt_id"] + # data_sk=data_sk.drop_duplicates() + # data_sk_samp=data_sk.sample(n=samplesize_pts) ## that is an input arg 7 + # data_dat=data_dat[data_dat["Pt_id"].isin(data_sk_samp.values.tolist())] + # data_lbl=data_lbl[data_lbl["Pt_id"].isin(data_sk_samp.values.tolist())] + + ## loading the types + + if typeFile == "NA": + types = {"zero_pad": 0} + print("new types dictionary") + else: + with open(typeFile, "rb") as t2: + types = pickle.load(t2) + print("types dictionary loaded") + # end_time = timeit.timeit() + # print ("consumed time for data loading",(_start -end_time)/1000.0 ) + return data_dat, data_lbl, types + + +def pickle_data(data_dat, data_lbl, types, reverse=True): + full_list = [] index_date = {} time_list = [] - dates_list =[] + dates_list = [] label_list = [] pt_list = [] - dur_list=[] + dur_list = [] newVisit_list = [] - count=0 - - for Pt, group in data_dat.groupby('Pt_id'): - data_i_c = [] - data_dt_c = [] - for Time, subgroup in group.sort_values(['Time'], ascending= not reverse).groupby('Time', sort=False): ### ascending=True normal order ascending=False reveresed order - data_i_c.append(np.array(subgroup['ICD']).tolist())# get ICD codes for each admission separately - data_dt_c.append(dt.strptime(Time, '%Y-%m-%d'))#concat dischargetime of each admission - if len(data_i_c) > 0: - # creating the duration in days between visits list, first visit marked with 0 (last in reversed order) - v_dur_c=[] - if len(data_dt_c)<=1: - v_dur_c=[0] - else: - for jx in range (len(data_dt_c)): - if jx==0: - v_dur_c.append(jx) - else: - #xx = ((dt.strptime(data_dt_c[jx-1], '%d-%b-%y'))-(dt.strptime(data_dt_c[jx], '%d-%b-%y'))).days ## use if original data have time information or different date format - if reverse: xx = (data_dt_c[jx-1] - data_dt_c[jx]).days ## reversed order - else: xx = (data_dt_c[jx]- data_dt_c[jx-1]).days ### normal order - v_dur_c.append(xx) - #print(data_i_c) - #print(data_dt_c) - #print(v_dur_c) - #print(types) - ### Diagnosis recoding - newPatient_c = [] - for visit in data_i_c: - newVisit_c = [] - for code in visit: - if code in types: newVisit_c.append(types[code]) - else: - types[code] = max(types.values())+1 - newVisit_c.append(types[code]) - newPatient_c.append(newVisit_c) - #print(newPatient_c) - - if len(data_i_c) > 0: ## only save non-empty entries - label_list.append(data_lbl.loc[data_lbl.Pt_id == Pt, ['mort_label','LOS']#,'vent_label','time_to_intub','Readmission_label','plos_label'] - ].values.squeeze().tolist()) #### LR ammended for multilabel - pt_list.append(Pt) - newVisit_list.append(newPatient_c) - dur_list.append(v_dur_c) - print(label_list) - print(pt_list) - print(dur_list) - print(newVisit_list) - count=count+1 - if count % 1000 == 0: print ('processed %d pts' % count) - return types,pt_list,label_list,newVisit_list,dur_list - -def reparsing(pt_list,label_list,newVisit_list,dur_list): - ### Create the combined list for the Pytorch RNN - fset=[] - print ('Reparsing') + count = 0 + + for Pt, group in data_dat.groupby("Pt_id"): + data_i_c = [] + data_dt_c = [] + for Time, subgroup in group.sort_values( + ["Time"], ascending=not reverse + ).groupby( + "Time", sort=False + ): ### ascending=True normal order ascending=False reveresed order + data_i_c.append( + np.array(subgroup["ICD"]).tolist() + ) # get ICD codes for each admission separately + data_dt_c.append( + dt.strptime(Time, "%Y-%m-%d") + ) # concat dischargetime of each admission + if len(data_i_c) > 0: + # creating the duration in days between visits list, first visit marked with 0 (last in reversed order) + v_dur_c = [] + if len(data_dt_c) <= 1: + v_dur_c = [0] + else: + for jx in range(len(data_dt_c)): + if jx == 0: + v_dur_c.append(jx) + else: + # xx = ((dt.strptime(data_dt_c[jx-1], '%d-%b-%y'))-(dt.strptime(data_dt_c[jx], '%d-%b-%y'))).days ## use if original data have time information or different date format + if reverse: + xx = (data_dt_c[jx - 1] - data_dt_c[jx]).days ## reversed order + else: + xx = (data_dt_c[jx] - data_dt_c[jx - 1]).days ### normal order + v_dur_c.append(xx) + # print(data_i_c) + # print(data_dt_c) + # print(v_dur_c) + # print(types) + ### Diagnosis recoding + newPatient_c = [] + for visit in data_i_c: + newVisit_c = [] + for code in visit: + if code in types: + newVisit_c.append(types[code]) + else: + types[code] = max(types.values()) + 1 + newVisit_c.append(types[code]) + newPatient_c.append(newVisit_c) + # print(newPatient_c) + + if len(data_i_c) > 0: ## only save non-empty entries + label_list.append( + data_lbl.loc[ + data_lbl.Pt_id == Pt, + [ + "mort_label", + "LOS", + ], # ,'vent_label','time_to_intub','Readmission_label','plos_label'] + ] + .values.squeeze() + .tolist() + ) #### LR ammended for multilabel + pt_list.append(Pt) + newVisit_list.append(newPatient_c) + dur_list.append(v_dur_c) + print(label_list) + print(pt_list) + print(dur_list) + print(newVisit_list) + count = count + 1 + if count % 1000 == 0: + print("processed %d pts" % count) + return types, pt_list, label_list, newVisit_list, dur_list + + +def reparsing(pt_list, label_list, newVisit_list, dur_list): + ### Create the combined list for the Pytorch RNN + fset = [] + print("Reparsing") for pt_idx in range(len(pt_list)): - pt_sk= pt_list[pt_idx] - pt_lbl= label_list[pt_idx] - pt_vis= newVisit_list[pt_idx] - pt_td= dur_list[pt_idx] - d_gr=[] - n_seq=[] - d_a_v=[] - for v in range(len(pt_vis)): - nv=[] - nv.append([pt_td[v]]) - nv.append(pt_vis[v]) - n_seq.append(nv) - n_pt= [pt_sk,pt_lbl,n_seq] - print("n_pt",n_pt) - fset.append(n_pt) + pt_sk = pt_list[pt_idx] + pt_lbl = label_list[pt_idx] + pt_vis = newVisit_list[pt_idx] + pt_td = dur_list[pt_idx] + d_gr = [] + n_seq = [] + d_a_v = [] + for v in range(len(pt_vis)): + nv = [] + nv.append([pt_td[v]]) + nv.append(pt_vis[v]) + n_seq.append(nv) + n_pt = [pt_sk, pt_lbl, n_seq] + print("n_pt", n_pt) + fset.append(n_pt) return fset -def split_data(fset, pt_list, pts_file_pre,outFile): - +def split_data(fset, pt_list, pts_file_pre, outFile): ### Random split to train ,test and validation sets - print ("Splitting") + print("Splitting") - if pts_file_pre=='NA': - print('random split') + if pts_file_pre == "NA": + print("random split") dataSize = len(pt_list) - #np.random.seed(0) + # np.random.seed(0) ind = np.random.permutation(dataSize) nTest = int(0.2 * dataSize) nValid = int(0.1 * dataSize) test_indices = ind[:nTest] - valid_indices = ind[nTest:nTest+nValid] - train_indices = ind[nTest+nValid:] + valid_indices = ind[nTest : nTest + nValid] + train_indices = ind[nTest + nValid :] else: - print ('loading previous splits') - pt_train=pickle.load(open(pts_file_pre+'.train', 'rb')) - pt_valid=pickle.load(open(pts_file_pre+'.valid', 'rb')) - pt_test=pickle.load(open(pts_file_pre+'.test', 'rb')) - test_indices = np.intersect1d(pt_list, pt_test,assume_unique=True, return_indices=True)[1] - valid_indices= np.intersect1d(pt_list, pt_valid,assume_unique=True, return_indices=True)[1] - train_indices= np.intersect1d(pt_list, pt_train,assume_unique=True, return_indices=True)[1] - - for subset in ['train','valid','test']: - if subset =='train': + print("loading previous splits") + pt_train = pickle.load(open(pts_file_pre + ".train", "rb")) + pt_valid = pickle.load(open(pts_file_pre + ".valid", "rb")) + pt_test = pickle.load(open(pts_file_pre + ".test", "rb")) + test_indices = np.intersect1d( + pt_list, pt_test, assume_unique=True, return_indices=True + )[1] + valid_indices = np.intersect1d( + pt_list, pt_valid, assume_unique=True, return_indices=True + )[1] + train_indices = np.intersect1d( + pt_list, pt_train, assume_unique=True, return_indices=True + )[1] + + for subset in ["train", "valid", "test"]: + if subset == "train": indices = train_indices - elif subset =='valid': + elif subset == "valid": indices = valid_indices - elif subset =='test': + elif subset == "test": indices = test_indices - else: - print ('error') + else: + print("error") break - + #### below comments are mainly because I'm no longer need those theano RETAIN needed data, so comment for now #### only using Pts file , so keeping them for now - - #subset_x = [newVisit_list[i] for i in indices] - #subset_y = [label_list[i] for i in indices] - #subset_t = [dur_list[i] for i in indices] - subset_p = [pt_list[i] for i in indices] - #nseqfile = outFile +'.visits.'+subset - #nlabfile = outFile +'.labels.'+subset - #ntimefile = outFile +'.days.'+subset - nptfile = outFile +'.pts.'+subset - #pickle.dump(subset_x, open(nseqfile, 'wb'),protocol=2) - #pickle.dump(subset_y, open(nlabfile, 'wb'),protocol=2) - #pickle.dump(subset_t, open(ntimefile, 'wb'),protocol=2) - pickle.dump(subset_p, open(nptfile, 'wb'),protocol=2) - - subset_full= [fset[i] for i in indices] - ncombfile = outFile +'.combined.'+subset - pickle.dump(subset_full, open(ncombfile, 'wb'), -1) - -def dump_split_process_data(dataFile, labelFile , typeFile ,outFile , pts_file_pre , dist=False, exclude=[],reverse=True): - - data_dat, data_lbl, types = load_data( dataFile, labelFile , typeFile , dist=dist, exclude=exclude) - types, pt_list , label_list,newVisit_list,dur_list = pickle_data (data_dat, data_lbl, types, reverse=reverse) - fset= reparsing(pt_list , label_list , newVisit_list , dur_list) - split_data(fset, pt_list , pts_file_pre,outFile) - pickle.dump(types, open(outFile+'.types', 'wb'), -1) - - ### Creating the full pickled lists ### uncomment if you need to dump the all data before splitting - #pickle.dump(label_list, open(outFile+'.labels', 'wb'), -1) - #pickle.dump(newVisit_list, open(outFile+'.visits', 'wb'), -1) - #pickle.dump(pt_list, open(outFile+'.pts', 'wb'), -1) - #pickle.dump(dur_list, open(outFile+'.days', 'wb'), -1) - - -if __name__ == '__main__': - - dataFile= sys.argv[1] - labelFile= sys.argv[2] - typeFile= sys.argv[3] - outFile = sys.argv[4] - pts_file_pre = sys.argv[5] - #cls_type= sys.argv[6] - #samplesize_pts = int(sys.argv[7]) - parser = OptionParser() - (options, args) = parser.parse_args() - dump_split_process_data(dataFile, labelFile , typeFile ,outFile , pts_file_pre , dist=False, exclude=[]) - + # subset_x = [newVisit_list[i] for i in indices] + # subset_y = [label_list[i] for i in indices] + # subset_t = [dur_list[i] for i in indices] + subset_p = [pt_list[i] for i in indices] + # nseqfile = outFile +'.visits.'+subset + # nlabfile = outFile +'.labels.'+subset + # ntimefile = outFile +'.days.'+subset + nptfile = outFile + ".pts." + subset + # pickle.dump(subset_x, open(nseqfile, 'wb'),protocol=2) + # pickle.dump(subset_y, open(nlabfile, 'wb'),protocol=2) + # pickle.dump(subset_t, open(ntimefile, 'wb'),protocol=2) + pickle.dump(subset_p, open(nptfile, "wb"), protocol=2) + subset_full = [fset[i] for i in indices] + ncombfile = outFile + ".combined." + subset + pickle.dump(subset_full, open(ncombfile, "wb"), -1) +def dump_split_process_data( + dataFile, + labelFile, + typeFile, + outFile, + pts_file_pre, + dist=False, + exclude=[], + reverse=True, +): + data_dat, data_lbl, types = load_data( + dataFile, labelFile, typeFile, dist=dist, exclude=exclude + ) + types, pt_list, label_list, newVisit_list, dur_list = pickle_data( + data_dat, data_lbl, types, reverse=reverse + ) + fset = reparsing(pt_list, label_list, newVisit_list, dur_list) + split_data(fset, pt_list, pts_file_pre, outFile) + pickle.dump(types, open(outFile + ".types", "wb"), -1) - + ### Creating the full pickled lists ### uncomment if you need to dump the all data before splitting + # pickle.dump(label_list, open(outFile+'.labels', 'wb'), -1) + # pickle.dump(newVisit_list, open(outFile+'.visits', 'wb'), -1) + # pickle.dump(pt_list, open(outFile+'.pts', 'wb'), -1) + # pickle.dump(dur_list, open(outFile+'.days', 'wb'), -1) +if __name__ == "__main__": + dataFile = sys.argv[1] + labelFile = sys.argv[2] + typeFile = sys.argv[3] + outFile = sys.argv[4] + pts_file_pre = sys.argv[5] + # cls_type= sys.argv[6] + # samplesize_pts = int(sys.argv[7]) + parser = OptionParser() + (options, args) = parser.parse_args() + dump_split_process_data( + dataFile, labelFile, typeFile, outFile, pts_file_pre, dist=False, exclude=[] + ) diff --git a/preprocessing/day_intervals_preproc/day_intervals_cohort_v2.py b/preprocessing/day_intervals_preproc/day_intervals_cohort_v2.py index 080cbf543b..8da71149e5 100644 --- a/preprocessing/day_intervals_preproc/day_intervals_cohort_v2.py +++ b/preprocessing/day_intervals_preproc/day_intervals_cohort_v2.py @@ -6,14 +6,31 @@ from pathlib import Path from tqdm import tqdm import importlib -import disease_cohort -importlib.reload(disease_cohort) -import disease_cohort -sys.path.append(os.path.dirname(os.path.abspath(__file__)) + './../..') + +import preprocessing.day_intervals_preproc.disease_cohort as disease_cohort + +# importlib.reload(disease_cohort) +# import disease_cohort + +sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "./../..") if not os.path.exists("./data/cohort"): os.makedirs("./data/cohort") - -def get_visit_pts(mimic4_path:str, group_col:str, visit_col:str, admit_col:str, disch_col:str, adm_visit_col:str, use_mort:bool, use_los:bool, los:int, use_admn:bool, disease_label:str,use_ICU:bool): + + +def get_visit_pts( + mimic4_path: str, + group_col: str, + visit_col: str, + admit_col: str, + disch_col: str, + adm_visit_col: str, + use_mort: bool, + use_los: bool, + los: int, + use_admn: bool, + disease_label: str, + use_ICU: bool, +): """Combines the MIMIC-IV core/patients table information with either the icu/icustays or core/admissions data. Parameters: @@ -25,83 +42,186 @@ def get_visit_pts(mimic4_path:str, group_col:str, visit_col:str, admit_col:str, use_ICU: describes whether to speficially look at ICU visits in icu/icustays OR look at general admissions from core/admissions """ - visit = None # df containing visit information depending on using ICU or not + visit = None # df containing visit information depending on using ICU or not if use_ICU: - visit = pd.read_csv(mimic4_path + "icu/icustays.csv.gz", compression='gzip', header=0, index_col=None, parse_dates=[admit_col, disch_col]) + visit = pd.read_csv( + mimic4_path + "icu/icustays.csv.gz", + compression="gzip", + header=0, + index_col=None, + parse_dates=[admit_col, disch_col], + ) if use_admn: # icustays doesn't have a way to identify if patient died during visit; must # use core/patients to remove such stay_ids for readmission labels - pts = pd.read_csv(mimic4_path + "hosp/patients.csv.gz", compression='gzip', header=0, index_col=None, usecols=['subject_id', 'dod'], parse_dates=['dod']) - visit = visit.merge(pts, how='inner', left_on='subject_id', right_on='subject_id') + pts = pd.read_csv( + mimic4_path + "hosp/patients.csv.gz", + compression="gzip", + header=0, + index_col=None, + usecols=["subject_id", "dod"], + parse_dates=["dod"], + ) + visit = visit.merge( + pts, how="inner", left_on="subject_id", right_on="subject_id" + ) visit = visit.loc[(visit.dod.isna()) | (visit.dod >= visit[disch_col])] if len(disease_label): - hids=disease_cohort.extract_diag_cohort(visit['hadm_id'],disease_label,mimic4_path) - visit=visit[visit['hadm_id'].isin(hids['hadm_id'])] - print("[ READMISSION DUE TO "+disease_label+" ]") - + hids = disease_cohort.extract_diag_cohort( + visit["hadm_id"], disease_label, mimic4_path + ) + visit = visit[visit["hadm_id"].isin(hids["hadm_id"])] + print("[ READMISSION DUE TO " + disease_label + " ]") + else: - visit = pd.read_csv(mimic4_path + "hosp/admissions.csv.gz", compression='gzip', header=0, index_col=None, parse_dates=[admit_col, disch_col]) - visit['los']=visit[disch_col]-visit[admit_col] + visit = pd.read_csv( + mimic4_path + "hosp/admissions.csv.gz", + compression="gzip", + header=0, + index_col=None, + parse_dates=[admit_col, disch_col], + ) + visit["los"] = visit[disch_col] - visit[admit_col] visit[admit_col] = pd.to_datetime(visit[admit_col]) - visit[disch_col] = pd.to_datetime(visit[disch_col]) - visit['los']=pd.to_timedelta(visit[disch_col]-visit[admit_col],unit='h') - visit['los']=visit['los'].astype(str) - visit[['days', 'dummy','hours']] = visit['los'].str.split(' ', -1, expand=True) - visit['los']=pd.to_numeric(visit['days']) - visit=visit.drop(columns=['days', 'dummy','hours']) - - + visit[disch_col] = pd.to_datetime(visit[disch_col]) + visit["los"] = pd.to_timedelta(visit[disch_col] - visit[admit_col], unit="h") + visit["los"] = visit["los"].astype(str) + visit[["days", "dummy", "hours"]] = visit["los"].str.split(" ", expand=True) + visit["los"] = pd.to_numeric(visit["days"]) + visit = visit.drop(columns=["days", "dummy", "hours"]) + if use_admn: # remove hospitalizations with a death; impossible for readmission for such visits visit = visit.loc[visit.hospital_expire_flag == 0] if len(disease_label): - hids=disease_cohort.extract_diag_cohort(visit['hadm_id'],disease_label,mimic4_path) - visit=visit[visit['hadm_id'].isin(hids['hadm_id'])] - print("[ READMISSION DUE TO "+disease_label+" ]") + hids = disease_cohort.extract_diag_cohort( + visit["hadm_id"], disease_label, mimic4_path + ) + visit = visit[visit["hadm_id"].isin(hids["hadm_id"])] + print("[ READMISSION DUE TO " + disease_label + " ]") pts = pd.read_csv( - mimic4_path + "hosp/patients.csv.gz", compression='gzip', header=0, index_col = None, usecols=[group_col, 'anchor_year', 'anchor_age', 'anchor_year_group', 'dod','gender'] - ) - pts['yob']= pts['anchor_year'] - pts['anchor_age'] # get yob to ensure a given visit is from an adult - pts['min_valid_year'] = pts['anchor_year'] + (2019 - pts['anchor_year_group'].str.slice(start=-4).astype(int)) - + mimic4_path + "hosp/patients.csv.gz", + compression="gzip", + header=0, + index_col=None, + usecols=[ + group_col, + "anchor_year", + "anchor_age", + "anchor_year_group", + "dod", + "gender", + ], + ) + pts["yob"] = ( + pts["anchor_year"] - pts["anchor_age"] + ) # get yob to ensure a given visit is from an adult + pts["min_valid_year"] = pts["anchor_year"] + ( + 2019 - pts["anchor_year_group"].str.slice(start=-4).astype(int) + ) + # Define anchor_year corresponding to the anchor_year_group 2017-2019. This is later used to prevent consideration # of visits with prediction windows outside the dataset's time range (2008-2019) - #[[group_col, visit_col, admit_col, disch_col]] + # [[group_col, visit_col, admit_col, disch_col]] if use_ICU: - visit_pts = visit[[group_col, visit_col, adm_visit_col, admit_col, disch_col,'los']].merge( - pts[[group_col, 'anchor_year', 'anchor_age', 'yob', 'min_valid_year', 'dod','gender']], how='inner', left_on=group_col, right_on=group_col + visit_pts = visit[ + [group_col, visit_col, adm_visit_col, admit_col, disch_col, "los"] + ].merge( + pts[ + [ + group_col, + "anchor_year", + "anchor_age", + "yob", + "min_valid_year", + "dod", + "gender", + ] + ], + how="inner", + left_on=group_col, + right_on=group_col, ) else: - visit_pts = visit[[group_col, visit_col, admit_col, disch_col,'los']].merge( - pts[[group_col, 'anchor_year', 'anchor_age', 'yob', 'min_valid_year', 'dod','gender']], how='inner', left_on=group_col, right_on=group_col - ) + visit_pts = visit[[group_col, visit_col, admit_col, disch_col, "los"]].merge( + pts[ + [ + group_col, + "anchor_year", + "anchor_age", + "yob", + "min_valid_year", + "dod", + "gender", + ] + ], + how="inner", + left_on=group_col, + right_on=group_col, + ) # only take adult patients -# visit_pts['Age']=visit_pts[admit_col].dt.year - visit_pts['yob'] -# visit_pts = visit_pts.loc[visit_pts['Age'] >= 18] - visit_pts['Age']=visit_pts['anchor_age'] - visit_pts = visit_pts.loc[visit_pts['Age'] >= 18] - + # visit_pts['Age']=visit_pts[admit_col].dt.year - visit_pts['yob'] + # visit_pts = visit_pts.loc[visit_pts['Age'] >= 18] + visit_pts["Age"] = visit_pts["anchor_age"] + visit_pts = visit_pts.loc[visit_pts["Age"] >= 18] + ##Add Demo data - eth = pd.read_csv(mimic4_path + "hosp/admissions.csv.gz", compression='gzip', header=0, usecols=['hadm_id', 'insurance','race'], index_col=None) - visit_pts= visit_pts.merge(eth, how='inner', left_on='hadm_id', right_on='hadm_id') - + eth = pd.read_csv( + mimic4_path + "hosp/admissions.csv.gz", + compression="gzip", + header=0, + usecols=["hadm_id", "insurance", "race"], + index_col=None, + ) + visit_pts = visit_pts.merge(eth, how="inner", left_on="hadm_id", right_on="hadm_id") + if use_ICU: - return visit_pts[[group_col, visit_col, adm_visit_col, admit_col, disch_col,'los', 'min_valid_year', 'dod','Age','gender','race', 'insurance']] + return visit_pts[ + [ + group_col, + visit_col, + adm_visit_col, + admit_col, + disch_col, + "los", + "min_valid_year", + "dod", + "Age", + "gender", + "race", + "insurance", + ] + ] else: - return visit_pts.dropna(subset=['min_valid_year'])[[group_col, visit_col, admit_col, disch_col,'los', 'min_valid_year', 'dod','Age','gender','race', 'insurance']] + return visit_pts.dropna(subset=["min_valid_year"])[ + [ + group_col, + visit_col, + admit_col, + disch_col, + "los", + "min_valid_year", + "dod", + "Age", + "gender", + "race", + "insurance", + ] + ] def validate_row(row, ctrl, invalid, max_year, disch_col, valid_col, gap): """Checks if visit's prediction window potentially extends beyond the dataset range (2008-2019). An 'invalid row' is NOT guaranteed to be outside the range, only potentially outside due to de-identification of MIMIC-IV being done through 3-year time ranges. - + To be invalid, the end of the prediction window's year must both extend beyond the maximum seen year - for a patient AND beyond the year that corresponds to the 2017-2019 anchor year range for a patient""" - print("disch_col",row[disch_col]) + for a patient AND beyond the year that corresponds to the 2017-2019 anchor year range for a patient + """ + print("disch_col", row[disch_col]) print(gap) pred_year = (row[disch_col] + gap).year if max_year < pred_year and pred_year > row[valid_col]: @@ -111,109 +231,162 @@ def validate_row(row, ctrl, invalid, max_year, disch_col, valid_col, gap): return ctrl, invalid -def partition_by_los(df:pd.DataFrame, los:int, group_col:str, visit_col:str, admit_col:str, disch_col:str, valid_col:str): - - invalid = df.loc[(df[admit_col].isna()) | (df[disch_col].isna()) | (df['los'].isna())] - cohort = df.loc[(~df[admit_col].isna()) & (~df[disch_col].isna()) & (~df['los'].isna())] - - - #cohort=cohort.fillna(0) - pos_cohort=cohort[cohort['los']>los] - neg_cohort=cohort[cohort['los']<=los] - neg_cohort=neg_cohort.fillna(0) - pos_cohort=pos_cohort.fillna(0) - - pos_cohort['label']=1 - neg_cohort['label']=0 - - cohort=pd.concat([pos_cohort,neg_cohort], axis=0) - cohort=cohort.sort_values(by=[group_col,admit_col]) - #print("cohort",cohort.shape) +def partition_by_los( + df: pd.DataFrame, + los: int, + group_col: str, + visit_col: str, + admit_col: str, + disch_col: str, + valid_col: str, +): + invalid = df.loc[ + (df[admit_col].isna()) | (df[disch_col].isna()) | (df["los"].isna()) + ] + cohort = df.loc[ + (~df[admit_col].isna()) & (~df[disch_col].isna()) & (~df["los"].isna()) + ] + + # cohort=cohort.fillna(0) + pos_cohort = cohort[cohort["los"] > los] + neg_cohort = cohort[cohort["los"] <= los] + neg_cohort = neg_cohort.fillna(0) + pos_cohort = pos_cohort.fillna(0) + + pos_cohort["label"] = 1 + neg_cohort["label"] = 0 + + cohort = pd.concat([pos_cohort, neg_cohort], axis=0) + cohort = cohort.sort_values(by=[group_col, admit_col]) + # print("cohort",cohort.shape) print("[ LOS LABELS FINISHED ]") return cohort, invalid - - -def partition_by_readmit(df:pd.DataFrame, gap:datetime.timedelta, group_col:str, visit_col:str, admit_col:str, disch_col:str, valid_col:str): + + +def partition_by_readmit( + df: pd.DataFrame, + gap: datetime.timedelta, + group_col: str, + visit_col: str, + admit_col: str, + disch_col: str, + valid_col: str, +): """Applies labels to individual visits according to whether or not a readmission has occurred within the specified `gap` days. For a given visit, another visit must occur within the gap window for a positive readmission label. - The gap window starts from the disch_col time and the admit_col of subsequent visits are considered.""" - - case = pd.DataFrame() # hadm_ids with readmission within the gap period - ctrl = pd.DataFrame() # hadm_ids without readmission within the gap period - invalid = pd.DataFrame() # hadm_ids that are not considered in the cohort + The gap window starts from the disch_col time and the admit_col of subsequent visits are considered. + """ + + case = pd.DataFrame() # hadm_ids with readmission within the gap period + ctrl = pd.DataFrame() # hadm_ids without readmission within the gap period + invalid = pd.DataFrame() # hadm_ids that are not considered in the cohort # Iterate through groupbys based on group_col (subject_id). Data is sorted by subject_id and admit_col (admittime) # to ensure that the most current hadm_id is last in a group. - #grouped= df[[group_col, visit_col, admit_col, disch_col, valid_col]].sort_values(by=[group_col, admit_col]).groupby(group_col) - grouped= df.sort_values(by=[group_col, admit_col]).groupby(group_col) + # grouped= df[[group_col, visit_col, admit_col, disch_col, valid_col]].sort_values(by=[group_col, admit_col]).groupby(group_col) + grouped = df.sort_values(by=[group_col, admit_col]).groupby(group_col) for subject, group in tqdm(grouped): max_year = group.max()[disch_col].year if group.shape[0] <= 1: - #ctrl, invalid = validate_row(group.iloc[0], ctrl, invalid, max_year, disch_col, valid_col, gap) # A group with 1 row has no readmission; goes to ctrl - ctrl = ctrl.append(group.iloc[0]) + # ctrl, invalid = validate_row(group.iloc[0], ctrl, invalid, max_year, disch_col, valid_col, gap) # A group with 1 row has no readmission; goes to ctrl + ctrl = pd.concat([ctrl, pd.DataFrame([group.iloc[0]])], ignore_index=True) else: - for idx in range(group.shape[0]-1): - visit_time = group.iloc[idx][disch_col] # For each index (a unique hadm_id), get its timestamp - if group.loc[ - (group[admit_col] > visit_time) & # Readmissions must come AFTER the current timestamp - (group[admit_col] - visit_time <= gap) # Distance between a timestamp and readmission must be within gap - ].shape[0] >= 1: # If ANY rows meet above requirements, a readmission has occurred after that visit - - case = case.append(group.iloc[idx]) + for idx in range(group.shape[0] - 1): + visit_time = group.iloc[idx][ + disch_col + ] # For each index (a unique hadm_id), get its timestamp + if ( + group.loc[ + (group[admit_col] > visit_time) + & ( # Readmissions must come AFTER the current timestamp + group[admit_col] - visit_time <= gap + ) # Distance between a timestamp and readmission must be within gap + ].shape[0] + >= 1 + ): # If ANY rows meet above requirements, a readmission has occurred after that visit + case = pd.concat( + [case, pd.DataFrame([group.iloc[idx]])], ignore_index=True + ) else: # If no readmission is found, only add to ctrl if prediction window is guaranteed to be within the # time range of the dataset (2008-2019). Visits with prediction windows existing in potentially out-of-range # dates (like 2018-2020) are excluded UNLESS the prediction window takes place the same year as the visit, # in which case it is guaranteed to be within 2008-2019 - ctrl = ctrl.append(group.iloc[idx]) + ctrl = pd.concat( + [ctrl, pd.DataFrame([group.iloc[idx]])], ignore_index=True + ) - #ctrl, invalid = validate_row(group.iloc[-1], ctrl, invalid, max_year, disch_col, valid_col, gap) # The last hadm_id datewise is guaranteed to have no readmission logically - ctrl = ctrl.append(group.iloc[-1]) - #print(f"[ {gap.days} DAYS ] {case.shape[0] + ctrl.shape[0]}/{df.shape[0]} {visit_col}s processed") + # ctrl, invalid = validate_row(group.iloc[-1], ctrl, invalid, max_year, disch_col, valid_col, gap) # The last hadm_id datewise is guaranteed to have no readmission logically + ctrl = pd.concat([ctrl, pd.DataFrame([group.iloc[-1]])], ignore_index=True) + # print(f"[ {gap.days} DAYS ] {case.shape[0] + ctrl.shape[0]}/{df.shape[0]} {visit_col}s processed") print("[ READMISSION LABELS FINISHED ]") return case, ctrl, invalid -def partition_by_mort(df:pd.DataFrame, group_col:str, visit_col:str, admit_col:str, disch_col:str, death_col:str): +def partition_by_mort( + df: pd.DataFrame, + group_col: str, + visit_col: str, + admit_col: str, + disch_col: str, + death_col: str, +): """Applies labels to individual visits according to whether or not a death has occurred within the times of the specified admit_col and disch_col""" invalid = df.loc[(df[admit_col].isna()) | (df[disch_col].isna())] cohort = df.loc[(~df[admit_col].isna()) & (~df[disch_col].isna())] - -# cohort["label"] = ( -# (~cohort[death_col].isna()) -# & (cohort[death_col] >= cohort[admit_col]) -# & (cohort[death_col] <= cohort[disch_col]) -# ) -# cohort["label"] = cohort["label"].astype("Int32") - #print("cohort",cohort.shape) - #print(np.where(~cohort[death_col].isna(),1,0)) - #print(np.where(cohort.loc[death_col] >= cohort.loc[admit_col],1,0)) - #print(np.where(cohort.loc[death_col] <= cohort.loc[disch_col],1,0)) - cohort['label']=0 - #cohort=cohort.fillna(0) - pos_cohort=cohort[~cohort[death_col].isna()] - neg_cohort=cohort[cohort[death_col].isna()] - neg_cohort=neg_cohort.fillna(0) - pos_cohort=pos_cohort.fillna(0) + + # cohort["label"] = ( + # (~cohort[death_col].isna()) + # & (cohort[death_col] >= cohort[admit_col]) + # & (cohort[death_col] <= cohort[disch_col]) + # ) + # cohort["label"] = cohort["label"].astype("Int32") + # print("cohort",cohort.shape) + # print(np.where(~cohort[death_col].isna(),1,0)) + # print(np.where(cohort.loc[death_col] >= cohort.loc[admit_col],1,0)) + # print(np.where(cohort.loc[death_col] <= cohort.loc[disch_col],1,0)) + cohort["label"] = 0 + # cohort=cohort.fillna(0) + pos_cohort = cohort[~cohort[death_col].isna()] + neg_cohort = cohort[cohort[death_col].isna()] + neg_cohort = neg_cohort.fillna(0) + pos_cohort = pos_cohort.fillna(0) pos_cohort[death_col] = pd.to_datetime(pos_cohort[death_col]) - pos_cohort['label'] = np.where((pos_cohort[death_col] >= pos_cohort[admit_col]) & (pos_cohort[death_col] <= pos_cohort[disch_col]),1,0) - - pos_cohort['label'] = pos_cohort['label'].astype("Int32") - cohort=pd.concat([pos_cohort,neg_cohort], axis=0) - cohort=cohort.sort_values(by=[group_col,admit_col]) - #print("cohort",cohort.shape) + pos_cohort["label"] = np.where( + (pos_cohort[death_col] >= pos_cohort[admit_col]) + & (pos_cohort[death_col] <= pos_cohort[disch_col]), + 1, + 0, + ) + + pos_cohort["label"] = pos_cohort["label"].astype("Int32") + cohort = pd.concat([pos_cohort, neg_cohort], axis=0) + cohort = cohort.sort_values(by=[group_col, admit_col]) + # print("cohort",cohort.shape) print("[ MORTALITY LABELS FINISHED ]") return cohort, invalid -def get_case_ctrls(df:pd.DataFrame, gap:int, group_col:str, visit_col:str, admit_col:str, disch_col:str, valid_col:str, death_col:str, use_mort=False,use_admn=False,use_los=False) -> pd.DataFrame: +def get_case_ctrls( + df: pd.DataFrame, + gap: int, + group_col: str, + visit_col: str, + admit_col: str, + disch_col: str, + valid_col: str, + death_col: str, + use_mort=False, + use_admn=False, + use_los=False, +) -> pd.DataFrame: """Handles logic for creating the labelled cohort based on arguments passed to extract(). Parameters: @@ -228,86 +401,142 @@ def get_case_ctrls(df:pd.DataFrame, gap:int, group_col:str, visit_col:str, admit """ case = None # hadm_ids with readmission within the gap period - ctrl = None # hadm_ids without readmission within the gap period - invalid = None # hadm_ids that are not considered in the cohort + ctrl = None # hadm_ids without readmission within the gap period + invalid = None # hadm_ids that are not considered in the cohort if use_mort: - return partition_by_mort(df, group_col, visit_col, admit_col, disch_col, death_col) + return partition_by_mort( + df, group_col, visit_col, admit_col, disch_col, death_col + ) elif use_admn: gap = datetime.timedelta(days=gap) # transform gap into a timedelta to compare with datetime columns - case, ctrl, invalid = partition_by_readmit(df, gap, group_col, visit_col, admit_col, disch_col, valid_col) + case, ctrl, invalid = partition_by_readmit( + df, gap, group_col, visit_col, admit_col, disch_col, valid_col + ) # case hadm_ids are labelled 1 for readmission, ctrls have a 0 label - case['label'] = np.ones(case.shape[0]).astype(int) - ctrl['label'] = np.zeros(ctrl.shape[0]).astype(int) + case["label"] = np.ones(case.shape[0]).astype(int) + ctrl["label"] = np.zeros(ctrl.shape[0]).astype(int) return pd.concat([case, ctrl], axis=0), invalid elif use_los: - return partition_by_los(df, gap, group_col, visit_col, admit_col, disch_col, death_col) + return partition_by_los( + df, gap, group_col, visit_col, admit_col, disch_col, death_col + ) # print(f"[ {gap.days} DAYS ] {invalid.shape[0]} hadm_ids are invalid") -def extract_data(use_ICU:str, label:str, time:int, icd_code:str, root_dir, disease_label, cohort_output=None, summary_output=None): +# create extract options: use_icu... +# root_dir (path) -> raw_data_dir and preproc_data_dir + + +def extract_data( + use_ICU: str, + label: str, + time: int, + icd_code: str, + root_dir, + disease_label, + cohort_output=None, + summary_output=None, +): """Extracts cohort data and summary from MIMIC-IV data based on provided parameters. Parameters: cohort_output: name of labelled cohort output file summary_output: name of summary output file use_ICU: state whether to use ICU patient data or not - label: Can either be '{day} day Readmission' or 'Mortality', decides what binary data label signifies""" + label: Can either be '{day} day Readmission' or 'Mortality', decides what binary data label signifies + """ print("===========MIMIC-IV v2.0============") if not cohort_output: - cohort_output="cohort_" + use_ICU.lower() + "_" + label.lower().replace(" ", "_") + "_" + str(time) + "_" + disease_label + cohort_output = ( + "cohort_" + + use_ICU.lower() + + "_" + + label.lower().replace(" ", "_") + + "_" + + str(time) + + "_" + + disease_label + ) if not summary_output: - summary_output="summary_" + use_ICU.lower() + "_" + label.lower().replace(" ", "_") + "_" + str(time) + "_" + disease_label - - if icd_code=="No Disease Filter": + summary_output = ( + "summary_" + + use_ICU.lower() + + "_" + + label.lower().replace(" ", "_") + + "_" + + str(time) + + "_" + + disease_label + ) + + if icd_code == "No Disease Filter": if len(disease_label): - print(f"EXTRACTING FOR: | {use_ICU.upper()} | {label.upper()} DUE TO {disease_label.upper()} | {str(time)} | ") + print( + f"EXTRACTING FOR: | {use_ICU.upper()} | {label.upper()} DUE TO {disease_label.upper()} | {str(time)} | " + ) else: - print(f"EXTRACTING FOR: | {use_ICU.upper()} | {label.upper()} | {str(time)} |") + print( + f"EXTRACTING FOR: | {use_ICU.upper()} | {label.upper()} | {str(time)} |" + ) else: if len(disease_label): - print(f"EXTRACTING FOR: | {use_ICU.upper()} | {label.upper()} DUE TO {disease_label.upper()} | ADMITTED DUE TO {icd_code.upper()} | {str(time)} |") + print( + f"EXTRACTING FOR: | {use_ICU.upper()} | {label.upper()} DUE TO {disease_label.upper()} | ADMITTED DUE TO {icd_code.upper()} | {str(time)} |" + ) else: - print(f"EXTRACTING FOR: | {use_ICU.upper()} | {label.upper()} | ADMITTED DUE TO {icd_code.upper()} | {str(time)} |") - #print(label) - cohort, invalid = None, None # final labelled output and df of invalid records, respectively + print( + f"EXTRACTING FOR: | {use_ICU.upper()} | {label.upper()} | ADMITTED DUE TO {icd_code.upper()} | {str(time)} |" + ) + # print(label) + cohort, invalid = ( + None, + None, + ) # final labelled output and df of invalid records, respectively pts = None # valid patients generated by get_visit_pts based on use_ICU and label - ICU=use_ICU - group_col, visit_col, admit_col, disch_col, death_col, adm_visit_col = "", "", "", "", "", "" - #print(label) - use_mort = label == "Mortality" # change to boolean value - use_admn=label=='Readmission' - los=0 - use_los= label=='Length of Stay' - - #print(use_mort) - #print(use_admn) - #print(use_los) + ICU = use_ICU + group_col, visit_col, admit_col, disch_col, death_col, adm_visit_col = ( + "", + "", + "", + "", + "", + "", + ) + # print(label) + use_mort = label == "Mortality" # change to boolean value + use_admn = label == "Readmission" + los = 0 + use_los = label == "Length of Stay" + + # print(use_mort) + # print(use_admn) + # print(use_los) if use_los: - los=time - use_ICU = use_ICU == "ICU" # change to boolean value - use_disease=icd_code!="No Disease Filter" - + los = time + use_ICU = use_ICU == "ICU" # change to boolean value + use_disease = icd_code != "No Disease Filter" + if use_ICU: - group_col='subject_id' - visit_col='stay_id' - admit_col='intime' - disch_col='outtime' - death_col='dod' - adm_visit_col='hadm_id' + group_col = "subject_id" + visit_col = "stay_id" + admit_col = "intime" + disch_col = "outtime" + death_col = "dod" + adm_visit_col = "hadm_id" else: - group_col='subject_id' - visit_col='hadm_id' - admit_col='admittime' - disch_col='dischtime' - death_col='dod' + group_col = "subject_id" + visit_col = "hadm_id" + admit_col = "admittime" + disch_col = "dischtime" + death_col = "dod" pts = get_visit_pts( - mimic4_path=root_dir+"/mimiciv/2.0/", + mimic4_path=root_dir + "\\raw_data\\mimiciv_2_0\\", group_col=group_col, visit_col=visit_col, admit_col=admit_col, @@ -318,49 +547,104 @@ def extract_data(use_ICU:str, label:str, time:int, icd_code:str, root_dir, disea los=los, use_admn=use_admn, disease_label=disease_label, - use_ICU=use_ICU + use_ICU=use_ICU, ) - #print("pts",pts.head()) - + # print("pts",pts.head()) + # cols to be extracted from get_case_ctrls - cols = [group_col, visit_col, admit_col, disch_col, 'Age','gender','ethnicity','insurance','label'] + cols = [ + group_col, + visit_col, + admit_col, + disch_col, + "Age", + "gender", + "ethnicity", + "insurance", + "label", + ] if use_mort: cols.append(death_col) - cohort, invalid = get_case_ctrls(pts, None, group_col, visit_col, admit_col, disch_col,'min_valid_year', death_col, use_mort=True,use_admn=False,use_los=False) + cohort, invalid = get_case_ctrls( + pts, + None, + group_col, + visit_col, + admit_col, + disch_col, + "min_valid_year", + death_col, + use_mort=True, + use_admn=False, + use_los=False, + ) elif use_admn: interval = time - cohort, invalid = get_case_ctrls(pts, interval, group_col, visit_col, admit_col, disch_col,'min_valid_year', death_col, use_mort=False,use_admn=True,use_los=False) + cohort, invalid = get_case_ctrls( + pts, + interval, + group_col, + visit_col, + admit_col, + disch_col, + "min_valid_year", + death_col, + use_mort=False, + use_admn=True, + use_los=False, + ) elif use_los: - cohort, invalid = get_case_ctrls(pts, los, group_col, visit_col, admit_col, disch_col,'min_valid_year', death_col, use_mort=False,use_admn=False,use_los=True) - #print(cohort.head()) - + cohort, invalid = get_case_ctrls( + pts, + los, + group_col, + visit_col, + admit_col, + disch_col, + "min_valid_year", + death_col, + use_mort=False, + use_admn=False, + use_los=True, + ) + # print(cohort.head()) + if use_ICU: cols.append(adm_visit_col) - #print(cohort.head()) - + # print(cohort.head()) + if use_disease: - hids=disease_cohort.extract_diag_cohort(cohort['hadm_id'],icd_code,root_dir+"/mimiciv/2.0/") - #print(hids.shape) - #print(cohort.shape) - #print(len(list(set(hids['hadm_id'].unique()).intersection(set(cohort['hadm_id'].unique()))))) - cohort=cohort[cohort['hadm_id'].isin(hids['hadm_id'])] - cohort_output=cohort_output+"_"+icd_code - summary_output=summary_output+"_"+icd_code - #print(cohort[cols].head()) + hids = disease_cohort.extract_diag_cohort( + cohort["hadm_id"], + icd_code, + root_dir + "\\raw_data\\mimiciv_2_0\\", + ) + # print(hids.shape) + # print(cohort.shape) + # print(len(list(set(hids['hadm_id'].unique()).intersection(set(cohort['hadm_id'].unique()))))) + cohort = cohort[cohort["hadm_id"].isin(hids["hadm_id"])] + cohort_output = cohort_output + "_" + icd_code + summary_output = summary_output + "_" + icd_code + # print(cohort[cols].head()) # save output - cohort=cohort.rename(columns={"race":"ethnicity"}) - cohort[cols].to_csv(root_dir+"/data/cohort/"+cohort_output+".csv.gz", index=False, compression='gzip') + cohort = cohort.rename(columns={"race": "ethnicity"}) + cohort[cols].to_csv( + root_dir + "/data/cohort/" + cohort_output + ".csv.gz", + index=False, + compression="gzip", + ) print("[ COHORT SUCCESSFULLY SAVED ]") - summary = "\n".join([ - f"{label} FOR {ICU} DATA", - f"# Admission Records: {cohort.shape[0]}", - f"# Patients: {cohort[group_col].nunique()}", - f"# Positive cases: {cohort[cohort['label']==1].shape[0]}", - f"# Negative cases: {cohort[cohort['label']==0].shape[0]}" - ]) - + summary = "\n".join( + [ + f"{label} FOR {ICU} DATA", + f"# Admission Records: {cohort.shape[0]}", + f"# Patients: {cohort[group_col].nunique()}", + f"# Positive cases: {cohort[cohort['label']==1].shape[0]}", + f"# Negative cases: {cohort[cohort['label']==0].shape[0]}", + ] + ) # save basic summary of data with open(f"./data/cohort/{summary_output}.txt", "w") as f: f.write(summary) @@ -371,22 +655,30 @@ def extract_data(use_ICU:str, label:str, time:int, icd_code:str, root_dir, disea return cohort_output -if __name__ == '__main__': +if __name__ == "__main__": # use_ICU = input("Use ICU Data? (ICU/Non_ICU)\n").strip() # label = input("Please input the intended label:\n").strip() # extract(use_ICU, label) + extract_data( + "Non-ICU", + "Length of Stay", + 3, + "No Disease Filter", + "d:\\Work\\Repos\\MIMIC-IV-Data-Pipeline", + "", + ) - response = input('Extra all datasets? (y/n)').strip().lower() - if response == 'y': - extract_data("ICU", "Mortality") - extract_data("Non-ICU", "Mortality") + # response = input("Extra all datasets? (y/n)").strip().lower() + # if response == "y": + # extract_data("ICU", "Mortality") + # extract_data("Non-ICU", "Mortality") - extract_data("ICU", "30 Day Readmission") - extract_data("Non-ICU", "30 Day Readmission") + # extract_data("ICU", "30 Day Readmission") + # extract_data("Non-ICU", "30 Day Readmission") - extract_data("ICU", "60 Day Readmission") - extract_data("Non-ICU", "60 Day Readmission") + # extract_data("ICU", "60 Day Readmission") + # extract_data("Non-ICU", "60 Day Readmission") - extract_data("ICU", "120 Day Readmission") - extract_data("Non-ICU", "120 Day Readmission") \ No newline at end of file + # extract_data("ICU", "120 Day Readmission") + # extract_data("Non-ICU", "120 Day Readmission") diff --git a/preprocessing/day_intervals_preproc/disease_cohort.py b/preprocessing/day_intervals_preproc/disease_cohort.py index 94097584f8..1c99debbd4 100644 --- a/preprocessing/day_intervals_preproc/disease_cohort.py +++ b/preprocessing/day_intervals_preproc/disease_cohort.py @@ -3,17 +3,20 @@ # In[ ]: - +from pathlib import Path import pandas as pd import numpy as np import os import sys -sys.path.append(os.path.dirname(os.path.abspath(__file__)) + './../..') + +sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "./../..") +MAP_PATH = Path("utils") / "mappings" / "ICD9_to_ICD10_mapping.txt" + def read_icd_mapping(map_path: str) -> pd.DataFrame: """Reads in mapping table for converting ICD9 to ICD10 codes""" - mapping = pd.read_csv(map_path, header=0, delimiter="\t") + mapping = pd.read_csv(MAP_PATH, header=0, delimiter="\t") mapping.diagnosis_description = mapping.diagnosis_description.apply(str.lower) return mapping @@ -22,7 +25,7 @@ def get_diagnosis_icd(module_path: str) -> pd.DataFrame: """Reads in diagnosis_icd table""" return pd.read_csv( - module_path + "/hosp/diagnoses_icd.csv.gz", compression="gzip", header=0 + module_path + "hosp\\diagnoses_icd.csv.gz", compression="gzip", header=0 ) @@ -67,15 +70,14 @@ def icd_9to10(icd): diag.at[idx, col_name] = new_code count += group.shape[0] - #print(f"{count}/{diag.shape[0]} rows processed") + # print(f"{count}/{diag.shape[0]} rows processed") # Column for just the roots of the converted ICD10 column diag["root"] = diag[col_name].apply(lambda x: x[:3] if type(x) is str else np.nan) - -def preproc_icd_module(h_ids, - module_path: str, ICD10_code: str, icd_map_path: str +def preproc_icd_module( + h_ids, module_path: str, ICD10_code: str, icd_map_path: str ) -> tuple: """Takes an module dataset with ICD codes and puts it in long_format, mapping ICD-codes by a mapping table path""" @@ -89,7 +91,7 @@ def preproc_icd_module(h_ids, diag.dropna(subset=["root"], inplace=True) pos_ids = pd.DataFrame( diag.loc[diag.root.str.contains(ICD10_code)].hadm_id.unique(), - columns=["hadm_id"] + columns=["hadm_id"], ) return pos_ids @@ -98,15 +100,11 @@ def extract_diag_cohort( h_ids, label: str, module_path, - icd_map_path="./utils/mappings/ICD9_to_ICD10_mapping.txt" + icd_map_path="./utils/mappings/ICD9_to_ICD10_mapping.txt", ) -> str: """Takes UserInterface parameters, then creates and saves a labelled cohort summary, and error file""" - cohort = preproc_icd_module(h_ids, - module_path, label, icd_map_path - ) + cohort = preproc_icd_module(h_ids, module_path, label, icd_map_path) return cohort - - diff --git a/preprocessing/hosp_module_preproc/feature_selection_icu.py b/preprocessing/hosp_module_preproc/feature_selection_icu.py index a5fa9037fa..44433bcb59 100644 --- a/preprocessing/hosp_module_preproc/feature_selection_icu.py +++ b/preprocessing/hosp_module_preproc/feature_selection_icu.py @@ -2,23 +2,27 @@ import pickle import glob import importlib -#print(os.getcwd()) -#os.chdir('../../') -#print(os.getcwd()) + +# print(os.getcwd()) +# os.chdir('../../') +# print(os.getcwd()) import utils.icu_preprocess_util -from utils.icu_preprocess_util import * +from utils.icu_preprocess_util import * + importlib.reload(utils.icu_preprocess_util) import utils.icu_preprocess_util -from utils.icu_preprocess_util import *# module of preprocessing functions +from utils.icu_preprocess_util import * # module of preprocessing functions import utils.outlier_removal -from utils.outlier_removal import * +from utils.outlier_removal import * + importlib.reload(utils.outlier_removal) import utils.outlier_removal from utils.outlier_removal import * import utils.uom_conversion -from utils.uom_conversion import * +from utils.uom_conversion import * + importlib.reload(utils.uom_conversion) import utils.uom_conversion from utils.uom_conversion import * @@ -29,195 +33,383 @@ if not os.path.exists("./data/features/chartevents"): os.makedirs("./data/features/chartevents") -def feature_icu(cohort_output, version_path, diag_flag=True,out_flag=True,chart_flag=True,proc_flag=True,med_flag=True): + +def feature_icu( + cohort_output, + version_path, + diag_flag=True, + out_flag=True, + chart_flag=True, + proc_flag=True, + med_flag=True, +): if diag_flag: print("[EXTRACTING DIAGNOSIS DATA]") - diag = preproc_icd_module("./"+version_path+"/hosp/diagnoses_icd.csv.gz", './data/cohort/'+cohort_output+'.csv.gz', './utils/mappings/ICD9_to_ICD10_mapping.txt', map_code_colname='diagnosis_code') - diag[['subject_id', 'hadm_id', 'stay_id', 'icd_code','root_icd10_convert','root']].to_csv("./data/features/preproc_diag_icu.csv.gz", compression='gzip', index=False) + # diag = preproc_icd_module("./"+version_path+"/hosp/diagnoses_icd.csv.gz", './data/cohort/'+cohort_output+'.csv.gz', './utils/mappings/ICD9_to_ICD10_mapping.txt', map_code_colname='diagnosis_code') + diag = preproc_icd_module( + "d:\\Work\\Repos\\MIMIC-IV-Data-Pipeline\\raw_data\\mimiciv_2_0\\hosp\\diagnoses_icd.csv.gz", + "./data/cohort/" + cohort_output + ".csv.gz", + "./utils/mappings/ICD9_to_ICD10_mapping.txt", + map_code_colname="diagnosis_code", + ) + diag[ + [ + "subject_id", + "hadm_id", + "stay_id", + "icd_code", + "root_icd10_convert", + "root", + ] + ].to_csv( + "./data/features/preproc_diag_icu.csv.gz", compression="gzip", index=False + ) print("[SUCCESSFULLY SAVED DIAGNOSIS DATA]") - - if out_flag: + + if out_flag: print("[EXTRACTING OUPTPUT EVENTS DATA]") - out = preproc_out("./"+version_path+"/icu/outputevents.csv.gz", './data/cohort/'+cohort_output+'.csv.gz', 'charttime', dtypes=None, usecols=None) - out[['subject_id', 'hadm_id', 'stay_id', 'itemid', 'charttime', 'intime', 'event_time_from_admit']].to_csv("./data/features/preproc_out_icu.csv.gz", compression='gzip', index=False) + + # out = preproc_out("./"+version_path+"/icu/outputevents.csv.gz", './data/cohort/'+cohort_output+'.csv.gz', 'charttime', dtypes=None, usecols=None) + out = preproc_out( + "d:\\Work\\Repos\\MIMIC-IV-Data-Pipeline\\raw_data\\mimiciv_2_0\\icu\\outputevents.csv.gz", + "./data/cohort/" + cohort_output + ".csv.gz", + "charttime", + dtypes=None, + usecols=None, + ) + out[ + [ + "subject_id", + "hadm_id", + "stay_id", + "itemid", + "charttime", + "intime", + "event_time_from_admit", + ] + ].to_csv( + "./data/features/preproc_out_icu.csv.gz", compression="gzip", index=False + ) print("[SUCCESSFULLY SAVED OUPTPUT EVENTS DATA]") - + if chart_flag: print("[EXTRACTING CHART EVENTS DATA]") - chart=preproc_chart("./"+version_path+"/icu/chartevents.csv.gz", './data/cohort/'+cohort_output+'.csv.gz', 'charttime', dtypes=None, usecols=['stay_id','charttime','itemid','valuenum','valueuom']) + chart = preproc_chart( + "d:\\Work\\Repos\\MIMIC-IV-Data-Pipeline\\raw_data\\mimiciv_2_0\\icu\\chartevents.csv.gz", + "./data/cohort/" + cohort_output + ".csv.gz", + "charttime", + dtypes=None, + usecols=["stay_id", "charttime", "itemid", "valuenum", "valueuom"], + ) chart = drop_wrong_uom(chart, 0.95) - chart[['stay_id', 'itemid','event_time_from_admit','valuenum']].to_csv("./data/features/preproc_chart_icu.csv.gz", compression='gzip', index=False) + chart[["stay_id", "itemid", "event_time_from_admit", "valuenum"]].to_csv( + "./data/features/preproc_chart_icu.csv.gz", compression="gzip", index=False + ) print("[SUCCESSFULLY SAVED CHART EVENTS DATA]") - + if proc_flag: print("[EXTRACTING PROCEDURES DATA]") - proc = preproc_proc("./"+version_path+"/icu/procedureevents.csv.gz", './data/cohort/'+cohort_output+'.csv.gz', 'starttime', dtypes=None, usecols=['stay_id','starttime','itemid']) - proc[['subject_id', 'hadm_id', 'stay_id', 'itemid', 'starttime', 'intime', 'event_time_from_admit']].to_csv("./data/features/preproc_proc_icu.csv.gz", compression='gzip', index=False) + proc = preproc_proc( + "d:\\Work\\Repos\\MIMIC-IV-Data-Pipeline\\raw_data\\mimiciv_2_0\\icu\\procedureevents.csv.gz", + "./data/cohort/" + cohort_output + ".csv.gz", + "starttime", + dtypes=None, + usecols=["stay_id", "starttime", "itemid"], + ) + proc[ + [ + "subject_id", + "hadm_id", + "stay_id", + "itemid", + "starttime", + "intime", + "event_time_from_admit", + ] + ].to_csv( + "./data/features/preproc_proc_icu.csv.gz", compression="gzip", index=False + ) print("[SUCCESSFULLY SAVED PROCEDURES DATA]") - + if med_flag: print("[EXTRACTING MEDICATIONS DATA]") - med = preproc_meds("./"+version_path+"/icu/inputevents.csv.gz", './data/cohort/'+cohort_output+'.csv.gz') - med[['subject_id', 'hadm_id', 'stay_id', 'itemid' ,'starttime','endtime', 'start_hours_from_admit', 'stop_hours_from_admit','rate','amount','orderid']].to_csv('./data/features/preproc_med_icu.csv.gz', compression='gzip', index=False) + med = preproc_meds( + "d:\\Work\\Repos\\MIMIC-IV-Data-Pipeline\\raw_data\\mimiciv_2_0\\icu\\inputevents.csv.gz", + "./data/cohort/" + cohort_output + ".csv.gz", + ) + med[ + [ + "subject_id", + "hadm_id", + "stay_id", + "itemid", + "starttime", + "endtime", + "start_hours_from_admit", + "stop_hours_from_admit", + "rate", + "amount", + "orderid", + ] + ].to_csv( + "./data/features/preproc_med_icu.csv.gz", compression="gzip", index=False + ) print("[SUCCESSFULLY SAVED MEDICATIONS DATA]") -def preprocess_features_icu(cohort_output, diag_flag, group_diag,chart_flag,clean_chart,impute_outlier_chart,thresh,left_thresh): + +def preprocess_features_icu( + cohort_output, + diag_flag, + group_diag, + chart_flag, + clean_chart, + impute_outlier_chart, + thresh, + left_thresh, +): if diag_flag: print("[PROCESSING DIAGNOSIS DATA]") - diag = pd.read_csv("./data/features/preproc_diag_icu.csv.gz", compression='gzip',header=0) - if(group_diag=='Keep both ICD-9 and ICD-10 codes'): - diag['new_icd_code']=diag['icd_code'] - if(group_diag=='Convert ICD-9 to ICD-10 codes'): - diag['new_icd_code']=diag['root_icd10_convert'] - if(group_diag=='Convert ICD-9 to ICD-10 and group ICD-10 codes'): - diag['new_icd_code']=diag['root'] - - diag=diag[['subject_id', 'hadm_id', 'stay_id', 'new_icd_code']].dropna() - print("Total number of rows",diag.shape[0]) - diag.to_csv("./data/features/preproc_diag_icu.csv.gz", compression='gzip', index=False) + diag = pd.read_csv( + "./data/features/preproc_diag_icu.csv.gz", compression="gzip", header=0 + ) + if group_diag == "Keep both ICD-9 and ICD-10 codes": + diag["new_icd_code"] = diag["icd_code"] + if group_diag == "Convert ICD-9 to ICD-10 codes": + diag["new_icd_code"] = diag["root_icd10_convert"] + if group_diag == "Convert ICD-9 to ICD-10 and group ICD-10 codes": + diag["new_icd_code"] = diag["root"] + + diag = diag[["subject_id", "hadm_id", "stay_id", "new_icd_code"]].dropna() + print("Total number of rows", diag.shape[0]) + diag.to_csv( + "./data/features/preproc_diag_icu.csv.gz", compression="gzip", index=False + ) print("[SUCCESSFULLY SAVED DIAGNOSIS DATA]") - + if chart_flag: - if clean_chart: + if clean_chart: print("[PROCESSING CHART EVENTS DATA]") - chart = pd.read_csv("./data/features/preproc_chart_icu.csv.gz", compression='gzip',header=0) - chart = outlier_imputation(chart, 'itemid', 'valuenum', thresh,left_thresh,impute_outlier_chart) - -# for i in [227441, 229357, 229358, 229360]: -# try: -# maj = chart.loc[chart.itemid == i].valueuom.value_counts().index[0] -# chart = chart.loc[~((chart.itemid == i) & (chart.valueuom == maj))] -# except IndexError: -# print(f"{idx} not found") - print("Total number of rows",chart.shape[0]) - chart.to_csv("./data/features/preproc_chart_icu.csv.gz", compression='gzip', index=False) + chart = pd.read_csv( + "./data/features/preproc_chart_icu.csv.gz", compression="gzip", header=0 + ) + chart = outlier_imputation( + chart, "itemid", "valuenum", thresh, left_thresh, impute_outlier_chart + ) + + # for i in [227441, 229357, 229358, 229360]: + # try: + # maj = chart.loc[chart.itemid == i].valueuom.value_counts().index[0] + # chart = chart.loc[~((chart.itemid == i) & (chart.valueuom == maj))] + # except IndexError: + # print(f"{idx} not found") + print("Total number of rows", chart.shape[0]) + chart.to_csv( + "./data/features/preproc_chart_icu.csv.gz", + compression="gzip", + index=False, + ) print("[SUCCESSFULLY SAVED CHART EVENTS DATA]") - - - -def generate_summary_icu(diag_flag,proc_flag,med_flag,out_flag,chart_flag): + + +def generate_summary_icu(diag_flag, proc_flag, med_flag, out_flag, chart_flag): print("[GENERATING FEATURE SUMMARY]") if diag_flag: - diag = pd.read_csv("./data/features/preproc_diag_icu.csv.gz", compression='gzip',header=0) - freq=diag.groupby(['stay_id','new_icd_code']).size().reset_index(name="mean_frequency") - freq=freq.groupby(['new_icd_code'])['mean_frequency'].mean().reset_index() - total=diag.groupby('new_icd_code').size().reset_index(name="total_count") - summary=pd.merge(freq,total,on='new_icd_code',how='right') - summary=summary.fillna(0) - summary.to_csv('./data/summary/diag_summary.csv',index=False) - summary['new_icd_code'].to_csv('./data/summary/diag_features.csv',index=False) - + diag = pd.read_csv( + "./data/features/preproc_diag_icu.csv.gz", compression="gzip", header=0 + ) + freq = ( + diag.groupby(["stay_id", "new_icd_code"]) + .size() + .reset_index(name="mean_frequency") + ) + freq = freq.groupby(["new_icd_code"])["mean_frequency"].mean().reset_index() + total = diag.groupby("new_icd_code").size().reset_index(name="total_count") + summary = pd.merge(freq, total, on="new_icd_code", how="right") + summary = summary.fillna(0) + summary.to_csv("./data/summary/diag_summary.csv", index=False) + summary["new_icd_code"].to_csv("./data/summary/diag_features.csv", index=False) if med_flag: - med = pd.read_csv("./data/features/preproc_med_icu.csv.gz", compression='gzip',header=0) - freq=med.groupby(['stay_id','itemid']).size().reset_index(name="mean_frequency") - freq=freq.groupby(['itemid'])['mean_frequency'].mean().reset_index() - - missing=med[med['amount']==0].groupby('itemid').size().reset_index(name="missing_count") - total=med.groupby('itemid').size().reset_index(name="total_count") - summary=pd.merge(missing,total,on='itemid',how='right') - summary=pd.merge(freq,summary,on='itemid',how='right') - #summary['missing%']=100*(summary['missing_count']/summary['total_count']) - summary=summary.fillna(0) - summary.to_csv('./data/summary/med_summary.csv',index=False) - summary['itemid'].to_csv('./data/summary/med_features.csv',index=False) - - - + med = pd.read_csv( + "./data/features/preproc_med_icu.csv.gz", compression="gzip", header=0 + ) + freq = ( + med.groupby(["stay_id", "itemid"]).size().reset_index(name="mean_frequency") + ) + freq = freq.groupby(["itemid"])["mean_frequency"].mean().reset_index() + + missing = ( + med[med["amount"] == 0] + .groupby("itemid") + .size() + .reset_index(name="missing_count") + ) + total = med.groupby("itemid").size().reset_index(name="total_count") + summary = pd.merge(missing, total, on="itemid", how="right") + summary = pd.merge(freq, summary, on="itemid", how="right") + # summary['missing%']=100*(summary['missing_count']/summary['total_count']) + summary = summary.fillna(0) + summary.to_csv("./data/summary/med_summary.csv", index=False) + summary["itemid"].to_csv("./data/summary/med_features.csv", index=False) + if proc_flag: - proc = pd.read_csv("./data/features/preproc_proc_icu.csv.gz", compression='gzip',header=0) - freq=proc.groupby(['stay_id','itemid']).size().reset_index(name="mean_frequency") - freq=freq.groupby(['itemid'])['mean_frequency'].mean().reset_index() - total=proc.groupby('itemid').size().reset_index(name="total_count") - summary=pd.merge(freq,total,on='itemid',how='right') - summary=summary.fillna(0) - summary.to_csv('./data/summary/proc_summary.csv',index=False) - summary['itemid'].to_csv('./data/summary/proc_features.csv',index=False) - - + proc = pd.read_csv( + "./data/features/preproc_proc_icu.csv.gz", compression="gzip", header=0 + ) + freq = ( + proc.groupby(["stay_id", "itemid"]) + .size() + .reset_index(name="mean_frequency") + ) + freq = freq.groupby(["itemid"])["mean_frequency"].mean().reset_index() + total = proc.groupby("itemid").size().reset_index(name="total_count") + summary = pd.merge(freq, total, on="itemid", how="right") + summary = summary.fillna(0) + summary.to_csv("./data/summary/proc_summary.csv", index=False) + summary["itemid"].to_csv("./data/summary/proc_features.csv", index=False) + if out_flag: - out = pd.read_csv("./data/features/preproc_out_icu.csv.gz", compression='gzip',header=0) - freq=out.groupby(['stay_id','itemid']).size().reset_index(name="mean_frequency") - freq=freq.groupby(['itemid'])['mean_frequency'].mean().reset_index() - total=out.groupby('itemid').size().reset_index(name="total_count") - summary=pd.merge(freq,total,on='itemid',how='right') - summary=summary.fillna(0) - summary.to_csv('./data/summary/out_summary.csv',index=False) - summary['itemid'].to_csv('./data/summary/out_features.csv',index=False) - + out = pd.read_csv( + "./data/features/preproc_out_icu.csv.gz", compression="gzip", header=0 + ) + freq = ( + out.groupby(["stay_id", "itemid"]).size().reset_index(name="mean_frequency") + ) + freq = freq.groupby(["itemid"])["mean_frequency"].mean().reset_index() + total = out.groupby("itemid").size().reset_index(name="total_count") + summary = pd.merge(freq, total, on="itemid", how="right") + summary = summary.fillna(0) + summary.to_csv("./data/summary/out_summary.csv", index=False) + summary["itemid"].to_csv("./data/summary/out_features.csv", index=False) + if chart_flag: - chart=pd.read_csv("./data/features/preproc_chart_icu.csv.gz", compression='gzip',header=0) - freq=chart.groupby(['stay_id','itemid']).size().reset_index(name="mean_frequency") - freq=freq.groupby(['itemid'])['mean_frequency'].mean().reset_index() - - missing=chart[chart['valuenum']==0].groupby('itemid').size().reset_index(name="missing_count") - total=chart.groupby('itemid').size().reset_index(name="total_count") - summary=pd.merge(missing,total,on='itemid',how='right') - summary=pd.merge(freq,summary,on='itemid',how='right') - #summary['missing_perc']=100*(summary['missing_count']/summary['total_count']) - #summary=summary.fillna(0) - -# final.groupby('itemid')['missing_count'].sum().reset_index() -# final.groupby('itemid')['total_count'].sum().reset_index() -# final.groupby('itemid')['missing%'].mean().reset_index() - summary=summary.fillna(0) - summary.to_csv('./data/summary/chart_summary.csv',index=False) - summary['itemid'].to_csv('./data/summary/chart_features.csv',index=False) + chart = pd.read_csv( + "./data/features/preproc_chart_icu.csv.gz", compression="gzip", header=0 + ) + freq = ( + chart.groupby(["stay_id", "itemid"]) + .size() + .reset_index(name="mean_frequency") + ) + freq = freq.groupby(["itemid"])["mean_frequency"].mean().reset_index() + + missing = ( + chart[chart["valuenum"] == 0] + .groupby("itemid") + .size() + .reset_index(name="missing_count") + ) + total = chart.groupby("itemid").size().reset_index(name="total_count") + summary = pd.merge(missing, total, on="itemid", how="right") + summary = pd.merge(freq, summary, on="itemid", how="right") + # summary['missing_perc']=100*(summary['missing_count']/summary['total_count']) + # summary=summary.fillna(0) + + # final.groupby('itemid')['missing_count'].sum().reset_index() + # final.groupby('itemid')['total_count'].sum().reset_index() + # final.groupby('itemid')['missing%'].mean().reset_index() + summary = summary.fillna(0) + summary.to_csv("./data/summary/chart_summary.csv", index=False) + summary["itemid"].to_csv("./data/summary/chart_features.csv", index=False) print("[SUCCESSFULLY SAVED FEATURE SUMMARY]") - -def features_selection_icu(cohort_output, diag_flag,proc_flag,med_flag,out_flag,chart_flag,group_diag,group_med,group_proc,group_out,group_chart): + + +def features_selection_icu( + cohort_output, + diag_flag, + proc_flag, + med_flag, + out_flag, + chart_flag, + group_diag, + group_med, + group_proc, + group_out, + group_chart, +): if diag_flag: if group_diag: print("[FEATURE SELECTION DIAGNOSIS DATA]") - diag = pd.read_csv("./data/features/preproc_diag_icu.csv.gz", compression='gzip',header=0) - features=pd.read_csv("./data/summary/diag_features.csv",header=0) - diag=diag[diag['new_icd_code'].isin(features['new_icd_code'].unique())] - - print("Total number of rows",diag.shape[0]) - diag.to_csv("./data/features/preproc_diag_icu.csv.gz", compression='gzip', index=False) + diag = pd.read_csv( + "./data/features/preproc_diag_icu.csv.gz", compression="gzip", header=0 + ) + features = pd.read_csv("./data/summary/diag_features.csv", header=0) + diag = diag[diag["new_icd_code"].isin(features["new_icd_code"].unique())] + + print("Total number of rows", diag.shape[0]) + diag.to_csv( + "./data/features/preproc_diag_icu.csv.gz", + compression="gzip", + index=False, + ) print("[SUCCESSFULLY SAVED DIAGNOSIS DATA]") - - if med_flag: - if group_med: + + if med_flag: + if group_med: print("[FEATURE SELECTION MEDICATIONS DATA]") - med = pd.read_csv("./data/features/preproc_med_icu.csv.gz", compression='gzip',header=0) - features=pd.read_csv("./data/summary/med_features.csv",header=0) - med=med[med['itemid'].isin(features['itemid'].unique())] - print("Total number of rows",med.shape[0]) - med.to_csv('./data/features/preproc_med_icu.csv.gz', compression='gzip', index=False) + med = pd.read_csv( + "./data/features/preproc_med_icu.csv.gz", compression="gzip", header=0 + ) + features = pd.read_csv("./data/summary/med_features.csv", header=0) + med = med[med["itemid"].isin(features["itemid"].unique())] + print("Total number of rows", med.shape[0]) + med.to_csv( + "./data/features/preproc_med_icu.csv.gz", + compression="gzip", + index=False, + ) print("[SUCCESSFULLY SAVED MEDICATIONS DATA]") - - + if proc_flag: if group_proc: print("[FEATURE SELECTION PROCEDURES DATA]") - proc = pd.read_csv("./data/features/preproc_proc_icu.csv.gz", compression='gzip',header=0) - features=pd.read_csv("./data/summary/proc_features.csv",header=0) - proc=proc[proc['itemid'].isin(features['itemid'].unique())] - print("Total number of rows",proc.shape[0]) - proc.to_csv("./data/features/preproc_proc_icu.csv.gz", compression='gzip', index=False) + proc = pd.read_csv( + "./data/features/preproc_proc_icu.csv.gz", compression="gzip", header=0 + ) + features = pd.read_csv("./data/summary/proc_features.csv", header=0) + proc = proc[proc["itemid"].isin(features["itemid"].unique())] + print("Total number of rows", proc.shape[0]) + proc.to_csv( + "./data/features/preproc_proc_icu.csv.gz", + compression="gzip", + index=False, + ) print("[SUCCESSFULLY SAVED PROCEDURES DATA]") - - + if out_flag: - if group_out: + if group_out: print("[FEATURE SELECTION OUTPUT EVENTS DATA]") - out = pd.read_csv("./data/features/preproc_out_icu.csv.gz", compression='gzip',header=0) - features=pd.read_csv("./data/summary/out_features.csv",header=0) - out=out[out['itemid'].isin(features['itemid'].unique())] - print("Total number of rows",out.shape[0]) - out.to_csv("./data/features/preproc_out_icu.csv.gz", compression='gzip', index=False) + out = pd.read_csv( + "./data/features/preproc_out_icu.csv.gz", compression="gzip", header=0 + ) + features = pd.read_csv("./data/summary/out_features.csv", header=0) + out = out[out["itemid"].isin(features["itemid"].unique())] + print("Total number of rows", out.shape[0]) + out.to_csv( + "./data/features/preproc_out_icu.csv.gz", + compression="gzip", + index=False, + ) print("[SUCCESSFULLY SAVED OUTPUT EVENTS DATA]") - + if chart_flag: - if group_chart: + if group_chart: print("[FEATURE SELECTION CHART EVENTS DATA]") - - chart=pd.read_csv("./data/features/preproc_chart_icu.csv.gz", compression='gzip',header=0, index_col=None) - - features=pd.read_csv("./data/summary/chart_features.csv",header=0) - chart=chart[chart['itemid'].isin(features['itemid'].unique())] - print("Total number of rows",chart.shape[0]) - chart.to_csv("./data/features/preproc_chart_icu.csv.gz", compression='gzip', index=False) - print("[SUCCESSFULLY SAVED CHART EVENTS DATA]") \ No newline at end of file + + chart = pd.read_csv( + "./data/features/preproc_chart_icu.csv.gz", + compression="gzip", + header=0, + index_col=None, + ) + + features = pd.read_csv("./data/summary/chart_features.csv", header=0) + chart = chart[chart["itemid"].isin(features["itemid"].unique())] + print("Total number of rows", chart.shape[0]) + chart.to_csv( + "./data/features/preproc_chart_icu.csv.gz", + compression="gzip", + index=False, + ) + print("[SUCCESSFULLY SAVED CHART EVENTS DATA]") diff --git a/requirements.txt b/requirements.txt index 58020deb8b..3a5579dc19 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,13 @@ -import_ipynb==0.1.3 -ipywidgets==7.5.1 -Jinja2==2.11.2 -matplotlib==3.2.2 -numpy==1.18.5 -pandas==1.0.5 -scikit_learn==1.0.2 -torch==1.6.0 -tqdm==4.47.0 +matplotlib +pandas +scikit_learn +torch +tqdm +ipywidgets +ipykernel +xgboost +imblearn +import_ipynb +captum +behrt_model +pytorch_pretrained_bert \ No newline at end of file diff --git a/script_pathlib.py b/script_pathlib.py new file mode 100644 index 0000000000..fbef31de4f --- /dev/null +++ b/script_pathlib.py @@ -0,0 +1,71 @@ +from pipeline.cohort_extractor import CohortExtractor +from pipeline.feature.diagnoses import IcdGroupOption +from pipeline.feature_selector import FeatureSelector +from pipeline.features_preprocessor import FeaturePreprocessor +from pipeline.prediction_task import TargetType, PredictionTask, DiseaseCode +from pipeline.features_extractor import FeatureExtractor + +# from pipeline.data_generator import DataGenerator + + +if __name__ == "__main__": + prediction_task = PredictionTask( + target_type=TargetType.READMISSION, + disease_readmission=DiseaseCode.CAD, + disease_selection=None, + nb_days=30, + use_icu=False, + ) + + cohort_extractor = CohortExtractor(prediction_task=prediction_task) + cohort = cohort_extractor.extract() + feature_extractor = FeatureExtractor( + cohort_output=cohort_extractor.cohort_output, + use_icu=prediction_task.use_icu, + for_diagnoses=True, + for_labs=not prediction_task.use_icu, + for_chart_events=prediction_task.use_icu, + for_medications=True, + for_output_events=prediction_task.use_icu, + for_procedures=True, + ) + features = feature_extractor.save_features() + + feat_preproc = FeaturePreprocessor( + feature_extractor=feature_extractor, + group_diag_icd=IcdGroupOption.KEEP, + group_med_code=True, + keep_proc_icd9=False, + clean_chart=False, + impute_outlier_chart=False, + clean_labs=False, + impute_labs=False, + ) + preproc = feat_preproc.preprocess_no_event_features() + summaries = feat_preproc.save_summaries() + + feat_select = FeatureSelector( + prediction_task.use_icu, + feature_extractor.for_diagnoses, + feature_extractor.for_medications, + feature_extractor.for_procedures, + not (prediction_task.use_icu) and feature_extractor.for_labs, + prediction_task.use_icu and feature_extractor.for_chart_events, + prediction_task.use_icu and feature_extractor.for_output_events, + ) + + selection = feat_select.feature_selection() + + feat_preproc = FeaturePreprocessor( + feature_extractor=feature_extractor, + group_diag_icd=IcdGroupOption.KEEP, + group_med_code=False, + keep_proc_icd9=False, + clean_chart=False, + impute_outlier_chart=False, + clean_labs=True, + impute_labs=True, + thresh=98, + left_thresh=0, + ) + feat_preproc.preproc_events_features() diff --git a/tests/conversion/test_icd.py b/tests/conversion/test_icd.py new file mode 100644 index 0000000000..9d941a68e7 --- /dev/null +++ b/tests/conversion/test_icd.py @@ -0,0 +1,79 @@ +import pandas as pd +from pipeline.conversion.icd import IcdConverter +from pipeline.file_info.raw.hosp import HospDiagnosesIcd + + +def test_converter(): + """ + Tests the IcdConverter class for standardizing ICD codes and extracting hospital + admission IDs based on specific ICD-10 codes. + + This test validates: + - The conversion of ICD codes from version 9 to version 10. + - The extraction of 'root' ICD codes. + - The retrieval of hospital admission IDs for a given ICD-10 code. + """ + + # Given: Sample ICD codes, versions, and hospital admission IDs + icd_codes = [ + "4139", + "V707", + "41401", + "D696", + "S030XXA", + "S25512A", + "I5022", + "42821", + "4280", + ] + icd_versions = [9, 9, 9, 10, 10, 10, 10, 9, 9] + + admissions = [ + 1, + 1, + 1, + 2, + 3, + 3, + 3, + 4, + 4, + ] + df = pd.DataFrame( + { + "icd_code": icd_codes, + "icd_version": icd_versions, + "hadm_id": admissions, + } + ) + + icd_converter = IcdConverter() + st_dia = icd_converter.standardize_icd(df) + hids = icd_converter.get_pos_ids(st_dia, "I50") + + # Expected results for root ICD-10 conversion and hospital admission IDs + expected_root_icd10 = [ + "I208", + "Z0000", + "I2510", + "D696", + "S030XXA", + "S25512A", + "I5022", + "I50814", + "I50814", + ] + expected_root = [ + "I20", + "Z00", + "I25", + "D69", + "S03", + "S25", + "I50", + "I50", + "I50", + ] + assert st_dia["root_icd10_convert"].values.tolist() == expected_root_icd10 + assert st_dia["root"].values.tolist() == expected_root + assert hids.tolist() == [3, 4] diff --git a/tests/test_cohort_extractor.py b/tests/test_cohort_extractor.py new file mode 100644 index 0000000000..87fcaf99bd --- /dev/null +++ b/tests/test_cohort_extractor.py @@ -0,0 +1,50 @@ +import pytest +from pipeline.cohort_extractor import CohortExtractor +from pipeline.prediction_task import PredictionTask, TargetType + + +@pytest.mark.parametrize( + "use_icu, target_type, nb_days, disease_readmission, disease_selection, expected_admission_records_count, expected_patients_count, expected_positive_cases_count", + [ + (True, TargetType.MORTALITY, 0, None, None, 140, 100, 10), + (True, TargetType.LOS, 3, None, None, 140, 100, 55), + (True, TargetType.LOS, 7, None, None, 140, 100, 20), + (True, TargetType.READMISSION, 30, None, None, 128, 93, 18), + (True, TargetType.READMISSION, 90, None, None, 128, 93, 22), + (True, TargetType.READMISSION, 30, "I50", None, 27, 20, 2), + (True, TargetType.READMISSION, 30, "I25", None, 32, 29, 2), + (True, TargetType.READMISSION, 30, "N18", None, 25, 18, 2), + (True, TargetType.READMISSION, 30, "J44", None, 17, 12, 3), + (False, TargetType.MORTALITY, 0, None, None, 275, 100, 15), + (False, TargetType.LOS, 3, None, None, 275, 100, 163), + (False, TargetType.LOS, 7, None, None, 275, 100, 76), + (False, TargetType.READMISSION, 30, None, None, 260, 95, 52), + (False, TargetType.READMISSION, 90, None, None, 260, 95, 86), + (False, TargetType.READMISSION, 30, "I50", None, 55, 23, 13), + # heart failure + (False, TargetType.READMISSION, 30, "I25", None, 68, 32, 13), + (False, TargetType.READMISSION, 30, "N18", None, 63, 22, 10), + (False, TargetType.READMISSION, 30, "J44", None, 26, 12, 7), + (True, TargetType.MORTALITY, 0, None, "I50", 32, 22, 5), + ], +) +def test_cohort_extractor( + use_icu, + target_type, + nb_days, + disease_readmission, + disease_selection, + expected_admission_records_count, + expected_patients_count, + expected_positive_cases_count, +): + prediction_task = PredictionTask( + target_type, disease_readmission, disease_selection, nb_days, use_icu + ) + cohort_extractor = CohortExtractor( + prediction_task=prediction_task, + ) + df = cohort_extractor.extract().df + assert len(df) == expected_admission_records_count + assert df["subject_id"].nunique() == expected_patients_count + assert df["label"].sum() == expected_positive_cases_count diff --git a/tests/test_feature_extractor.py b/tests/test_feature_extractor.py new file mode 100644 index 0000000000..6a7574c69a --- /dev/null +++ b/tests/test_feature_extractor.py @@ -0,0 +1,125 @@ +from pipeline.features_extractor import ( + FeatureExtractor, +) +from pipeline.feature.feature_abc import Name + + +def test_feature_icu_all_true(): + feature_extractor = FeatureExtractor( + cohort_output="cohort_icu_mortality_0_", + use_icu=True, + for_diagnoses=True, + for_output_events=True, + for_chart_events=True, + for_procedures=True, + for_medications=True, + for_labs=True, + ) + result = feature_extractor.save_features() + assert len(result) == 5 + assert len(result[Name.DIAGNOSES]) == 2647 + assert result[Name.DIAGNOSES].columns.tolist() == [ + "subject_id", + "hadm_id", + "icd_code", + "root_icd10_convert", + "root", + "stay_id", + ] + assert len(result[Name.PROCEDURES]) == 1435 + assert result[Name.PROCEDURES].columns.tolist() == [ + "subject_id", + "hadm_id", + "stay_id", + "itemid", + "starttime", + "intime", + "event_time_from_admit", + ] + assert len(result[Name.MEDICATIONS]) == 11038 + assert result[Name.MEDICATIONS].columns.tolist() == [ + "subject_id", + "hadm_id", + "starttime", + "start_hours_from_admit", + "stop_hours_from_admit", + "stay_id", + "itemid", + "endtime", + "rate", + "amount", + "orderid", + ] + assert len(result[Name.OUTPUT]) == 9362 + assert result[Name.OUTPUT].columns.tolist() == [ + "subject_id", + "hadm_id", + "stay_id", + "itemid", + "charttime", + "intime", + "event_time_from_admit", + ] + assert len(result[Name.CHART]) == 162571 + assert result[Name.CHART].columns.tolist() == [ + "stay_id", + "itemid", + "valuenum", + "event_time_from_admit", + ] + + +def test_feature_non_icu_all_true(): + feature_extractor = FeatureExtractor( + cohort_output="cohort_Non-ICU_readmission_30_I50", + use_icu=False, + for_diagnoses=True, + for_output_events=True, + for_chart_events=True, + for_procedures=True, + for_medications=True, + for_labs=True, + ) + result = feature_extractor.save_features() + assert len(result) == 4 + assert len(result[Name.DIAGNOSES]) == 1273 + assert result[Name.DIAGNOSES].columns.tolist() == [ + "subject_id", + "hadm_id", + "icd_code", + "root_icd10_convert", + "root", + ] + assert len(result[Name.PROCEDURES]) == 136 + assert result[Name.PROCEDURES].columns.tolist() == [ + "subject_id", + "hadm_id", + "icd_code", + "icd_version", + "chartdate", + "admittime", + "proc_time_from_admit", + ] + assert len(result[Name.MEDICATIONS]) == 4803 + assert result[Name.MEDICATIONS].columns.tolist() == [ + "subject_id", + "hadm_id", + "starttime", + "start_hours_from_admit", + "stop_hours_from_admit", + "stoptime", + "drug", + "nonproprietaryname", + "dose_val_rx", + "EPC", + ] + assert len(result[Name.LAB]) == 22029 + assert result[Name.LAB].columns.tolist() == [ + "subject_id", + "hadm_id", + "itemid", + "charttime", + "admittime", + "lab_time_from_admit", + "valuenum", + ] diff --git a/tests/test_feature_preprocessor.py b/tests/test_feature_preprocessor.py new file mode 100644 index 0000000000..e3a97c1f42 --- /dev/null +++ b/tests/test_feature_preprocessor.py @@ -0,0 +1,75 @@ +from pipeline.features_extractor import FeatureExtractor +from pipeline.features_preprocessor import FeaturePreprocessor, IcdGroupOption +from pipeline.data_generator import DataGenerator + + +def test_feature_icu_all_true(): + extractor = FeatureExtractor( + cohort_output="cohort_icu_mortality_0_", + use_icu=True, + for_diagnoses=True, + for_output_events=True, + for_chart_events=True, + for_procedures=True, + for_medications=True, + for_labs=True, + ) + preprocessor = FeaturePreprocessor( + feature_extractor=extractor, + group_diag_icd=IcdGroupOption.GROUP, + group_med_code=True, + keep_proc_icd9=False, + clean_chart=True, + impute_outlier_chart=True, + impute_labs=True, + thresh=98, + left_thresh=2, + clean_labs=True, + ) + extractor.save_features() + preprocessor.preprocess() + generator = DataGenerator( + cohort_output=extractor.cohort_output, + feature_extractor=extractor, + ) + generator.generate_features() + generator.length_by_target() + generator.smooth_ini() + generator.smooth_tqdm() + assert 5 == 5 + + +# def test_feature_non_icu_all_true(): +# extractor = FeatureExtractor( +# cohort_output="cohort_Non-ICU_readmission_30_I50", +# use_icu=False, +# for_diagnoses=True, +# for_output_events=True, +# for_chart_events=True, +# for_procedures=True, +# for_medications=True, +# for_labs=True, +# ) +# preprocessor = FeaturePreprocessor( +# feature_extractor=extractor, +# group_diag_icd=IcdGroupOption.GROUP, +# group_med_code=True, +# keep_proc_icd9=False, +# clean_chart=True, +# impute_outlier_chart=True, +# impute_labs=True, +# thresh=95, +# left_thresh=5, +# clean_labs=True, +# ) +# extractor.save_features() +# preprocessor.preprocess() +# generator = DataGenerator( +# cohort_output=extractor.cohort_output, +# feature_extractor=extractor, +# ) +# generator.generate_features() +# generator.length_by_target() +# generator.smooth_ini() +# generator.smooth_tqdm() +# assert 4 == 4 diff --git a/todo.md b/todo.md new file mode 100644 index 0000000000..3918e87b10 --- /dev/null +++ b/todo.md @@ -0,0 +1,66 @@ +# FeatureExtractor + +- cohort preprocessing +- raw file preprocessing +- feature preprocessing / feature cleaning ? + + + +# module my preprocessing + +- mkdir... + +- precommit + +- implementer options pour l extract + +- raw extract column csv + +- handle columns (fewarure selection?) + +- refact filter?, conversion + +- CLARIFY LOG + +- EXPLICIT OPTION CONSEQUENCE ICU... option =-> get columns... + +- summary, more comments, clarification of the pipeline + + + + + +# module feature selection + +- define tests + +- summary, more comments, clarification of the whole pipeline (use column emum...) + +- what are the features: define enum to clarify with header enums + +# finalize preprocessing +- cleaning +- other preprocessing transformation +- output files +- features summary + + +# data_generation + + +# ML_models + +# DL_models + +# tokenization.BEHRT_model + +# evaluation + +# calibraation + +callibrate_output.callibrate + +# project +- requirements.txt +- (docker pipeline?) +- options diff --git a/utils/icu_preprocess_util.py b/utils/icu_preprocess_util.py index 43764a2561..9c99092a36 100644 --- a/utils/icu_preprocess_util.py +++ b/utils/icu_preprocess_util.py @@ -9,14 +9,24 @@ from sklearn.preprocessing import MultiLabelBinarizer + ########################## GENERAL ########################## -def dataframe_from_csv(path, compression='gzip', header=0, index_col=0, chunksize=None): - return pd.read_csv(path, compression=compression, header=header, index_col=index_col, chunksize=None) +def dataframe_from_csv(path, compression="gzip", header=0, index_col=0, chunksize=None): + return pd.read_csv( + path, + compression=compression, + header=header, + index_col=index_col, + chunksize=None, + ) + def read_admissions_table(mimic4_path): - admits = dataframe_from_csv(os.path.join(mimic4_path, 'core/admissions.csv.gz')) - admits=admits.reset_index() - admits = admits[['subject_id', 'hadm_id', 'admittime', 'dischtime', 'deathtime', 'ethnicity']] + admits = dataframe_from_csv(os.path.join(mimic4_path, "core/admissions.csv.gz")) + admits = admits.reset_index() + admits = admits[ + ["subject_id", "hadm_id", "admittime", "dischtime", "deathtime", "ethnicity"] + ] admits.admittime = pd.to_datetime(admits.admittime) admits.dischtime = pd.to_datetime(admits.dischtime) admits.deathtime = pd.to_datetime(admits.deathtime) @@ -24,31 +34,43 @@ def read_admissions_table(mimic4_path): def read_patients_table(mimic4_path): - pats = dataframe_from_csv(os.path.join(mimic4_path, 'core/patients.csv.gz')) + pats = dataframe_from_csv(os.path.join(mimic4_path, "core/patients.csv.gz")) pats = pats.reset_index() - pats = pats[['subject_id', 'gender','dod','anchor_age','anchor_year', 'anchor_year_group']] - pats['yob']= pats['anchor_year'] - pats['anchor_age'] - #pats.dob = pd.to_datetime(pats.dob) + pats = pats[ + [ + "subject_id", + "gender", + "dod", + "anchor_age", + "anchor_year", + "anchor_year_group", + ] + ] + pats["yob"] = pats["anchor_year"] - pats["anchor_age"] + # pats.dob = pd.to_datetime(pats.dob) pats.dod = pd.to_datetime(pats.dod) return pats ########################## DIAGNOSES ########################## def read_diagnoses_icd_table(mimic4_path): - diag = dataframe_from_csv(os.path.join(mimic4_path, 'hosp/diagnoses_icd.csv.gz')) + diag = dataframe_from_csv(os.path.join(mimic4_path, "hosp/diagnoses_icd.csv.gz")) diag.reset_index(inplace=True) return diag def read_d_icd_diagnoses_table(mimic4_path): - d_icd = dataframe_from_csv(os.path.join(mimic4_path, 'hosp/d_icd_diagnoses.csv.gz')) + d_icd = dataframe_from_csv(os.path.join(mimic4_path, "hosp/d_icd_diagnoses.csv.gz")) d_icd.reset_index(inplace=True) - return d_icd[['icd_code', 'long_title']] + return d_icd[["icd_code", "long_title"]] def read_diagnoses(mimic4_path): return read_diagnoses_icd_table(mimic4_path).merge( - read_d_icd_diagnoses_table(mimic4_path), how='inner', left_on=['icd_code'], right_on=['icd_code'] + read_d_icd_diagnoses_table(mimic4_path), + how="inner", + left_on=["icd_code"], + right_on=["icd_code"], ) @@ -67,12 +89,13 @@ def icd_9to10(icd): return np.nan # Create new column with original codes as default - col_name = 'icd10_convert' - if root: col_name = 'root_' + col_name - df[col_name] = df['icd_code'].values + col_name = "icd10_convert" + if root: + col_name = "root_" + col_name + df[col_name] = df["icd_code"].values # Group identical ICD9 codes, then convert all ICD9 codes within a group to ICD10 - for code, group in df.loc[df.icd_version == 9].groupby(by='icd_code'): + for code, group in df.loc[df.icd_version == 9].groupby(by="icd_code"): new_code = icd_9to10(code) for idx in group.index.values: # Modify values of original df at the indexes in the groups @@ -81,71 +104,108 @@ def icd_9to10(icd): ########################## PROCEDURES ########################## def read_procedures_icd_table(mimic4_path): - proc = dataframe_from_csv(os.path.join(mimic4_path, 'hosp/procedures_icd.csv.gz')) + proc = dataframe_from_csv(os.path.join(mimic4_path, "hosp/procedures_icd.csv.gz")) proc.reset_index(inplace=True) return proc def read_d_icd_procedures_table(mimic4_path): - p_icd = dataframe_from_csv(os.path.join(mimic4_path, 'hosp/d_icd_procedures.csv.gz')) + p_icd = dataframe_from_csv( + os.path.join(mimic4_path, "hosp/d_icd_procedures.csv.gz") + ) p_icd.reset_index(inplace=True) - return p_icd[['icd_code', 'long_title']] + return p_icd[["icd_code", "long_title"]] def read_procedures(mimic4_path): return read_procedures_icd_table(mimic4_path).merge( - read_d_icd_procedures_table(mimic4_path), how='inner', left_on=['icd_code'], right_on=['icd_code'] + read_d_icd_procedures_table(mimic4_path), + how="inner", + left_on=["icd_code"], + right_on=["icd_code"], ) ########################## MAPPING ########################## def read_icd_mapping(map_path): - mapping = pd.read_csv(map_path, header=0, delimiter='\t') + mapping = pd.read_csv(map_path, header=0, delimiter="\t") mapping.diagnosis_description = mapping.diagnosis_description.apply(str.lower) return mapping ########################## PREPROCESSING ########################## -def preproc_meds(module_path:str, adm_cohort_path:str) -> pd.DataFrame: - - adm = pd.read_csv(adm_cohort_path, usecols=['hadm_id', 'stay_id', 'intime'], parse_dates = ['intime']) - med = pd.read_csv(module_path, compression='gzip', usecols=['subject_id', 'stay_id', 'itemid', 'starttime', 'endtime','rate','amount','orderid'], parse_dates = ['starttime', 'endtime']) - med = med.merge(adm, left_on = 'stay_id', right_on = 'stay_id', how = 'inner') - med['start_hours_from_admit'] = med['starttime'] - med['intime'] - med['stop_hours_from_admit'] = med['endtime'] - med['intime'] - - #print(med.isna().sum()) - med=med.dropna() - #med[['amount','rate']]=med[['amount','rate']].fillna(0) + +def preproc_meds(module_path: str, adm_cohort_path: str) -> pd.DataFrame: + adm = pd.read_csv( + adm_cohort_path, + usecols=["hadm_id", "stay_id", "intime"], + parse_dates=["intime"], + ) + med = pd.read_csv( + module_path, + compression="gzip", + usecols=[ + "subject_id", + "stay_id", + "itemid", + "starttime", + "endtime", + "rate", + "amount", + "orderid", + ], + parse_dates=["starttime", "endtime"], + ) + med = med.merge(adm, left_on="stay_id", right_on="stay_id", how="inner") + med["start_hours_from_admit"] = med["starttime"] - med["intime"] + med["stop_hours_from_admit"] = med["endtime"] - med["intime"] + + # print(med.isna().sum()) + med = med.dropna() + # med[['amount','rate']]=med[['amount','rate']].fillna(0) print("# of unique type of drug: ", med.itemid.nunique()) print("# Admissions: ", med.stay_id.nunique()) - print("# Total rows", med.shape[0]) - + print("# Total rows", med.shape[0]) + return med - -def preproc_proc(dataset_path: str, cohort_path:str, time_col:str, dtypes: dict, usecols: list) -> pd.DataFrame: + + +def preproc_proc( + dataset_path: str, cohort_path: str, time_col: str, dtypes: dict, usecols: list +) -> pd.DataFrame: """Function for getting hosp observations pertaining to a pickled cohort. Function is structured to save memory when reading and transforming data.""" def merge_module_cohort() -> pd.DataFrame: """Gets the initial module data with patients anchor year data and only the year of the charttime""" - + # read module w/ custom params - module = pd.read_csv(dataset_path, compression='gzip', usecols=usecols, dtype=dtypes, parse_dates=[time_col]).drop_duplicates() - #print(module.head()) + module = pd.read_csv( + dataset_path, + compression="gzip", + usecols=usecols, + dtype=dtypes, + parse_dates=[time_col], + ).drop_duplicates() + # print(module.head()) # Only consider values in our cohort - cohort = pd.read_csv(cohort_path, compression='gzip', parse_dates = ['intime']) - - #print(module.head()) - #print(cohort.head()) + cohort = pd.read_csv(cohort_path, compression="gzip", parse_dates=["intime"]) + + # print(module.head()) + # print(cohort.head()) # merge module and cohort - return module.merge(cohort[['subject_id','hadm_id','stay_id', 'intime','outtime']], how='inner', left_on='stay_id', right_on='stay_id') + return module.merge( + cohort[["subject_id", "hadm_id", "stay_id", "intime", "outtime"]], + how="inner", + left_on="stay_id", + right_on="stay_id", + ) df_cohort = merge_module_cohort() - df_cohort['event_time_from_admit'] = df_cohort[time_col] - df_cohort['intime'] - - df_cohort=df_cohort.dropna() + df_cohort["event_time_from_admit"] = df_cohort[time_col] - df_cohort["intime"] + + df_cohort = df_cohort.dropna() # Print unique counts and value_counts print("# Unique Events: ", df_cohort.itemid.dropna().nunique()) print("# Admissions: ", df_cohort.stay_id.nunique()) @@ -154,27 +214,41 @@ def merge_module_cohort() -> pd.DataFrame: # Only return module measurements within the observation range, sorted by subject_id return df_cohort -def preproc_out(dataset_path: str, cohort_path:str, time_col:str, dtypes: dict, usecols: list) -> pd.DataFrame: + +def preproc_out( + dataset_path: str, cohort_path: str, time_col: str, dtypes: dict, usecols: list +) -> pd.DataFrame: """Function for getting hosp observations pertaining to a pickled cohort. Function is structured to save memory when reading and transforming data.""" def merge_module_cohort() -> pd.DataFrame: """Gets the initial module data with patients anchor year data and only the year of the charttime""" - + # read module w/ custom params - module = pd.read_csv(dataset_path, compression='gzip', usecols=usecols, dtype=dtypes, parse_dates=[time_col]).drop_duplicates() - #print(module.head()) + module = pd.read_csv( + dataset_path, + compression="gzip", + usecols=usecols, + dtype=dtypes, + parse_dates=[time_col], + ).drop_duplicates() + # print(module.head()) # Only consider values in our cohort - cohort = pd.read_csv(cohort_path, compression='gzip', parse_dates = ['intime']) - - #print(module.head()) - #print(cohort.head()) + cohort = pd.read_csv(cohort_path, compression="gzip", parse_dates=["intime"]) + + # print(module.head()) + # print(cohort.head()) # merge module and cohort - return module.merge(cohort[['stay_id', 'intime','outtime']], how='inner', left_on='stay_id', right_on='stay_id') + return module.merge( + cohort[["stay_id", "intime", "outtime"]], + how="inner", + left_on="stay_id", + right_on="stay_id", + ) df_cohort = merge_module_cohort() - df_cohort['event_time_from_admit'] = df_cohort[time_col] - df_cohort['intime'] - df_cohort=df_cohort.dropna() + df_cohort["event_time_from_admit"] = df_cohort[time_col] - df_cohort["intime"] + df_cohort = df_cohort.dropna() # Print unique counts and value_counts print("# Unique Events: ", df_cohort.itemid.nunique()) print("# Admissions: ", df_cohort.stay_id.nunique()) @@ -183,46 +257,62 @@ def merge_module_cohort() -> pd.DataFrame: # Only return module measurements within the observation range, sorted by subject_id return df_cohort -def preproc_chart(dataset_path: str, cohort_path:str, time_col:str, dtypes: dict, usecols: list) -> pd.DataFrame: + +def preproc_chart( + dataset_path: str, cohort_path: str, time_col: str, dtypes: dict, usecols: list +) -> pd.DataFrame: """Function for getting hosp observations pertaining to a pickled cohort. Function is structured to save memory when reading and transforming data.""" - + # Only consider values in our cohort - cohort = pd.read_csv(cohort_path, compression='gzip', parse_dates = ['intime']) - df_cohort=pd.DataFrame() - # read module w/ custom params + cohort = pd.read_csv(cohort_path, compression="gzip", parse_dates=["intime"]) + df_cohort = pd.DataFrame() + # read module w/ custom params chunksize = 10000000 - count=0 - nitem=[] - nstay=[] - nrows=0 - for chunk in tqdm(pd.read_csv(dataset_path, compression='gzip', usecols=usecols, dtype=dtypes, parse_dates=[time_col],chunksize=chunksize)): - #print(chunk.head()) - count=count+1 - #chunk['valuenum']=chunk['valuenum'].fillna(0) - chunk=chunk.dropna(subset=['valuenum']) - chunk_merged=chunk.merge(cohort[['stay_id', 'intime']], how='inner', left_on='stay_id', right_on='stay_id') - chunk_merged['event_time_from_admit'] = chunk_merged[time_col] - chunk_merged['intime'] - - del chunk_merged[time_col] - del chunk_merged['intime'] - chunk_merged=chunk_merged.dropna() - chunk_merged=chunk_merged.drop_duplicates() + count = 0 + nitem = [] + nstay = [] + nrows = 0 + for chunk in tqdm( + pd.read_csv( + dataset_path, + compression="gzip", + usecols=usecols, + dtype=dtypes, + parse_dates=[time_col], + chunksize=chunksize, + ) + ): + # print(chunk.head()) + count = count + 1 + # chunk['valuenum']=chunk['valuenum'].fillna(0) + chunk = chunk.dropna(subset=["valuenum"]) + chunk_merged = chunk.merge( + cohort[["stay_id", "intime"]], + how="inner", + left_on="stay_id", + right_on="stay_id", + ) + chunk_merged["event_time_from_admit"] = ( + chunk_merged[time_col] - chunk_merged["intime"] + ) + + del chunk_merged[time_col] + del chunk_merged["intime"] + chunk_merged = chunk_merged.dropna() + chunk_merged = chunk_merged.drop_duplicates() if df_cohort.empty: - df_cohort=chunk_merged + df_cohort = chunk_merged else: - df_cohort=df_cohort.append(chunk_merged, ignore_index=True) - - -# nitem.append(chunk_merged.itemid.dropna().unique()) -# nstay=nstay.append(chunk_merged.stay_id.unique()) -# nrows=nrows+chunk_merged.shape[0] - - - + df_cohort = df_cohort.append(chunk_merged, ignore_index=True) + + # nitem.append(chunk_merged.itemid.dropna().unique()) + # nstay=nstay.append(chunk_merged.stay_id.unique()) + # nrows=nrows+chunk_merged.shape[0] + # Print unique counts and value_counts -# print("# Unique Events: ", len(set(nitem))) -# print("# Admissions: ", len(set(nstay))) -# print("Total rows", nrows) + # print("# Unique Events: ", len(set(nitem))) + # print("# Admissions: ", len(set(nstay))) + # print("Total rows", nrows) print("# Unique Events: ", df_cohort.itemid.nunique()) print("# Admissions: ", df_cohort.stay_id.nunique()) print("Total rows", df_cohort.shape[0]) @@ -230,21 +320,33 @@ def preproc_chart(dataset_path: str, cohort_path:str, time_col:str, dtypes: dict # Only return module measurements within the observation range, sorted by subject_id return df_cohort -def preproc_icd_module(module_path:str, adm_cohort_path:str, icd_map_path=None, map_code_colname=None, only_icd10=True) -> pd.DataFrame: - """Takes an module dataset with ICD codes and puts it in long_format, optionally mapping ICD-codes by a mapping table path""" - - def get_module_cohort(module_path:str, cohort_path:str): - module = pd.read_csv(module_path, compression='gzip', header=0) - adm_cohort = pd.read_csv(adm_cohort_path, compression='gzip', header=0) - #print(module.head()) - #print(adm_cohort.head()) - - #adm_cohort = adm_cohort.loc[(adm_cohort.timedelta_years <= 6) & (~adm_cohort.timedelta_years.isna())] - return module.merge(adm_cohort[['hadm_id', 'stay_id', 'label']], how='inner', left_on='hadm_id', right_on='hadm_id') + +def preproc_icd_module( + module_path: str, + adm_cohort_path: str, + icd_map_path=None, + map_code_colname=None, + only_icd10=True, +) -> pd.DataFrame: + """Takes an module dataset with ICD codes and puts it in long_format, optionally mapping ICD-codes by a mapping table path""" + + def get_module_cohort(module_path: str, cohort_path: str): + module = pd.read_csv(module_path, compression="gzip", header=0) + adm_cohort = pd.read_csv(adm_cohort_path, compression="gzip", header=0) + # print(module.head()) + # print(adm_cohort.head()) + + # adm_cohort = adm_cohort.loc[(adm_cohort.timedelta_years <= 6) & (~adm_cohort.timedelta_years.isna())] + return module.merge( + adm_cohort[["hadm_id", "stay_id", "label"]], + how="inner", + left_on="hadm_id", + right_on="hadm_id", + ) def standardize_icd(mapping, df, root=False): """Takes an ICD9 -> ICD10 mapping table and a modulenosis dataframe; adds column with converted ICD10 column""" - + def icd_9to10(icd): # If root is true, only map an ICD 9 -> 10 according to the ICD9's root (first 3 digits) if root: @@ -253,16 +355,17 @@ def icd_9to10(icd): # Many ICD-9's do not have a 1-to-1 mapping; get first index of mapped codes return mapping.loc[mapping[map_code_colname] == icd].icd10cm.iloc[0] except: - #print("Error on code", icd) + # print("Error on code", icd) return np.nan # Create new column with original codes as default - col_name = 'icd10_convert' - if root: col_name = 'root_' + col_name - df[col_name] = df['icd_code'].values + col_name = "icd10_convert" + if root: + col_name = "root_" + col_name + df[col_name] = df["icd_code"].values # Group identical ICD9 codes, then convert all ICD9 codes within a group to ICD10 - for code, group in df.loc[df.icd_version == 9].groupby(by='icd_code'): + for code, group in df.loc[df.icd_version == 9].groupby(by="icd_code"): new_code = icd_9to10(code) for idx in group.index.values: # Modify values of original df at the indexes in the groups @@ -270,30 +373,52 @@ def icd_9to10(icd): if only_icd10: # Column for just the roots of the converted ICD10 column - df['root'] = df[col_name].apply(lambda x: x[:3] if type(x) is str else np.nan) + df["root"] = df[col_name].apply( + lambda x: x[:3] if type(x) is str else np.nan + ) module = get_module_cohort(module_path, adm_cohort_path) - #print(module.shape) - #print(module['icd_code'].nunique()) + # print(module.shape) + # print(module['icd_code'].nunique()) # Optional ICD mapping if argument passed if icd_map_path: icd_map = read_icd_mapping(icd_map_path) - #print(icd_map) + # print(icd_map) standardize_icd(icd_map, module, root=True) - print("# unique ICD-9 codes",module[module['icd_version']==9]['icd_code'].nunique()) - print("# unique ICD-10 codes",module[module['icd_version']==10]['icd_code'].nunique()) - print("# unique ICD-10 codes (After converting ICD-9 to ICD-10)",module['root_icd10_convert'].nunique()) - print("# unique ICD-10 codes (After clinical gruping ICD-10 codes)",module['root'].nunique()) + print( + "# unique ICD-9 codes", + module[module["icd_version"] == 9]["icd_code"].nunique(), + ) + print( + "# unique ICD-10 codes", + module[module["icd_version"] == 10]["icd_code"].nunique(), + ) + print( + "# unique ICD-10 codes (After converting ICD-9 to ICD-10)", + module["root_icd10_convert"].nunique(), + ) + print( + "# unique ICD-10 codes (After clinical gruping ICD-10 codes)", + module["root"].nunique(), + ) print("# Admissions: ", module.stay_id.nunique()) print("Total rows", module.shape[0]) return module -def pivot_cohort(df: pd.DataFrame, prefix: str, target_col:str, values='values', use_mlb=False, ohe=True, max_features=None): +def pivot_cohort( + df: pd.DataFrame, + prefix: str, + target_col: str, + values="values", + use_mlb=False, + ohe=True, + max_features=None, +): """Pivots long_format data into a multiindex array: - || feature 1 || ... || feature n || - || subject_id || label || timedelta || + || feature 1 || ... || feature n || + || subject_id || label || timedelta || """ aggfunc = np.mean pivot_df = df.dropna(subset=[target_col]) @@ -303,18 +428,50 @@ def pivot_cohort(df: pd.DataFrame, prefix: str, target_col:str, values='values', output = mlb.fit_transform(pivot_df[target_col].apply(ast.literal_eval)) output = pd.DataFrame(output, columns=mlb.classes_) if max_features: - top_features = output.sum().sort_values(ascending=False).index[:max_features] + top_features = ( + output.sum().sort_values(ascending=False).index[:max_features] + ) output = output[top_features] - pivot_df = pd.concat([pivot_df[['subject_id', 'label', 'timedelta']].reset_index(drop=True), output], axis=1) - pivot_df = pd.pivot_table(pivot_df, index=['subject_id', 'label', 'timedelta'], values=pivot_df.columns[3:], aggfunc=np.max) + pivot_df = pd.concat( + [ + pivot_df[["subject_id", "label", "timedelta"]].reset_index(drop=True), + output, + ], + axis=1, + ) + pivot_df = pd.pivot_table( + pivot_df, + index=["subject_id", "label", "timedelta"], + values=pivot_df.columns[3:], + aggfunc=np.max, + ) else: if max_features: - top_features = pd.Series(pivot_df[['subject_id', target_col]].drop_duplicates()[target_col].value_counts().index[:max_features], name=target_col) - pivot_df = pivot_df.merge(top_features, how='inner', left_on=target_col, right_on=target_col) + top_features = pd.Series( + pivot_df[["subject_id", target_col]] + .drop_duplicates()[target_col] + .value_counts() + .index[:max_features], + name=target_col, + ) + pivot_df = pivot_df.merge( + top_features, how="inner", left_on=target_col, right_on=target_col + ) if ohe: - pivot_df = pd.concat([pivot_df.reset_index(drop=True), pd.Series(np.ones(pivot_df.shape[0], dtype=int), name='values')], axis=1) + pivot_df = pd.concat( + [ + pivot_df.reset_index(drop=True), + pd.Series(np.ones(pivot_df.shape[0], dtype=int), name="values"), + ], + axis=1, + ) aggfunc = np.max - pivot_df = pivot_df.pivot_table(index=['subject_id', 'label', 'timedelta'], columns=target_col, values=values, aggfunc=aggfunc) + pivot_df = pivot_df.pivot_table( + index=["subject_id", "label", "timedelta"], + columns=target_col, + values=values, + aggfunc=aggfunc, + ) pivot_df.columns = [prefix + str(i) for i in pivot_df.columns] - return pivot_df \ No newline at end of file + return pivot_df