Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add custom names for pain-related concepts #21

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 0 additions & 9 deletions 05_CustomConcepts/dutch-umls_custom_concepts.csv

This file was deleted.

2 changes: 2 additions & 0 deletions 05_CustomConcepts/dutch-umls_custom_name_status.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
cui,str,tty
C0002769,epidurale analgesie,P
105 changes: 105 additions & 0 deletions 05_CustomConcepts/dutch-umls_custom_names.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
cui,str,tty,sab
C0456984,uitslag,A,UMCU
C0019080,bloedt,A,UMCU
C0019080,bloeden,A,UMCU
C0225844,RA,A,UMCU
C0225883,RV,A,UMCU
C0225897,LV,A,UMCU
C0011206,delier,P,UMCU
C0003232,antibiotica,P,UMCU
C0000970,pcm,A,UMCU
C0002769,epi,A,UMCU
C0002769,EPI,A,UMCU
C0002769,epiduraal,A,UMCU
C0002771,pijn medicatie,A,UMCU
C0002771,pijnmedicatie,A,UMCU
C0002771,pijnstilling,A,UMCU
C0004604,pijn in de rug,A,UMCU
C0004604,pijnklachten rug,A,UMCU
C0006400,bupi,A,UMCU
C0008031,POB,A,UMCU
C0008031,pob,A,UMCU
C0012091,diclo,A,UMCU
C0018681,hoofdpijnklachten,A,UMCU
C0030049,oxy,A,UMCU
C0030049,oxynorm,A,UMCU
C0030193,pijnklachten,A,UMCU
C0030193,pijnlijk,A,UMCU
C0030193,pijnlijke,A,UMCU
C0030193,pijn aanwezig,A,UMCU
C0078944,dipidolorpomp,A,UMCU
C0232492,pijn in de bovenbuik,A,UMCU
C0234238,zeurderig gevoel,A,UMCU
C0242429,last van de keel,A,UMCU
C0242429,pijnklachten keel,A,UMCU
C0242429,pijnlijke keel,A,UMCU
C0458254,blaaskrampen,A,UMCU
C0458254,krampen,A,UMCU
C0458254,spierkrampen,A,UMCU
C0497172,last van zijn buik,A,UMCU
C0591917,oramorph,A,UMCU
C0701926,ketanest,A,UMCU
C0809927,dipi,A,UMCU
C0877617,pijn door darmkrampen,A,UMCU
C0181226,hot pack,A,UMCU
C0181226,hot-pack,A,UMCU
C0181226,hot packs,A,UMCU
C0181226,hot-packs,A,UMCU
C0180006,cold pack,A,UMCU
C0180006,cold-pack,A,UMCU
C0180006,coldpacks,A,UMCU
C0180006,cold-packs,A,UMCU
C0179267,AD matras,A,UMCU
C0179267,antidecubitus matras,A,UMCU
C0179267,antidecubitusmatras,A,UMCU|SNOMEDCT_NL
C0278138,licht pijnlijk,A,UMCU
C0278138,lichtelijke pijnklachten,A,UMCU
C0278138,lichte pijnklachten,A,UMCU
C0278138,lichte pijn aanwezig,A,UMCU
C0278139,matige pijnklachten,A,UMCU
C0278140,erg pijnlijk,A,UMCU
C0278140,veel pijn,A,UMCU
C0278140,erg veel pijn,A,UMCU
C0278140,ernstige pijnklachten,A,UMCU
C0278140,veel pijnklachten,A,UMCU
C0278140,erg veel pijnklachten,A,UMCU
C0278140,veel pijn aanwezig,A,UMCU
C0278141,ondragelijke pijn,A,UMCU
C0278141,pijn is ondragelijk,A,UMCU
C0278141,pijn is ondraaglijk,A,UMCU
C0278143,intense pijnklachten,A,UMCU
C0234225,pijnvrij,A,UMCU
C0234225,geen pijn,A,UMCU
C0234225,geen pijnklachten,A,UMCU
C4050142,NRS,A,UMCU
C5548091,NRS onder controle,A,UMCU
C5548091,NRS is onder controle,A,UMCU
C5548091,pijn onder controle,A,UMCU
C5548091,pijn is onder controle,A,UMCU
C5548091,pijnklachten onder controle,A,UMCU
C5548091,pijnklachten zijn onder controle,A,UMCU
C3650806,pijn niet onder controle,A,UMCU
C3650806,pijn is niet onder controle,A,UMCU
C3650806,pijnklachten niet onder controle,A,UMCU
C3650806,pijnklachten zijn niet onder controle,A,UMCU
C0582151,houdbare pijn,A,UMCU
C0582151,houdbare pijnklachten,A,UMCU
C0582151,pijn houdbaar,A,UMCU
C0582151,pijn is houdbaar,A,UMCU
C0582151,pijnklachten houdbaar,A,UMCU
C0582151,pijnklachten zijn houdbaar,A,UMCU
C0582151,pijn is nog houdbaar,A,UMCU
C0582151,pijn aanwezig maar houdbaar,A,UMCU
C0582152,onhoudbare pijn,A,UMCU
C0582152,onhoudbare pijnklachten,A,UMCU
C0582152,pijn onhoudbaar,A,UMCU
C0582152,pijn is onhoudbaar,A,UMCU
C0582152,pijn niet houdbaar,A,UMCU
C0582152,pijn is niet houdbaar,A,UMCU
C0582152,pijnklachten onhoudbaar,A,UMCU
C0582152,pijnklachten zijn onhoudbaar,A,UMCU
C0582152,pijnklachten zijn niet houdbaar,A,UMCU
C0582152,pijn is nog niet houdbaar,A,UMCU
C0015846,fentanylpleister,A,UMCU
C0015846,fentanyl pleister,A,UMCU
C5141269,pijn toename,A,UMCU
7 changes: 7 additions & 0 deletions 05_CustomConcepts/dutch-umls_custom_types.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
cui,tui
C0591917,T109
C0591917,T121
C0701926,T109
C0701926,T121
C0809927,T109
C0809927,T121
137 changes: 107 additions & 30 deletions dutch-umls_to_concept-table.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,19 @@
"from sqlalchemy import create_engine\n",
"from utils import clean_name_status_column, convert_title_to_lowercase\n",
"\n",
"pd.set_option('max_colwidth', 400)\n",
"pd.options.display.max_colwidth=400\n",
"pd.options.display.max_rows=200\n",
"\n",
"# Set output version of the generated UMLS dutch concept table\n",
"UMLS_DUTCH_VERSION = 'v1.11'\n",
"UMLS_DUTCH_VERSION = 'v1.12.0'\n",
"\n",
"# Set version of SNOMED to append to UMLS terms\n",
"snomed_dutch_file = Path('04_ConceptDB') / 'snomedct-dutch_v1.3.csv'\n",
"\n",
"# Set custom concepts file\n",
"custom_concepts_file = Path(\"05_CustomConcepts\") / \"dutch-umls_custom_concepts.csv\"\n",
"# Set custom names and types files\n",
"custom_names_file = Path(\"05_CustomConcepts\") / \"dutch-umls_custom_names.csv\"\n",
"custom_types_file = Path(\"05_CustomConcepts\") / \"dutch-umls_custom_types.csv\"\n",
"custom_name_status_file = Path(\"05_CustomConcepts\") / \"dutch-umls_custom_name_status.csv\"\n",
"\n",
"# Output files\n",
"output_file = Path(\"04_ConceptDB\") / f'umls-dutch_{UMLS_DUTCH_VERSION}.csv'\n",
Expand Down Expand Up @@ -104,7 +107,7 @@
"metadata": {},
"source": [
"## Manual corrections\n",
"Some manual corrections. Easiest to do this as close to the source as possible, so they are processed downstream correctly."
"Some manual corrections. Easiest to do this as close to the source as possible, so they are processed correctly downstream."
]
},
{
Expand Down Expand Up @@ -134,6 +137,32 @@
"display(dutch_umls_original.loc[dutch_umls_original.cui == 'C1145670'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c5071a80",
"metadata": {},
"outputs": [],
"source": [
"# C0030193: Pain\n",
"# C0234238: Ache\n",
"display(dutch_umls_original.loc[dutch_umls_original.cui == 'C0030193'])\n",
"display(dutch_umls_original.loc[dutch_umls_original.cui == 'C0234238'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ccf3fa41",
"metadata": {},
"outputs": [],
"source": [
"dutch_umls_original.loc[dutch_umls_original.str == 'pijn', 'cui'] = 'C0030193'\n",
"dutch_umls_original.loc[dutch_umls_original.str == 'pijn NAO', 'cui'] = 'C0030193'\n",
"display(dutch_umls_original.loc[dutch_umls_original.cui == 'C0030193'])\n",
"display(dutch_umls_original.loc[dutch_umls_original.cui == 'C0234238'])"
]
},
{
"cell_type": "markdown",
"id": "ec63d914",
Expand Down Expand Up @@ -897,8 +926,8 @@
"metadata": {},
"outputs": [],
"source": [
"custom_concepts = pd.read_csv(custom_concepts_file)\n",
"custom_concepts"
"custom_names = pd.read_csv(custom_names_file)\n",
"custom_names.head()"
]
},
{
Expand All @@ -909,7 +938,7 @@
"outputs": [],
"source": [
"print(f'Number of rows before adding rows: {dutch_umls_snomed.shape[0]}')\n",
"dutch_umls_snomed = pd.concat([dutch_umls_snomed, custom_concepts])\n",
"dutch_umls_snomed = pd.concat([dutch_umls_snomed, custom_names])\n",
"print(f'Number of rows after adding rows: {dutch_umls_snomed.shape[0]}')"
]
},
Expand Down Expand Up @@ -943,8 +972,31 @@
"query = \"\"\"\n",
"SELECT cui, tui FROM MRSTY\n",
"\"\"\"\n",
"tui_original = pd.read_sql_query(query, con=connection)\n",
"tui_original.head()"
"umls_types = pd.read_sql_query(query, con=connection)\n",
"umls_types.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f2356c01",
"metadata": {},
"outputs": [],
"source": [
"# Load custom types file\n",
"custom_types = pd.read_csv(custom_types_file)\n",
"custom_types.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d86d3e84",
"metadata": {},
"outputs": [],
"source": [
"concept_types = pd.concat([umls_types, custom_types])\n",
"concept_types.head(10)"
]
},
{
Expand All @@ -955,7 +1007,7 @@
"outputs": [],
"source": [
"# Add TUI column to UMLS + SNOMED CUI table\n",
"dutch_umls_snomed = dutch_umls_snomed.merge(tui_original, how='left', on='cui')\n",
"dutch_umls_snomed = dutch_umls_snomed.merge(concept_types, how='left', on='cui')\n",
"dutch_umls_snomed.head(20)"
]
},
Expand All @@ -977,7 +1029,7 @@
"metadata": {},
"outputs": [],
"source": [
"tuis_to_remove = [\n",
"types_to_remove = [\n",
" \n",
" # Concepts & Ideas\n",
" 'T078', # Idea or Concept\n",
Expand All @@ -1004,7 +1056,7 @@
" 'T083', #Geographic Aera\n",
"]\n",
" \n",
"dutch_umls_snomed[dutch_umls_snomed.tui.isin(tuis_to_remove)].head()"
"dutch_umls_snomed[dutch_umls_snomed.tui.isin(types_to_remove)].head()"
]
},
{
Expand All @@ -1015,12 +1067,27 @@
"outputs": [],
"source": [
"# Remove rows based on TUI\n",
"rows_to_remove = dutch_umls_snomed[dutch_umls_snomed.tui.isin(tuis_to_remove)].index\n",
"rows_to_remove = dutch_umls_snomed[dutch_umls_snomed.tui.isin(types_to_remove)].index\n",
"print(f'Number of rows before removing rows: {dutch_umls_snomed.shape[0]}')\n",
"dutch_umls_snomed = dutch_umls_snomed.drop(dutch_umls_snomed.index[rows_to_remove])\n",
"print(f'Number of rows after removing rows: {dutch_umls_snomed.shape[0]}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c6e7435f",
"metadata": {},
"outputs": [],
"source": [
"# Check whether there are concepts without TUI.\n",
"# This can be caused when adding custom concepts, which originate from ontologies\n",
"# that are not in the UMLS subset generated with Metamorphysis.\n",
"# For example: a concept from MeSH English is not in the generated UMLS subset of Dutch concepts,\n",
"# so it's TUI is also not present in the UMLS subset, and therefore it is not in the UMLS MySQL database.\n",
"dutch_umls_snomed[dutch_umls_snomed.tui.isnull()]"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -1029,7 +1096,28 @@
"outputs": [],
"source": [
"dutch_umls_snomed = dutch_umls_snomed.groupby(['cui', 'str', 'tty', 'sab'])['tui'].apply('|'.join).reset_index()\n",
"print(f'Number of rows after merging TUIs in single value: {dutch_umls_snomed.shape[0]}')"
"print(f'Number of rows after merging TUIs in single value: {len(dutch_umls_snomed)}')"
]
},
{
"cell_type": "markdown",
"id": "6916b424",
"metadata": {},
"source": [
"## Custom name status\n",
"To change the primary/preferred/pretty name, which is relevant for display purposes in downstream applications such as MedCAT Trainer and MedCAT Service, a list of name statuses to change is used."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cb738a14",
"metadata": {},
"outputs": [],
"source": [
"custom_name_status = pd.read_csv(custom_name_status_file, dtype='str')\n",
"for index, row in custom_name_status.iterrows():\n",
" dutch_umls_snomed.loc[(dutch_umls_snomed.cui == row['cui']) & (dutch_umls_snomed.str == row['str']), 'tty'] = row['tty']"
]
},
{
Expand All @@ -1038,7 +1126,7 @@
"metadata": {},
"source": [
"### Update column names\n",
"In MedCAT v1.0 the column name specification has changed and is defined as in the [README.md in examples](https://github.com/CogStack/MedCAT/tree/master/examples)."
"In MedCAT v1.0 the column name specification has changed and is defined in the [README.md in examples](https://github.com/CogStack/MedCAT/tree/master/examples)."
]
},
{
Expand Down Expand Up @@ -1086,18 +1174,7 @@
"## Add drug names\n",
"Only run this part below if you want to further expand the concept database with drug names, which adds around 270k lines. Many drugs only have an international name, or use the international name more often than the Dutch name, so adding these from ATC, Drugbank and RXNorm can be a good addition to the concept table. \n",
"\n",
"After assessing the resulting list it will be clear that many names will not be useful in named entity recognition, because they will probably never be used in natural language."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "385e18aa",
"metadata": {},
"outputs": [],
"source": [
"#In case you want to begin from here, load existing concept table:\n",
"#dutch_umls_snomed = pd.read_csv(\"04_ConceptDB/umls-dutch_{UMLS_DUTCH_VERSION}.csv\", dtype=str)"
"After assessing the resulting list it's clear that many names (such as `(±)-2-(p-isobutylphenyl)propionic acid`) will not be useful in named entity recognition, because they will probably never be used in natural language. In a future iteration of this project it would be nice to remove such names from the resulting concept table."
]
},
{
Expand Down Expand Up @@ -1177,11 +1254,11 @@
"outputs": [],
"source": [
"# Add TUI column\n",
"dutch_umls_snomed_drugs = dutch_umls_snomed_drugs.merge(tui_original, how='left', on='cui')\n",
"dutch_umls_snomed_drugs = dutch_umls_snomed_drugs.merge(concept_types, how='left', on='cui')\n",
"print(f'Number of rows containing TUIs: {dutch_umls_snomed_drugs.shape[0]}')\n",
"\n",
"# Remove TUIs that we decided to filter\n",
"rows_to_remove = dutch_umls_snomed_drugs[dutch_umls_snomed_drugs.tui.isin(tuis_to_remove)].index\n",
"rows_to_remove = dutch_umls_snomed_drugs[dutch_umls_snomed_drugs.tui.isin(types_to_remove)].index\n",
"dutch_umls_snomed_drugs = dutch_umls_snomed_drugs.drop(dutch_umls_snomed_drugs.index[rows_to_remove])\n",
"print(f'Number of rows filtering TUIs: {dutch_umls_snomed_drugs.shape[0]}')\n",
"\n",
Expand Down