From f461d52abf3c7eb4981c43ab0a63aca653a333db Mon Sep 17 00:00:00 2001 From: joeflack4 Date: Sun, 23 Jul 2023 17:10:03 -0400 Subject: [PATCH 1/3] - Update: new classes: duplicated some UMLS: classes as Medgen:, if they started with 'C' and a number. - Update: prefixes: In addition to new classes above, renamed UMLS prefix with Medgen for all other classes (which happen to all start with 'CN:' - Update: prefixes: Renamed prior MEDGEN: xref prefixes to Medgen_UID: These IDs don't start with C (CUI; Concept Unique Identifier) or CN (Common Name?). These are internal Medgen UIDs that are duplicative and not for clinical or analytical use. - Rename: bin/ -> src/ - Add: output/: For both release outputs and non-release. - Rename: release/ -> output/release/ - Add: mondo_mapping_status.py: For generating artefacts related to the reporting and management of mappings between Mondo and Medgen. - Add: Python dependency requirements files. - Add: run.sh: For running commands in ODK - Add: config/medgen.sssom-metadata.yml --- .gitignore | 6 +- README.md | 1 + config/medgen.sssom-metadata.yml | 21 ++++++ makefile | 58 +++++++++++---- requirements-unlocked.txt | 1 + requirements.txt | 14 ++++ run.sh | 85 ++++++++++++++++++++++ {bin => src}/make_uid2cui.pl | 0 {bin => src}/medgen2obo.pl | 44 +++++++++++- src/mondo_mapping_status.py | 118 +++++++++++++++++++++++++++++++ 10 files changed, 331 insertions(+), 17 deletions(-) create mode 100644 config/medgen.sssom-metadata.yml create mode 100644 requirements-unlocked.txt create mode 100644 requirements.txt create mode 100644 run.sh rename {bin => src}/make_uid2cui.pl (100%) rename {bin => src}/medgen2obo.pl (72%) create mode 100644 src/mondo_mapping_status.py diff --git a/.gitignore b/.gitignore index 88619f4..e0419c6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,20 +1,22 @@ # Standard dev/ data/cache/ +output/ __pycache__/ .idea/ .DS_Store .env # Specialized +/*.json /*.obo /*.owl /*.tmp /*.tsv /fetch /ftp.ncbi.nlm.nih.gov/ -/release/ *ignore/ +.ipynb_checkpoints/ _archive/ +release/ tmp/ -.ipynb_checkpoints/ diff --git a/README.md b/README.md index f3939de..fe91d76 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ On MacOS, (3) and (4) should be available without the need for installation. ## Setup 1. Give permission to run Perl: `chmod +x ./bin/*.pl` +2. Install Python dependencies: `pip install -r requirements.txt` ## Running the ingest Run: `make all` diff --git a/config/medgen.sssom-metadata.yml b/config/medgen.sssom-metadata.yml new file mode 100644 index 0000000..48ffaf1 --- /dev/null +++ b/config/medgen.sssom-metadata.yml @@ -0,0 +1,21 @@ +creator_id: 0000-0002-2906-7319 +curie_map: + GTR: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/GTR/ + HP: http://purl.obolibrary.org/obo/HP_ + MESH: http://identifiers.org/mesh/ + MONDO: http://purl.obolibrary.org/obo/MONDO_ + MedGen: http://purl.obolibrary.org/obo/Medgen_ + MedGen_UID: http://purl.obolibrary.org/obo/Medgen_UID_ + NCIT: http://purl.obolibrary.org/obo/NCIT_ + OMIM: https://omim.org/entry/ + Orphanet: http://www.orpha.net/ORDO/Orphanet_ + SCTID: http://identifiers.org/snomedct/ + UMLS: http://purl.obolibrary.org/obo/UMLS_ + oboInOwl: http://www.geneontology.org/formats/oboInOwl# + owl: http://www.w3.org/2002/07/owl# + rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns# + rdfs: http://www.w3.org/2000/01/rdf-schema# + semapv: https://w3id.org/semapv/ + skos: http://www.w3.org/2004/02/skos/core# + sssom: https://w3id.org/sssom/ +license: http://w3id.org/sssom/license/unspecified diff --git a/makefile b/makefile index f99e73f..347699a 100644 --- a/makefile +++ b/makefile @@ -2,38 +2,54 @@ # Running `make all` will run the full pipeline. Note that if the FTP files have already been downloaded, it'll skip # that part. In order to force re-download, run `make all -B`. .DEFAULT_GOAL := all -.PHONY: all build stage stage-% +.PHONY: all build stage stage-% release-artefacts analysis-artefacts clean deploy-release OBO=http://purl.obolibrary.org/obo PRODUCTS=medgen-disease-extract.obo medgen-disease-extract.owl TODAY ?=$(shell date +%Y-%m-%d) VERSION=v$(TODAY) -all: build stage -build: $(PRODUCTS) +all: build stage clean +release-artefacts: $(PRODUCTS) medgen.sssom.tsv +# analysis-artefacts runs more than just this file; that goal creates multiple files +analysis-artefacts: medgen_terms_mapping_status.tsv +build: release-artefacts analysis-artefacts stage: $(patsubst %, stage-%, $(PRODUCTS)) - mv medgen.obo release/ -stage-%: % | release/ - mv $< release/ + mv medgen.obo output/release/ + mv medgen.sssom.tsv output/release/ +stage-%: % | output/release/ + mv $< output/release/ +clean: + rm medgen.obographs.json + rm uid2cui.tsv + rm *.obo # ---------------------------------------- -# ETL +# Setup dirs # ---------------------------------------- -release/: +tmp/input/: + mkdir -p $@ +output/: + mkdir -p $@ +output/release/: mkdir -p $@ +# ---------------------------------------- +# ETL +# ---------------------------------------- ftp.ncbi.nlm.nih.gov: wget -r -np ftp://ftp.ncbi.nlm.nih.gov/pub/medgen/ && touch $@ uid2cui.tsv: - ./bin/make_uid2cui.pl > $@ + ./src/make_uid2cui.pl > $@ # ---------------------------------------- -# Hacky conversion to obo +# Main artefacts # ---------------------------------------- +# Hacky conversion to obo ---------------- # Relies on MGCONSO.RRF.gz etc being made by 'ftp.ncbi.nlm.nih.gov' step medgen.obo: ftp.ncbi.nlm.nih.gov uid2cui.tsv - ./bin/medgen2obo.pl > $@.tmp && mv $@.tmp $@ + ./src/medgen2obo.pl > $@.tmp && mv $@.tmp $@ # We only care about diseases for now # - NOTE: some cancers seem to appear under Neoplastic-Process @@ -49,6 +65,13 @@ medgen-disease-extract.json: medgen-disease-extract.obo medgen-disease-extract.owl: medgen-disease-extract.obo owltools $< -o $@ +# SSSOM ---------------------------------- +medgen.obographs.json: + robot convert -i medgen-disease-extract.owl -o $@ + +medgen.sssom.tsv: medgen.obographs.json + sssom parse medgen.obographs.json -I obographs-json -m config/medgen.sssom-metadata.yml -o $@ + # ---------------------------------------- # Cycles # ---------------------------------------- @@ -59,6 +82,15 @@ medgen-disease-extract.owl: medgen-disease-extract.obo # ---------------------------------------- # Devops # ---------------------------------------- -deploy-release: | release/ +deploy-release: | output/release/ @test $(VERSION) - gh release create $(VERSION) --notes "New release." --title "$(VERSION)" release/* + gh release create $(VERSION) --notes "New release." --title "$(VERSION)" output/release/* + +# ---------------------------------------- +# Mapping analysis +# ---------------------------------------- +tmp/input/mondo.sssom.tsv: | tmp/input/ + wget http://purl.obolibrary.org/obo/mondo/mappings/mondo.sssom.tsv -O $@ + +output/medgen_terms_mapping_status.tsv output/obsoleted_medgen_terms_in_mondo.txt: | output/ + python src/mondo_mapping_status.py diff --git a/requirements-unlocked.txt b/requirements-unlocked.txt new file mode 100644 index 0000000..fb6c7ed --- /dev/null +++ b/requirements-unlocked.txt @@ -0,0 +1 @@ +pandas diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..01e7f87 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,14 @@ +distlib==0.3.6 +filelock==3.9.0 +numpy==1.25.1 +pandas==2.0.3 +pbr==5.11.1 +platformdirs==3.1.0 +python-dateutil==2.8.2 +pytz==2023.3 +six==1.16.0 +stevedore==5.0.0 +tzdata==2023.3 +virtualenv==20.20.0 +virtualenv-clone==0.5.7 +virtualenvwrapper==4.8.4 diff --git a/run.sh b/run.sh new file mode 100644 index 0000000..11e1a06 --- /dev/null +++ b/run.sh @@ -0,0 +1,85 @@ +#!/bin/sh +# Wrapper script for docker. +# +# This is used primarily for wrapping the GNU Make workflow. +# Instead of typing "make TARGET", type "./run.sh make TARGET". +# This will run the make workflow within a docker container. +# +# The assumption is that you are working in the src/ontology folder; +# we therefore map the whole repo (../..) to a docker volume. +# +# To use singularity instead of docker, please issue +# export USE_SINGULARITY= +# before running this script. +# +# See README-editors.md for more details. + +if [ -f run.sh.conf ]; then + . ./run.sh.conf +fi + +# Look for a GitHub token +if [ -n "$GH_TOKEN" ]; then + : +elif [ -f ../../.github/token.txt ]; then + GH_TOKEN=$(cat ../../.github/token.txt) +elif [ -f $XDG_CONFIG_HOME/ontology-development-kit/github/token ]; then + GH_TOKEN=$(cat $XDG_CONFIG_HOME/ontology-development-kit/github/token) +elif [ -f "$HOME/Library/Application Support/ontology-development-kit/github/token" ]; then + GH_TOKEN=$(cat "$HOME/Library/Application Support/ontology-development-kit/github/token") +fi + +ODK_IMAGE=${ODK_IMAGE:-odkfull} +TAG_IN_IMAGE=$(echo $ODK_IMAGE | awk -F':' '{ print $2 }') +if [ -n "$TAG_IN_IMAGE" ]; then + # Override ODK_TAG env var if IMAGE already includes a tag + ODK_TAG=$TAG_IN_IMAGE + ODK_IMAGE=$(echo $ODK_IMAGE | awk -F':' '{ print $1 }') +fi +ODK_TAG=${ODK_TAG:-latest} +ODK_JAVA_OPTS=${ODK_JAVA_OPTS:--Xmx20G} +ODK_DEBUG=${ODK_DEBUG:-no} + +# Convert OWLAPI_* environment variables to the OWLAPI as Java options +# See http://owlcs.github.io/owlapi/apidocs_4/org/semanticweb/owlapi/model/parameters/ConfigurationOptions.html +# for a list of allowed options +OWLAPI_OPTIONS_NAMESPACE=org.semanticweb.owlapi.model.parameters.ConfigurationOptions +for owlapi_var in $(env | sed -n s/^OWLAPI_//p) ; do + ODK_JAVA_OPTS="$ODK_JAVA_OPTS -D$OWLAPI_OPTIONS_NAMESPACE.${owlapi_var%=*}=${owlapi_var#*=}" +done + +TIMECMD= +if [ x$ODK_DEBUG = xyes ]; then + # If you wish to change the format string, take care of using + # non-breaking spaces (U+00A0) instead of normal spaces, to + # prevent the shell from tokenizing the format string. + echo "Running ${IMAGE} with ${ODK_JAVA_OPTS} of memory for ROBOT and Java-based pipeline steps." + TIMECMD="/usr/bin/time -f ### DEBUG STATS ###\nElapsed time: %E\nPeak memory: %M kb" +fi + +VOLUME_BIND=$PWD:/work +WORK_DIR=/work + +if [ -n "$ODK_BINDS" ]; then + VOLUME_BIND="$VOLUME_BIND,$ODK_BINDS" +fi + +if [ -n "$USE_SINGULARITY" ]; then + + singularity exec --cleanenv $ODK_SINGULARITY_OPTIONS \ + --env "ROBOT_JAVA_ARGS=$ODK_JAVA_OPTS,JAVA_OPTS=$ODK_JAVA_OPTS" \ + --bind $VOLUME_BIND \ + -W $WORK_DIR \ + docker://obolibrary/$ODK_IMAGE:$ODK_TAG $TIMECMD "$@" +else + BIND_OPTIONS="-v $(echo $VOLUME_BIND | sed 's/,/ -v /')" + docker run $ODK_DOCKER_OPTIONS $BIND_OPTIONS -w $WORK_DIR \ + -e ROBOT_JAVA_ARGS="$ODK_JAVA_OPTS" -e JAVA_OPTS="$ODK_JAVA_OPTS" \ + --rm -ti obolibrary/$ODK_IMAGE:$ODK_TAG $TIMECMD "$@" +fi + +case "$@" in +*update_repo*|*release*) + echo "Please remember to update your ODK image from time to time: https://oboacademy.github.io/obook/howto/odk-update/." + ;; +esac \ No newline at end of file diff --git a/bin/make_uid2cui.pl b/src/make_uid2cui.pl similarity index 100% rename from bin/make_uid2cui.pl rename to src/make_uid2cui.pl diff --git a/bin/medgen2obo.pl b/src/medgen2obo.pl similarity index 72% rename from bin/medgen2obo.pl rename to src/medgen2obo.pl index b05b13f..77af286 100755 --- a/bin/medgen2obo.pl +++ b/src/medgen2obo.pl @@ -82,7 +82,7 @@ chomp; my ($u,$c) = split(/\t/,$_); $uh{$c} = $u; - $th{$c}->{xrefs}->{"MEDGEN:$u"} = 1; + $th{$c}->{xrefs}->{"MedGen_UID:$u"} = 1; } close(F); @@ -104,9 +104,49 @@ my @ids = keys %th; @ids = sort @ids; foreach my $id (@ids) { + if ($id =~ /^C\d+/) { + # TODO: repurpose to func (this is instance 1/2) + my $h = $th{$id}; + print "[Term]\n"; + print "id: UMLS:$id\n"; + print "name: $h->{name}\n"; + foreach my $x (keys %{$h->{xrefs}}) { + $x =~ s@MSH:@MESH:@; + $x =~ s@NCI:@NCIT:@; + $x =~ s@SNOMEDCT_US:@SCTID:@; + print "xref: $x\n"; + } + foreach (keys %{$ssh{$id} || {}}) { + my $ss = mk_subset($_); + print "subset: $ss\n"; + } + foreach my $s (@{$h->{synonyms}}) { + my ($str, $x)= @$s; + $str = escq($str); + print "synonym: \"$str\" RELATED [$x]\n"; + } + my $trelh = $rh{$id}; + foreach my $rel (keys %{$trelh}) { + my $vh = $trelh->{$rel}; + foreach my $v (keys %$vh) { + unless ($v eq $id) { + my $tag = "relationship: $rel"; + if ($rel eq 'isa') { + $tag = 'is_a:'; + } + if ($rel eq 'mapped_to') { + $tag = 'equivalent_to:'; + } + print "$tag UMLS:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n"; + } + } + } + print "\n"; + } + # TODO: repurpose to func (this is instance 2/2) my $h = $th{$id}; print "[Term]\n"; - print "id: UMLS:$id\n"; + print "id: MedGen:$id\n"; print "name: $h->{name}\n"; foreach my $x (keys %{$h->{xrefs}}) { $x =~ s@MSH:@MESH:@; diff --git a/src/mondo_mapping_status.py b/src/mondo_mapping_status.py new file mode 100644 index 0000000..14aef7b --- /dev/null +++ b/src/mondo_mapping_status.py @@ -0,0 +1,118 @@ +"""Mapping status between Medgen and Mondo""" +from pathlib import Path +from typing import List, Set, Tuple + +import pandas as pd + +SRC_DIR = Path(__file__).parent +PROJECT_DIR = SRC_DIR.parent +OUTDIR = PROJECT_DIR / 'output' +RELEASE_OUTDIR = OUTDIR / 'release' +INPUT_DIR = PROJECT_DIR / 'tmp' / 'input' +MONDO_SSSOM_TSV = INPUT_DIR / 'mondo.sssom.tsv' +MEDGEN_SSSOM_TSV = RELEASE_OUTDIR / 'medgen.sssom.tsv' +# MEDGEN_PREFIXES: Some of these are old, some are new, some may not be used. +MEDGEN_PREFIXES = ['Medgen', 'MedGen', 'MEDGEN', 'Medgen_UID', 'MedGen_UID', 'UMLS', 'UMLS_CUI'] +CURIE = str + + +def ids_prefixless(ids: Set[str]) -> Set[str]: + """Remove prefix""" + return set([x.split(':')[1] for x in ids]) + + +def ids_drop_uids(ids: Set[CURIE]) -> Set[CURIE]: + """From a set of Medgen IDs, drop those that are UIDs""" + return set([x for x in ids if x.split(':')[1].startswith('C')]) + +def read_mapping_sources( + mondo_predicate_filter: List[str] = None, + drop_uids=True +) -> Tuple[Set[CURIE], Set[CURIE], Set[CURIE]]: + """Read data sources + :param drop_uids: drop UIDs from Medgen IDs. These are ones that don't start with CN or C, and are IDs that are used + only internally in Medgen and are not stable.""" + medgen_df = pd.read_csv(MEDGEN_SSSOM_TSV, sep='\t', comment='#').fillna('') + # todo: move commented line to .ipynb + # preds = list(medgen_df['predicate_id'].unique()) # oboInOwl:hasDbXref, owl:equivalentClass + medgen_in_medgen: Set[CURIE] = set(list(medgen_df['subject_id'])) + + mondo_df = pd.read_csv(MONDO_SSSOM_TSV, sep='\t', comment='#').fillna('') # n=72,902 + mondo_df['prefix'] = mondo_df['object_id'].apply(lambda x: x.split(':')[0]) + mondo_df = mondo_df[mondo_df['prefix'].isin(MEDGEN_PREFIXES)] # n=16,627 + del mondo_df['prefix'] + # todo: move commented line to .ipynb + # preds = list(mondo_df['predicate_id'].unique()) # only skos:exactMatch + if mondo_predicate_filter: # leaving for now; but has no effect because only skos:exactMatch exists + mondo_df = mondo_df[mondo_df['predicate_id'].isin(mondo_predicate_filter)] + medgen_in_mondo: Set[CURIE] = set(mondo_df['object_id'].tolist()) + + medgen_all_ids = medgen_in_medgen.union(medgen_in_mondo) + + if drop_uids: + medgen_all_ids = ids_drop_uids(medgen_all_ids) + medgen_in_medgen = ids_drop_uids(medgen_in_medgen) + medgen_in_mondo = ids_drop_uids(medgen_in_mondo) + + return medgen_all_ids, medgen_in_medgen, medgen_in_mondo + +def report_obs_medgen_in_mondo(medgen_in_mondo: Set[str], medgen_in_medgen: Set[str]): + """Obsoleted Medgen terms in Mondo""" + # obsoleted_medgen_terms_in_mondo.txt: get a list of obsolete Medgen terms that are still in Mondo + in_mondo_not_in_medgen = medgen_in_mondo.difference(medgen_in_medgen) + obs_medgen_in_mondo_df = pd.DataFrame() + obs_medgen_in_mondo_df['id'] = sorted([x for x in in_mondo_not_in_medgen]) + obs_medgen_in_mondo_df = obs_medgen_in_mondo_df.sort_values(by='id') + obs_medgen_in_mondo_df.to_csv(OUTDIR / 'obsoleted_medgen_terms_in_mondo.txt', index=False, header=False) + +def report_existing_overlap(medgen_all_ids: Set[str], medgen_in_medgen: Set[str], medgen_in_mondo: Set[str], file_suffix: str): + """Get explicit, existing mapping status overlaps between Medgen and Mondo + These are mappings at the time before we began the Medgen ingest, and we this was useful for analytical information + at the time, but we maybe should drop this because not using for curation. We're not keeping the previous + Mondo::Medgen mappings from Mondo.""" + existing_overlap_df = pd.DataFrame() + existing_overlap_df['subject_id'] = list(medgen_all_ids) + existing_overlap_df['in_medgen'] = existing_overlap_df['subject_id'].isin(medgen_in_medgen) + existing_overlap_df['in_mondo'] = existing_overlap_df['subject_id'].isin(medgen_in_mondo) + existing_overlap_df['status'] = existing_overlap_df['subject_id'].apply( + lambda x: + 'medgen' if x in medgen_in_medgen and x not in medgen_in_mondo else + 'mondo' if x in medgen_in_mondo and x not in medgen_in_medgen else + 'both') + existing_overlap_df = existing_overlap_df.sort_values(['status', 'subject_id', 'in_medgen', 'in_mondo']) + # todo: move to .ipynb + # tot_medgen = len(existing_overlap_df[existing_overlap_df['status'] == 'medgen']) # n=66,224 + # tot_mondo = len(existing_overlap_df[existing_overlap_df['status'] == 'mondo']) # n=2,362 + # tot_both = len(existing_overlap_df[existing_overlap_df['status'] == 'both']) # n=14,263 + existing_overlap_df.to_csv(OUTDIR / f'medgen_terms_mapping_status{file_suffix}.tsv', index=False, sep='\t') + +def medgen_mondo_mapping_status(mondo_predicate_filter: List[str] = None): + """Mapping status between Medgen and Mondo""" + # Vars + file_suffix = '' if not mondo_predicate_filter \ + else '-mondo-exacts-only' if mondo_predicate_filter == ['skos:exactMatch'] \ + else '-custom' + # Read sources + medgen_all_ids, medgen_in_medgen, medgen_in_mondo = \ + read_mapping_sources(mondo_predicate_filter=mondo_predicate_filter) + # Special operations + # - Inconsistent prefixes between what Mondo used before and will going forward. In this case, stripping prefixes + # should be OK, at least for now. + medgen_all_ids = ids_prefixless(medgen_all_ids) + medgen_in_medgen = ids_prefixless(medgen_in_medgen) + medgen_in_mondo = ids_prefixless(medgen_in_mondo) + # Report + report_obs_medgen_in_mondo(medgen_in_mondo, medgen_in_medgen) + report_existing_overlap(medgen_all_ids, medgen_in_medgen, medgen_in_mondo, file_suffix) + +def run(): + """Run reports""" + # # filters: could be set up if needed, but current Medgen & previous Mondo only have exactMatch + # filters = [None, ['skos:exactMatch']] + # for f in filters: + # medgen_mondo_mapping_status(f) + medgen_mondo_mapping_status() + + +if __name__ == '__main__': + run() From d62a422102b05ed9535b7db1b2d8fe55aca0b0bd Mon Sep 17 00:00:00 2001 From: joeflack4 Date: Wed, 2 Aug 2023 16:25:52 -0400 Subject: [PATCH 2/3] - Update: medgen2obo.pl: (i) Abstracted adding of classes and their triples as a function, (ii) updated namespacing of classes based on what type of MedGen/UMLS identifier they are. - Update: Namespaces MedGen, MedGen_UI (removed), MedGenCUI - Bugfix: SSSOM metadata yaml had a typo preventing conversion - Bugfix: Makefile: (i) needed to rename a dependency, (ii) needed to run 'analyze' step after 'stage' - Update: Makefile: Simplified some goals - Bugfix: For UMLS CUIs (e.g. starts with C then #s), we chose to do duplicate classes with namespaces UMLS and MedGen. However, I just now made it so that also all references (e.g. xrefs) are also duplicated, e.g. MedGen:1 maps to MedGen:2 and UMLS:2. --- config/medgen.sssom-metadata.yml | 4 +- makefile | 12 ++--- src/medgen2obo.pl | 82 ++++++++++++++------------------ src/mondo_mapping_status.py | 12 ++++- 4 files changed, 53 insertions(+), 57 deletions(-) diff --git a/config/medgen.sssom-metadata.yml b/config/medgen.sssom-metadata.yml index 48ffaf1..d4efde2 100644 --- a/config/medgen.sssom-metadata.yml +++ b/config/medgen.sssom-metadata.yml @@ -4,8 +4,8 @@ curie_map: HP: http://purl.obolibrary.org/obo/HP_ MESH: http://identifiers.org/mesh/ MONDO: http://purl.obolibrary.org/obo/MONDO_ - MedGen: http://purl.obolibrary.org/obo/Medgen_ - MedGen_UID: http://purl.obolibrary.org/obo/Medgen_UID_ + MedGen: http://purl.obolibrary.org/obo/MedGen_ + MedGenCUI: http://purl.obolibrary.org/obo/MedGenCUI_ NCIT: http://purl.obolibrary.org/obo/NCIT_ OMIM: https://omim.org/entry/ Orphanet: http://www.orpha.net/ORDO/Orphanet_ diff --git a/makefile b/makefile index 347699a..d68fb8f 100644 --- a/makefile +++ b/makefile @@ -2,18 +2,17 @@ # Running `make all` will run the full pipeline. Note that if the FTP files have already been downloaded, it'll skip # that part. In order to force re-download, run `make all -B`. .DEFAULT_GOAL := all -.PHONY: all build stage stage-% release-artefacts analysis-artefacts clean deploy-release +.PHONY: all build stage stage-% analyze clean deploy-release OBO=http://purl.obolibrary.org/obo PRODUCTS=medgen-disease-extract.obo medgen-disease-extract.owl TODAY ?=$(shell date +%Y-%m-%d) VERSION=v$(TODAY) -all: build stage clean -release-artefacts: $(PRODUCTS) medgen.sssom.tsv -# analysis-artefacts runs more than just this file; that goal creates multiple files -analysis-artefacts: medgen_terms_mapping_status.tsv -build: release-artefacts analysis-artefacts +all: build stage clean analyze +# analyze: runs more than just this file; that goal creates multiple files +analyze: output/medgen_terms_mapping_status.tsv +build: $(PRODUCTS) medgen.sssom.tsv stage: $(patsubst %, stage-%, $(PRODUCTS)) mv medgen.obo output/release/ mv medgen.sssom.tsv output/release/ @@ -92,5 +91,6 @@ deploy-release: | output/release/ tmp/input/mondo.sssom.tsv: | tmp/input/ wget http://purl.obolibrary.org/obo/mondo/mappings/mondo.sssom.tsv -O $@ +# creates more than just this file; that goal creates multiple files output/medgen_terms_mapping_status.tsv output/obsoleted_medgen_terms_in_mondo.txt: | output/ python src/mondo_mapping_status.py diff --git a/src/medgen2obo.pl b/src/medgen2obo.pl index 77af286..76b6601 100755 --- a/src/medgen2obo.pl +++ b/src/medgen2obo.pl @@ -1,6 +1,7 @@ #!/usr/bin/perl use strict; +# Vars my %th = (); my %rh = (); my %dh = (); @@ -10,6 +11,7 @@ our $PATH = "ftp.ncbi.nlm.nih.gov/pub/medgen"; +# Execution open(F,"gzip -dc $PATH/MGCONSO.RRF.gz|") || die; while() { next if m@^#@; @@ -82,7 +84,7 @@ chomp; my ($u,$c) = split(/\t/,$_); $uh{$c} = $u; - $th{$c}->{xrefs}->{"MedGen_UID:$u"} = 1; + $th{$c}->{xrefs}->{"MedGen:$u"} = 1; } close(F); @@ -101,57 +103,18 @@ } print "\n"; -my @ids = keys %th; -@ids = sort @ids; -foreach my $id (@ids) { - if ($id =~ /^C\d+/) { - # TODO: repurpose to func (this is instance 1/2) - my $h = $th{$id}; - print "[Term]\n"; - print "id: UMLS:$id\n"; - print "name: $h->{name}\n"; - foreach my $x (keys %{$h->{xrefs}}) { - $x =~ s@MSH:@MESH:@; - $x =~ s@NCI:@NCIT:@; - $x =~ s@SNOMEDCT_US:@SCTID:@; - print "xref: $x\n"; - } - foreach (keys %{$ssh{$id} || {}}) { - my $ss = mk_subset($_); - print "subset: $ss\n"; - } - foreach my $s (@{$h->{synonyms}}) { - my ($str, $x)= @$s; - $str = escq($str); - print "synonym: \"$str\" RELATED [$x]\n"; - } - my $trelh = $rh{$id}; - foreach my $rel (keys %{$trelh}) { - my $vh = $trelh->{$rel}; - foreach my $v (keys %$vh) { - unless ($v eq $id) { - my $tag = "relationship: $rel"; - if ($rel eq 'isa') { - $tag = 'is_a:'; - } - if ($rel eq 'mapped_to') { - $tag = 'equivalent_to:'; - } - print "$tag UMLS:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n"; - } - } - } - print "\n"; - } - # TODO: repurpose to func (this is instance 2/2) +sub add_triples { + my ($prefix, $id) = @_; + my $h = $th{$id}; print "[Term]\n"; - print "id: MedGen:$id\n"; + print "id: $prefix:$id\n"; print "name: $h->{name}\n"; foreach my $x (keys %{$h->{xrefs}}) { $x =~ s@MSH:@MESH:@; $x =~ s@NCI:@NCIT:@; $x =~ s@SNOMEDCT_US:@SCTID:@; + # TODO: change these to skos:exactMatch? print "xref: $x\n"; } foreach (keys %{$ssh{$id} || {}}) { @@ -173,14 +136,39 @@ $tag = 'is_a:'; } if ($rel eq 'mapped_to') { - $tag = 'equivalent_to:'; + # TODO: change these to skos:exactMatch? + $tag = 'equivalent_to:'; # This translates to owl:equivalentClass + # $tag = 'xref:'; # want to get this to translate to skos:exactMatch, but got oboInOwl:hasDbXref instead + } + # Namespaces: Different ones based on if is a UMLS CUI (C#), a MedGen CUI Novel (CN#), or a MedGEn UID (#). + if ($v =~ /^CN\d+/) { + print "$tag MedGenCUI:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n"; + } else { + # If a CUI (starts with 'C'), will be created twice: one for MedGen, one for UMLS + if ($v =~ /^C\d+/) { + print "$tag UMLS:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n"; + } + print "$tag MedGen:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n"; } - print "$tag UMLS:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n"; } } } print "\n"; } +my @ids = keys %th; +@ids = sort @ids; +foreach my $id (@ids) { + # Namespaces: Different ones based on if is a UMLS CUI (C#), a MedGen CUI Novel (CN#), or a MedGEn UID (#). + if ($id =~ /^CN\d+/) { + add_triples('MedGenCUI', $id); + } else { + # If a CUI (starts with 'C'), will be created twice: one for MedGen, one for UMLS + if ($id =~ /^C\d+/) { + add_triples('UMLS', $id); + } + add_triples('MedGen', $id); + } +} exit 0; diff --git a/src/mondo_mapping_status.py b/src/mondo_mapping_status.py index 14aef7b..c3eaeb8 100644 --- a/src/mondo_mapping_status.py +++ b/src/mondo_mapping_status.py @@ -12,9 +12,14 @@ MONDO_SSSOM_TSV = INPUT_DIR / 'mondo.sssom.tsv' MEDGEN_SSSOM_TSV = RELEASE_OUTDIR / 'medgen.sssom.tsv' # MEDGEN_PREFIXES: Some of these are old, some are new, some may not be used. -MEDGEN_PREFIXES = ['Medgen', 'MedGen', 'MEDGEN', 'Medgen_UID', 'MedGen_UID', 'UMLS', 'UMLS_CUI'] +# todo: If I couldn't convert SSSOM properly with MedGen_CUI, souldn't UMLS_CUI have a problem? though i think it's just coming from previous work in mondo maybe. it's not being used in this ingest +MEDGEN_PREFIXES = [ + 'Medgen', 'MedGen', 'MEDGEN', 'MedGenCUI', 'UMLS', 'UMLS_CUI', + # 'Medgen_UID', 'MedGen_UID', 'Medgen_CUI', 'MedGen_CUI', 'Medgen_CUI' +] CURIE = str +# TODO: Mappings can be considered skos:exactMatch def ids_prefixless(ids: Set[str]) -> Set[str]: """Remove prefix""" @@ -92,16 +97,19 @@ def medgen_mondo_mapping_status(mondo_predicate_filter: List[str] = None): file_suffix = '' if not mondo_predicate_filter \ else '-mondo-exacts-only' if mondo_predicate_filter == ['skos:exactMatch'] \ else '-custom' + # Read sources medgen_all_ids, medgen_in_medgen, medgen_in_mondo = \ read_mapping_sources(mondo_predicate_filter=mondo_predicate_filter) + # Special operations # - Inconsistent prefixes between what Mondo used before and will going forward. In this case, stripping prefixes # should be OK, at least for now. medgen_all_ids = ids_prefixless(medgen_all_ids) medgen_in_medgen = ids_prefixless(medgen_in_medgen) medgen_in_mondo = ids_prefixless(medgen_in_mondo) - # Report + + # Generate reports report_obs_medgen_in_mondo(medgen_in_mondo, medgen_in_medgen) report_existing_overlap(medgen_all_ids, medgen_in_medgen, medgen_in_mondo, file_suffix) From 916df0b2bb2a83d45ed386ac953b2f4d4b7d28e2 Mon Sep 17 00:00:00 2001 From: joeflack4 Date: Sun, 13 Aug 2023 19:02:06 -0400 Subject: [PATCH 3/3] - Update: Namespaces: (i) MedGen -> MEDGEN, (ii) MedGenCUI -> MEDGENCUI --- config/medgen.sssom-metadata.yml | 4 ++-- src/medgen2obo.pl | 30 ++++++++++++++++-------------- src/mondo_mapping_status.py | 2 +- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/config/medgen.sssom-metadata.yml b/config/medgen.sssom-metadata.yml index d4efde2..daeb4d9 100644 --- a/config/medgen.sssom-metadata.yml +++ b/config/medgen.sssom-metadata.yml @@ -4,8 +4,8 @@ curie_map: HP: http://purl.obolibrary.org/obo/HP_ MESH: http://identifiers.org/mesh/ MONDO: http://purl.obolibrary.org/obo/MONDO_ - MedGen: http://purl.obolibrary.org/obo/MedGen_ - MedGenCUI: http://purl.obolibrary.org/obo/MedGenCUI_ + MEDGEN: http://purl.obolibrary.org/obo/MEDGEN_ + MEDGENCUI: http://purl.obolibrary.org/obo/MEDGENCUI_ NCIT: http://purl.obolibrary.org/obo/NCIT_ OMIM: https://omim.org/entry/ Orphanet: http://www.orpha.net/ORDO/Orphanet_ diff --git a/src/medgen2obo.pl b/src/medgen2obo.pl index 76b6601..15d0210 100755 --- a/src/medgen2obo.pl +++ b/src/medgen2obo.pl @@ -84,7 +84,7 @@ chomp; my ($u,$c) = split(/\t/,$_); $uh{$c} = $u; - $th{$c}->{xrefs}->{"MedGen:$u"} = 1; + $th{$c}->{xrefs}->{"MEDGEN:$u"} = 1; } close(F); @@ -140,15 +140,16 @@ sub add_triples { $tag = 'equivalent_to:'; # This translates to owl:equivalentClass # $tag = 'xref:'; # want to get this to translate to skos:exactMatch, but got oboInOwl:hasDbXref instead } - # Namespaces: Different ones based on if is a UMLS CUI (C#), a MedGen CUI Novel (CN#), or a MedGEn UID (#). + # Namespaces: Different ones based on if is a UMLS CUI (C#), a MEDGEN CUI Novel (CN#), or a MedGEn UID (#). if ($v =~ /^CN\d+/) { - print "$tag MedGenCUI:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n"; + print "$tag MEDGENCUI:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n"; + # If a CUI (starts with 'C'), will be created twice: one for MEDGENCUI, one for UMLS + } elsif ($v =~ /^C\d+/) { + print "$tag UMLS:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n"; + print "$tag MEDGENCUI:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n"; + # UID } else { - # If a CUI (starts with 'C'), will be created twice: one for MedGen, one for UMLS - if ($v =~ /^C\d+/) { - print "$tag UMLS:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n"; - } - print "$tag MedGen:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n"; + print "$tag MEDGEN:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n"; } } } @@ -160,13 +161,14 @@ sub add_triples { foreach my $id (@ids) { # Namespaces: Different ones based on if is a UMLS CUI (C#), a MedGen CUI Novel (CN#), or a MedGEn UID (#). if ($id =~ /^CN\d+/) { - add_triples('MedGenCUI', $id); + add_triples('MEDGENCUI', $id); + # If a CUI (starts with 'C'), will be created twice: one for MEDGENCUI, one for UMLS + } elsif ($id =~ /^C\d+/) { + add_triples('UMLS', $id); + add_triples('MEDGENCUI', $id); + # UID } else { - # If a CUI (starts with 'C'), will be created twice: one for MedGen, one for UMLS - if ($id =~ /^C\d+/) { - add_triples('UMLS', $id); - } - add_triples('MedGen', $id); + add_triples('MEDGEN', $id); } } diff --git a/src/mondo_mapping_status.py b/src/mondo_mapping_status.py index c3eaeb8..fed8e32 100644 --- a/src/mondo_mapping_status.py +++ b/src/mondo_mapping_status.py @@ -14,7 +14,7 @@ # MEDGEN_PREFIXES: Some of these are old, some are new, some may not be used. # todo: If I couldn't convert SSSOM properly with MedGen_CUI, souldn't UMLS_CUI have a problem? though i think it's just coming from previous work in mondo maybe. it's not being used in this ingest MEDGEN_PREFIXES = [ - 'Medgen', 'MedGen', 'MEDGEN', 'MedGenCUI', 'UMLS', 'UMLS_CUI', + 'MEDGEN', 'Medgen', 'MedGen', 'MEDGENCUI', 'MedGenCUI', 'UMLS', 'UMLS_CUI', # 'Medgen_UID', 'MedGen_UID', 'Medgen_CUI', 'MedGen_CUI', 'Medgen_CUI' ] CURIE = str