From f461d52abf3c7eb4981c43ab0a63aca653a333db Mon Sep 17 00:00:00 2001
From: joeflack4 <joeflack4@gmail.com>
Date: Sun, 23 Jul 2023 17:10:03 -0400
Subject: [PATCH 1/3] - Update: new classes: duplicated some UMLS: classes as
 Medgen:, if they started with 'C' and a number. - Update: prefixes: In
 addition to new classes above, renamed UMLS prefix with Medgen for all other
 classes (which happen to all start with 'CN:' - Update: prefixes: Renamed
 prior MEDGEN: xref prefixes to Medgen_UID: These IDs don't start with C (CUI;
 Concept Unique Identifier) or CN (Common Name?). These are internal Medgen
 UIDs that are duplicative and not for clinical or analytical use. - Rename:
 bin/ -> src/ - Add: output/: For both release outputs and non-release. -
 Rename: release/ -> output/release/ - Add: mondo_mapping_status.py: For
 generating artefacts related to the reporting and management of mappings
 between Mondo and Medgen. - Add: Python dependency requirements files. - Add:
 run.sh: For running commands in ODK - Add: config/medgen.sssom-metadata.yml

---
 .gitignore                       |   6 +-
 README.md                        |   1 +
 config/medgen.sssom-metadata.yml |  21 ++++++
 makefile                         |  58 +++++++++++----
 requirements-unlocked.txt        |   1 +
 requirements.txt                 |  14 ++++
 run.sh                           |  85 ++++++++++++++++++++++
 {bin => src}/make_uid2cui.pl     |   0
 {bin => src}/medgen2obo.pl       |  44 +++++++++++-
 src/mondo_mapping_status.py      | 118 +++++++++++++++++++++++++++++++
 10 files changed, 331 insertions(+), 17 deletions(-)
 create mode 100644 config/medgen.sssom-metadata.yml
 create mode 100644 requirements-unlocked.txt
 create mode 100644 requirements.txt
 create mode 100644 run.sh
 rename {bin => src}/make_uid2cui.pl (100%)
 rename {bin => src}/medgen2obo.pl (72%)
 create mode 100644 src/mondo_mapping_status.py

diff --git a/.gitignore b/.gitignore
index 88619f4..e0419c6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,20 +1,22 @@
 # Standard
 dev/
 data/cache/
+output/
 __pycache__/
 .idea/
 .DS_Store
 .env
 
 # Specialized
+/*.json
 /*.obo
 /*.owl
 /*.tmp
 /*.tsv
 /fetch
 /ftp.ncbi.nlm.nih.gov/
-/release/
 *ignore/
+.ipynb_checkpoints/
 _archive/
+release/
 tmp/
-.ipynb_checkpoints/
diff --git a/README.md b/README.md
index f3939de..fe91d76 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,7 @@ On MacOS, (3) and (4) should be available without the need for installation.
 
 ## Setup
 1. Give permission to run Perl: `chmod +x ./bin/*.pl`
+2. Install Python dependencies: `pip install -r requirements.txt`
 
 ## Running the ingest
 Run: `make all`
diff --git a/config/medgen.sssom-metadata.yml b/config/medgen.sssom-metadata.yml
new file mode 100644
index 0000000..48ffaf1
--- /dev/null
+++ b/config/medgen.sssom-metadata.yml
@@ -0,0 +1,21 @@
+creator_id: 0000-0002-2906-7319
+curie_map:
+  GTR: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/GTR/
+  HP: http://purl.obolibrary.org/obo/HP_
+  MESH: http://identifiers.org/mesh/
+  MONDO: http://purl.obolibrary.org/obo/MONDO_
+  MedGen: http://purl.obolibrary.org/obo/Medgen_
+  MedGen_UID: http://purl.obolibrary.org/obo/Medgen_UID_
+  NCIT: http://purl.obolibrary.org/obo/NCIT_
+  OMIM: https://omim.org/entry/
+  Orphanet: http://www.orpha.net/ORDO/Orphanet_
+  SCTID: http://identifiers.org/snomedct/
+  UMLS: http://purl.obolibrary.org/obo/UMLS_
+  oboInOwl: http://www.geneontology.org/formats/oboInOwl#
+  owl: http://www.w3.org/2002/07/owl#
+  rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
+  rdfs: http://www.w3.org/2000/01/rdf-schema#
+  semapv: https://w3id.org/semapv/
+  skos: http://www.w3.org/2004/02/skos/core#
+  sssom: https://w3id.org/sssom/
+license: http://w3id.org/sssom/license/unspecified
diff --git a/makefile b/makefile
index f99e73f..347699a 100644
--- a/makefile
+++ b/makefile
@@ -2,38 +2,54 @@
 # Running `make all` will run the full pipeline. Note that if the FTP files have already been downloaded, it'll skip
 # that part. In order to force re-download, run `make all -B`.
 .DEFAULT_GOAL := all
-.PHONY: all build stage stage-%
+.PHONY: all build stage stage-% release-artefacts analysis-artefacts clean deploy-release
 
 OBO=http://purl.obolibrary.org/obo
 PRODUCTS=medgen-disease-extract.obo medgen-disease-extract.owl
 TODAY ?=$(shell date +%Y-%m-%d)
 VERSION=v$(TODAY)
 
-all: build stage
-build: $(PRODUCTS)
+all: build stage clean
+release-artefacts: $(PRODUCTS) medgen.sssom.tsv
+# analysis-artefacts runs more than just this file; that goal creates multiple files
+analysis-artefacts: medgen_terms_mapping_status.tsv
+build: release-artefacts analysis-artefacts
 stage: $(patsubst %, stage-%, $(PRODUCTS))
-	mv medgen.obo release/
-stage-%: % | release/
-	mv $< release/
+	mv medgen.obo output/release/
+	mv medgen.sssom.tsv output/release/
+stage-%: % | output/release/
+	mv $< output/release/
+clean:
+	rm medgen.obographs.json
+	rm uid2cui.tsv
+	rm *.obo
 
 # ----------------------------------------
-# ETL
+# Setup dirs
 # ----------------------------------------
-release/:
+tmp/input/:
+	mkdir -p $@
+output/:
+	mkdir -p $@
+output/release/:
 	mkdir -p $@
 
+# ----------------------------------------
+# ETL
+# ----------------------------------------
 ftp.ncbi.nlm.nih.gov:
 	wget -r -np ftp://ftp.ncbi.nlm.nih.gov/pub/medgen/ && touch $@
 
 uid2cui.tsv:
-	./bin/make_uid2cui.pl > $@
+	./src/make_uid2cui.pl > $@
 
 # ----------------------------------------
-# Hacky conversion to obo
+# Main artefacts
 # ----------------------------------------
+# Hacky conversion to obo ----------------
 # Relies on MGCONSO.RRF.gz etc being made by 'ftp.ncbi.nlm.nih.gov' step
 medgen.obo: ftp.ncbi.nlm.nih.gov uid2cui.tsv
-	./bin/medgen2obo.pl > $@.tmp && mv $@.tmp $@
+	./src/medgen2obo.pl > $@.tmp && mv $@.tmp $@
 
 # We only care about diseases for now
 # - NOTE: some cancers seem to appear under Neoplastic-Process
@@ -49,6 +65,13 @@ medgen-disease-extract.json: medgen-disease-extract.obo
 medgen-disease-extract.owl: medgen-disease-extract.obo
 	owltools $< -o $@
 
+# SSSOM ----------------------------------
+medgen.obographs.json:
+	robot convert -i medgen-disease-extract.owl -o $@
+
+medgen.sssom.tsv: medgen.obographs.json
+	sssom parse medgen.obographs.json -I obographs-json -m config/medgen.sssom-metadata.yml -o $@
+
 # ----------------------------------------
 # Cycles	
 # ----------------------------------------
@@ -59,6 +82,15 @@ medgen-disease-extract.owl: medgen-disease-extract.obo
 # ----------------------------------------
 # Devops
 # ----------------------------------------
-deploy-release: | release/
+deploy-release: | output/release/
 	@test $(VERSION)
-	gh release create $(VERSION) --notes "New release." --title "$(VERSION)" release/*
+	gh release create $(VERSION) --notes "New release." --title "$(VERSION)" output/release/*
+
+# ----------------------------------------
+# Mapping analysis
+# ----------------------------------------
+tmp/input/mondo.sssom.tsv: | tmp/input/
+	wget http://purl.obolibrary.org/obo/mondo/mappings/mondo.sssom.tsv -O $@
+
+output/medgen_terms_mapping_status.tsv output/obsoleted_medgen_terms_in_mondo.txt: | output/
+	python src/mondo_mapping_status.py
diff --git a/requirements-unlocked.txt b/requirements-unlocked.txt
new file mode 100644
index 0000000..fb6c7ed
--- /dev/null
+++ b/requirements-unlocked.txt
@@ -0,0 +1 @@
+pandas
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..01e7f87
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,14 @@
+distlib==0.3.6
+filelock==3.9.0
+numpy==1.25.1
+pandas==2.0.3
+pbr==5.11.1
+platformdirs==3.1.0
+python-dateutil==2.8.2
+pytz==2023.3
+six==1.16.0
+stevedore==5.0.0
+tzdata==2023.3
+virtualenv==20.20.0
+virtualenv-clone==0.5.7
+virtualenvwrapper==4.8.4
diff --git a/run.sh b/run.sh
new file mode 100644
index 0000000..11e1a06
--- /dev/null
+++ b/run.sh
@@ -0,0 +1,85 @@
+#!/bin/sh
+# Wrapper script for docker.
+#
+# This is used primarily for wrapping the GNU Make workflow.
+# Instead of typing "make TARGET", type "./run.sh make TARGET".
+# This will run the make workflow within a docker container.
+#
+# The assumption is that you are working in the src/ontology folder;
+# we therefore map the whole repo (../..) to a docker volume.
+#
+# To use singularity instead of docker, please issue
+# export USE_SINGULARITY=<any-value>
+# before running this script.
+#
+# See README-editors.md for more details.
+
+if [ -f run.sh.conf ]; then
+    . ./run.sh.conf
+fi
+
+# Look for a GitHub token
+if [ -n "$GH_TOKEN" ]; then
+    :
+elif [ -f ../../.github/token.txt ]; then
+    GH_TOKEN=$(cat ../../.github/token.txt)
+elif [ -f $XDG_CONFIG_HOME/ontology-development-kit/github/token ]; then
+    GH_TOKEN=$(cat $XDG_CONFIG_HOME/ontology-development-kit/github/token)
+elif [ -f "$HOME/Library/Application Support/ontology-development-kit/github/token" ]; then
+    GH_TOKEN=$(cat "$HOME/Library/Application Support/ontology-development-kit/github/token")
+fi
+
+ODK_IMAGE=${ODK_IMAGE:-odkfull}
+TAG_IN_IMAGE=$(echo $ODK_IMAGE | awk -F':' '{ print $2 }')
+if [ -n "$TAG_IN_IMAGE" ]; then
+  # Override ODK_TAG env var if IMAGE already includes a tag
+  ODK_TAG=$TAG_IN_IMAGE
+  ODK_IMAGE=$(echo $ODK_IMAGE | awk -F':' '{ print $1 }')
+fi
+ODK_TAG=${ODK_TAG:-latest}
+ODK_JAVA_OPTS=${ODK_JAVA_OPTS:--Xmx20G}
+ODK_DEBUG=${ODK_DEBUG:-no}
+
+# Convert OWLAPI_* environment variables to the OWLAPI as Java options
+# See http://owlcs.github.io/owlapi/apidocs_4/org/semanticweb/owlapi/model/parameters/ConfigurationOptions.html
+# for a list of allowed options
+OWLAPI_OPTIONS_NAMESPACE=org.semanticweb.owlapi.model.parameters.ConfigurationOptions
+for owlapi_var in $(env | sed -n s/^OWLAPI_//p) ; do
+    ODK_JAVA_OPTS="$ODK_JAVA_OPTS -D$OWLAPI_OPTIONS_NAMESPACE.${owlapi_var%=*}=${owlapi_var#*=}"
+done
+
+TIMECMD=
+if [ x$ODK_DEBUG = xyes ]; then
+    # If you wish to change the format string, take care of using
+    # non-breaking spaces (U+00A0) instead of normal spaces, to
+    # prevent the shell from tokenizing the format string.
+    echo "Running ${IMAGE} with ${ODK_JAVA_OPTS} of memory for ROBOT and Java-based pipeline steps."
+    TIMECMD="/usr/bin/time -f ### DEBUG STATS ###\nElapsed time: %E\nPeak memory: %M kb"
+fi
+
+VOLUME_BIND=$PWD:/work
+WORK_DIR=/work
+
+if [ -n "$ODK_BINDS" ]; then
+    VOLUME_BIND="$VOLUME_BIND,$ODK_BINDS"
+fi
+
+if [ -n "$USE_SINGULARITY" ]; then
+
+    singularity exec --cleanenv $ODK_SINGULARITY_OPTIONS \
+        --env "ROBOT_JAVA_ARGS=$ODK_JAVA_OPTS,JAVA_OPTS=$ODK_JAVA_OPTS" \
+        --bind $VOLUME_BIND \
+        -W $WORK_DIR \
+        docker://obolibrary/$ODK_IMAGE:$ODK_TAG $TIMECMD "$@"
+else
+    BIND_OPTIONS="-v $(echo $VOLUME_BIND | sed 's/,/ -v /')"
+    docker run $ODK_DOCKER_OPTIONS $BIND_OPTIONS -w $WORK_DIR \
+        -e ROBOT_JAVA_ARGS="$ODK_JAVA_OPTS" -e JAVA_OPTS="$ODK_JAVA_OPTS" \
+        --rm -ti obolibrary/$ODK_IMAGE:$ODK_TAG $TIMECMD "$@"
+fi
+
+case "$@" in
+*update_repo*|*release*)
+    echo "Please remember to update your ODK image from time to time: https://oboacademy.github.io/obook/howto/odk-update/."
+    ;;
+esac
\ No newline at end of file
diff --git a/bin/make_uid2cui.pl b/src/make_uid2cui.pl
similarity index 100%
rename from bin/make_uid2cui.pl
rename to src/make_uid2cui.pl
diff --git a/bin/medgen2obo.pl b/src/medgen2obo.pl
similarity index 72%
rename from bin/medgen2obo.pl
rename to src/medgen2obo.pl
index b05b13f..77af286 100755
--- a/bin/medgen2obo.pl
+++ b/src/medgen2obo.pl
@@ -82,7 +82,7 @@
     chomp;
     my ($u,$c) = split(/\t/,$_);
     $uh{$c} = $u;
-    $th{$c}->{xrefs}->{"MEDGEN:$u"} = 1;
+    $th{$c}->{xrefs}->{"MedGen_UID:$u"} = 1;
 }
 close(F);
 
@@ -104,9 +104,49 @@
 my @ids = keys %th;
 @ids = sort @ids;
 foreach my $id (@ids) {
+    if ($id =~ /^C\d+/) {
+        # TODO: repurpose to func (this is instance 1/2)
+        my $h = $th{$id};
+        print "[Term]\n";
+        print "id: UMLS:$id\n";
+        print "name: $h->{name}\n";
+        foreach my $x (keys %{$h->{xrefs}}) {
+            $x =~ s@MSH:@MESH:@;
+            $x =~ s@NCI:@NCIT:@;
+            $x =~ s@SNOMEDCT_US:@SCTID:@;
+            print "xref: $x\n";
+        }
+        foreach (keys %{$ssh{$id} || {}}) {
+            my $ss = mk_subset($_);
+            print "subset: $ss\n";
+        }
+        foreach my $s (@{$h->{synonyms}}) {
+            my ($str, $x)= @$s;
+            $str = escq($str);
+            print "synonym: \"$str\" RELATED [$x]\n";
+        }
+        my $trelh = $rh{$id};
+        foreach my $rel (keys %{$trelh}) {
+            my $vh = $trelh->{$rel};
+            foreach my $v (keys %$vh) {
+                unless ($v eq $id) {
+                    my $tag = "relationship: $rel";
+                    if ($rel eq 'isa') {
+                        $tag = 'is_a:';
+                    }
+                    if ($rel eq 'mapped_to') {
+                        $tag = 'equivalent_to:';
+                    }
+                    print "$tag UMLS:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n";
+                }
+            }
+        }
+        print "\n";
+    }
+    # TODO: repurpose to func (this is instance 2/2)
     my $h = $th{$id};
     print "[Term]\n";
-    print "id: UMLS:$id\n";
+    print "id: MedGen:$id\n";
     print "name: $h->{name}\n";
     foreach my $x (keys %{$h->{xrefs}}) {
         $x =~ s@MSH:@MESH:@;
diff --git a/src/mondo_mapping_status.py b/src/mondo_mapping_status.py
new file mode 100644
index 0000000..14aef7b
--- /dev/null
+++ b/src/mondo_mapping_status.py
@@ -0,0 +1,118 @@
+"""Mapping status between Medgen and Mondo"""
+from pathlib import Path
+from typing import List, Set, Tuple
+
+import pandas as pd
+
+SRC_DIR = Path(__file__).parent
+PROJECT_DIR = SRC_DIR.parent
+OUTDIR = PROJECT_DIR / 'output'
+RELEASE_OUTDIR = OUTDIR / 'release'
+INPUT_DIR = PROJECT_DIR / 'tmp' / 'input'
+MONDO_SSSOM_TSV = INPUT_DIR / 'mondo.sssom.tsv'
+MEDGEN_SSSOM_TSV = RELEASE_OUTDIR / 'medgen.sssom.tsv'
+# MEDGEN_PREFIXES: Some of these are old, some are new, some may not be used.
+MEDGEN_PREFIXES = ['Medgen', 'MedGen', 'MEDGEN', 'Medgen_UID', 'MedGen_UID', 'UMLS', 'UMLS_CUI']
+CURIE = str
+
+
+def ids_prefixless(ids: Set[str]) -> Set[str]:
+    """Remove prefix"""
+    return set([x.split(':')[1] for x in ids])
+
+
+def ids_drop_uids(ids: Set[CURIE]) -> Set[CURIE]:
+    """From a set of Medgen IDs, drop those that are UIDs"""
+    return set([x for x in ids if x.split(':')[1].startswith('C')])
+
+def read_mapping_sources(
+    mondo_predicate_filter: List[str] = None,
+    drop_uids=True
+) -> Tuple[Set[CURIE], Set[CURIE], Set[CURIE]]:
+    """Read data sources
+    :param drop_uids: drop UIDs from Medgen IDs. These are ones that don't start with CN or C, and are IDs that are used
+    only internally in Medgen and are not stable."""
+    medgen_df = pd.read_csv(MEDGEN_SSSOM_TSV, sep='\t', comment='#').fillna('')
+    # todo: move commented line to .ipynb
+    # preds = list(medgen_df['predicate_id'].unique())  # oboInOwl:hasDbXref, owl:equivalentClass
+    medgen_in_medgen: Set[CURIE] = set(list(medgen_df['subject_id']))
+
+    mondo_df = pd.read_csv(MONDO_SSSOM_TSV, sep='\t', comment='#').fillna('')  # n=72,902
+    mondo_df['prefix'] = mondo_df['object_id'].apply(lambda x: x.split(':')[0])
+    mondo_df = mondo_df[mondo_df['prefix'].isin(MEDGEN_PREFIXES)]  # n=16,627
+    del mondo_df['prefix']
+    # todo: move commented line to .ipynb
+    # preds = list(mondo_df['predicate_id'].unique())  # only skos:exactMatch
+    if mondo_predicate_filter:  # leaving for now; but has no effect because only skos:exactMatch exists
+        mondo_df = mondo_df[mondo_df['predicate_id'].isin(mondo_predicate_filter)]
+    medgen_in_mondo: Set[CURIE] = set(mondo_df['object_id'].tolist())
+
+    medgen_all_ids = medgen_in_medgen.union(medgen_in_mondo)
+
+    if drop_uids:
+        medgen_all_ids = ids_drop_uids(medgen_all_ids)
+        medgen_in_medgen = ids_drop_uids(medgen_in_medgen)
+        medgen_in_mondo = ids_drop_uids(medgen_in_mondo)
+
+    return medgen_all_ids, medgen_in_medgen, medgen_in_mondo
+
+def report_obs_medgen_in_mondo(medgen_in_mondo: Set[str], medgen_in_medgen: Set[str]):
+    """Obsoleted Medgen terms in Mondo"""
+    # obsoleted_medgen_terms_in_mondo.txt: get a list of obsolete Medgen terms that are still in Mondo
+    in_mondo_not_in_medgen = medgen_in_mondo.difference(medgen_in_medgen)
+    obs_medgen_in_mondo_df = pd.DataFrame()
+    obs_medgen_in_mondo_df['id'] = sorted([x for x in in_mondo_not_in_medgen])
+    obs_medgen_in_mondo_df = obs_medgen_in_mondo_df.sort_values(by='id')
+    obs_medgen_in_mondo_df.to_csv(OUTDIR / 'obsoleted_medgen_terms_in_mondo.txt', index=False, header=False)
+
+def report_existing_overlap(medgen_all_ids: Set[str], medgen_in_medgen: Set[str], medgen_in_mondo: Set[str], file_suffix: str):
+    """Get explicit, existing mapping status overlaps between Medgen and Mondo
+    These are mappings at the time before we began the Medgen ingest, and we this was useful for analytical information
+    at the time, but we maybe should drop this because not using for curation. We're not keeping the previous
+    Mondo::Medgen mappings from Mondo."""
+    existing_overlap_df = pd.DataFrame()
+    existing_overlap_df['subject_id'] = list(medgen_all_ids)
+    existing_overlap_df['in_medgen'] = existing_overlap_df['subject_id'].isin(medgen_in_medgen)
+    existing_overlap_df['in_mondo'] = existing_overlap_df['subject_id'].isin(medgen_in_mondo)
+    existing_overlap_df['status'] = existing_overlap_df['subject_id'].apply(
+        lambda x:
+        'medgen' if x in medgen_in_medgen and x not in medgen_in_mondo else
+        'mondo' if x in medgen_in_mondo and x not in medgen_in_medgen else
+        'both')
+    existing_overlap_df = existing_overlap_df.sort_values(['status', 'subject_id', 'in_medgen', 'in_mondo'])
+    # todo: move to .ipynb
+    # tot_medgen = len(existing_overlap_df[existing_overlap_df['status'] == 'medgen'])  # n=66,224
+    # tot_mondo = len(existing_overlap_df[existing_overlap_df['status'] == 'mondo'])  # n=2,362
+    # tot_both = len(existing_overlap_df[existing_overlap_df['status'] == 'both'])  # n=14,263
+    existing_overlap_df.to_csv(OUTDIR / f'medgen_terms_mapping_status{file_suffix}.tsv', index=False, sep='\t')
+
+def medgen_mondo_mapping_status(mondo_predicate_filter: List[str] = None):
+    """Mapping status between Medgen and Mondo"""
+    # Vars
+    file_suffix = '' if not mondo_predicate_filter \
+        else '-mondo-exacts-only' if mondo_predicate_filter == ['skos:exactMatch'] \
+        else '-custom'
+    # Read sources
+    medgen_all_ids, medgen_in_medgen, medgen_in_mondo = \
+        read_mapping_sources(mondo_predicate_filter=mondo_predicate_filter)
+    # Special operations
+    # - Inconsistent prefixes between what Mondo used before and will going forward. In this case, stripping prefixes
+    # should be OK, at least for now.
+    medgen_all_ids = ids_prefixless(medgen_all_ids)
+    medgen_in_medgen = ids_prefixless(medgen_in_medgen)
+    medgen_in_mondo = ids_prefixless(medgen_in_mondo)
+    # Report
+    report_obs_medgen_in_mondo(medgen_in_mondo, medgen_in_medgen)
+    report_existing_overlap(medgen_all_ids, medgen_in_medgen, medgen_in_mondo, file_suffix)
+
+def run():
+    """Run reports"""
+    # # filters: could be set up if needed, but current Medgen & previous Mondo only have exactMatch
+    # filters = [None, ['skos:exactMatch']]
+    # for f in filters:
+    #     medgen_mondo_mapping_status(f)
+    medgen_mondo_mapping_status()
+
+
+if __name__ == '__main__':
+    run()

From d62a422102b05ed9535b7db1b2d8fe55aca0b0bd Mon Sep 17 00:00:00 2001
From: joeflack4 <joeflack4@gmail.com>
Date: Wed, 2 Aug 2023 16:25:52 -0400
Subject: [PATCH 2/3] - Update: medgen2obo.pl: (i) Abstracted adding of classes
 and their triples as a function, (ii) updated namespacing of classes based on
 what type of MedGen/UMLS identifier they are. - Update: Namespaces MedGen,
 MedGen_UI (removed), MedGenCUI - Bugfix: SSSOM metadata yaml had a typo
 preventing conversion - Bugfix: Makefile: (i) needed to rename a dependency,
 (ii) needed to run 'analyze' step after 'stage' - Update: Makefile:
 Simplified some goals - Bugfix: For UMLS CUIs (e.g. starts with C then #s),
 we chose to do duplicate classes with namespaces UMLS and MedGen. However, I
 just now made it so that also all references (e.g. xrefs) are also
 duplicated, e.g. MedGen:1 maps to MedGen:2 and UMLS:2.

---
 config/medgen.sssom-metadata.yml |  4 +-
 makefile                         | 12 ++---
 src/medgen2obo.pl                | 82 ++++++++++++++------------------
 src/mondo_mapping_status.py      | 12 ++++-
 4 files changed, 53 insertions(+), 57 deletions(-)

diff --git a/config/medgen.sssom-metadata.yml b/config/medgen.sssom-metadata.yml
index 48ffaf1..d4efde2 100644
--- a/config/medgen.sssom-metadata.yml
+++ b/config/medgen.sssom-metadata.yml
@@ -4,8 +4,8 @@ curie_map:
   HP: http://purl.obolibrary.org/obo/HP_
   MESH: http://identifiers.org/mesh/
   MONDO: http://purl.obolibrary.org/obo/MONDO_
-  MedGen: http://purl.obolibrary.org/obo/Medgen_
-  MedGen_UID: http://purl.obolibrary.org/obo/Medgen_UID_
+  MedGen: http://purl.obolibrary.org/obo/MedGen_
+  MedGenCUI: http://purl.obolibrary.org/obo/MedGenCUI_
   NCIT: http://purl.obolibrary.org/obo/NCIT_
   OMIM: https://omim.org/entry/
   Orphanet: http://www.orpha.net/ORDO/Orphanet_
diff --git a/makefile b/makefile
index 347699a..d68fb8f 100644
--- a/makefile
+++ b/makefile
@@ -2,18 +2,17 @@
 # Running `make all` will run the full pipeline. Note that if the FTP files have already been downloaded, it'll skip
 # that part. In order to force re-download, run `make all -B`.
 .DEFAULT_GOAL := all
-.PHONY: all build stage stage-% release-artefacts analysis-artefacts clean deploy-release
+.PHONY: all build stage stage-% analyze clean deploy-release
 
 OBO=http://purl.obolibrary.org/obo
 PRODUCTS=medgen-disease-extract.obo medgen-disease-extract.owl
 TODAY ?=$(shell date +%Y-%m-%d)
 VERSION=v$(TODAY)
 
-all: build stage clean
-release-artefacts: $(PRODUCTS) medgen.sssom.tsv
-# analysis-artefacts runs more than just this file; that goal creates multiple files
-analysis-artefacts: medgen_terms_mapping_status.tsv
-build: release-artefacts analysis-artefacts
+all: build stage clean analyze
+# analyze: runs more than just this file; that goal creates multiple files
+analyze: output/medgen_terms_mapping_status.tsv
+build: $(PRODUCTS) medgen.sssom.tsv
 stage: $(patsubst %, stage-%, $(PRODUCTS))
 	mv medgen.obo output/release/
 	mv medgen.sssom.tsv output/release/
@@ -92,5 +91,6 @@ deploy-release: | output/release/
 tmp/input/mondo.sssom.tsv: | tmp/input/
 	wget http://purl.obolibrary.org/obo/mondo/mappings/mondo.sssom.tsv -O $@
 
+# creates more than just this file; that goal creates multiple files
 output/medgen_terms_mapping_status.tsv output/obsoleted_medgen_terms_in_mondo.txt: | output/
 	python src/mondo_mapping_status.py
diff --git a/src/medgen2obo.pl b/src/medgen2obo.pl
index 77af286..76b6601 100755
--- a/src/medgen2obo.pl
+++ b/src/medgen2obo.pl
@@ -1,6 +1,7 @@
 #!/usr/bin/perl
 use strict;
 
+# Vars
 my %th = ();
 my %rh = ();
 my %dh = ();
@@ -10,6 +11,7 @@
 
 our $PATH = "ftp.ncbi.nlm.nih.gov/pub/medgen";
 
+# Execution
 open(F,"gzip -dc $PATH/MGCONSO.RRF.gz|") || die;
 while(<F>) {
     next if m@^#@;
@@ -82,7 +84,7 @@
     chomp;
     my ($u,$c) = split(/\t/,$_);
     $uh{$c} = $u;
-    $th{$c}->{xrefs}->{"MedGen_UID:$u"} = 1;
+    $th{$c}->{xrefs}->{"MedGen:$u"} = 1;
 }
 close(F);
 
@@ -101,57 +103,18 @@
 }
 print "\n";
 
-my @ids = keys %th;
-@ids = sort @ids;
-foreach my $id (@ids) {
-    if ($id =~ /^C\d+/) {
-        # TODO: repurpose to func (this is instance 1/2)
-        my $h = $th{$id};
-        print "[Term]\n";
-        print "id: UMLS:$id\n";
-        print "name: $h->{name}\n";
-        foreach my $x (keys %{$h->{xrefs}}) {
-            $x =~ s@MSH:@MESH:@;
-            $x =~ s@NCI:@NCIT:@;
-            $x =~ s@SNOMEDCT_US:@SCTID:@;
-            print "xref: $x\n";
-        }
-        foreach (keys %{$ssh{$id} || {}}) {
-            my $ss = mk_subset($_);
-            print "subset: $ss\n";
-        }
-        foreach my $s (@{$h->{synonyms}}) {
-            my ($str, $x)= @$s;
-            $str = escq($str);
-            print "synonym: \"$str\" RELATED [$x]\n";
-        }
-        my $trelh = $rh{$id};
-        foreach my $rel (keys %{$trelh}) {
-            my $vh = $trelh->{$rel};
-            foreach my $v (keys %$vh) {
-                unless ($v eq $id) {
-                    my $tag = "relationship: $rel";
-                    if ($rel eq 'isa') {
-                        $tag = 'is_a:';
-                    }
-                    if ($rel eq 'mapped_to') {
-                        $tag = 'equivalent_to:';
-                    }
-                    print "$tag UMLS:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n";
-                }
-            }
-        }
-        print "\n";
-    }
-    # TODO: repurpose to func (this is instance 2/2)
+sub add_triples {
+    my ($prefix, $id) = @_;
+
     my $h = $th{$id};
     print "[Term]\n";
-    print "id: MedGen:$id\n";
+    print "id: $prefix:$id\n";
     print "name: $h->{name}\n";
     foreach my $x (keys %{$h->{xrefs}}) {
         $x =~ s@MSH:@MESH:@;
         $x =~ s@NCI:@NCIT:@;
         $x =~ s@SNOMEDCT_US:@SCTID:@;
+        # TODO: change these to skos:exactMatch?
         print "xref: $x\n";
     }
     foreach (keys %{$ssh{$id} || {}}) {
@@ -173,14 +136,39 @@
                     $tag = 'is_a:';
                 }
                 if ($rel eq 'mapped_to') {
-                    $tag = 'equivalent_to:';
+                    # TODO: change these to skos:exactMatch?
+                    $tag = 'equivalent_to:';  # This translates to owl:equivalentClass
+                    # $tag = 'xref:';  # want to get this to translate to skos:exactMatch, but got oboInOwl:hasDbXref instead
+                }
+                # Namespaces: Different ones based on if is a UMLS CUI (C#), a MedGen CUI Novel (CN#), or a MedGEn UID (#).
+                if ($v =~ /^CN\d+/) {
+                    print "$tag MedGenCUI:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n";
+                } else {
+                    # If a CUI (starts with 'C'), will be created twice: one for MedGen, one for UMLS
+                    if ($v =~ /^C\d+/) {
+                        print "$tag UMLS:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n";
+                    }
+                    print "$tag MedGen:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n";
                 }
-                print "$tag UMLS:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n";
             }
         }
     }
     print "\n";
 }
+my @ids = keys %th;
+@ids = sort @ids;
+foreach my $id (@ids) {
+    # Namespaces: Different ones based on if is a UMLS CUI (C#), a MedGen CUI Novel (CN#), or a MedGEn UID (#).
+    if ($id =~ /^CN\d+/) {
+        add_triples('MedGenCUI', $id);
+    } else {
+        # If a CUI (starts with 'C'), will be created twice: one for MedGen, one for UMLS
+        if ($id =~ /^C\d+/) {
+            add_triples('UMLS', $id);
+        }
+        add_triples('MedGen', $id);
+    }
+}
 
 exit 0;
 
diff --git a/src/mondo_mapping_status.py b/src/mondo_mapping_status.py
index 14aef7b..c3eaeb8 100644
--- a/src/mondo_mapping_status.py
+++ b/src/mondo_mapping_status.py
@@ -12,9 +12,14 @@
 MONDO_SSSOM_TSV = INPUT_DIR / 'mondo.sssom.tsv'
 MEDGEN_SSSOM_TSV = RELEASE_OUTDIR / 'medgen.sssom.tsv'
 # MEDGEN_PREFIXES: Some of these are old, some are new, some may not be used.
-MEDGEN_PREFIXES = ['Medgen', 'MedGen', 'MEDGEN', 'Medgen_UID', 'MedGen_UID', 'UMLS', 'UMLS_CUI']
+# todo: If I couldn't convert SSSOM properly with MedGen_CUI, souldn't UMLS_CUI have a problem? though i think it's just coming from previous work in mondo maybe. it's not being used in this ingest
+MEDGEN_PREFIXES = [
+    'Medgen', 'MedGen', 'MEDGEN', 'MedGenCUI', 'UMLS', 'UMLS_CUI',
+    # 'Medgen_UID', 'MedGen_UID', 'Medgen_CUI', 'MedGen_CUI', 'Medgen_CUI'
+]
 CURIE = str
 
+# TODO: Mappings can be considered skos:exactMatch
 
 def ids_prefixless(ids: Set[str]) -> Set[str]:
     """Remove prefix"""
@@ -92,16 +97,19 @@ def medgen_mondo_mapping_status(mondo_predicate_filter: List[str] = None):
     file_suffix = '' if not mondo_predicate_filter \
         else '-mondo-exacts-only' if mondo_predicate_filter == ['skos:exactMatch'] \
         else '-custom'
+
     # Read sources
     medgen_all_ids, medgen_in_medgen, medgen_in_mondo = \
         read_mapping_sources(mondo_predicate_filter=mondo_predicate_filter)
+
     # Special operations
     # - Inconsistent prefixes between what Mondo used before and will going forward. In this case, stripping prefixes
     # should be OK, at least for now.
     medgen_all_ids = ids_prefixless(medgen_all_ids)
     medgen_in_medgen = ids_prefixless(medgen_in_medgen)
     medgen_in_mondo = ids_prefixless(medgen_in_mondo)
-    # Report
+
+    # Generate reports
     report_obs_medgen_in_mondo(medgen_in_mondo, medgen_in_medgen)
     report_existing_overlap(medgen_all_ids, medgen_in_medgen, medgen_in_mondo, file_suffix)
 

From 916df0b2bb2a83d45ed386ac953b2f4d4b7d28e2 Mon Sep 17 00:00:00 2001
From: joeflack4 <joeflack4@gmail.com>
Date: Sun, 13 Aug 2023 19:02:06 -0400
Subject: [PATCH 3/3] - Update: Namespaces: (i) MedGen -> MEDGEN, (ii)
 MedGenCUI -> MEDGENCUI

---
 config/medgen.sssom-metadata.yml |  4 ++--
 src/medgen2obo.pl                | 30 ++++++++++++++++--------------
 src/mondo_mapping_status.py      |  2 +-
 3 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/config/medgen.sssom-metadata.yml b/config/medgen.sssom-metadata.yml
index d4efde2..daeb4d9 100644
--- a/config/medgen.sssom-metadata.yml
+++ b/config/medgen.sssom-metadata.yml
@@ -4,8 +4,8 @@ curie_map:
   HP: http://purl.obolibrary.org/obo/HP_
   MESH: http://identifiers.org/mesh/
   MONDO: http://purl.obolibrary.org/obo/MONDO_
-  MedGen: http://purl.obolibrary.org/obo/MedGen_
-  MedGenCUI: http://purl.obolibrary.org/obo/MedGenCUI_
+  MEDGEN: http://purl.obolibrary.org/obo/MEDGEN_
+  MEDGENCUI: http://purl.obolibrary.org/obo/MEDGENCUI_
   NCIT: http://purl.obolibrary.org/obo/NCIT_
   OMIM: https://omim.org/entry/
   Orphanet: http://www.orpha.net/ORDO/Orphanet_
diff --git a/src/medgen2obo.pl b/src/medgen2obo.pl
index 76b6601..15d0210 100755
--- a/src/medgen2obo.pl
+++ b/src/medgen2obo.pl
@@ -84,7 +84,7 @@
     chomp;
     my ($u,$c) = split(/\t/,$_);
     $uh{$c} = $u;
-    $th{$c}->{xrefs}->{"MedGen:$u"} = 1;
+    $th{$c}->{xrefs}->{"MEDGEN:$u"} = 1;
 }
 close(F);
 
@@ -140,15 +140,16 @@ sub add_triples {
                     $tag = 'equivalent_to:';  # This translates to owl:equivalentClass
                     # $tag = 'xref:';  # want to get this to translate to skos:exactMatch, but got oboInOwl:hasDbXref instead
                 }
-                # Namespaces: Different ones based on if is a UMLS CUI (C#), a MedGen CUI Novel (CN#), or a MedGEn UID (#).
+                # Namespaces: Different ones based on if is a UMLS CUI (C#), a MEDGEN CUI Novel (CN#), or a MedGEn UID (#).
                 if ($v =~ /^CN\d+/) {
-                    print "$tag MedGenCUI:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n";
+                    print "$tag MEDGENCUI:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n";
+                # If a CUI (starts with 'C'), will be created twice: one for MEDGENCUI, one for UMLS
+                } elsif ($v =~ /^C\d+/) {
+                    print "$tag UMLS:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n";
+                    print "$tag MEDGENCUI:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n";
+                # UID
                 } else {
-                    # If a CUI (starts with 'C'), will be created twice: one for MedGen, one for UMLS
-                    if ($v =~ /^C\d+/) {
-                        print "$tag UMLS:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n";
-                    }
-                    print "$tag MedGen:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n";
+                    print "$tag MEDGEN:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n";
                 }
             }
         }
@@ -160,13 +161,14 @@ sub add_triples {
 foreach my $id (@ids) {
     # Namespaces: Different ones based on if is a UMLS CUI (C#), a MedGen CUI Novel (CN#), or a MedGEn UID (#).
     if ($id =~ /^CN\d+/) {
-        add_triples('MedGenCUI', $id);
+        add_triples('MEDGENCUI', $id);
+    # If a CUI (starts with 'C'), will be created twice: one for MEDGENCUI, one for UMLS
+    } elsif ($id =~ /^C\d+/) {
+        add_triples('UMLS', $id);
+        add_triples('MEDGENCUI', $id);
+    # UID
     } else {
-        # If a CUI (starts with 'C'), will be created twice: one for MedGen, one for UMLS
-        if ($id =~ /^C\d+/) {
-            add_triples('UMLS', $id);
-        }
-        add_triples('MedGen', $id);
+        add_triples('MEDGEN', $id);
     }
 }
 
diff --git a/src/mondo_mapping_status.py b/src/mondo_mapping_status.py
index c3eaeb8..fed8e32 100644
--- a/src/mondo_mapping_status.py
+++ b/src/mondo_mapping_status.py
@@ -14,7 +14,7 @@
 # MEDGEN_PREFIXES: Some of these are old, some are new, some may not be used.
 # todo: If I couldn't convert SSSOM properly with MedGen_CUI, souldn't UMLS_CUI have a problem? though i think it's just coming from previous work in mondo maybe. it's not being used in this ingest
 MEDGEN_PREFIXES = [
-    'Medgen', 'MedGen', 'MEDGEN', 'MedGenCUI', 'UMLS', 'UMLS_CUI',
+    'MEDGEN', 'Medgen', 'MedGen', 'MEDGENCUI', 'MedGenCUI', 'UMLS', 'UMLS_CUI',
     # 'Medgen_UID', 'MedGen_UID', 'Medgen_CUI', 'MedGen_CUI', 'Medgen_CUI'
 ]
 CURIE = str