Skip to content

Commit

Permalink
- Update: new classes: duplicated some UMLS: classes as Medgen:, if t…
Browse files Browse the repository at this point in the history
…hey started with 'C' and a number.

- Update: prefixes: In addition to new classes above, renamed UMLS prefix with Medgen for all other classes (which happen to all start with 'CN:'
- Update: prefixes: Renamed prior MEDGEN: xref prefixes to Medgen_UID: These IDs don't start with C (CUI; Concept Unique Identifier) or CN (Common Name?). These are internal Medgen UIDs that are duplicative and not for clinical or analytical use.
- Rename: bin/ -> src/
- Add: output/: For both release outputs and non-release.
- Rename: release/ -> output/release/
- Add: mondo_mapping_status.py: For generating artefacts related to the reporting and management of mappings between Mondo and Medgen.
- Add: Python dependency requirements files.
- Add: run.sh: For running commands in ODK
- Add: config/medgen.sssom-metadata.yml
  • Loading branch information
joeflack4 committed Jul 24, 2023
1 parent a288faf commit f208076
Show file tree
Hide file tree
Showing 10 changed files with 318 additions and 15 deletions.
6 changes: 4 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,20 +1,22 @@
# Standard
dev/
data/cache/
output/
__pycache__/
.idea/
.DS_Store
.env

# Specialized
/*.json
/*.obo
/*.owl
/*.tmp
/*.tsv
/fetch
/ftp.ncbi.nlm.nih.gov/
/release/
*ignore/
.ipynb_checkpoints/
_archive/
release/
tmp/
.ipynb_checkpoints/
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ On MacOS, (3) and (4) should be available without the need for installation.

## Setup
1. Give permission to run Perl: `chmod +x ./bin/*.pl`
2. Install Python dependencies: `pip install -r requirements.txt`

## Running the ingest
Run: `make all`
21 changes: 21 additions & 0 deletions config/medgen.sssom-metadata.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
creator_id: 0000-0002-2906-7319
curie_map:
GTR: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/GTR/
HP: http://purl.obolibrary.org/obo/HP_
MESH: http://identifiers.org/mesh/
MONDO: http://purl.obolibrary.org/obo/MONDO_
Medgen: http://purl.obolibrary.org/obo/Medgen_
Medgen_UID: http://purl.obolibrary.org/obo/Medgen_UID_
NCIT: http://purl.obolibrary.org/obo/NCIT_
OMIM: https://omim.org/entry/
Orphanet: http://www.orpha.net/ORDO/Orphanet_
SCTID: http://identifiers.org/snomedct/
UMLS: http://purl.obolibrary.org/obo/UMLS_
oboInOwl: http://www.geneontology.org/formats/oboInOwl#
owl: http://www.w3.org/2002/07/owl#
rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
rdfs: http://www.w3.org/2000/01/rdf-schema#
semapv: https://w3id.org/semapv/
skos: http://www.w3.org/2004/02/skos/core#
sssom: https://w3id.org/sssom/
license: http://w3id.org/sssom/license/unspecified
46 changes: 35 additions & 11 deletions makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,31 +9,37 @@ PRODUCTS=medgen-disease-extract.obo medgen-disease-extract.owl
TODAY ?=$(shell date +%Y-%m-%d)
VERSION=v$(TODAY)

all: build stage
build: $(PRODUCTS)
all: build stage clean
build: $(PRODUCTS) medgen.sssom.tsv
stage: $(patsubst %, stage-%, $(PRODUCTS))
mv medgen.obo release/
stage-%: % | release/
mv $< release/
mv medgen.obo output/release/
mv medgen.sssom.tsv output/release/
stage-%: % | output/release/
mv $< output/release/
clean:
rm medgen.obographs.json
rm uid2cui.tsv
rm *.obo

# ----------------------------------------
# ETL
# ----------------------------------------
release/:
output/release/:
mkdir -p $@

ftp.ncbi.nlm.nih.gov:
wget -r -np ftp://ftp.ncbi.nlm.nih.gov/pub/medgen/ && touch $@

uid2cui.tsv:
./bin/make_uid2cui.pl > $@
./src/make_uid2cui.pl > $@

# ----------------------------------------
# Hacky conversion to obo
# Main artefacts
# ----------------------------------------
# Hacky conversion to obo ----------------
# Relies on MGCONSO.RRF.gz etc being made by 'ftp.ncbi.nlm.nih.gov' step
medgen.obo: ftp.ncbi.nlm.nih.gov uid2cui.tsv
./bin/medgen2obo.pl > $@.tmp && mv $@.tmp $@
./src/medgen2obo.pl > $@.tmp && mv $@.tmp $@

# We only care about diseases for now
# - NOTE: some cancers seem to appear under Neoplastic-Process
Expand All @@ -49,6 +55,13 @@ medgen-disease-extract.json: medgen-disease-extract.obo
medgen-disease-extract.owl: medgen-disease-extract.obo
owltools $< -o $@

# SSSOM ----------------------------------
medgen.obographs.json:
robot convert -i medgen-disease-extract.owl -o $@

medgen.sssom.tsv: medgen.obographs.json
sssom parse medgen.obographs.json -I obographs-json -m config/medgen.sssom-metadata.yml -o $@

# ----------------------------------------
# Cycles
# ----------------------------------------
Expand All @@ -59,6 +72,17 @@ medgen-disease-extract.owl: medgen-disease-extract.obo
# ----------------------------------------
# Devops
# ----------------------------------------
deploy-release: | release/
deploy-release: | output/release/
@test $(VERSION)
gh release create $(VERSION) --notes "New release." --title "$(VERSION)" release/*
gh release create $(VERSION) --notes "New release." --title "$(VERSION)" output/release/*

# ----------------------------------------
# Mapping analysis
# ----------------------------------------
tmp/input/:
mkdir -p $@

tmp/input/mondo.sssom.tsv: tmp/input/
wget http://purl.obolibrary.org/obo/mondo/mappings/mondo.sssom.tsv -O $@

# TODO: goals using mondo_mapping_status.py
1 change: 1 addition & 0 deletions requirements-unlocked.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pandas
14 changes: 14 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
distlib==0.3.6
filelock==3.9.0
numpy==1.25.1
pandas==2.0.3
pbr==5.11.1
platformdirs==3.1.0
python-dateutil==2.8.2
pytz==2023.3
six==1.16.0
stevedore==5.0.0
tzdata==2023.3
virtualenv==20.20.0
virtualenv-clone==0.5.7
virtualenvwrapper==4.8.4
85 changes: 85 additions & 0 deletions run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#!/bin/sh
# Wrapper script for docker.
#
# This is used primarily for wrapping the GNU Make workflow.
# Instead of typing "make TARGET", type "./run.sh make TARGET".
# This will run the make workflow within a docker container.
#
# The assumption is that you are working in the src/ontology folder;
# we therefore map the whole repo (../..) to a docker volume.
#
# To use singularity instead of docker, please issue
# export USE_SINGULARITY=<any-value>
# before running this script.
#
# See README-editors.md for more details.

if [ -f run.sh.conf ]; then
. ./run.sh.conf
fi

# Look for a GitHub token
if [ -n "$GH_TOKEN" ]; then
:
elif [ -f ../../.github/token.txt ]; then
GH_TOKEN=$(cat ../../.github/token.txt)
elif [ -f $XDG_CONFIG_HOME/ontology-development-kit/github/token ]; then
GH_TOKEN=$(cat $XDG_CONFIG_HOME/ontology-development-kit/github/token)
elif [ -f "$HOME/Library/Application Support/ontology-development-kit/github/token" ]; then
GH_TOKEN=$(cat "$HOME/Library/Application Support/ontology-development-kit/github/token")
fi

ODK_IMAGE=${ODK_IMAGE:-odkfull}
TAG_IN_IMAGE=$(echo $ODK_IMAGE | awk -F':' '{ print $2 }')
if [ -n "$TAG_IN_IMAGE" ]; then
# Override ODK_TAG env var if IMAGE already includes a tag
ODK_TAG=$TAG_IN_IMAGE
ODK_IMAGE=$(echo $ODK_IMAGE | awk -F':' '{ print $1 }')
fi
ODK_TAG=${ODK_TAG:-latest}
ODK_JAVA_OPTS=${ODK_JAVA_OPTS:--Xmx20G}
ODK_DEBUG=${ODK_DEBUG:-no}

# Convert OWLAPI_* environment variables to the OWLAPI as Java options
# See http://owlcs.github.io/owlapi/apidocs_4/org/semanticweb/owlapi/model/parameters/ConfigurationOptions.html
# for a list of allowed options
OWLAPI_OPTIONS_NAMESPACE=org.semanticweb.owlapi.model.parameters.ConfigurationOptions
for owlapi_var in $(env | sed -n s/^OWLAPI_//p) ; do
ODK_JAVA_OPTS="$ODK_JAVA_OPTS -D$OWLAPI_OPTIONS_NAMESPACE.${owlapi_var%=*}=${owlapi_var#*=}"
done

TIMECMD=
if [ x$ODK_DEBUG = xyes ]; then
# If you wish to change the format string, take care of using
# non-breaking spaces (U+00A0) instead of normal spaces, to
# prevent the shell from tokenizing the format string.
echo "Running ${IMAGE} with ${ODK_JAVA_OPTS} of memory for ROBOT and Java-based pipeline steps."
TIMECMD="/usr/bin/time -f ### DEBUG STATS ###\nElapsed time: %E\nPeak memory: %M kb"
fi

VOLUME_BIND=$PWD:/work
WORK_DIR=/work

if [ -n "$ODK_BINDS" ]; then
VOLUME_BIND="$VOLUME_BIND,$ODK_BINDS"
fi

if [ -n "$USE_SINGULARITY" ]; then

singularity exec --cleanenv $ODK_SINGULARITY_OPTIONS \
--env "ROBOT_JAVA_ARGS=$ODK_JAVA_OPTS,JAVA_OPTS=$ODK_JAVA_OPTS" \
--bind $VOLUME_BIND \
-W $WORK_DIR \
docker://obolibrary/$ODK_IMAGE:$ODK_TAG $TIMECMD "$@"
else
BIND_OPTIONS="-v $(echo $VOLUME_BIND | sed 's/,/ -v /')"
docker run $ODK_DOCKER_OPTIONS $BIND_OPTIONS -w $WORK_DIR \
-e ROBOT_JAVA_ARGS="$ODK_JAVA_OPTS" -e JAVA_OPTS="$ODK_JAVA_OPTS" \
--rm -ti obolibrary/$ODK_IMAGE:$ODK_TAG $TIMECMD "$@"
fi

case "$@" in
*update_repo*|*release*)
echo "Please remember to update your ODK image from time to time: https://oboacademy.github.io/obook/howto/odk-update/."
;;
esac
File renamed without changes.
44 changes: 42 additions & 2 deletions bin/medgen2obo.pl → src/medgen2obo.pl
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@
chomp;
my ($u,$c) = split(/\t/,$_);
$uh{$c} = $u;
$th{$c}->{xrefs}->{"MEDGEN:$u"} = 1;
$th{$c}->{xrefs}->{"Medgen_UID:$u"} = 1;
}
close(F);

Expand All @@ -104,9 +104,49 @@
my @ids = keys %th;
@ids = sort @ids;
foreach my $id (@ids) {
if ($id =~ /^C\d+/) {
# TODO: repurpose to func (this is instance 1/2)
my $h = $th{$id};
print "[Term]\n";
print "id: UMLS:$id\n";
print "name: $h->{name}\n";
foreach my $x (keys %{$h->{xrefs}}) {
$x =~ s@MSH:@MESH:@;
$x =~ s@NCI:@NCIT:@;
$x =~ s@SNOMEDCT_US:@SCTID:@;
print "xref: $x\n";
}
foreach (keys %{$ssh{$id} || {}}) {
my $ss = mk_subset($_);
print "subset: $ss\n";
}
foreach my $s (@{$h->{synonyms}}) {
my ($str, $x)= @$s;
$str = escq($str);
print "synonym: \"$str\" RELATED [$x]\n";
}
my $trelh = $rh{$id};
foreach my $rel (keys %{$trelh}) {
my $vh = $trelh->{$rel};
foreach my $v (keys %$vh) {
unless ($v eq $id) {
my $tag = "relationship: $rel";
if ($rel eq 'isa') {
$tag = 'is_a:';
}
if ($rel eq 'mapped_to') {
$tag = 'equivalent_to:';
}
print "$tag UMLS:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n";
}
}
}
print "\n";
}
# TODO: repurpose to func (this is instance 2/2)
my $h = $th{$id};
print "[Term]\n";
print "id: UMLS:$id\n";
print "id: Medgen:$id\n";
print "name: $h->{name}\n";
foreach my $x (keys %{$h->{xrefs}}) {
$x =~ s@MSH:@MESH:@;
Expand Down
Loading

0 comments on commit f208076

Please sign in to comment.