Skip to content

Commit

Permalink
Merge pull request #7 from monarch-initiative/update1
Browse files Browse the repository at this point in the history
Medgen updates
  • Loading branch information
joeflack4 authored Aug 14, 2023
2 parents a288faf + 916df0b commit b9e8bc5
Show file tree
Hide file tree
Showing 10 changed files with 333 additions and 21 deletions.
6 changes: 4 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,20 +1,22 @@
# Standard
dev/
data/cache/
output/
__pycache__/
.idea/
.DS_Store
.env

# Specialized
/*.json
/*.obo
/*.owl
/*.tmp
/*.tsv
/fetch
/ftp.ncbi.nlm.nih.gov/
/release/
*ignore/
.ipynb_checkpoints/
_archive/
release/
tmp/
.ipynb_checkpoints/
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ On MacOS, (3) and (4) should be available without the need for installation.

## Setup
1. Give permission to run Perl: `chmod +x ./bin/*.pl`
2. Install Python dependencies: `pip install -r requirements.txt`

## Running the ingest
Run: `make all`
21 changes: 21 additions & 0 deletions config/medgen.sssom-metadata.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
creator_id: 0000-0002-2906-7319
curie_map:
GTR: http://purl.obolibrary.org/obo/mondo/mappings/unknown_prefix/GTR/
HP: http://purl.obolibrary.org/obo/HP_
MESH: http://identifiers.org/mesh/
MONDO: http://purl.obolibrary.org/obo/MONDO_
MEDGEN: http://purl.obolibrary.org/obo/MEDGEN_
MEDGENCUI: http://purl.obolibrary.org/obo/MEDGENCUI_
NCIT: http://purl.obolibrary.org/obo/NCIT_
OMIM: https://omim.org/entry/
Orphanet: http://www.orpha.net/ORDO/Orphanet_
SCTID: http://identifiers.org/snomedct/
UMLS: http://purl.obolibrary.org/obo/UMLS_
oboInOwl: http://www.geneontology.org/formats/oboInOwl#
owl: http://www.w3.org/2002/07/owl#
rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
rdfs: http://www.w3.org/2000/01/rdf-schema#
semapv: https://w3id.org/semapv/
skos: http://www.w3.org/2004/02/skos/core#
sssom: https://w3id.org/sssom/
license: http://w3id.org/sssom/license/unspecified
58 changes: 45 additions & 13 deletions makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,38 +2,53 @@
# Running `make all` will run the full pipeline. Note that if the FTP files have already been downloaded, it'll skip
# that part. In order to force re-download, run `make all -B`.
.DEFAULT_GOAL := all
.PHONY: all build stage stage-%
.PHONY: all build stage stage-% analyze clean deploy-release

OBO=http://purl.obolibrary.org/obo
PRODUCTS=medgen-disease-extract.obo medgen-disease-extract.owl
TODAY ?=$(shell date +%Y-%m-%d)
VERSION=v$(TODAY)

all: build stage
build: $(PRODUCTS)
all: build stage clean analyze
# analyze: runs more than just this file; that goal creates multiple files
analyze: output/medgen_terms_mapping_status.tsv
build: $(PRODUCTS) medgen.sssom.tsv
stage: $(patsubst %, stage-%, $(PRODUCTS))
mv medgen.obo release/
stage-%: % | release/
mv $< release/
mv medgen.obo output/release/
mv medgen.sssom.tsv output/release/
stage-%: % | output/release/
mv $< output/release/
clean:
rm medgen.obographs.json
rm uid2cui.tsv
rm *.obo

# ----------------------------------------
# ETL
# Setup dirs
# ----------------------------------------
release/:
tmp/input/:
mkdir -p $@
output/:
mkdir -p $@
output/release/:
mkdir -p $@

# ----------------------------------------
# ETL
# ----------------------------------------
ftp.ncbi.nlm.nih.gov:
wget -r -np ftp://ftp.ncbi.nlm.nih.gov/pub/medgen/ && touch $@

uid2cui.tsv:
./bin/make_uid2cui.pl > $@
./src/make_uid2cui.pl > $@

# ----------------------------------------
# Hacky conversion to obo
# Main artefacts
# ----------------------------------------
# Hacky conversion to obo ----------------
# Relies on MGCONSO.RRF.gz etc being made by 'ftp.ncbi.nlm.nih.gov' step
medgen.obo: ftp.ncbi.nlm.nih.gov uid2cui.tsv
./bin/medgen2obo.pl > $@.tmp && mv $@.tmp $@
./src/medgen2obo.pl > $@.tmp && mv $@.tmp $@

# We only care about diseases for now
# - NOTE: some cancers seem to appear under Neoplastic-Process
Expand All @@ -49,6 +64,13 @@ medgen-disease-extract.json: medgen-disease-extract.obo
medgen-disease-extract.owl: medgen-disease-extract.obo
owltools $< -o $@

# SSSOM ----------------------------------
medgen.obographs.json:
robot convert -i medgen-disease-extract.owl -o $@

medgen.sssom.tsv: medgen.obographs.json
sssom parse medgen.obographs.json -I obographs-json -m config/medgen.sssom-metadata.yml -o $@

# ----------------------------------------
# Cycles
# ----------------------------------------
Expand All @@ -59,6 +81,16 @@ medgen-disease-extract.owl: medgen-disease-extract.obo
# ----------------------------------------
# Devops
# ----------------------------------------
deploy-release: | release/
deploy-release: | output/release/
@test $(VERSION)
gh release create $(VERSION) --notes "New release." --title "$(VERSION)" release/*
gh release create $(VERSION) --notes "New release." --title "$(VERSION)" output/release/*

# ----------------------------------------
# Mapping analysis
# ----------------------------------------
tmp/input/mondo.sssom.tsv: | tmp/input/
wget http://purl.obolibrary.org/obo/mondo/mappings/mondo.sssom.tsv -O $@

# creates more than just this file; that goal creates multiple files
output/medgen_terms_mapping_status.tsv output/obsoleted_medgen_terms_in_mondo.txt: | output/
python src/mondo_mapping_status.py
1 change: 1 addition & 0 deletions requirements-unlocked.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pandas
14 changes: 14 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
distlib==0.3.6
filelock==3.9.0
numpy==1.25.1
pandas==2.0.3
pbr==5.11.1
platformdirs==3.1.0
python-dateutil==2.8.2
pytz==2023.3
six==1.16.0
stevedore==5.0.0
tzdata==2023.3
virtualenv==20.20.0
virtualenv-clone==0.5.7
virtualenvwrapper==4.8.4
85 changes: 85 additions & 0 deletions run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#!/bin/sh
# Wrapper script for docker.
#
# This is used primarily for wrapping the GNU Make workflow.
# Instead of typing "make TARGET", type "./run.sh make TARGET".
# This will run the make workflow within a docker container.
#
# The assumption is that you are working in the src/ontology folder;
# we therefore map the whole repo (../..) to a docker volume.
#
# To use singularity instead of docker, please issue
# export USE_SINGULARITY=<any-value>
# before running this script.
#
# See README-editors.md for more details.

if [ -f run.sh.conf ]; then
. ./run.sh.conf
fi

# Look for a GitHub token
if [ -n "$GH_TOKEN" ]; then
:
elif [ -f ../../.github/token.txt ]; then
GH_TOKEN=$(cat ../../.github/token.txt)
elif [ -f $XDG_CONFIG_HOME/ontology-development-kit/github/token ]; then
GH_TOKEN=$(cat $XDG_CONFIG_HOME/ontology-development-kit/github/token)
elif [ -f "$HOME/Library/Application Support/ontology-development-kit/github/token" ]; then
GH_TOKEN=$(cat "$HOME/Library/Application Support/ontology-development-kit/github/token")
fi

ODK_IMAGE=${ODK_IMAGE:-odkfull}
TAG_IN_IMAGE=$(echo $ODK_IMAGE | awk -F':' '{ print $2 }')
if [ -n "$TAG_IN_IMAGE" ]; then
# Override ODK_TAG env var if IMAGE already includes a tag
ODK_TAG=$TAG_IN_IMAGE
ODK_IMAGE=$(echo $ODK_IMAGE | awk -F':' '{ print $1 }')
fi
ODK_TAG=${ODK_TAG:-latest}
ODK_JAVA_OPTS=${ODK_JAVA_OPTS:--Xmx20G}
ODK_DEBUG=${ODK_DEBUG:-no}

# Convert OWLAPI_* environment variables to the OWLAPI as Java options
# See http://owlcs.github.io/owlapi/apidocs_4/org/semanticweb/owlapi/model/parameters/ConfigurationOptions.html
# for a list of allowed options
OWLAPI_OPTIONS_NAMESPACE=org.semanticweb.owlapi.model.parameters.ConfigurationOptions
for owlapi_var in $(env | sed -n s/^OWLAPI_//p) ; do
ODK_JAVA_OPTS="$ODK_JAVA_OPTS -D$OWLAPI_OPTIONS_NAMESPACE.${owlapi_var%=*}=${owlapi_var#*=}"
done

TIMECMD=
if [ x$ODK_DEBUG = xyes ]; then
# If you wish to change the format string, take care of using
# non-breaking spaces (U+00A0) instead of normal spaces, to
# prevent the shell from tokenizing the format string.
echo "Running ${IMAGE} with ${ODK_JAVA_OPTS} of memory for ROBOT and Java-based pipeline steps."
TIMECMD="/usr/bin/time -f ### DEBUG STATS ###\nElapsed time: %E\nPeak memory: %M kb"
fi

VOLUME_BIND=$PWD:/work
WORK_DIR=/work

if [ -n "$ODK_BINDS" ]; then
VOLUME_BIND="$VOLUME_BIND,$ODK_BINDS"
fi

if [ -n "$USE_SINGULARITY" ]; then

singularity exec --cleanenv $ODK_SINGULARITY_OPTIONS \
--env "ROBOT_JAVA_ARGS=$ODK_JAVA_OPTS,JAVA_OPTS=$ODK_JAVA_OPTS" \
--bind $VOLUME_BIND \
-W $WORK_DIR \
docker://obolibrary/$ODK_IMAGE:$ODK_TAG $TIMECMD "$@"
else
BIND_OPTIONS="-v $(echo $VOLUME_BIND | sed 's/,/ -v /')"
docker run $ODK_DOCKER_OPTIONS $BIND_OPTIONS -w $WORK_DIR \
-e ROBOT_JAVA_ARGS="$ODK_JAVA_OPTS" -e JAVA_OPTS="$ODK_JAVA_OPTS" \
--rm -ti obolibrary/$ODK_IMAGE:$ODK_TAG $TIMECMD "$@"
fi

case "$@" in
*update_repo*|*release*)
echo "Please remember to update your ODK image from time to time: https://oboacademy.github.io/obook/howto/odk-update/."
;;
esac
File renamed without changes.
42 changes: 36 additions & 6 deletions bin/medgen2obo.pl → src/medgen2obo.pl
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/perl
use strict;

# Vars
my %th = ();
my %rh = ();
my %dh = ();
Expand All @@ -10,6 +11,7 @@

our $PATH = "ftp.ncbi.nlm.nih.gov/pub/medgen";

# Execution
open(F,"gzip -dc $PATH/MGCONSO.RRF.gz|") || die;
while(<F>) {
next if m@^#@;
Expand Down Expand Up @@ -101,17 +103,18 @@
}
print "\n";

my @ids = keys %th;
@ids = sort @ids;
foreach my $id (@ids) {
sub add_triples {
my ($prefix, $id) = @_;

my $h = $th{$id};
print "[Term]\n";
print "id: UMLS:$id\n";
print "id: $prefix:$id\n";
print "name: $h->{name}\n";
foreach my $x (keys %{$h->{xrefs}}) {
$x =~ s@MSH:@MESH:@;
$x =~ s@NCI:@NCIT:@;
$x =~ s@SNOMEDCT_US:@SCTID:@;
# TODO: change these to skos:exactMatch?
print "xref: $x\n";
}
foreach (keys %{$ssh{$id} || {}}) {
Expand All @@ -133,14 +136,41 @@
$tag = 'is_a:';
}
if ($rel eq 'mapped_to') {
$tag = 'equivalent_to:';
# TODO: change these to skos:exactMatch?
$tag = 'equivalent_to:'; # This translates to owl:equivalentClass
# $tag = 'xref:'; # want to get this to translate to skos:exactMatch, but got oboInOwl:hasDbXref instead
}
# Namespaces: Different ones based on if is a UMLS CUI (C#), a MEDGEN CUI Novel (CN#), or a MedGEn UID (#).
if ($v =~ /^CN\d+/) {
print "$tag MEDGENCUI:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n";
# If a CUI (starts with 'C'), will be created twice: one for MEDGENCUI, one for UMLS
} elsif ($v =~ /^C\d+/) {
print "$tag UMLS:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n";
print "$tag MEDGENCUI:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n";
# UID
} else {
print "$tag MEDGEN:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n";
}
print "$tag UMLS:$v {source=\"$vh->{$v}\"} ! $th{$v}->{name}\n";
}
}
}
print "\n";
}
my @ids = keys %th;
@ids = sort @ids;
foreach my $id (@ids) {
# Namespaces: Different ones based on if is a UMLS CUI (C#), a MedGen CUI Novel (CN#), or a MedGEn UID (#).
if ($id =~ /^CN\d+/) {
add_triples('MEDGENCUI', $id);
# If a CUI (starts with 'C'), will be created twice: one for MEDGENCUI, one for UMLS
} elsif ($id =~ /^C\d+/) {
add_triples('UMLS', $id);
add_triples('MEDGENCUI', $id);
# UID
} else {
add_triples('MEDGEN', $id);
}
}

exit 0;

Expand Down
Loading

0 comments on commit b9e8bc5

Please sign in to comment.