Skip to content

Commit

Permalink
Python version of WorkflowFinder
Browse files Browse the repository at this point in the history
New "finders" structure
ToolFinder page structure update

Thanks @uwwint
  • Loading branch information
supernord committed Aug 23, 2022
1 parent e07e4cf commit 78a550a
Show file tree
Hide file tree
Showing 13 changed files with 2,261 additions and 2,056 deletions.
790 changes: 390 additions & 400 deletions docs/2_1_workflows.html

Large diffs are not rendered by default.

1,308 changes: 658 additions & 650 deletions docs/2_tools.html

Large diffs are not rendered by default.

787 changes: 387 additions & 400 deletions docs/5_attributions.html

Large diffs are not rendered by default.

784 changes: 387 additions & 397 deletions docs/index.html

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions finders/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .common import Dataprovider
from .toolfinder import Tool, ToolDB, ToolMatrixDataProvider, ZeusDataProvider, MagnusDataProvider, QriscloudDataProvider, GadiDataProvider, if89DataProvider, GalaxyDataProvider, BiotoolsDataProvider, SetonixDataProvider
from .workflowfinder import Workflow, WorkflowDB, WorkflowHubSpaceDataProvider
163 changes: 163 additions & 0 deletions finders/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
import datetime
from enum import Enum
from abc import abstractmethod
import pandas as pd
from typing import List
import numpy as np

class Dataprovider:
"""
Class representing a data source, which enriches information about a tool.
"""
class FIELD_NAMES(Enum):
# toolfinder
REPOSITORY_URL = "FIELD_NAMES.REPOSITORY_URL"
NAME = "FIELD_NAMES.NAME"
TOOL_IDENTIFIER = "FIELD_NAMES.TOOL_IDENTIFIER"
BIOTOOLS_ID = "FIELD_NAMES.BIOTOOLS_ID"
INCLUSION = "FIELD_NAMES.INCLUSION"
BIOCOMMONS_DOCUMENTATION_DESCRIPTION = "FIELD_NAMES.BIOCOMMONS_DOCUMENTATION_DESCRIPTION"
BIOCOMMONS_DOCUMENTATION_LINK = "FIELD_NAMES.BIOCOMMONS_DOCUMENTATION_LINK"
DESCRIPTION = "FIELD_NAMES.DESCRIPTION"
LICENSE = "FIELD_NAMES.LICENSE"
EDAM_TOPICS = "FIELD_NAMES.EDAM_TOPICS"
PUBLICATIONS = "FIELD_NAMES.PUBLICATIONS"
GALAXY_AUSTRALIA_LAUNCH_LINK = "FIELD_NAMES.GALAXY_AUSTRALIA_LAUNCH_LINK"
NCI_GADI_VERSION = "FIELD_NAMES.NCI_GADI_VERSION"
NCI_IF89_VERSION = "FIELD_NAMES.NCI_IF89_VERSION"
PAWSEY_ZEUS_VERSION = "FIELD_NAMES.PAWSEY_ZEUS_VERSION"
PAWSEY_MAGNUS_VERSION = "FIELD_NAMES.PAWSEY_MAGNUS_VERSION"
QRISCLOUD_VERSION = "FIELD_NAMES.QRISCLOUD_VERSION"
PAWSEY_SETONIX_VERSION = "FIELD_NAMES.PAWSEY_SETONIX_VERSION"
# workflowfinder
UPDATED_AT = "FIELD_NAMES.UPDATED_AT"
PROJECTS = "FIELD_NAMES.PROJECTS"
TITLE = "FIELD_NAMES.TITLE"
TAGS = "FIELD_NAMES.TAGS"
DOI = "FIELD_NAMES.DOI"
EDAM_OPS = "FIELD_NAMES.EDAM_OPS"
EDAM_TOP = "FIELD_NAMES.EDAM_TOP"
URL = "FIELD_NAMES.URL"
LAUNCH_LINK = "FIELD_NAMES.LAUNCH_LINK"
GUIDE_LINK = "FIELD_NAMES.GUIDE_LINK"

def __init__(self):
self.identifier = ""
"""available_data is keyed by the toolid and should contain all information received by this data provider"""
self.available_data = {}
self.unmatched_data = {}
self.last_queried = datetime.datetime.min

def _save_unmatched_(self, key, data):
self.unmatched_data[key] = data

def query_remote(self):
if self._needs_querying():
self._query_remote()
self.last_queried = datetime.datetime.now()

"""
_query_remote queries a remote data source, transforms the information received into internal identifiers to be later joined onto all available tools.
"""

@abstractmethod
def _query_remote(self):
pass

"""needs re-querying if data is more than 7 days old"""

def _needs_querying(self):
return (datetime.datetime.now() - self.last_queried) > datetime.timedelta(days=7)

"""get information """

def get_information(self, uid):
if uid in self.available_data:
return self.available_data[uid]
else:
return None

"""Data provider are keyed by id"""

def __eq__(self, other):
if isinstance(other, Dataprovider):
return self.identifier == other.identifier
else:
return False

"""public wrapper for _render"""

def render(self, tool):
data = tool.get_data(self)
if data is not None:
return self._render(data)
return {}

"""render information"""

@abstractmethod
def _render(self, data):
pass

def get_alt_ids(self):
return{}


class DB:
"""represents the database for all known tools"""

def __init__(self):
self.db = {}
self.dataprovider: List[Dataprovider]
self.dataprovider = []
self.alternateids = {}

"""enrich the DB with what the dataprovider has queried from its datasource"""

def _enrich(self, dataprovider: Dataprovider):

alt_ids = dataprovider.get_alt_ids()
# https://stackoverflow.com/a/26853961 & https://www.python.org/dev/peps/pep-0584/
self.alternateids = {**self.alternateids, **alt_ids}

dataprovider.query_remote()

for i in self.db:
tool = self.db[i]
tool.add_data(dataprovider)

def get_id_from_alt(self, provider:str, unique_id:str):
#unique_id = unique_id.lower()
if provider in self.alternateids:
if unique_id in self.alternateids[provider]:
return self.alternateids[provider][unique_id]
return []

def get_unmatched_ids(self, dp:Dataprovider):
a = dp.unmatched_data
unmatched_ids = set(dp.available_data.keys()).difference(self.db.keys())
return(a, unmatched_ids)


"""add a dataprovider to the list of providers"""

def add_provider(self, provider: Dataprovider):
self.dataprovider.append(provider)
self._enrich(provider)

def get_data(self):
data = []
for i in self.db:
line = []
for dp in self.dataprovider:
line.append(dp.render(self.db[i]))
result = {}
for element in line:
result.update(element)
data.append(result)

return pd.DataFrame(data)

@abstractmethod
def get_formatted_table(self) -> pd.DataFrame:
pass
150 changes: 4 additions & 146 deletions toolfinder/__init__.py → finders/toolfinder.py
Original file line number Diff line number Diff line change
@@ -1,99 +1,9 @@
from abc import abstractmethod
import datetime
from enum import Enum

import numpy as np
from .common import Dataprovider, DB
import pandas as pd
from typing import List
import requests
import json
import itertools

class Dataprovider:
"""
Class representing a data source, which enriches information about a tool.
"""
class FIELD_NAMES(Enum):
REPOSITORY_URL = "FIELD_NAMES.REPOSITORY_URL"
NAME = "FIELD_NAMES.NAME"
TOOL_IDENTIFIER = "FIELD_NAMES.TOOL_IDENTIFIER"
BIOTOOLS_ID = "FIELD_NAMES.BIOTOOLS_ID"
INCLUSION = "FIELD_NAMES.INCLUSION"
BIOCOMMONS_DOCUMENTATION_DESCRIPTION = "FIELD_NAMES.BIOCOMMONS_DOCUMENTATION_DESCRIPTION"
BIOCOMMONS_DOCUMENTATION_LINK = "FIELD_NAMES.BIOCOMMONS_DOCUMENTATION_LINK"
DESCRIPTION = "FIELD_NAMES.DESCRIPTION"
LICENSE = "FIELD_NAMES.LICENSE"
EDAM_TOPICS = "FIELD_NAMES.EDAM_TOPICS"
PUBLICATIONS = "FIELD_NAMES.PUBLICATIONS"
GALAXY_AUSTRALIA_LAUNCH_LINK = "FIELD_NAMES.GALAXY_AUSTRALIA_LAUNCH_LINK"
NCI_GADI_VERSION = "FIELD_NAMES.NCI_GADI_VERSION"
NCI_IF89_VERSION = "FIELD_NAMES.NCI_IF89_VERSION"
PAWSEY_ZEUS_VERSION = "FIELD_NAMES.PAWSEY_ZEUS_VERSION"
PAWSEY_MAGNUS_VERSION = "FIELD_NAMES.PAWSEY_MAGNUS_VERSION"
QRISCLOUD_VERSION = "FIELD_NAMES.QRISCLOUD_VERSION"
PAWSEY_SETONIX_VERSION = "FIELD_NAMES.PAWSEY_SETONIX_VERSION"

def __init__(self):
self.identifier = ""
"""available_data is keyed by the toolid and should contain all information received by this data provider"""
self.available_data = {}
self.unmatched_data = {}
self.last_queried = datetime.datetime.min

def _save_unmatched_(self, key, data):
self.unmatched_data[key] = data

def query_remote(self):
if self._needs_querying():
self._query_remote()
self.last_queried = datetime.datetime.now()

"""
_query_remote queries a remote data source, transforms the information received into internal identifiers to be later joined onto all available tools.
"""

@abstractmethod
def _query_remote(self):
pass

"""needs re-querying if data is more than 7 days old"""

def _needs_querying(self):
return (datetime.datetime.now() - self.last_queried) > datetime.timedelta(days=7)

"""get information """

def get_information(self, uid):
if uid in self.available_data:
return self.available_data[uid]
else:
return None

"""Data provider are keyed by id"""

def __eq__(self, other):
if isinstance(other, Dataprovider):
return self.identifier == other.identifier
else:
return False

"""public wrapper for _render"""

def render(self, tool):
data = tool.get_data(self)
if data is not None:
return self._render(data)
return {}

"""render information"""

@abstractmethod
def _render(self, data):
pass

def get_alt_ids(self):
return{}

import numpy as np

class ToolMatrixDataProvider(Dataprovider):
ID_BIO_TOOLS = "bio.tools"
Expand Down Expand Up @@ -130,7 +40,6 @@ def get_alt_ids(self):
retval[ToolMatrixDataProvider.ID_BIO_TOOLS][row.biotoolsID].append(toolID)
return retval


class ZeusDataProvider(Dataprovider):
def __init__(self, filename):
super().__init__()
Expand All @@ -154,7 +63,6 @@ def _query_remote(self):
def _render(self, data):
return {Dataprovider.FIELD_NAMES.PAWSEY_ZEUS_VERSION: data}


class MagnusDataProvider(Dataprovider):
def __init__(self, filename):
super().__init__()
Expand Down Expand Up @@ -424,65 +332,16 @@ def __eq__(self, other):
return False


class ToolDB:
class ToolDB(DB):
"""represents the database for all known tools"""

def __init__(self, tool_matrix_file):
self.db = {}
self.dataprovider: List[Dataprovider]
self.dataprovider = []
self.alternateids = {}
super().__init__()
data = pd.read_excel(tool_matrix_file, header=2)
for i in data.toolID:
self.db[i] = Tool(i)
del self.db[np.nan]

"""enrich the DB with what the dataprovider has queried from its datasource"""

def _enrich(self, dataprovider: Dataprovider):

alt_ids = dataprovider.get_alt_ids()
# https://stackoverflow.com/a/26853961 & https://www.python.org/dev/peps/pep-0584/
self.alternateids = {**self.alternateids, **alt_ids}

dataprovider.query_remote()

for i in self.db:
tool = self.db[i]
tool.add_data(dataprovider)

def get_id_from_alt(self, provider:str, unique_id:str):
#unique_id = unique_id.lower()
if provider in self.alternateids:
if unique_id in self.alternateids[provider]:
return self.alternateids[provider][unique_id]
return []

def get_unmatched_ids(self, dp:Dataprovider):
a = dp.unmatched_data
unmatched_ids = set(dp.available_data.keys()).difference(self.db.keys())
return(a, unmatched_ids)


"""add a dataprovider to the list of providers"""

def add_provider(self, provider: Dataprovider):
self.dataprovider.append(provider)
self._enrich(provider)

def get_data(self):
data = []
for i in self.db:
line = []
for dp in self.dataprovider:
line.append(dp.render(self.db[i]))
result = {}
for element in line:
result.update(element)
data.append(result)

return pd.DataFrame(data)

def get_formatted_table(self):
import urllib
tool_table = self.get_data()
Expand Down Expand Up @@ -544,4 +403,3 @@ def get_formatted_table(self):
tool_line.append("")
formatted_list.append(tool_line)
return pd.DataFrame(formatted_list, columns=["Tool / workflow name","bio.tools link","Tool identifier (module name / bio.tools ID / placeholder)","Description","Topic (EDAM, if available)","Publications","BioContainers link","License","BioCommons Documentation","Galaxy Australia","NCI (Gadi)","NCI (if89)","Pawsey (Zeus)","Pawsey (Magnus)","Pawsey (Setonix)","QRIScloud / UQ-RCC (Flashlite, Awoonga, Tinaroo)"])

Loading

0 comments on commit 78a550a

Please sign in to comment.