Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add assertion method for edge presence #102

Open
wants to merge 18 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,7 @@ ddi = {editable = true,git = "https://github.com/ddionrails/ddi.py.git",ref = "m
goodtables = "*"
pandas = "*"
pylint = "*"

[requires]
python_version = "3.6"
networkx = "*"

[pipenv]
allow_prereleases = true
452 changes: 217 additions & 235 deletions Pipfile.lock

Large diffs are not rendered by default.

10,377 changes: 1,821 additions & 8,556 deletions ddionrails/transformations.csv

Large diffs are not rendered by default.

172 changes: 172 additions & 0 deletions lib_py/clean_relations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
"""Create cleaned up relational files

There are two files that define relationships beetween variables and
between variables and questions.

* generations.csv defines relationships beetween variables
* logical_variables.csv defines relationships between questions and variables

These relationships are not "clean", they contain relationships with entities
that are not part of the actual data.
The unclean relations are needed to establish the relationship between questions
and variables.
The relationships are transitive.
Before we can throw away old relationships, we have to create the transitive closure
for all relations.
Otherwise we would loose relationships when we remove relationships to old variables.
"""

import copy
from typing import Set

import networkx
import pandas


class VariableGraph:
def __init__(self, generations: pandas.DataFrame, variables: Set[str], version):
self._generations = generations
self._variables = variables
self._graph = networkx.DiGraph()
self._filled = False
self._version = version
self._version_variables = set()

@property
def graph(self) -> networkx.DiGraph:
"""Return a flat copy of the networkx DiGraph"""
return self._graph.copy()

@property
def filled(self):
"""Return boolean flag, if graph is already filled."""
return self._filled

def fill(self):
"""Fill the graph with variables defined in the input file."""
if self.filled:
return
for _, row in self._generations.iterrows():
input_node = (
row["input_study"],
row["input_dataset"],
row["input_version"],
row["input_variable"],
)
output_node = (
row["output_study"],
row["output_dataset"],
row["output_version"],
row["output_variable"],
)
if input_node[2] == self._version:
self._version_variables.add(input_node)

self._graph.add_edge(input_node, output_node)
self._graph = networkx.algorithms.dag.transitive_closure(self._graph)
self._filled = True

def get_transformations(self):
version = self._version
if not self.filled:
self.fill()
transformations = pandas.DataFrame(
columns=[
"origin_study_name",
"origin_dataset_name",
"origin_variable_name",
"target_study_name",
"target_dataset_name",
"target_variable_name",
]
)

for node in self._version_variables:
for related_node in self._graph.neighbors(node):
if not self._is_current_variable(related_node, version):
continue
transformations = transformations.append(
pandas.Series(
self._create_transformations_row(node, related_node),
index=transformations.columns,
),
ignore_index=True,
)

return transformations

def _is_current_variable(self, node, version):
if node[2] != version:
return False
if self._remove_version(node) not in self._variables:
return False
return True

@classmethod
def _create_transformations_row(cls, origin, target):
return cls._remove_version(origin) + cls._remove_version(target)

@staticmethod
def _remove_version(node):
return node[0:2] + node[3:]

def __add__(self, graph):
if isinstance(graph, QuestionsVariablesGraph):
return graph + self
raise TypeError(
(
"unsupported operand type(s) for + : "
f"'{type(self)}' and '{type(graph)}')"
)
)


class QuestionsVariablesGraph:
def __init__(self, question_to_variable_relations: pandas.DataFrame):
self._filled = False
self._relations_table = question_to_variable_relations
self._graph = networkx.DiGraph()

@property
def graph(self):
if not self._filled:
self.fill()
return self._graph.copy()

@property
def filled(self):
return self._filled

def fill(self):
for _, row in self._relations_table.iterrows():
question_node = (
row["study"],
row["questionnaire"],
row["question"],
row["item"],
)
variable_node = (
row["study"],
row["dataset"],
row["variable"],
)
self._graph.add_edge(question_node, variable_node)
self._graph = networkx.algorithms.dag.transitive_closure(self._graph)
self._filled = True

def __add__(self, graph):
if not isinstance(graph, VariableGraph):
raise TypeError(
(
"unsupported operand type(s) for + : "
f"'{type(self)}' and '{type(graph)}')"
)
)
if not self.filled:
self.fill()
if not graph.filled:
graph.fill()
_graph = networkx.compose(graph.graph, self._graph)
question_variables_graph = copy.copy(self)
question_variables_graph._graph = networkx.transitive_closure(_graph)
return question_variables_graph
4 changes: 4 additions & 0 deletions lib_py/test_data/metadata/generations.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
input_study,input_dataset,input_version,input_variable,output_study,output_dataset,output_version,output_variable
some-study,some-dataset,v30,some-variable,some-study,some-dataset,v35,some-other-variable
some-study,some-dataset,v35,some-other-variable,some-study,some-dataset,v35,final-variable
some-study,some-dataset,v35,some-irrelevant-variable,some-study,some-dataset,v35,final-variable
2 changes: 2 additions & 0 deletions lib_py/test_data/metadata/logical_variables.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
study,questionnaire,question,item,dataset,variable,concept
some-study,some-instrument,1,some-item,some-dataset,some-variable,
3 changes: 3 additions & 0 deletions lib_py/test_data/metadata/variables.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
study,dataset,version,variable,template_id,label,label_de,concept,type,description,description_de,case_sensitive_name,categories,categories_de
some-study,some-dataset,v35,final-variable,,english label,German label,some-concept,,,,
some-study,some-dataset,v35,some-other-variable,,english label,German label,some-concept,,,,
Empty file added lib_py/tests/__init__.py
Empty file.
Loading