Skip to content

Commit

Permalink
Add support for main dc schema.mcf. (#275)
Browse files Browse the repository at this point in the history
  • Loading branch information
keyurva authored Jan 30, 2024
1 parent 5f5668f commit 613c979
Show file tree
Hide file tree
Showing 11 changed files with 186 additions and 10 deletions.
1 change: 1 addition & 0 deletions run_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ function run_main_dc_sample {

function run_all_samples {
run_sample
cd ..
run_main_dc_sample
}

Expand Down
56 changes: 56 additions & 0 deletions simple/sample/main_dc_output/schema.mcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
Node: dcid:custom/g/group_1
typeOf: StatVarGroup
name: "Parent Group"
specializationOf: dc/g/Root

Node: dcid:custom/g/group_2
typeOf: StatVarGroup
name: "Child Group 1"
specializationOf: custom/g/group_1

Node: dcid:custom/g/group_3
typeOf: StatVarGroup
name: "Child Group 2"
specializationOf: custom/g/group_1

Node: dcid:custom/g/Root
typeOf: StatVarGroup
name: "Custom Variables"
specializationOf: dc/g/Root

Node: dcid:var1
typeOf: StatisticalVariable
name: "Good var1 name"
description: "Good var1 description"
memberOf: custom/g/group_2
populationType: schema:Thing
measuredProperty: var1

Node: dcid:var2
typeOf: StatisticalVariable
name: "Good var2 name"
memberOf: custom/g/group_3
populationType: schema:Thing
measuredProperty: var2

Node: dcid:Variable_1
typeOf: StatisticalVariable
name: "Variable 1"
memberOf: custom/g/group_1
populationType: schema:Thing
measuredProperty: Variable_1

Node: dcid:Variable_2
typeOf: StatisticalVariable
name: "Variable 2"
memberOf: custom/g/Root
populationType: schema:Thing
measuredProperty: Variable_2

Node: dcid:Crime_Count
typeOf: StatisticalVariable
name: "Crime Count"
description: "Number of crimes"
memberOf: custom/g/Root
populationType: schema:Thing
measuredProperty: Crime_Count
10 changes: 10 additions & 0 deletions simple/sample/output/tables/triples.csv
Original file line number Diff line number Diff line change
Expand Up @@ -345,28 +345,38 @@ var1,memberOf,custom/g/group_2,""
var1,includedIn,c/p/1,""
var1,includedIn,c/p/2,""
var1,includedIn,c/s/1,""
var1,populationType,schema:Thing,""
var1,measuredProperty,var1,""
var2,typeOf,StatisticalVariable,""
var2,name,"","Good var2 name"
var2,memberOf,custom/g/group_3,""
var2,includedIn,c/p/1,""
var2,includedIn,c/p/2,""
var2,includedIn,c/s/1,""
var2,populationType,schema:Thing,""
var2,measuredProperty,var2,""
Variable_1,typeOf,StatisticalVariable,""
Variable_1,name,"","Variable 1"
Variable_1,memberOf,custom/g/group_1,""
Variable_1,includedIn,c/p/1,""
Variable_1,includedIn,c/s/1,""
Variable_1,populationType,schema:Thing,""
Variable_1,measuredProperty,Variable_1,""
Variable_2,typeOf,StatisticalVariable,""
Variable_2,name,"","Variable 2"
Variable_2,memberOf,custom/g/Root,""
Variable_2,includedIn,c/p/1,""
Variable_2,includedIn,c/s/1,""
Variable_2,populationType,schema:Thing,""
Variable_2,measuredProperty,Variable_2,""
Crime_Count,typeOf,StatisticalVariable,""
Crime_Count,name,"","Crime Count"
Crime_Count,description,"","Number of crimes"
Crime_Count,memberOf,custom/g/Root,""
Crime_Count,includedIn,c/p/1,""
Crime_Count,includedIn,c/s/1,""
Crime_Count,populationType,schema:Thing,""
Crime_Count,measuredProperty,Crime_Count,""
CrimeEvent,typeOf,Class,""
CrimeEvent,subClassOf,Event,""
CrimeEvent,name,"",CrimeEvent
Expand Down
46 changes: 41 additions & 5 deletions simple/stats/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,19 @@
_PREDICATE_SUB_CLASS_OF = "subClassOf"
_PREDICATE_OBSERVATION_DATE = "observationDate"
_PREDICATE_LOCATION = "location"
_PREDICATE_POPULATION_TYPE = "populationType"
_PREDICATE_MEASURED_PROPERTY = "measuredProperty"

_STATISTICAL_VARIABLE = "StatisticalVariable"
_STAT_VAR_GROUP = "StatVarGroup"
STATISTICAL_VARIABLE = "StatisticalVariable"
STAT_VAR_GROUP = "StatVarGroup"
_SOURCE = "Source"
_PROVENANCE = "Provenance"
_PROPERTY = "Property"
_CLASS = "Class"
_EVENT = "Event"
_THING = "schema:Thing"

_MCF_PREDICATE_BLOCKLIST = set([_PREDICATE_INCLUDED_IN])


@dataclass
Expand Down Expand Up @@ -69,8 +74,8 @@ def add_provenance(self, provenance: "Provenance") -> "StatVarGroup":

def triples(self) -> list[Triple]:
triples: list[Triple] = []
triples.append(
Triple(self.id, _PREDICATE_TYPE_OF, object_id=_STAT_VAR_GROUP))
triples.append(Triple(self.id, _PREDICATE_TYPE_OF,
object_id=STAT_VAR_GROUP))
triples.append(Triple(self.id, _PREDICATE_NAME, object_value=self.name))
triples.append(
Triple(self.id, _PREDICATE_SPECIALIZATION_OF, object_id=self.parent_id))
Expand Down Expand Up @@ -107,7 +112,7 @@ def add_provenance(self, provenance: "Provenance") -> "StatVar":
def triples(self) -> list[Triple]:
triples: list[Triple] = []
triples.append(
Triple(self.id, _PREDICATE_TYPE_OF, object_id=_STATISTICAL_VARIABLE))
Triple(self.id, _PREDICATE_TYPE_OF, object_id=STATISTICAL_VARIABLE))
triples.append(Triple(self.id, _PREDICATE_NAME, object_value=self.name))
if self.description:
triples.append(
Expand All @@ -122,6 +127,10 @@ def triples(self) -> list[Triple]:
for source_id in self.source_ids:
triples.append(
Triple(self.id, _PREDICATE_INCLUDED_IN, object_id=source_id))
triples.append(Triple(self.id, _PREDICATE_POPULATION_TYPE,
object_id=_THING))
triples.append(
Triple(self.id, _PREDICATE_MEASURED_PROPERTY, object_id=self.id))
return triples


Expand Down Expand Up @@ -289,3 +298,30 @@ def __post_init__(self):
raise ValueError(f"invalid period: {self.period}")
if self.method not in AggregationMethod._member_map_.values():
raise ValueError(f"invalid method: {self.method}")


@dataclass
class McfNode:
id: str
node_type: str = ""
properties: dict[str, str] = field(default_factory=lambda: defaultdict(dict))

def add_triple(self, triple: Triple) -> Self:
if triple.predicate in _MCF_PREDICATE_BLOCKLIST:
return self

if triple.predicate == _PREDICATE_TYPE_OF:
self.node_type = triple.object_id

if triple.object_id:
self.properties[triple.predicate] = triple.object_id
elif triple.object_value:
self.properties[triple.predicate] = f'"{triple.object_value}"'

return self

def to_mcf(self) -> str:
parts: list[str] = []
parts.append(f"Node: dcid:{self.id}")
parts.extend([f"{p}: {v}" for p, v in self.properties.items()])
return "\n".join(parts)
27 changes: 24 additions & 3 deletions simple/stats/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@
import pandas as pd
from pymysql.connections import Connection
from pymysql.cursors import Cursor
from stats.data import McfNode
from stats.data import Observation
from stats.data import STAT_VAR_GROUP
from stats.data import STATISTICAL_VARIABLE
from stats.data import Triple
from util.filehandler import create_file_handler
from util.filehandler import is_gcs_path
Expand Down Expand Up @@ -106,6 +109,9 @@
value: C:Table->value"""

OBSERVATIONS_TMCF_FILE_NAME = "observations.tmcf"
SCHEMA_MCF_FILE_NAME = "schema.mcf"

MCF_NODE_TYPES_ALLOWLIST = set([STATISTICAL_VARIABLE, STAT_VAR_GROUP])


class ImportStatus(Enum):
Expand Down Expand Up @@ -143,10 +149,12 @@ def __init__(self, db_params: dict) -> None:
assert MAIN_DC_OUTPUT_DIR in db_params

self.output_dir_fh = create_file_handler(db_params[MAIN_DC_OUTPUT_DIR])
# dcid to node dict
self.nodes: dict[str, McfNode] = {}

def insert_triples(self, triples: list[Triple]):
# TODO: generate schema mcf
pass
for triple in triples:
self._add_triple(triple)

def insert_observations(self, observations: list[Observation],
input_file_name: str):
Expand All @@ -161,9 +169,22 @@ def insert_import_info(self, status: ImportStatus):
pass

def commit_and_close(self):
# MCF
filtered = filter(lambda node: node.node_type in MCF_NODE_TYPES_ALLOWLIST,
self.nodes.values())
mcf = "\n\n".join(map(lambda node: node.to_mcf(), filtered))
self.output_dir_fh.make_file(SCHEMA_MCF_FILE_NAME).write_string(mcf)

# TMCF
self.output_dir_fh.make_file(OBSERVATIONS_TMCF_FILE_NAME).write_string(
OBSERVATIONS_TMCF)
pass

def _add_triple(self, triple: Triple):
node = self.nodes.get(triple.subject_id)
if not node:
node = McfNode(triple.subject_id)
self.nodes[triple.subject_id] = node
node.add_triple(triple)


class SqlDb(Db):
Expand Down
33 changes: 33 additions & 0 deletions simple/tests/stats/data_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import unittest

from stats.data import Event
from stats.data import McfNode
from stats.data import Provenance
from stats.data import StatVar
from stats.data import StatVarGroup
Expand Down Expand Up @@ -47,6 +48,8 @@ def test_sv_triples_basic(self):
expected = [
Triple(SV_ID1, "typeOf", object_id="StatisticalVariable"),
Triple(SV_ID1, "name", object_value=SV_NAME1),
Triple(SV_ID1, "populationType", object_id="schema:Thing"),
Triple(SV_ID1, "measuredProperty", object_id=SV_ID1),
]
self.assertListEqual(result, expected)

Expand All @@ -57,6 +60,8 @@ def test_sv_triples_with_description(self):
Triple(SV_ID1, "typeOf", object_id="StatisticalVariable"),
Triple(SV_ID1, "name", object_value=SV_NAME1),
Triple(SV_ID1, "description", object_value=SV_DESCRIPTION1),
Triple(SV_ID1, "populationType", object_id="schema:Thing"),
Triple(SV_ID1, "measuredProperty", object_id=SV_ID1),
]
self.assertListEqual(result, expected)

Expand All @@ -66,6 +71,8 @@ def test_sv_triples_with_nl_sentences(self):
expected = [
Triple(SV_ID1, "typeOf", object_id="StatisticalVariable"),
Triple(SV_ID1, "name", object_value=SV_NAME1),
Triple(SV_ID1, "populationType", object_id="schema:Thing"),
Triple(SV_ID1, "measuredProperty", object_id=SV_ID1),
]
self.assertListEqual(result, expected)

Expand All @@ -76,6 +83,8 @@ def test_sv_triples_with_group_id(self):
Triple(SV_ID1, "typeOf", object_id="StatisticalVariable"),
Triple(SV_ID1, "name", object_value=SV_NAME1),
Triple(SV_ID1, "memberOf", object_id=SVG_ID1),
Triple(SV_ID1, "populationType", object_id="schema:Thing"),
Triple(SV_ID1, "measuredProperty", object_id=SV_ID1),
]
self.assertListEqual(result, expected)

Expand Down Expand Up @@ -115,6 +124,8 @@ def test_sv_triples_all(self):
Triple(SV_ID1, "name", object_value=SV_NAME1),
Triple(SV_ID1, "description", object_value=SV_DESCRIPTION1),
Triple(SV_ID1, "memberOf", object_id=SVG_ID1),
Triple(SV_ID1, "populationType", object_id="schema:Thing"),
Triple(SV_ID1, "measuredProperty", object_id=SV_ID1),
]
self.assertListEqual(result, expected)

Expand Down Expand Up @@ -148,3 +159,25 @@ def test_event_triples(self):
Triple(EVENT_ID1, EVENT_PROP2_TYPE, object_value=EVENT_PROP2_VALUE1)
]
self.assertListEqual(result, expected)

def test_mcf_node(self):
triples: list[Triple] = [
Triple("sv1", "typeOf", object_id="StatisticalVariable"),
Triple("sv1", "name", object_value="sv name"),
Triple("sv1", "description", object_value="sv desc"),
Triple("sv1", "memberOf", object_id="svg1"),
Triple("sv1", "includedIn", object_id="prov1")
]

node = McfNode("sv1")
for triple in triples:
node.add_triple(triple)

expected = """
Node: dcid:sv1
typeOf: StatisticalVariable
name: "sv name"
description: "sv desc"
memberOf: svg1""".strip()

self.assertEqual(node.to_mcf(), expected)
8 changes: 6 additions & 2 deletions simple/tests/stats/db_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ def _compare_files(test: unittest.TestCase, output_path, expected_path):


_TRIPLES = [
Triple("sub1", "pred1", object_id="objid1"),
Triple("sub2", "pred2", object_value="objval1")
Triple("sub1", "typeOf", object_id="StatisticalVariable"),
Triple("sub1", "pred1", object_value="objval1")
]

_OBSERVATIONS = [
Expand Down Expand Up @@ -92,6 +92,8 @@ def test_main_dc_db(self):
"observations.csv")
tmcf_file = os.path.join(temp_dir, "observations.tmcf")
expected_tmcf_file = os.path.join(_EXPECTED_DIR, "observations.tmcf")
mcf_file = os.path.join(temp_dir, "schema.mcf")
expected_mcf_file = os.path.join(_EXPECTED_DIR, "schema.mcf")

db = create_db(create_main_dc_config(temp_dir))
db.insert_triples(_TRIPLES)
Expand All @@ -102,10 +104,12 @@ def test_main_dc_db(self):
if is_write_mode():
shutil.copy(observations_file, expected_observations_file)
shutil.copy(tmcf_file, expected_tmcf_file)
shutil.copy(mcf_file, expected_mcf_file)
return

_compare_files(self, observations_file, expected_observations_file)
_compare_files(self, tmcf_file, expected_tmcf_file)
_compare_files(self, mcf_file, expected_mcf_file)

@mock.patch.dict(os.environ, {})
def test_get_cloud_sql_config_from_env_empty(self):
Expand Down
3 changes: 3 additions & 0 deletions simple/tests/stats/test_data/db/expected/schema.mcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Node: dcid:sub1
typeOf: StatisticalVariable
pred1: "objval1"
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ Count_CrimeEvent,name,,Count_CrimeEvent
Count_CrimeEvent,memberOf,custom/g/Root,
Count_CrimeEvent,includedIn,c/p/1,
Count_CrimeEvent,includedIn,c/s/1,
Count_CrimeEvent,populationType,schema:Thing,
Count_CrimeEvent,measuredProperty,Count_CrimeEvent,
CrimeEvent,typeOf,Class,
CrimeEvent,subClassOf,Event,
CrimeEvent,name,,Crime Event
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ Crime_Event2_Count,name,,Number of Crime2 Events
Crime_Event2_Count,memberOf,custom/g/Root,
Crime_Event2_Count,includedIn,c/p/1,
Crime_Event2_Count,includedIn,c/s/1,
Crime_Event2_Count,populationType,schema:Thing,
Crime_Event2_Count,measuredProperty,Crime_Event2_Count,
CrimeEvent2,typeOf,Class,
CrimeEvent2,subClassOf,Event,
CrimeEvent2,name,,CrimeEvent2
Expand Down
Loading

0 comments on commit 613c979

Please sign in to comment.