Skip to content

Commit

Permalink
Use separate columns for standard observation properties. (#341)
Browse files Browse the repository at this point in the history
  • Loading branch information
keyurva authored Sep 30, 2024
1 parent 772c79e commit fab89e3
Show file tree
Hide file tree
Showing 23 changed files with 287 additions and 201 deletions.
70 changes: 52 additions & 18 deletions simple/stats/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,15 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This file includes data model classes used across the simple importer."""

from collections import defaultdict
from dataclasses import dataclass
from dataclasses import field
from dataclasses import fields
from dataclasses import is_dataclass
from enum import StrEnum
import json
from typing import Self
from urllib.parse import urlparse

Expand Down Expand Up @@ -227,35 +230,66 @@ def triples(self) -> list[Triple]:
return triples


@dataclass
class ObservationProperties:
unit: str = ""
scaling_factor: str = ""
measurement_method: str = ""
observation_period: str = ""
# All custom properties other than the standard ones above go in this field.
properties: dict[str, str] = field(default_factory=dict)

@classmethod
def new(cls: Self, all_properties: dict[str, str] = {}) -> Self:
unit = all_properties.get(sc.PREDICATE_UNIT, "")
scaling_factor = all_properties.get(sc.PREDICATE_SCALING_FACTOR, "")
measurement_method = all_properties.get(sc.PREDICATE_MEASUREMENT_METHOD, "")
observation_period = all_properties.get(sc.PREDICATE_OBSERVATION_PERIOD, "")
custom_properties = {
p: v
for p, v in all_properties.items()
if p not in sc.STANDARD_OBSERVATION_PROPERTIES
}
return cls(unit, scaling_factor, measurement_method, observation_period,
custom_properties)


@dataclass
class Observation:
entity: str
variable: str
date: str
value: str
provenance: str
properties: dict[str, str] = field(default_factory=dict)

def __post_init__(self):
if not self.properties:
self.properties = {}
# Properties in the DB are stored as gzipped and base64 encoded strings.
# Convert it to json / dict so it is available as a dict in code.
elif isinstance(self.properties, str):
self.properties = base64_decode_and_gunzip_json(self.properties)

def properties_string(self) -> str:
if not self.properties:
return ""
return gzip_and_base64_encode_json(self.properties)
properties: ObservationProperties = field(
default_factory=ObservationProperties.new)

def db_tuple(self):
return (_strip_namespace(self.entity), _strip_namespace(self.variable),
self.date, self.value, _strip_namespace(self.provenance),
self.properties_string())


OBSERVATION_FIELD_NAMES = list(map(lambda x: x.name, fields(Observation)))
_strip_namespace(self.properties.unit),
self.properties.scaling_factor,
_strip_namespace(self.properties.measurement_method),
_strip_namespace(self.properties.observation_period),
json.dumps(self.properties.properties)
if self.properties.properties else "")


def _get_flattened_dataclass_field_names(cls) -> list[str]:
"""Flattens the field names from the specified class and any of it's nested member classes.
In practice, this is used to flatten the fields from the Observation and ObservationProperties classes
which is used to populate test CSVs.
"""
field_names: list[str] = []
for field in fields(cls):
if is_dataclass(field.type):
field_names.extend(_get_flattened_dataclass_field_names(field.type))
else:
field_names.append(field.name)
return field_names


OBSERVATION_FIELD_NAMES = _get_flattened_dataclass_field_names(Observation)


@dataclass
Expand Down
37 changes: 27 additions & 10 deletions simple/stats/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,11 @@

MAIN_DC_OUTPUT_DIR = "mainDcOutputDir"

_OBSERVATION_PROPERTY_COLUMNS = [
"unit", "scaling_factor", "measurement_method", "observation_period",
"properties"
]

_CREATE_TRIPLES_TABLE = """
create table if not exists triples (
subject_id varchar(255),
Expand All @@ -78,12 +83,16 @@
date varchar(255),
value varchar(255),
provenance varchar(255),
unit varchar(255),
scaling_factor varchar(255),
measurement_method varchar(255),
observation_period varchar(255),
properties TEXT
);
"""

_DELETE_OBSERVATIONS_STATEMENT = "delete from observations"
_INSERT_OBSERVATIONS_STATEMENT = "insert into observations values(?, ?, ?, ?, ?, ?)"
_INSERT_OBSERVATIONS_STATEMENT = "insert into observations values(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"

_CREATE_KEY_VALUE_STORE_TABLE = """
create table if not exists key_value_store (
Expand Down Expand Up @@ -122,11 +131,17 @@

# Schema update statements.

# The properties column was not part of the observations table originally.
# This statement adds the column.
# Various property columns not part of the observations table originally.
# These statements add those columns.
# Neither sqlite nor mysql support an 'if not exists' statement for altering tables universally,
# so the code needs to check for existence separately before applying this statement.
_ALTER_OBSERVATIONS_TABLE_STATEMENT = "alter table observations add column properties TEXT;"
# so the code needs to check for existence separately before applying these statements.
_ALTER_OBSERVATIONS_TABLE_STATEMENTS = [
"alter table observations add column unit varchar(255);",
"alter table observations add column scaling_factor varchar(255);",
"alter table observations add column measurement_method varchar(255);",
"alter table observations add column observation_period varchar(255);",
"alter table observations add column properties text;"
]

OBSERVATIONS_TMCF = """Node: E:Table->E0
typeOf: dcs:StatVarObservation
Expand Down Expand Up @@ -376,14 +391,15 @@ def _schema_updates(self) -> None:
Add any sqlite schema updates here.
Ensure that all schema updates always check if the update is necessary before applying it.
"""
# Add properties column to observations table if it does not exist.
# Add property columns to observations table if it does not exist.
rows = self.fetch_all(_SQLITE_OBSERVATIONS_TABLE_INFO_STATEMENT)
existing_columns = set([columns[1] for columns in rows])
if "properties" not in existing_columns:
logging.info(
"properties column does not exist in the observations table. Altering table to add it."
f"properties column does not exist in the observations table. Altering table to the following property columns: {', '.join(_OBSERVATION_PROPERTY_COLUMNS)}"
)
self.execute(_ALTER_OBSERVATIONS_TABLE_STATEMENT)
for statement in _ALTER_OBSERVATIONS_TABLE_STATEMENTS:
self.cursor.execute(statement)

def _drop_indexes(self) -> None:
for index in _DB_INDEXES:
Expand Down Expand Up @@ -487,9 +503,10 @@ def _schema_updates(self) -> None:
properties_column_exists = rows is not None and len(rows) > 0
if not properties_column_exists:
logging.info(
"properties column does not exist in the observations table. Altering table to add it."
f"properties column does not exist in the observations table. Altering table to the following property columns: {', '.join(_OBSERVATION_PROPERTY_COLUMNS)}"
)
self.execute(_ALTER_OBSERVATIONS_TABLE_STATEMENT)
for statement in _ALTER_OBSERVATIONS_TABLE_STATEMENTS:
self.cursor.execute(statement)

def _drop_indexes(self) -> None:
for index in _DB_INDEXES:
Expand Down
4 changes: 3 additions & 1 deletion simple/stats/observations_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from stats import constants
from stats import schema_constants as sc
from stats.data import Observation
from stats.data import ObservationProperties
from stats.db import Db
from stats.importer import Importer
from stats.nodes import Nodes
Expand Down Expand Up @@ -115,7 +116,8 @@ def _write_observations(self) -> None:
)

provenance = self.nodes.provenance(self.input_file_name).id
obs_props = self.config.observation_properties(self.input_file_name)
obs_props = ObservationProperties.new(
self.config.observation_properties(self.input_file_name))

observations: list[Observation] = []
for _, row in observations_df.iterrows():
Expand Down
10 changes: 10 additions & 0 deletions simple/stats/schema_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,16 @@
PREDICATE_SEARCH_DESCRIPTION = "searchDescription"
PREDICATE_MEASURED_PROPERTY = "measuredProperty"
PREDICATE_DESCRIPTION = "description"
PREDICATE_UNIT = "unit"
PREDICATE_SCALING_FACTOR = "scalingFactor"
PREDICATE_MEASUREMENT_METHOD = "measurementMethod"
PREDICATE_OBSERVATION_PERIOD = "observationPeriod"

# The set of standard observation properties with first class support in our APIs and FE.
STANDARD_OBSERVATION_PROPERTIES: set[str] = {
PREDICATE_UNIT, PREDICATE_SCALING_FACTOR, PREDICATE_MEASUREMENT_METHOD,
PREDICATE_OBSERVATION_PERIOD
}

TYPE_STATISTICAL_VARIABLE = "StatisticalVariable"
TYPE_STATISTICAL_VARIABLE_GROUP = "StatVarGroup"
Expand Down
4 changes: 3 additions & 1 deletion simple/stats/variable_per_row_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from stats import constants
from stats import schema_constants as sc
from stats.data import Observation
from stats.data import ObservationProperties
from stats.db import Db
from stats.importer import Importer
from stats.nodes import Nodes
Expand Down Expand Up @@ -90,7 +91,8 @@ def _map_columns(self):

def _write_observations(self) -> None:
provenance = self.nodes.provenance(self.input_file_name).id
obs_props = self.config.observation_properties(self.input_file_name)
obs_props = ObservationProperties.new(
self.config.observation_properties(self.input_file_name))

observations: list[Observation] = []
for row in self.reader:
Expand Down
11 changes: 11 additions & 0 deletions simple/tests/stats/data_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@

import unittest

from stats.data import _get_flattened_dataclass_field_names
from stats.data import Event
from stats.data import McfNode
from stats.data import Observation
from stats.data import Provenance
from stats.data import StatVar
from stats.data import StatVarGroup
Expand Down Expand Up @@ -193,3 +195,12 @@ def test_mcf_node(self):
memberOf: svg1""".strip()

self.assertEqual(node.to_mcf(), expected)

def test_get_flattened_dataclass_field_names(self):
expected = [
"entity", "variable", "date", "value", "provenance", "unit",
"scaling_factor", "measurement_method", "observation_period",
"properties"
]
self.assertListEqual(_get_flattened_dataclass_field_names(Observation),
expected)
11 changes: 10 additions & 1 deletion simple/tests/stats/db_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

from freezegun import freeze_time
from stats.data import Observation
from stats.data import ObservationProperties
from stats.data import Triple
from stats.db import create_db
from stats.db import create_main_dc_config
Expand All @@ -46,7 +47,15 @@
_OBSERVATIONS = [
Observation("e1", "v1", "2023", "123", "p1"),
Observation("e2", "v1", "2023", "456", "p1"),
Observation("e3", "v1", "2023", "789", "p1", {"prop1": "val1"})
Observation("e3",
"v1",
"2023",
"789",
"p1",
properties=ObservationProperties.new({
"unit": "USD",
"prop1": "val1"
}))
]

_KEY_VALUE = ("k1", "v1")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
entity,variable,date,value,provenance,properties
country/USA,Count_CrimeEvent,2023,3,c/p/1,
country/BRA,Count_CrimeEvent,2023,2,c/p/1,
country/CHN,Count_CrimeEvent,2023,2,c/p/1,
entity,variable,date,value,provenance,unit,scaling_factor,measurement_method,observation_period,properties
country/USA,Count_CrimeEvent,2023,3,c/p/1,,,,,
country/BRA,Count_CrimeEvent,2023,2,c/p/1,,,,,
country/CHN,Count_CrimeEvent,2023,2,c/p/1,,,,,
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
entity,variable,date,value,provenance,properties
country/USA,Crime_Event2_Count,2023-11-08,2,c/p/1,
country/BRA,Crime_Event2_Count,2023-11-08,2,c/p/1,
country/CHN,Crime_Event2_Count,2023-11-08,1,c/p/1,
country/CHN,Crime_Event2_Count,2023-09-17,1,c/p/1,
country/USA,Crime_Event2_Count,2023-08-02,1,c/p/1,
entity,variable,date,value,provenance,unit,scaling_factor,measurement_method,observation_period,properties
country/USA,Crime_Event2_Count,2023-11-08,2,c/p/1,,,,,
country/BRA,Crime_Event2_Count,2023-11-08,2,c/p/1,,,,,
country/CHN,Crime_Event2_Count,2023-11-08,1,c/p/1,,,,,
country/CHN,Crime_Event2_Count,2023-09-17,1,c/p/1,,,,,
country/USA,Crime_Event2_Count,2023-08-02,1,c/p/1,,,,,
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
entity,variable,date,value,provenance,properties
country/BRA,var1,2023,0.19,c/p/default,
country/JPN,var1,2023,0.21,c/p/default,
country/CHN,var1,2022,-123.456,c/p/default,
country/BRA,var2,2023,6,c/p/default,
country/JPN,var2,2023,56,c/p/default,
country/USA,var2,2023,66,c/p/default,
entity,variable,date,value,provenance,unit,scaling_factor,measurement_method,observation_period,properties
country/BRA,var1,2023,0.19,c/p/default,,,,,
country/JPN,var1,2023,0.21,c/p/default,,,,,
country/CHN,var1,2022,-123.456,c/p/default,,,,,
country/BRA,var2,2023,6,c/p/default,,,,,
country/JPN,var2,2023,56,c/p/default,,,,,
country/USA,var2,2023,66,c/p/default,,,,,
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
entity,variable,date,value,provenance,properties
country/BRA,var1,2023,0.19,c/p/default,H4sIAAAAAAAC/6tWKs3LLFGyUlAKDXZR0lFQyk8qTi0qSyzJzM8LSC3KzE8ByQUYRirVAgAA6g2KKwAAAA==
country/JPN,var1,2023,0.21,c/p/default,H4sIAAAAAAAC/6tWKs3LLFGyUlAKDXZR0lFQyk8qTi0qSyzJzM8LSC3KzE8ByQUYRirVAgAA6g2KKwAAAA==
country/CHN,var1,2022,-123.456,c/p/default,H4sIAAAAAAAC/6tWKs3LLFGyUlAKDXZR0lFQyk8qTi0qSyzJzM8LSC3KzE8ByQUYRirVAgAA6g2KKwAAAA==
country/BRA,var2,2023,6,c/p/default,H4sIAAAAAAAC/6tWKs3LLFGyUlAKDXZR0lFQyk8qTi0qSyzJzM8LSC3KzE8ByQUYRirVAgAA6g2KKwAAAA==
country/JPN,var2,2023,56,c/p/default,H4sIAAAAAAAC/6tWKs3LLFGyUlAKDXZR0lFQyk8qTi0qSyzJzM8LSC3KzE8ByQUYRirVAgAA6g2KKwAAAA==
country/USA,var2,2023,66,c/p/default,H4sIAAAAAAAC/6tWKs3LLFGyUlAKDXZR0lFQyk8qTi0qSyzJzM8LSC3KzE8ByQUYRirVAgAA6g2KKwAAAA==
entity,variable,date,value,provenance,unit,scaling_factor,measurement_method,observation_period,properties
country/BRA,var1,2023,0.19,c/p/default,USD,,,P1Y,"{""customProperty1"": ""customValue1""}"
country/JPN,var1,2023,0.21,c/p/default,USD,,,P1Y,"{""customProperty1"": ""customValue1""}"
country/CHN,var1,2022,-123.456,c/p/default,USD,,,P1Y,"{""customProperty1"": ""customValue1""}"
country/BRA,var2,2023,6,c/p/default,USD,,,P1Y,"{""customProperty1"": ""customValue1""}"
country/JPN,var2,2023,56,c/p/default,USD,,,P1Y,"{""customProperty1"": ""customValue1""}"
country/USA,var2,2023,66,c/p/default,USD,,,P1Y,"{""customProperty1"": ""customValue1""}"
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
"entityType": "Dummy",
"observationProperties": {
"unit": "USD",
"observationPeriod": "P1Y"
"observationPeriod": "P1Y",
"customProperty1": "customValue1"
}
}
}
Expand Down
Loading

0 comments on commit fab89e3

Please sign in to comment.