Skip to content

Commit

Permalink
Use column names conforming to a valid id pattern as SV ids. (#237)
Browse files Browse the repository at this point in the history
  • Loading branch information
keyurva authored Oct 24, 2023
1 parent 30281a9 commit 4a74f91
Show file tree
Hide file tree
Showing 11 changed files with 54 additions and 41 deletions.
2 changes: 1 addition & 1 deletion simple/sample/output/observations_countries.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
dcid,date,custom/statvar_1,custom/statvar_2
dcid,date,var1,var2
country/AFG,2023,0.19,6
country/YEM,2023,0.21,56
country/AGO,2023,0.29,6
Expand Down
2 changes: 1 addition & 1 deletion simple/sample/output/observations_geoids.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
dcid,date,custom/statvar_3,custom/statvar_4
dcid,date,custom/statvar_1,custom/statvar_2
geoId/01,2021,555,666
geoId/122,2022,321,123456
2 changes: 1 addition & 1 deletion simple/sample/output/observations_latlng.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
dcid,date,custom/statvar_1,custom/statvar_2
dcid,date,var1,var2
country/USA,2021,555,666
country/IND,2022,321,123
2 changes: 1 addition & 1 deletion simple/sample/output/observations_powerplants.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
dcid,date,custom/statvar_1,custom/statvar_2
dcid,date,var1,var2
dc/000qxlm93vn93,2023,0.19,6
dc/5c7tz3lbln3p,2023,0.21,56
dc/8zmh7ctlkbsc4,2023,0.29,6
Expand Down
2 changes: 1 addition & 1 deletion simple/sample/output/observations_s2cells.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
dcid,date,custom/statvar_1,custom/statvar_2
dcid,date,var1,var2
s2CellId/0x80982b0000000000,2021,555,666
s2CellId/0x3be7c90000000000,2022,321,123
2 changes: 1 addition & 1 deletion simple/sample/output/observations_wikidataids.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
dcid,date,custom/statvar_1,custom/statvar_2
dcid,date,var1,var2
country/USA,2021,555,666
country/IND,2022,321,123
2 changes: 1 addition & 1 deletion simple/sample/output/process/debug_resolve_powerplants.csv
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
input,dcid,link
BAZ BAR,*UNRESOLVED*,
FOO BAR,*UNRESOLVED*,
BAZ BAR,*UNRESOLVED*,
Suzlon Project,dc/000qxlm93vn93,https://datacommons.org/browser/dc/000qxlm93vn93
Crete Energy Venture,dc/5c7tz3lbln3p,https://datacommons.org/browser/dc/5c7tz3lbln3p
Watchtower Educational Center,dc/8zmh7ctlkbsc4,https://datacommons.org/browser/dc/8zmh7ctlkbsc4
Expand Down
28 changes: 14 additions & 14 deletions simple/sample/output/process/report.json
Original file line number Diff line number Diff line change
@@ -1,37 +1,37 @@
{
"status": "SUCCESS",
"startTime": "2023-10-23 14:43:10.816166",
"lastUpdate": "2023-10-23 14:43:17.797630",
"startTime": "2023-10-23 17:54:52.376362",
"lastUpdate": "2023-10-23 17:55:02.055825",
"importFiles": {
"countries.csv": {
"status": "SUCCESS",
"startTime": "2023-10-23 14:43:10.816848",
"lastUpdate": "2023-10-23 14:43:11.638482"
"startTime": "2023-10-23 17:54:52.377555",
"lastUpdate": "2023-10-23 17:54:53.589711"
},
"geoids.csv": {
"status": "SUCCESS",
"startTime": "2023-10-23 14:43:11.641372",
"lastUpdate": "2023-10-23 14:43:11.644468"
"startTime": "2023-10-23 17:54:53.593646",
"lastUpdate": "2023-10-23 17:54:53.596484"
},
"latlng.csv": {
"status": "SUCCESS",
"startTime": "2023-10-23 14:43:11.645726",
"lastUpdate": "2023-10-23 14:43:12.729606"
"startTime": "2023-10-23 17:54:53.597627",
"lastUpdate": "2023-10-23 17:54:54.592548"
},
"powerplants.csv": {
"status": "SUCCESS",
"startTime": "2023-10-23 14:43:12.732431",
"lastUpdate": "2023-10-23 14:43:17.590325"
"startTime": "2023-10-23 17:54:54.595093",
"lastUpdate": "2023-10-23 17:55:01.839713"
},
"s2cells.csv": {
"status": "SUCCESS",
"startTime": "2023-10-23 14:43:17.591729",
"lastUpdate": "2023-10-23 14:43:17.593462"
"startTime": "2023-10-23 17:55:01.841094",
"lastUpdate": "2023-10-23 17:55:01.842973"
},
"wikidataids.csv": {
"status": "SUCCESS",
"startTime": "2023-10-23 14:43:17.594229",
"lastUpdate": "2023-10-23 14:43:17.794165"
"startTime": "2023-10-23 17:55:01.843903",
"lastUpdate": "2023-10-23 17:55:02.051317"
}
}
}
22 changes: 11 additions & 11 deletions simple/sample/output/triples.csv
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,16 @@ custom/g/group_2,specializationOf,custom/g/group_1,
custom/g/group_3,typeOf,StatVarGroup,
custom/g/group_3,name,,Child Group 2
custom/g/group_3,specializationOf,custom/g/group_1,
var1,typeOf,StatisticalVariable,
var1,name,,Good var1 name
var1,description,,Good var1 description
var1,memberOf,custom/g/group_2,
var2,typeOf,StatisticalVariable,
var2,name,,Good var2 name
var2,memberOf,custom/g/group_3,
custom/statvar_1,typeOf,StatisticalVariable,
custom/statvar_1,name,,Good var1 name
custom/statvar_1,description,,Good var1 description
custom/statvar_1,memberOf,custom/g/group_2,
custom/statvar_1,name,,Variable 1
custom/statvar_1,memberOf,custom/g/group_1,
custom/statvar_2,typeOf,StatisticalVariable,
custom/statvar_2,name,,Good var2 name
custom/statvar_2,memberOf,custom/g/group_3,
custom/statvar_3,typeOf,StatisticalVariable,
custom/statvar_3,name,,Variable 1
custom/statvar_3,memberOf,custom/g/group_1,
custom/statvar_4,typeOf,StatisticalVariable,
custom/statvar_4,name,,Variable 2
custom/statvar_4,memberOf,dc/g/Root,
custom/statvar_2,name,,Variable 2
custom/statvar_2,memberOf,dc/g/Root,
15 changes: 14 additions & 1 deletion simple/stats/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from dataclasses import dataclass
import logging
import re

import pandas as pd
from stats.config import Config
Expand All @@ -25,6 +26,10 @@
_CUSTOM_SV_ID_PREFIX = "custom/statvar_"
_CUSTOM_GROUP_ID_PREFIX = "custom/g/group_"
_ROOT_GROUP_ID = "dc/g/Root"
# Pattern to check if a string conforms to that of a valid SV ID.
# Note that slashes ("/") are intentionally not considered here
# since it can be confusing for custom DCs.
_SV_ID_PATTERN = r"^[A-Za-z0-9_]+$"


class Nodes:
Expand All @@ -35,6 +40,8 @@ def __init__(self, config: Config) -> None:
self.variables: dict[str, StatVar] = {}
# Dictionary of SVGs from SVG path to SVG
self.groups: dict[str, StatVarGroup] = {}
# Used to generate SV IDs
self._sv_generated_id_count = 0

def variable(self, sv_column_name: str) -> StatVar:
if sv_column_name in self.variables:
Expand All @@ -44,14 +51,20 @@ def variable(self, sv_column_name: str) -> StatVar:
group = self.group(var_cfg.group_path)
group_id = group.id if group else _ROOT_GROUP_ID
self.variables[sv_column_name] = StatVar(
f"{_CUSTOM_SV_ID_PREFIX}{len(self.variables) + 1}",
self._sv_id(sv_column_name),
var_cfg.name,
description=var_cfg.description,
nl_sentences=var_cfg.nl_sentences,
group_id=group_id,
)
return self.variables[sv_column_name]

def _sv_id(self, sv_column_name: str) -> str:
if re.fullmatch(_SV_ID_PATTERN, sv_column_name):
return sv_column_name
self._sv_generated_id_count += 1
return f"{_CUSTOM_SV_ID_PREFIX}{self._sv_generated_id_count}"

def group(self, group_path: str) -> StatVarGroup | None:
if not group_path:
return None
Expand Down
16 changes: 8 additions & 8 deletions simple/tests/stats/nodes_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,37 +132,37 @@
object_id="custom/g/group_2",
),
Triple(
"custom/statvar_3",
"var3",
"typeOf",
object_id="StatisticalVariable",
),
Triple(
"custom/statvar_3",
"var3",
"name",
object_value="Var 3 Name",
),
Triple(
"custom/statvar_3",
"var3",
"description",
object_value="Var 3 Description",
),
Triple(
"custom/statvar_3",
"var3",
"memberOf",
object_id="custom/g/group_3",
),
Triple(
"custom/statvar_4",
"custom/statvar_3",
"typeOf",
object_id="StatisticalVariable",
),
Triple(
"custom/statvar_4",
"custom/statvar_3",
"name",
object_value="Variable with no config",
),
Triple(
"custom/statvar_4",
"custom/statvar_3",
"memberOf",
object_id="dc/g/Root",
),
Expand All @@ -187,7 +187,7 @@ def test_variable_with_config(self):
self.assertEqual(
sv,
StatVar(
"custom/statvar_1",
"var3",
"Var 3 Name",
description="Var 3 Description",
nl_sentences=["Sentence 1", "Sentence 2"],
Expand Down

0 comments on commit 4a74f91

Please sign in to comment.