Skip to content

Commit

Permalink
Rename SV column names to SV ids. (#236)
Browse files Browse the repository at this point in the history
  • Loading branch information
keyurva authored Oct 24, 2023
1 parent bbf9d00 commit 30281a9
Show file tree
Hide file tree
Showing 17 changed files with 433 additions and 194 deletions.
2 changes: 1 addition & 1 deletion simple/sample/output/observations_countries.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
dcid,date,var1,var2
dcid,date,custom/statvar_1,custom/statvar_2
country/AFG,2023,0.19,6
country/YEM,2023,0.21,56
country/AGO,2023,0.29,6
Expand Down
2 changes: 1 addition & 1 deletion simple/sample/output/observations_geoids.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
dcid,date,Variable 1,Variable 2
dcid,date,custom/statvar_3,custom/statvar_4
geoId/01,2021,555,666
geoId/122,2022,321,123456
2 changes: 1 addition & 1 deletion simple/sample/output/observations_latlng.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
dcid,date,var1,var2
dcid,date,custom/statvar_1,custom/statvar_2
country/USA,2021,555,666
country/IND,2022,321,123
2 changes: 1 addition & 1 deletion simple/sample/output/observations_powerplants.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
dcid,date,var1,var2
dcid,date,custom/statvar_1,custom/statvar_2
dc/000qxlm93vn93,2023,0.19,6
dc/5c7tz3lbln3p,2023,0.21,56
dc/8zmh7ctlkbsc4,2023,0.29,6
Expand Down
2 changes: 1 addition & 1 deletion simple/sample/output/observations_s2cells.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
dcid,date,var1,var2
dcid,date,custom/statvar_1,custom/statvar_2
s2CellId/0x80982b0000000000,2021,555,666
s2CellId/0x3be7c90000000000,2022,321,123
2 changes: 1 addition & 1 deletion simple/sample/output/observations_wikidataids.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
dcid,date,var1,var2
dcid,date,custom/statvar_1,custom/statvar_2
country/USA,2021,555,666
country/IND,2022,321,123
2 changes: 1 addition & 1 deletion simple/sample/output/process/debug_resolve_powerplants.csv
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
input,dcid,link
FOO BAR,*UNRESOLVED*,
BAZ BAR,*UNRESOLVED*,
FOO BAR,*UNRESOLVED*,
Suzlon Project,dc/000qxlm93vn93,https://datacommons.org/browser/dc/000qxlm93vn93
Crete Energy Venture,dc/5c7tz3lbln3p,https://datacommons.org/browser/dc/5c7tz3lbln3p
Watchtower Educational Center,dc/8zmh7ctlkbsc4,https://datacommons.org/browser/dc/8zmh7ctlkbsc4
Expand Down
28 changes: 14 additions & 14 deletions simple/sample/output/process/report.json
Original file line number Diff line number Diff line change
@@ -1,37 +1,37 @@
{
"status": "SUCCESS",
"startTime": "2023-10-23 10:50:22.644858",
"lastUpdate": "2023-10-23 10:50:30.534262",
"startTime": "2023-10-23 14:43:10.816166",
"lastUpdate": "2023-10-23 14:43:17.797630",
"importFiles": {
"countries.csv": {
"status": "SUCCESS",
"startTime": "2023-10-23 10:50:22.645832",
"lastUpdate": "2023-10-23 10:50:23.689136"
"startTime": "2023-10-23 14:43:10.816848",
"lastUpdate": "2023-10-23 14:43:11.638482"
},
"geoids.csv": {
"status": "SUCCESS",
"startTime": "2023-10-23 10:50:23.693980",
"lastUpdate": "2023-10-23 10:50:23.697093"
"startTime": "2023-10-23 14:43:11.641372",
"lastUpdate": "2023-10-23 14:43:11.644468"
},
"latlng.csv": {
"status": "SUCCESS",
"startTime": "2023-10-23 10:50:23.698691",
"lastUpdate": "2023-10-23 10:50:24.975047"
"startTime": "2023-10-23 14:43:11.645726",
"lastUpdate": "2023-10-23 14:43:12.729606"
},
"powerplants.csv": {
"status": "SUCCESS",
"startTime": "2023-10-23 10:50:24.977544",
"lastUpdate": "2023-10-23 10:50:30.267462"
"startTime": "2023-10-23 14:43:12.732431",
"lastUpdate": "2023-10-23 14:43:17.590325"
},
"s2cells.csv": {
"status": "SUCCESS",
"startTime": "2023-10-23 10:50:30.268803",
"lastUpdate": "2023-10-23 10:50:30.270341"
"startTime": "2023-10-23 14:43:17.591729",
"lastUpdate": "2023-10-23 14:43:17.593462"
},
"wikidataids.csv": {
"status": "SUCCESS",
"startTime": "2023-10-23 10:50:30.271283",
"lastUpdate": "2023-10-23 10:50:30.529614"
"startTime": "2023-10-23 14:43:17.594229",
"lastUpdate": "2023-10-23 14:43:17.794165"
}
}
}
18 changes: 9 additions & 9 deletions simple/sample/output/triples.csv
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,15 @@ custom/g/group_3,typeOf,StatVarGroup,
custom/g/group_3,name,,Child Group 2
custom/g/group_3,specializationOf,custom/g/group_1,
custom/statvar_1,typeOf,StatisticalVariable,
custom/statvar_1,name,,Variable 1
custom/statvar_1,memberOf,custom/g/group_1,
custom/statvar_1,name,,Good var1 name
custom/statvar_1,description,,Good var1 description
custom/statvar_1,memberOf,custom/g/group_2,
custom/statvar_2,typeOf,StatisticalVariable,
custom/statvar_2,name,,Variable 2
custom/statvar_2,memberOf,dc/g/Root,
custom/statvar_2,name,,Good var2 name
custom/statvar_2,memberOf,custom/g/group_3,
custom/statvar_3,typeOf,StatisticalVariable,
custom/statvar_3,name,,Good var1 name
custom/statvar_3,description,,Good var1 description
custom/statvar_3,memberOf,custom/g/group_2,
custom/statvar_3,name,,Variable 1
custom/statvar_3,memberOf,custom/g/group_1,
custom/statvar_4,typeOf,StatisticalVariable,
custom/statvar_4,name,,Good var2 name
custom/statvar_4,memberOf,custom/g/group_3,
custom/statvar_4,name,,Variable 2
custom/statvar_4,memberOf,dc/g/Root,
79 changes: 15 additions & 64 deletions simple/stats/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,71 +38,22 @@ class Config:
def __init__(self, data: dict) -> None:
self.data = data

def variables_and_groups(
self, sv_names: list[str]) -> (list[StatVar], list[StatVarGroup]):
groups = self._groups(sv_names)
variables = self._variables(sv_names, groups=groups)
return list(variables.values()), list(groups.values())

def _groups(self, sv_names: list[str]):
# Group Path to SVGroup
groups: dict[str, StatVarGroup] = {}

for sv_name in sv_names:
group_path = self._variable_group(sv_name)
if group_path:
tokens = group_path.split("/")
for index in range(len(tokens)):
path = "/".join(tokens[:index + 1])
if path not in groups:
id = f"{_CUSTOM_GROUP_ID_PREFIX}{len(groups) + 1}"
name = tokens[index]
parent_path = "" if "/" not in path else path[:path.rindex("/")]
parent_id = groups[
parent_path].id if parent_path in groups else _ROOT_GROUP_ID
groups[path] = StatVarGroup(id, name, parent_id)

return groups

def _variables(self,
sv_names: list[str],
groups: dict[str, StatVarGroup] = None):
if not groups:
groups = self._groups(sv_names)

# Variable column name to SV
svs: dict[str, StatVar] = {}

for sv_name in sv_names:
if sv_name in svs:
continue

var = self._variable(sv_name)
id = f"{_CUSTOM_SV_ID_PREFIX}{len(svs) + 1}"
name = var.get(_NAME_FIELD, sv_name)
group_path = self._variable_group(sv_name)
group_id = groups[
group_path].id if group_path and group_path in groups else _ROOT_GROUP_ID
svs[sv_name] = StatVar(id,
name,
description=var.get(_DESCRIPTION_FIELD, ""),
nl_sentences=var.get(_NL_SENTENCES_FIELD, []),
group_id=group_id)

return svs
def variable(self, variable_name: str) -> StatVar:
var_cfg = self.data.get(_VARIABLES_FIELD, {}).get(variable_name, {})
return StatVar(
"",
var_cfg.get(_NAME_FIELD, variable_name),
description=var_cfg.get(_DESCRIPTION_FIELD, ""),
nl_sentences=var_cfg.get(_NL_SENTENCES_FIELD, []),
group_path=var_cfg.get(_GROUP_FIELD, ""),
)

def entity_type(self, input_file_name: str) -> str:
return self.data.get(_INPUT_FILES_FIELD,
{}).get(input_file_name,
{}).get(_ENTITY_TYPE_FIELD, "")
return (self.data.get(_INPUT_FILES_FIELD,
{}).get(input_file_name,
{}).get(_ENTITY_TYPE_FIELD, ""))

def ignore_columns(self, input_file_name: str) -> str:
return self.data.get(_INPUT_FILES_FIELD,
{}).get(input_file_name,
{}).get(_IGNORE_COLUMNS_FIELD, [])

def _variable(self, variable_name: str) -> dict:
return self.data.get(_VARIABLES_FIELD, {}).get(variable_name, {})

def _variable_group(self, variable_name: str) -> str:
return self._variable(variable_name).get(_GROUP_FIELD, "")
return (self.data.get(_INPUT_FILES_FIELD,
{}).get(input_file_name,
{}).get(_IGNORE_COLUMNS_FIELD, []))
1 change: 1 addition & 0 deletions simple/stats/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ class StatVar:
description: str = ""
nl_sentences: list[str] = field(default_factory=list)
group_id: str = ""
group_path: str = ""

def triples(self) -> list[Triple]:
triples: list[Triple] = []
Expand Down
25 changes: 17 additions & 8 deletions simple/stats/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import pandas as pd
from stats import constants
from stats.nodes import Nodes
from stats.reporter import FileImportReporter
from util.filehandler import FileHandler

Expand All @@ -33,18 +34,19 @@ def __init__(
observations_fh: FileHandler,
debug_resolve_fh: FileHandler,
reporter: FileImportReporter,
nodes: Nodes,
entity_type: str,
ignore_columns: list[str] = list(),
) -> None:
self.input_fh = input_fh
self.observations_fh = observations_fh
self.debug_resolve_fh = debug_resolve_fh
self.reporter = reporter
self.nodes = nodes
self.entity_type = entity_type
self.ignore_columns = ignore_columns
self.df = pd.DataFrame()
self.debug_resolve_df = None
self.sv_names = []

def do_import(self) -> None:
self.reporter.report_started()
Expand All @@ -54,7 +56,6 @@ def do_import(self) -> None:
self._sanitize_values()
self._resolve_entities()
self._rename_columns()
self._extract_sv_names()
self.reporter.report_success()
except Exception as e:
self.reporter.report_failure(str(e))
Expand Down Expand Up @@ -82,12 +83,20 @@ def _sanitize_values(self):
self.df = self.df.astype({self.df.columns[1]: str})

def _rename_columns(self) -> None:
df = self.df
df.columns.values[0] = constants.COLUMN_DCID
df.columns.values[1] = constants.COLUMN_DATE

def _extract_sv_names(self):
self.sv_names = list(self.df.columns[2:])
renamed = {}
# Rename dcid and date columns
renamed[self.df.columns[0]] = constants.COLUMN_DCID
renamed[self.df.columns[1]] = constants.COLUMN_DATE

# Rename SV columns to their IDs
sv_column_names = self.df.columns[2:]
sv_ids = [
self.nodes.variable(sv_column_name).id
for sv_column_name in sv_column_names
]
renamed.update({col: id for col, id in zip(sv_column_names, sv_ids)})

self.df = self.df.rename(columns=renamed)

def _resolve_entities(self) -> None:
df = self.df
Expand Down
85 changes: 85 additions & 0 deletions simple/stats/nodes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# Copyright 2023 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from dataclasses import dataclass
import logging

import pandas as pd
from stats.config import Config
from stats.data import StatVar
from stats.data import StatVarGroup
from stats.data import Triple
from util.filehandler import FileHandler

_CUSTOM_SV_ID_PREFIX = "custom/statvar_"
_CUSTOM_GROUP_ID_PREFIX = "custom/g/group_"
_ROOT_GROUP_ID = "dc/g/Root"


class Nodes:

def __init__(self, config: Config) -> None:
self.config = config
# Dictionary of SVs from column name to SV
self.variables: dict[str, StatVar] = {}
# Dictionary of SVGs from SVG path to SVG
self.groups: dict[str, StatVarGroup] = {}

def variable(self, sv_column_name: str) -> StatVar:
if sv_column_name in self.variables:
return self.variables[sv_column_name]

var_cfg = self.config.variable(sv_column_name)
group = self.group(var_cfg.group_path)
group_id = group.id if group else _ROOT_GROUP_ID
self.variables[sv_column_name] = StatVar(
f"{_CUSTOM_SV_ID_PREFIX}{len(self.variables) + 1}",
var_cfg.name,
description=var_cfg.description,
nl_sentences=var_cfg.nl_sentences,
group_id=group_id,
)
return self.variables[sv_column_name]

def group(self, group_path: str) -> StatVarGroup | None:
if not group_path:
return None
if group_path in self.groups:
return self.groups[group_path]

tokens = group_path.split("/")
for index in range(len(tokens)):
path = "/".join(tokens[:index + 1])
if path not in self.groups:
parent_path = "" if "/" not in path else path[:path.rindex("/")]
parent_id = (self.groups[parent_path].id
if parent_path in self.groups else _ROOT_GROUP_ID)
self.groups[path] = StatVarGroup(
f"{_CUSTOM_GROUP_ID_PREFIX}{len(self.groups) + 1}", tokens[index],
parent_id)

return self.groups[group_path]

def triples(self, triples_fh: FileHandler | None = None) -> list[Triple]:
triples: list[Triple] = []
for group in self.groups.values():
triples.extend(group.triples())
for variable in self.variables.values():
triples.extend(variable.triples())

if triples_fh:
logging.info("Writing %s triples to: %s", len(triples), str(triples_fh))
triples_fh.write_string(pd.DataFrame(triples).to_csv(index=False))

return triples
Loading

0 comments on commit 30281a9

Please sign in to comment.