Rename SV column names to SV ids. (#236)

datacommonsorg · Oct 24, 2023 · 30281a9 · 30281a9
1 parent bbf9d00
commit 30281a9
Show file tree

Hide file tree

Showing 17 changed files with 433 additions and 194 deletions.
diff --git a/simple/sample/output/observations_countries.csv b/simple/sample/output/observations_countries.csv
@@ -1,4 +1,4 @@
-dcid,date,var1,var2
+dcid,date,custom/statvar_1,custom/statvar_2
 country/AFG,2023,0.19,6
 country/YEM,2023,0.21,56
 country/AGO,2023,0.29,6

diff --git a/simple/sample/output/observations_geoids.csv b/simple/sample/output/observations_geoids.csv
@@ -1,3 +1,3 @@
-dcid,date,Variable 1,Variable 2
+dcid,date,custom/statvar_3,custom/statvar_4
 geoId/01,2021,555,666
 geoId/122,2022,321,123456
diff --git a/simple/sample/output/observations_latlng.csv b/simple/sample/output/observations_latlng.csv
@@ -1,3 +1,3 @@
-dcid,date,var1,var2
+dcid,date,custom/statvar_1,custom/statvar_2
 country/USA,2021,555,666
 country/IND,2022,321,123
diff --git a/simple/sample/output/observations_powerplants.csv b/simple/sample/output/observations_powerplants.csv
@@ -1,4 +1,4 @@
-dcid,date,var1,var2
+dcid,date,custom/statvar_1,custom/statvar_2
 dc/000qxlm93vn93,2023,0.19,6
 dc/5c7tz3lbln3p,2023,0.21,56
 dc/8zmh7ctlkbsc4,2023,0.29,6

diff --git a/simple/sample/output/observations_s2cells.csv b/simple/sample/output/observations_s2cells.csv
@@ -1,3 +1,3 @@
-dcid,date,var1,var2
+dcid,date,custom/statvar_1,custom/statvar_2
 s2CellId/0x80982b0000000000,2021,555,666
 s2CellId/0x3be7c90000000000,2022,321,123
diff --git a/simple/sample/output/observations_wikidataids.csv b/simple/sample/output/observations_wikidataids.csv
@@ -1,3 +1,3 @@
-dcid,date,var1,var2
+dcid,date,custom/statvar_1,custom/statvar_2
 country/USA,2021,555,666
 country/IND,2022,321,123
diff --git a/simple/sample/output/process/debug_resolve_powerplants.csv b/simple/sample/output/process/debug_resolve_powerplants.csv
@@ -1,6 +1,6 @@
 input,dcid,link
-FOO BAR,*UNRESOLVED*,
 BAZ BAR,*UNRESOLVED*,
+FOO BAR,*UNRESOLVED*,
 Suzlon Project,dc/000qxlm93vn93,https://datacommons.org/browser/dc/000qxlm93vn93
 Crete Energy Venture,dc/5c7tz3lbln3p,https://datacommons.org/browser/dc/5c7tz3lbln3p
 Watchtower Educational Center,dc/8zmh7ctlkbsc4,https://datacommons.org/browser/dc/8zmh7ctlkbsc4

diff --git a/simple/sample/output/process/report.json b/simple/sample/output/process/report.json
@@ -1,37 +1,37 @@
 {
   "status": "SUCCESS",
-  "startTime": "2023-10-23 10:50:22.644858",
-  "lastUpdate": "2023-10-23 10:50:30.534262",
+  "startTime": "2023-10-23 14:43:10.816166",
+  "lastUpdate": "2023-10-23 14:43:17.797630",
   "importFiles": {
     "countries.csv": {
       "status": "SUCCESS",
-      "startTime": "2023-10-23 10:50:22.645832",
-      "lastUpdate": "2023-10-23 10:50:23.689136"
+      "startTime": "2023-10-23 14:43:10.816848",
+      "lastUpdate": "2023-10-23 14:43:11.638482"
     },
     "geoids.csv": {
       "status": "SUCCESS",
-      "startTime": "2023-10-23 10:50:23.693980",
-      "lastUpdate": "2023-10-23 10:50:23.697093"
+      "startTime": "2023-10-23 14:43:11.641372",
+      "lastUpdate": "2023-10-23 14:43:11.644468"
     },
     "latlng.csv": {
       "status": "SUCCESS",
-      "startTime": "2023-10-23 10:50:23.698691",
-      "lastUpdate": "2023-10-23 10:50:24.975047"
+      "startTime": "2023-10-23 14:43:11.645726",
+      "lastUpdate": "2023-10-23 14:43:12.729606"
     },
     "powerplants.csv": {
       "status": "SUCCESS",
-      "startTime": "2023-10-23 10:50:24.977544",
-      "lastUpdate": "2023-10-23 10:50:30.267462"
+      "startTime": "2023-10-23 14:43:12.732431",
+      "lastUpdate": "2023-10-23 14:43:17.590325"
     },
     "s2cells.csv": {
       "status": "SUCCESS",
-      "startTime": "2023-10-23 10:50:30.268803",
-      "lastUpdate": "2023-10-23 10:50:30.270341"
+      "startTime": "2023-10-23 14:43:17.591729",
+      "lastUpdate": "2023-10-23 14:43:17.593462"
     },
     "wikidataids.csv": {
       "status": "SUCCESS",
-      "startTime": "2023-10-23 10:50:30.271283",
-      "lastUpdate": "2023-10-23 10:50:30.529614"
+      "startTime": "2023-10-23 14:43:17.594229",
+      "lastUpdate": "2023-10-23 14:43:17.794165"
     }
   }
 }
diff --git a/simple/sample/output/triples.csv b/simple/sample/output/triples.csv
@@ -9,15 +9,15 @@ custom/g/group_3,typeOf,StatVarGroup,
 custom/g/group_3,name,,Child Group 2
 custom/g/group_3,specializationOf,custom/g/group_1,
 custom/statvar_1,typeOf,StatisticalVariable,
-custom/statvar_1,name,,Variable 1
-custom/statvar_1,memberOf,custom/g/group_1,
+custom/statvar_1,name,,Good var1 name
+custom/statvar_1,description,,Good var1 description
+custom/statvar_1,memberOf,custom/g/group_2,
 custom/statvar_2,typeOf,StatisticalVariable,
-custom/statvar_2,name,,Variable 2
-custom/statvar_2,memberOf,dc/g/Root,
+custom/statvar_2,name,,Good var2 name
+custom/statvar_2,memberOf,custom/g/group_3,
 custom/statvar_3,typeOf,StatisticalVariable,
-custom/statvar_3,name,,Good var1 name
-custom/statvar_3,description,,Good var1 description
-custom/statvar_3,memberOf,custom/g/group_2,
+custom/statvar_3,name,,Variable 1
+custom/statvar_3,memberOf,custom/g/group_1,
 custom/statvar_4,typeOf,StatisticalVariable,
-custom/statvar_4,name,,Good var2 name
-custom/statvar_4,memberOf,custom/g/group_3,
+custom/statvar_4,name,,Variable 2
+custom/statvar_4,memberOf,dc/g/Root,
diff --git a/simple/stats/config.py b/simple/stats/config.py
@@ -38,71 +38,22 @@ class Config:
   def __init__(self, data: dict) -> None:
     self.data = data
 
-  def variables_and_groups(
-      self, sv_names: list[str]) -> (list[StatVar], list[StatVarGroup]):
-    groups = self._groups(sv_names)
-    variables = self._variables(sv_names, groups=groups)
-    return list(variables.values()), list(groups.values())
-
-  def _groups(self, sv_names: list[str]):
-    # Group Path to SVGroup
-    groups: dict[str, StatVarGroup] = {}
-
-    for sv_name in sv_names:
-      group_path = self._variable_group(sv_name)
-      if group_path:
-        tokens = group_path.split("/")
-        for index in range(len(tokens)):
-          path = "/".join(tokens[:index + 1])
-          if path not in groups:
-            id = f"{_CUSTOM_GROUP_ID_PREFIX}{len(groups) + 1}"
-            name = tokens[index]
-            parent_path = "" if "/" not in path else path[:path.rindex("/")]
-            parent_id = groups[
-                parent_path].id if parent_path in groups else _ROOT_GROUP_ID
-            groups[path] = StatVarGroup(id, name, parent_id)
-
-    return groups
-
-  def _variables(self,
-                 sv_names: list[str],
-                 groups: dict[str, StatVarGroup] = None):
-    if not groups:
-      groups = self._groups(sv_names)
-
-    # Variable column name to SV
-    svs: dict[str, StatVar] = {}
-
-    for sv_name in sv_names:
-      if sv_name in svs:
-        continue
-
-      var = self._variable(sv_name)
-      id = f"{_CUSTOM_SV_ID_PREFIX}{len(svs) + 1}"
-      name = var.get(_NAME_FIELD, sv_name)
-      group_path = self._variable_group(sv_name)
-      group_id = groups[
-          group_path].id if group_path and group_path in groups else _ROOT_GROUP_ID
-      svs[sv_name] = StatVar(id,
-                             name,
-                             description=var.get(_DESCRIPTION_FIELD, ""),
-                             nl_sentences=var.get(_NL_SENTENCES_FIELD, []),
-                             group_id=group_id)
-
-    return svs
+  def variable(self, variable_name: str) -> StatVar:
+    var_cfg = self.data.get(_VARIABLES_FIELD, {}).get(variable_name, {})
+    return StatVar(
+        "",
+        var_cfg.get(_NAME_FIELD, variable_name),
+        description=var_cfg.get(_DESCRIPTION_FIELD, ""),
+        nl_sentences=var_cfg.get(_NL_SENTENCES_FIELD, []),
+        group_path=var_cfg.get(_GROUP_FIELD, ""),
+    )
 
   def entity_type(self, input_file_name: str) -> str:
-    return self.data.get(_INPUT_FILES_FIELD,
-                         {}).get(input_file_name,
-                                 {}).get(_ENTITY_TYPE_FIELD, "")
+    return (self.data.get(_INPUT_FILES_FIELD,
+                          {}).get(input_file_name,
+                                  {}).get(_ENTITY_TYPE_FIELD, ""))
 
   def ignore_columns(self, input_file_name: str) -> str:
-    return self.data.get(_INPUT_FILES_FIELD,
-                         {}).get(input_file_name,
-                                 {}).get(_IGNORE_COLUMNS_FIELD, [])
-
-  def _variable(self, variable_name: str) -> dict:
-    return self.data.get(_VARIABLES_FIELD, {}).get(variable_name, {})
-
-  def _variable_group(self, variable_name: str) -> str:
-    return self._variable(variable_name).get(_GROUP_FIELD, "")
+    return (self.data.get(_INPUT_FILES_FIELD,
+                          {}).get(input_file_name,
+                                  {}).get(_IGNORE_COLUMNS_FIELD, []))
diff --git a/simple/stats/data.py b/simple/stats/data.py
@@ -56,6 +56,7 @@ class StatVar:
   description: str = ""
   nl_sentences: list[str] = field(default_factory=list)
   group_id: str = ""
+  group_path: str = ""
 
   def triples(self) -> list[Triple]:
     triples: list[Triple] = []

diff --git a/simple/stats/importer.py b/simple/stats/importer.py
@@ -16,6 +16,7 @@
 
 import pandas as pd
 from stats import constants
+from stats.nodes import Nodes
 from stats.reporter import FileImportReporter
 from util.filehandler import FileHandler
 
@@ -33,18 +34,19 @@ def __init__(
       observations_fh: FileHandler,
       debug_resolve_fh: FileHandler,
       reporter: FileImportReporter,
+      nodes: Nodes,
       entity_type: str,
       ignore_columns: list[str] = list(),
   ) -> None:
     self.input_fh = input_fh
     self.observations_fh = observations_fh
     self.debug_resolve_fh = debug_resolve_fh
     self.reporter = reporter
+    self.nodes = nodes
     self.entity_type = entity_type
     self.ignore_columns = ignore_columns
     self.df = pd.DataFrame()
     self.debug_resolve_df = None
-    self.sv_names = []
 
   def do_import(self) -> None:
     self.reporter.report_started()
@@ -54,7 +56,6 @@ def do_import(self) -> None:
       self._sanitize_values()
       self._resolve_entities()
       self._rename_columns()
-      self._extract_sv_names()
       self.reporter.report_success()
     except Exception as e:
       self.reporter.report_failure(str(e))
@@ -82,12 +83,20 @@ def _sanitize_values(self):
     self.df = self.df.astype({self.df.columns[1]: str})
 
   def _rename_columns(self) -> None:
-    df = self.df
-    df.columns.values[0] = constants.COLUMN_DCID
-    df.columns.values[1] = constants.COLUMN_DATE
-
-  def _extract_sv_names(self):
-    self.sv_names = list(self.df.columns[2:])
+    renamed = {}
+    # Rename dcid and date columns
+    renamed[self.df.columns[0]] = constants.COLUMN_DCID
+    renamed[self.df.columns[1]] = constants.COLUMN_DATE
+
+    # Rename SV columns to their IDs
+    sv_column_names = self.df.columns[2:]
+    sv_ids = [
+        self.nodes.variable(sv_column_name).id
+        for sv_column_name in sv_column_names
+    ]
+    renamed.update({col: id for col, id in zip(sv_column_names, sv_ids)})
+
+    self.df = self.df.rename(columns=renamed)
 
   def _resolve_entities(self) -> None:
     df = self.df

diff --git a/simple/stats/nodes.py b/simple/stats/nodes.py
@@ -0,0 +1,85 @@
+# Copyright 2023 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+import logging
+
+import pandas as pd
+from stats.config import Config
+from stats.data import StatVar
+from stats.data import StatVarGroup
+from stats.data import Triple
+from util.filehandler import FileHandler
+
+_CUSTOM_SV_ID_PREFIX = "custom/statvar_"
+_CUSTOM_GROUP_ID_PREFIX = "custom/g/group_"
+_ROOT_GROUP_ID = "dc/g/Root"
+
+
+class Nodes:
+
+  def __init__(self, config: Config) -> None:
+    self.config = config
+    # Dictionary of SVs from column name to SV
+    self.variables: dict[str, StatVar] = {}
+    # Dictionary of SVGs from SVG path to SVG
+    self.groups: dict[str, StatVarGroup] = {}
+
+  def variable(self, sv_column_name: str) -> StatVar:
+    if sv_column_name in self.variables:
+      return self.variables[sv_column_name]
+
+    var_cfg = self.config.variable(sv_column_name)
+    group = self.group(var_cfg.group_path)
+    group_id = group.id if group else _ROOT_GROUP_ID
+    self.variables[sv_column_name] = StatVar(
+        f"{_CUSTOM_SV_ID_PREFIX}{len(self.variables) + 1}",
+        var_cfg.name,
+        description=var_cfg.description,
+        nl_sentences=var_cfg.nl_sentences,
+        group_id=group_id,
+    )
+    return self.variables[sv_column_name]
+
+  def group(self, group_path: str) -> StatVarGroup | None:
+    if not group_path:
+      return None
+    if group_path in self.groups:
+      return self.groups[group_path]
+
+    tokens = group_path.split("/")
+    for index in range(len(tokens)):
+      path = "/".join(tokens[:index + 1])
+      if path not in self.groups:
+        parent_path = "" if "/" not in path else path[:path.rindex("/")]
+        parent_id = (self.groups[parent_path].id
+                     if parent_path in self.groups else _ROOT_GROUP_ID)
+        self.groups[path] = StatVarGroup(
+            f"{_CUSTOM_GROUP_ID_PREFIX}{len(self.groups) + 1}", tokens[index],
+            parent_id)
+
+    return self.groups[group_path]
+
+  def triples(self, triples_fh: FileHandler | None = None) -> list[Triple]:
+    triples: list[Triple] = []
+    for group in self.groups.values():
+      triples.extend(group.triples())
+    for variable in self.variables.values():
+      triples.extend(variable.triples())
+
+    if triples_fh:
+      logging.info("Writing %s triples to: %s", len(triples), str(triples_fh))
+      triples_fh.write_string(pd.DataFrame(triples).to_csv(index=False))
+
+    return triples