Skip to content

Commit

Permalink
Create RSI importer for simple entities. (#214)
Browse files Browse the repository at this point in the history
  • Loading branch information
keyurva authored Sep 18, 2023
1 parent c1f693f commit c08abdc
Show file tree
Hide file tree
Showing 10 changed files with 340 additions and 0 deletions.
4 changes: 4 additions & 0 deletions simple/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.env/
.vscode/
__pycache__/
.data/
17 changes: 17 additions & 0 deletions simple/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
absl-py==1.4.0
certifi==2023.7.22
charset-normalizer==3.2.0
idna==3.4
importlib-metadata==6.8.0
numpy==1.25.2
pandas==2.1.0
platformdirs==3.10.0
python-dateutil==2.8.2
pytz==2023.3.post1
requests==2.31.0
six==1.16.0
tomli==2.0.1
tzdata==2023.3
urllib3==2.0.4
yapf==0.40.1
zipp==3.16.2
25 changes: 25 additions & 0 deletions simple/stats/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Simple Stats Importer

Outputs `observations.csv` for simple stats to be used by [RSI sqlite][sqlite].

[sqlite]: https://github.com/datacommonsorg/mixer/tree/a768446c56095aa23add8c59cf6a0630a17a726b/internal/sqlite

## Usage

```shell
python3 main.py
```

It reads inputs CSVs from the `.data/input` folder and outputs `observations.csv` in the `.data/output` folder.

The first 2 columns of input CSVs should be place names (or more generically _entity_ names) and observation periods respectively. Each subsequent column should be for each individual statvar. A sample input CSV can be found [here](sample/input.csv).

The output `observations.csv` can be imported directly into sqlite. A sample output CSV can be found [here](sample/observations.csv).

## Other options

To see all parameters and overrides supported by the script:

```shell
python3 main.py --help
```
28 changes: 28 additions & 0 deletions simple/stats/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Copyright 2023 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os

# Defaults.
DEFAULT_DATA_DIR = ".data"
DEFAULT_INPUT_DIR = os.path.join(DEFAULT_DATA_DIR, "input")
DEFAULT_OUTPUT_DIR = os.path.join(DEFAULT_DATA_DIR, "output")

OBSERVATIONS_FILE_NAME = "observations.csv"

# Observations CSV columns.
COLUMN_DCID = "dcid"
COLUMN_VARIABLE = "variable"
COLUMN_DATE = "date"
COLUMN_VALUE = "value"
106 changes: 106 additions & 0 deletions simple/stats/importer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# Copyright 2023 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
from absl import logging
import pandas as pd
import constants

# For importing util
_CODEDIR = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(1, os.path.join(_CODEDIR, "../"))

from util import dc_client as dc


# TODO: Add support for units.
class SimpleStatsImporter:

def __init__(self, input_dir: str, output_dir: str,
entity_type: str) -> None:
self.input_dir = input_dir
self.output_dir = output_dir
self.observations_file = os.path.join(output_dir,
constants.OBSERVATIONS_FILE_NAME)
self.entity_type = entity_type
self.df = pd.DataFrame()

def do_import(self) -> None:
self._init()
self._read_csvs()
self._rename_columns()
self._resolve_entities()
self._unpivot_variables()
self._reorder_columns()
self._write_csv()

def _init(self):
os.makedirs(self.output_dir, exist_ok=True)

def _read_csvs(self) -> None:
files = [
os.path.join(self.input_dir, filename)
for filename in os.listdir(self.input_dir)
]
df = pd.DataFrame()
for file in files:
df = pd.concat([df, pd.read_csv(file)])
logging.info("Read %s rows.", df.index.size)
self.df = df

def _rename_columns(self) -> None:
df = self.df
df.columns.values[0] = constants.COLUMN_DCID
df.columns.values[1] = constants.COLUMN_DATE

def _resolve_entities(self) -> None:
df = self.df
# get first (0th) column
column = df.iloc[:, 0]
entities = column.tolist()
logging.info("Resolving %s entities of type %s.", len(entities),
self.entity_type)
dcids = dc.resolve_entities(entities=entities,
entity_type=self.entity_type)
logging.info("Resolved %s of %s entities.", len(dcids), len(entities))
column.replace(dcids, inplace=True)
unresolved = set(entities).difference(set(dcids.keys()))
if unresolved:
unresolved_list = list(unresolved)
logging.warning("# unresolved entities which will be dropped: %s",
len(unresolved_list))
logging.warning("Dropped entities: %s", unresolved_list)
df.drop(df[df.iloc[:, 0].isin(values=unresolved)].index,
inplace=True)

def _unpivot_variables(self) -> None:
self.df = self.df.melt(
id_vars=[constants.COLUMN_DCID, constants.COLUMN_DATE],
var_name=constants.COLUMN_VARIABLE,
value_name=constants.COLUMN_VALUE,
).dropna()

def _reorder_columns(self) -> None:
self.df = self.df.reindex(columns=[
constants.COLUMN_DCID,
constants.COLUMN_VARIABLE,
constants.COLUMN_DATE,
constants.COLUMN_VALUE,
])

def _write_csv(self) -> None:
logging.info("Writing %s observations to: %s", self.df.index.size,
self.observations_file)
self.df.to_csv(self.observations_file, index=False)
48 changes: 48 additions & 0 deletions simple/stats/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Copyright 2023 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys
from absl import flags
from absl import app
import constants
from importer import SimpleStatsImporter

# For importing util
_CODEDIR = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(1, os.path.join(_CODEDIR, "../"))

from util import dc_client as dc

FLAGS = flags.FLAGS

flags.DEFINE_enum("entity_type", None, ["Country", "City"],
"The type of entities in the CSV.")
flags.DEFINE_string("input_dir", constants.DEFAULT_INPUT_DIR,
"The input directory.")
flags.DEFINE_string("output_dir", constants.DEFAULT_OUTPUT_DIR,
"The output directory.")


def main(_):
importer = SimpleStatsImporter(
input_dir=FLAGS.input_dir,
output_dir=FLAGS.output_dir,
entity_type=FLAGS.entity_type,
)
importer.do_import()


if __name__ == "__main__":
app.run(main)
15 changes: 15 additions & 0 deletions simple/stats/sample/input.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
place,year,var1,var2
Afghanistan,2023,0.19,6
Yemen,2023,0.21,56
Angola,2023,0.29,6
Zambia,2023,0.31,34
Zimbabwe,2023,0.37,76
Albania,2023,0.50,34
Cabo Verde,2023,0.50,97
Algeria,2023,0.52,92
West Bank and Gaza,2023,0.53,64
Andorra,2023,0.76,9
American Samoa,2023,#N/A,34
Anguilla,2023,#N/A,42
Wallis and Futuna Islands,2023,#N/A,75
Western Sahara,2023,#N/A,65
21 changes: 21 additions & 0 deletions simple/stats/sample/observations.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
dcid,variable,date,value
country/AFG,var1,2023,0.19
country/YEM,var1,2023,0.21
country/AGO,var1,2023,0.29
country/ZMB,var1,2023,0.31
country/ZWE,var1,2023,0.37
country/ALB,var1,2023,0.5
country/DZA,var1,2023,0.52
country/AND,var1,2023,0.76
country/AFG,var2,2023,6.0
country/YEM,var2,2023,56.0
country/AGO,var2,2023,6.0
country/ZMB,var2,2023,34.0
country/ZWE,var2,2023,76.0
country/ALB,var2,2023,34.0
country/DZA,var2,2023,92.0
country/AND,var2,2023,9.0
country/ASM,var2,2023,34.0
country/AIA,var2,2023,42.0
country/WLF,var2,2023,75.0
country/ESH,var2,2023,65.0
13 changes: 13 additions & 0 deletions simple/util/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright 2023 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
63 changes: 63 additions & 0 deletions simple/util/dc_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Copyright 2023 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Data Commons REST API Client.
"""

import os
import requests

# Environment variable for API key.
_KEY_ENV = "DC_API_KEY"


def get_api_key():
return os.environ.get(_KEY_ENV, "")


# REST API endpoint root
_API_ROOT = "https://api.datacommons.org"


# See: https://docs.datacommons.org/api/rest/v2/resolve
def resolve_entities(entities: list[str],
entity_type: str = None) -> dict[str, str]:
type_of = f"{{typeOf:{entity_type}}}" if entity_type else ""
data = {
"nodes": entities,
"property": f"<-description{type_of}->dcid",
}
response = post(path="/v2/resolve", data=data)

resolved: dict[str, str] = {}
for entity in response.get("entities", []):
node = entity.get("node", "")
candidates = entity.get("candidates", [])
dcid = candidates[0].get("dcid", "") if candidates else ""
if node and dcid:
resolved[node] = dcid

return resolved


def post(path: str, data={}) -> dict:
url = _API_ROOT + path
headers = {"Content-Type": "application/json"}
api_key = get_api_key()
if api_key:
headers["x-api-key"] = api_key
resp = requests.post(url, json=data, headers=headers)
if resp.status_code != 200:
raise Exception(
f'{resp.status_code}: {resp.reason}\n{resp.json()["message"]}')
return resp.json()

0 comments on commit c08abdc

Please sign in to comment.