Skip to content

Commit

Permalink
Add top level parent ID in Atlas classifications
Browse files Browse the repository at this point in the history
  • Loading branch information
bleonard33 committed Oct 20, 2021
1 parent 03d4947 commit cc78284
Show file tree
Hide file tree
Showing 9 changed files with 8,500 additions and 8,439 deletions.
65 changes: 49 additions & 16 deletions classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,33 @@ def load(path):
return Classification.from_csv(path)


def parent_code_table_to_parent_id_table(df, hierarchy):
def parent_code_table_to_parent_id_table(df, hierarchy, top_level=None):
"""From a classification that has parent_code, go to one that has
parent_id."""
parent_id (and optionally top_parent_code to top_parent_id)."""

code_table = df[["code", "level"]].reset_index()
code_table.columns = ["parent_id", "parent_code", "parent_level"]

df["parent_level"] = df["level"].map(hierarchy.parent).fillna(value=pd.np.nan)

return df.merge(code_table, on=["parent_level", "parent_code"], how="left").drop(
df = df.merge(code_table, on=["parent_level", "parent_code"], how="left").drop(
["parent_code", "parent_level"], axis=1
)

if top_level:
top_code_table = code_table[code_table.parent_level == top_level]
top_code_table.columns = [
"top_parent_id",
"top_parent_code",
"top_parent_level",
]
df = df.merge(
top_code_table[["top_parent_id", "top_parent_code"]],
on="top_parent_code",
how="left",
)

return df


def ordered_table_to_parent_code_table(df, hierarchy):

Expand All @@ -56,32 +70,40 @@ def traversal_iteration(x):
return df


def repeated_table_to_parent_id_table(df, hierarchy, level_fields={}):
def repeated_table_to_parent_id_table(df, hierarchy, level_fields={}, top_level=None):
"""
Convert from the "merged" table format to a parent_id format, e.g.
level1 level0
cats animals
dogs animals
cod fish
salmon fish
level2 level1 level0
cats mammals animals
dogs mammals animals
cod fish animals
salmon fish animals
into:
code level name
cats level1 Cats
code level name parent_code top_parent_code
cats level2 Cats mammals animals
and to do that, specify level_fields=
{
"level0": [],
"level1": []
"level1": [],
"level2": []
}
and, optionally top_level="level0"
"""

# Check there is a code and name field for every entry in the hierarchy
for level in hierarchy:
for field_name in level_fields[level]:
assert field_name in df.columns, "Missing field: {}".format(field_name)
assert field_name in df.columns, f"Missing field: {field_name}"

# If top_level is defined, assert that there is a code column for it
assert (
f"{top_level}_code" in df.columns
), f"Missing top level field: {top_level}_code"

# Check there are no duplicate codes for the same country + dept + muni
# etc.
Expand All @@ -99,6 +121,9 @@ def repeated_table_to_parent_id_table(df, hierarchy, level_fields={}):

row_dict = {"code": code, "level": level, "parent_code": parent_codes[-1]}

if top_level:
row_dict["top_parent_code"] = row[f"{top_level}_code"]

for field in level_fields[level]:

# Strip _section from the end
Expand All @@ -113,7 +138,6 @@ def repeated_table_to_parent_id_table(df, hierarchy, level_fields={}):
new_df = pd.DataFrame(new_table)
new_df = new_df[~new_df.duplicated()]
new_df = new_df.reset_index(drop=True)
# new_df.level = new_df.level.astype("category")

return new_df

Expand All @@ -132,7 +156,7 @@ def sort_by_code_and_level(parent_code_table, hierarchy):
return parent_code_table


def spread_out_entries(parent_id_table, level_starts, hierarchy):
def spread_out_entries(parent_id_table, level_starts, hierarchy, top_level=None):
"""Given an id table, shift down ids so that the ids for each level group
starts at the given id. This allows us to leave gap ids in case we need to
add more."""
Expand All @@ -152,6 +176,9 @@ def spread_out_entries(parent_id_table, level_starts, hierarchy):
assert (
"new_parent_id" not in parent_id_table.columns
), "You already have a level named 'new_parent_id', please get rid of it."
assert (
"new_top_parent_id" not in parent_id_table.columns
), "You already have a level named 'new_top_parent_id', please get rid of it."

level_counts = parent_id_table.level.value_counts().to_dict()
for i, level in enumerate(hierarchy):
Expand Down Expand Up @@ -193,6 +220,12 @@ def spread_out_entries(parent_id_table, level_starts, hierarchy):
parent_id_table.level == level, "new_parent_id"
] = new_parent_ids

# Set new top parent ids
if top_level:
parent_id_table["top_parent_id"] = (
parent_id_table["top_parent_id"] + level_starts[top_level]
)

# Make sure there aren't any gaps left
assert parent_id_table.new_index.isnull().any() == False

Expand Down
18 changes: 14 additions & 4 deletions product/HS/IntlAtlas/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,15 @@
Classification,
)

TOP_LEVEL = "section"


def get_hs_services(file="./in/Services_Hierarchy.csv"):
services = pd.read_csv(file, encoding="utf-8", dtype="str")
services["top_parent_id"] = 0
# Spread out services similarly to each set of exports but buffered further
service_starts = {"section": 10, "2digit": 400, "4digit": 4000, "6digit": 11000}
return spread_out_entries(services, service_starts, h)
return spread_out_entries(services, service_starts, h, top_level=TOP_LEVEL)


if __name__ == "__main__":
Expand All @@ -31,13 +34,17 @@ def get_hs_services(file="./in/Services_Hierarchy.csv"):
fields = {"section": [], "2digit": [], "4digit": [], "6digit": []}

h = Hierarchy(["section", "2digit", "4digit", "6digit"])
parent_code_table = repeated_table_to_parent_id_table(hierarchy, h, fields)
parent_code_table = repeated_table_to_parent_id_table(
hierarchy, h, level_fields=fields, top_level=TOP_LEVEL
)
parent_code_table = parent_code_table.merge(names, on=["code", "level"])

# Sort by level order (not necessarily alphabetical)
parent_code_table = sort_by_code_and_level(parent_code_table, h)

parent_id_table = parent_code_table_to_parent_id_table(parent_code_table, h)
parent_id_table = parent_code_table_to_parent_id_table(
parent_code_table, h, top_level=TOP_LEVEL
)
parent_id_table["name"] = parent_id_table.name_en

parent_id_table = parent_id_table[
Expand All @@ -50,13 +57,16 @@ def get_hs_services(file="./in/Services_Hierarchy.csv"):
"name_short_en",
"name_short_es",
"parent_id",
"top_parent_id",
]
]

# Decide what id each level should start from
# Put ample space between each range of ids
level_starts = {"section": 0, "2digit": 100, "4digit": 650, "6digit": 5000}
parent_id_table = spread_out_entries(parent_id_table, level_starts, h)
parent_id_table = spread_out_entries(
parent_id_table, level_starts, h, top_level=TOP_LEVEL
)

# Append services to table
services = get_hs_services()
Expand Down
Loading

0 comments on commit cc78284

Please sign in to comment.