Add top level parent ID in Atlas classifications

cid-harvard · Oct 20, 2021 · cc78284 · cc78284
1 parent 03d4947
commit cc78284
Show file tree

Hide file tree

Showing 9 changed files with 8,500 additions and 8,439 deletions.
diff --git a/classification.py b/classification.py
@@ -24,19 +24,33 @@ def load(path):
     return Classification.from_csv(path)
 
 
-def parent_code_table_to_parent_id_table(df, hierarchy):
+def parent_code_table_to_parent_id_table(df, hierarchy, top_level=None):
     """From a classification that has parent_code, go to one that has
-    parent_id."""
+    parent_id (and optionally top_parent_code to top_parent_id)."""
 
     code_table = df[["code", "level"]].reset_index()
     code_table.columns = ["parent_id", "parent_code", "parent_level"]
 
     df["parent_level"] = df["level"].map(hierarchy.parent).fillna(value=pd.np.nan)
-
-    return df.merge(code_table, on=["parent_level", "parent_code"], how="left").drop(
+    df = df.merge(code_table, on=["parent_level", "parent_code"], how="left").drop(
         ["parent_code", "parent_level"], axis=1
     )
 
+    if top_level:
+        top_code_table = code_table[code_table.parent_level == top_level]
+        top_code_table.columns = [
+            "top_parent_id",
+            "top_parent_code",
+            "top_parent_level",
+        ]
+        df = df.merge(
+            top_code_table[["top_parent_id", "top_parent_code"]],
+            on="top_parent_code",
+            how="left",
+        )
+
+    return df
+
 
 def ordered_table_to_parent_code_table(df, hierarchy):
 
@@ -56,32 +70,40 @@ def traversal_iteration(x):
     return df
 
 
-def repeated_table_to_parent_id_table(df, hierarchy, level_fields={}):
+def repeated_table_to_parent_id_table(df, hierarchy, level_fields={}, top_level=None):
     """
     Convert from the "merged" table format to a parent_id format, e.g.
 
-      level1  level0
-      cats    animals
-      dogs    animals
-      cod     fish
-      salmon  fish
+      level2    level1      level0
+      cats      mammals     animals
+      dogs      mammals     animals
+      cod       fish        animals
+      salmon    fish        animals
 
     into:
 
-      code    level   name
-      cats    level1  Cats
+      code    level   name  parent_code top_parent_code
+      cats    level2  Cats  mammals     animals
 
     and to do that, specify level_fields=
       {
           "level0": [],
-          "level1": []
+          "level1": [],
+          "level2": []
       }
+
+    and, optionally top_level="level0"
     """
 
     # Check there is a code and name field for every entry in the hierarchy
     for level in hierarchy:
         for field_name in level_fields[level]:
-            assert field_name in df.columns, "Missing field: {}".format(field_name)
+            assert field_name in df.columns, f"Missing field: {field_name}"
+
+    # If top_level is defined, assert that there is a code column for it
+    assert (
+        f"{top_level}_code" in df.columns
+    ), f"Missing top level field: {top_level}_code"
 
     # Check there are no duplicate codes for the same country + dept + muni
     # etc.
@@ -99,6 +121,9 @@ def repeated_table_to_parent_id_table(df, hierarchy, level_fields={}):
 
             row_dict = {"code": code, "level": level, "parent_code": parent_codes[-1]}
 
+            if top_level:
+                row_dict["top_parent_code"] = row[f"{top_level}_code"]
+
             for field in level_fields[level]:
 
                 # Strip _section from the end
@@ -113,7 +138,6 @@ def repeated_table_to_parent_id_table(df, hierarchy, level_fields={}):
     new_df = pd.DataFrame(new_table)
     new_df = new_df[~new_df.duplicated()]
     new_df = new_df.reset_index(drop=True)
-    # new_df.level = new_df.level.astype("category")
 
     return new_df
 
@@ -132,7 +156,7 @@ def sort_by_code_and_level(parent_code_table, hierarchy):
     return parent_code_table
 
 
-def spread_out_entries(parent_id_table, level_starts, hierarchy):
+def spread_out_entries(parent_id_table, level_starts, hierarchy, top_level=None):
     """Given an id table, shift down ids so that the ids for each level group
     starts at the given id. This allows us to leave gap ids in case we need to
     add more."""
@@ -152,6 +176,9 @@ def spread_out_entries(parent_id_table, level_starts, hierarchy):
     assert (
         "new_parent_id" not in parent_id_table.columns
     ), "You already have a level named 'new_parent_id', please get rid of it."
+    assert (
+        "new_top_parent_id" not in parent_id_table.columns
+    ), "You already have a level named 'new_top_parent_id', please get rid of it."
 
     level_counts = parent_id_table.level.value_counts().to_dict()
     for i, level in enumerate(hierarchy):
@@ -193,6 +220,12 @@ def spread_out_entries(parent_id_table, level_starts, hierarchy):
                 parent_id_table.level == level, "new_parent_id"
             ] = new_parent_ids
 
+    # Set new top parent ids
+    if top_level:
+        parent_id_table["top_parent_id"] = (
+            parent_id_table["top_parent_id"] + level_starts[top_level]
+        )
+
     # Make sure there aren't any gaps left
     assert parent_id_table.new_index.isnull().any() == False
 

diff --git a/product/HS/IntlAtlas/clean.py b/product/HS/IntlAtlas/clean.py
@@ -11,12 +11,15 @@
     Classification,
 )
 
+TOP_LEVEL = "section"
+
 
 def get_hs_services(file="./in/Services_Hierarchy.csv"):
     services = pd.read_csv(file, encoding="utf-8", dtype="str")
+    services["top_parent_id"] = 0
     # Spread out services similarly to each set of exports but buffered further
     service_starts = {"section": 10, "2digit": 400, "4digit": 4000, "6digit": 11000}
-    return spread_out_entries(services, service_starts, h)
+    return spread_out_entries(services, service_starts, h, top_level=TOP_LEVEL)
 
 
 if __name__ == "__main__":
@@ -31,13 +34,17 @@ def get_hs_services(file="./in/Services_Hierarchy.csv"):
     fields = {"section": [], "2digit": [], "4digit": [], "6digit": []}
 
     h = Hierarchy(["section", "2digit", "4digit", "6digit"])
-    parent_code_table = repeated_table_to_parent_id_table(hierarchy, h, fields)
+    parent_code_table = repeated_table_to_parent_id_table(
+        hierarchy, h, level_fields=fields, top_level=TOP_LEVEL
+    )
     parent_code_table = parent_code_table.merge(names, on=["code", "level"])
 
     # Sort by level order (not necessarily alphabetical)
     parent_code_table = sort_by_code_and_level(parent_code_table, h)
 
-    parent_id_table = parent_code_table_to_parent_id_table(parent_code_table, h)
+    parent_id_table = parent_code_table_to_parent_id_table(
+        parent_code_table, h, top_level=TOP_LEVEL
+    )
     parent_id_table["name"] = parent_id_table.name_en
 
     parent_id_table = parent_id_table[
@@ -50,13 +57,16 @@ def get_hs_services(file="./in/Services_Hierarchy.csv"):
             "name_short_en",
             "name_short_es",
             "parent_id",
+            "top_parent_id",
         ]
     ]
 
     # Decide what id each level should start from
     # Put ample space between each range of ids
     level_starts = {"section": 0, "2digit": 100, "4digit": 650, "6digit": 5000}
-    parent_id_table = spread_out_entries(parent_id_table, level_starts, h)
+    parent_id_table = spread_out_entries(
+        parent_id_table, level_starts, h, top_level=TOP_LEVEL
+    )
 
     # Append services to table
     services = get_hs_services()