Add Peru HS classification.

cid-harvard · Apr 1, 2016 · 77d146a · 77d146a
1 parent 41c1b00
commit 77d146a
Show file tree

Hide file tree

Showing 8 changed files with 4,046 additions and 1 deletion.
diff --git a/product/HS/Peru_Datlas/README.md b/product/HS/Peru_Datlas/README.md
@@ -0,0 +1,3 @@
+This is HS (2000s?) for Peru, with names generously translated by Ryan.
+
+[here](https://docs.google.com/spreadsheets/d/1BV4vOUhGdB1boAux4JkHjoanTezSdvWEkRcanAnwzYw/edit#gid=1029116973)
diff --git a/product/HS/Peru_Datlas/Tupfile b/product/HS/Peru_Datlas/Tupfile
@@ -0,0 +1,3 @@
+CLEAN = PYTHONPATH=../../../ python2.7 -B clean.py
+
+: in/* |> $(CLEAN) |> out/products_peru_datlas.csv out/products_peru_datlas.dta
diff --git a/product/HS/Peru_Datlas/clean.py b/product/HS/Peru_Datlas/clean.py
@@ -0,0 +1,48 @@
+import pandas as pd
+
+from classification import (Hierarchy, repeated_table_to_parent_id_table,
+                            parent_code_table_to_parent_id_table,
+                            Classification)
+
+if __name__ == "__main__":
+    names = pd.read_table("./in/HS_hierarchy_master - Names.tsv",
+                          encoding="utf-8", dtype={"code": str})
+
+    hierarchy = pd.read_table("./in/HS_hierarchy_master - Hierarchy.tsv",
+                              encoding="utf-8",
+                              dtype={
+                                  "4digit": str,
+                                  "2digit": str,
+                                  "section": str,
+                                  "atlas_section": str,
+                              })
+    hierarchy.columns = ["4digit_code", "2digit_code", "section_code", "atlas_section"]
+    hierarchy["name_4digit"] = None
+    hierarchy["name_2digit"] = None
+    hierarchy["name_section"] = None
+
+
+    fields = {
+        "4digit": ["name_4digit"],
+        "2digit": ["name_2digit"],
+        "section": ["name_section"]
+    }
+
+    h = Hierarchy(["section", "2digit", "4digit"])
+    parent_code_table = repeated_table_to_parent_id_table(hierarchy, h, fields)
+
+    parent_code_table.code = parent_code_table.code.astype(str)
+
+    parent_code_table = parent_code_table.merge(names, on=["code", "level"])
+
+    parent_id_table = parent_code_table_to_parent_id_table(parent_code_table, h)
+    parent_id_table.name = parent_id_table.name_en
+
+    parent_id_table = parent_id_table[["code", "name", "level", "name_en",
+                                       "name_es", "name_short_es",
+                                       "name_short_en", "parent_id"]]
+
+    c = Classification(parent_id_table, h)
+
+    c.to_csv("out/products_peru_datlas.csv")
+    c.to_stata("out/products_peru_datlas.dta")
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		This is HS (2000s?) for Peru, with names generously translated by Ryan.

		[here](https://docs.google.com/spreadsheets/d/1BV4vOUhGdB1boAux4JkHjoanTezSdvWEkRcanAnwzYw/edit#gid=1029116973)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		CLEAN = PYTHONPATH=../../../ python2.7 -B clean.py

		: in/* \|> $(CLEAN) \|> out/products_peru_datlas.csv out/products_peru_datlas.dta