Add NAICS mexico 2007

cid-harvard · Jul 10, 2015 · 460008d · 460008d
1 parent 9a00870
commit 460008d
Show file tree

Hide file tree

Showing 8 changed files with 2,158 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -12,6 +12,7 @@ type|name|adaptation|localized name|version|description|link
 ----|----|----------|--------------|-------|-----------|----
 industry|ISIC|Colombia|CIIU 4 A.C.|4.0|ISIC 4.0, colombian version.|[here](industry/ISIC/Colombia)
 industry|ISIC|Colombia|CIIU 3 A.C.|3.0|ISIC 3.0, colombian version.|[here](industry/ISIC/Colombia)
+industry|NAICS|Colombia|SCIAN Mexico|2007|NAICS 2007, mexican version with translations.|[here](industry/NAICS/Mexico)
 product|HS|International|Harmonized System|1992|Harmonized system, as used by the Atlas of Economic Complexity.|[here](product/HS/Atlas)
 location|DANE Divipola |||2015-03-31|Colombian administrative regions, from DANE.|[here](location/Colombia/DANE)
 location|INEGI Catálogo Único de Claves de Áreas Geoestadísticas |||MAY2015|Mexican administrative regions, from INEGI.|[here](location/Mexico/INEGI)

diff --git a/industry/NAICS/Mexico/README.md b/industry/NAICS/Mexico/README.md
@@ -0,0 +1,6 @@
+Original data from INEGI: Sistema de Clasificación Industrial de América del Norte
+
+
+Main page: http://www.inegi.org.mx/est/contenidos/proyectos/SCIAN/presentacion.aspx
+[2013 Table](http://www.inegi.org.mx/est/contenidos/proyectos/SCIAN/presentacion.aspx?_file=/est/contenidos/proyectos/SCIAN/doc/est_ingles.xlsx)
+[2007 Table](http://www.inegi.org.mx/sistemas/scian/contenidos/ESTRUCTURA%20SCIAN%20M%C9XICO%202007%20TRADUCCI%D3N%20INGL%C9S.xlsx)
diff --git a/industry/NAICS/Mexico/Tupfile b/industry/NAICS/Mexico/Tupfile
@@ -0,0 +1,3 @@
+CLEAN = PYTHONPATH=../../../ python2.7 -B clean.py
+
+: in/est_ingles_2007.xlsx |> $(CLEAN) %f |> out/industries_mexico_scian_2007.csv
diff --git a/industry/NAICS/Mexico/clean.py b/industry/NAICS/Mexico/clean.py
@@ -0,0 +1,61 @@
+#!/usr/bin/python
+# vim: set fileencoding=utf8 :
+
+import pandas as pd
+
+from classification import (Hierarchy, ordered_table_to_parent_code_table,
+                            parent_code_table_to_parent_id_table,
+                            Classification)
+
+import re
+import sys
+
+if __name__ == "__main__":
+
+    assert len(sys.argv) == 2
+    filename = sys.argv[1]
+
+    df = pd.read_excel("./in/est_ingles_2007.xlsx", sheetname=0)
+    df.columns = ["code", "name_spanish", "code2", "name_english"]
+
+    assert df.code.equals(df.code2)
+    df = df[["code", "name_spanish", "name_english"]]
+
+    df.code = df.code.astype(str)
+
+    regexes = {"mex": u"MÉX\.", "can": u"CAN\.", "usa": u"EE\.UU\."}
+
+    # Remove weird "Mex" "Can" identifiers within the name fields
+    for name, regex in regexes.items():
+        selected_rows = df.name_english.str.contains(regex)
+        df["tag_en_" + name] = False
+        df.loc[selected_rows, "tag_en_" + name] = True
+        df.name_english = df.name_english.map(lambda x: re.sub(regex, "", x))
+
+    for name, regex in regexes.items():
+        selected_rows = df.name_spanish.str.contains(regex)
+        df["tag_sp_" + name] = False
+        df.loc[selected_rows, "tag_sp_" + name] = True
+        df.name_spanish = df.name_spanish.map(lambda x: re.sub(regex, "", x))
+
+    # Replace trailing comma and space
+    df.name_spanish = df.name_spanish.str.replace(", $", "")
+    df.name_english = df.name_english.str.replace(", $", "")
+
+    h = Hierarchy(["twodigit", "threedigit", "fourdigit", "fivedigit", "sixdigit"])
+
+    df.loc[df.code.str.len() == 2, "level"] = "twodigit"
+    df.loc[df.code.str.len() == 3, "level"] = "threedigit"
+    df.loc[df.code.str.len() == 4, "level"] = "fourdigit"
+    df.loc[df.code.str.len() == 5, "level"] = "fivedigit"
+    df.loc[df.code.str.len() == 6, "level"] = "sixdigit"
+
+    df = df[["code", "name_spanish", "level"]]
+    df.columns = ["code", "name", "level"]
+
+    parent_code_table = ordered_table_to_parent_code_table(df, h)
+    parent_id_table = parent_code_table_to_parent_id_table(parent_code_table, h)
+
+    c = Classification(parent_id_table, h)
+
+    c.to_csv("out/industries_mexico_scian_2007.csv")
diff --git a/industry/NAICS/Mexico/in/est_ingles.xlsx b/industry/NAICS/Mexico/in/est_ingles.xlsx
diff --git a/industry/NAICS/Mexico/in/est_ingles_2007.xlsx b/industry/NAICS/Mexico/in/est_ingles_2007.xlsx
diff --git a/industry/NAICS/Mexico/out/industries_mexico_scian_2007.csv b/industry/NAICS/Mexico/out/industries_mexico_scian_2007.csv
diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name="linnaeus",
-    version="v0.0.8",
+    version="v0.0.9",
     author="Mali Akmanalp <Harvard CID>",
     description=("Harvard CID's classification tools."),
     url="http://github.com/cid-harvard/classifications/",
@@ -14,6 +14,7 @@
         '': [
             'industry/ISIC/Colombia/out/isic_ac_3.0.csv',
             'industry/ISIC/Colombia/out/isic_ac_4.0.csv',
+            'industry/NAICS/Mexico/out/industries_mexico_scian_2007.csv',
             'product/HS/Atlas/out/hs92_atlas.csv',
             'location/Colombia/DANE/out/locations_colombia_dane.csv',
             'location/Mexico/INEGI/out/locations_mexico_inegi.csv',
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		CLEAN = PYTHONPATH=../../../ python2.7 -B clean.py

		: in/est_ingles_2007.xlsx \|> $(CLEAN) %f \|> out/industries_mexico_scian_2007.csv