-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
2,158 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
Original data from INEGI: Sistema de Clasificación Industrial de América del Norte | ||
|
||
|
||
Main page: http://www.inegi.org.mx/est/contenidos/proyectos/SCIAN/presentacion.aspx | ||
[2013 Table](http://www.inegi.org.mx/est/contenidos/proyectos/SCIAN/presentacion.aspx?_file=/est/contenidos/proyectos/SCIAN/doc/est_ingles.xlsx) | ||
[2007 Table](http://www.inegi.org.mx/sistemas/scian/contenidos/ESTRUCTURA%20SCIAN%20M%C9XICO%202007%20TRADUCCI%D3N%20INGL%C9S.xlsx) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
CLEAN = PYTHONPATH=../../../ python2.7 -B clean.py | ||
|
||
: in/est_ingles_2007.xlsx |> $(CLEAN) %f |> out/industries_mexico_scian_2007.csv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
#!/usr/bin/python | ||
# vim: set fileencoding=utf8 : | ||
|
||
import pandas as pd | ||
|
||
from classification import (Hierarchy, ordered_table_to_parent_code_table, | ||
parent_code_table_to_parent_id_table, | ||
Classification) | ||
|
||
import re | ||
import sys | ||
|
||
if __name__ == "__main__": | ||
|
||
assert len(sys.argv) == 2 | ||
filename = sys.argv[1] | ||
|
||
df = pd.read_excel("./in/est_ingles_2007.xlsx", sheetname=0) | ||
df.columns = ["code", "name_spanish", "code2", "name_english"] | ||
|
||
assert df.code.equals(df.code2) | ||
df = df[["code", "name_spanish", "name_english"]] | ||
|
||
df.code = df.code.astype(str) | ||
|
||
regexes = {"mex": u"MÉX\.", "can": u"CAN\.", "usa": u"EE\.UU\."} | ||
|
||
# Remove weird "Mex" "Can" identifiers within the name fields | ||
for name, regex in regexes.items(): | ||
selected_rows = df.name_english.str.contains(regex) | ||
df["tag_en_" + name] = False | ||
df.loc[selected_rows, "tag_en_" + name] = True | ||
df.name_english = df.name_english.map(lambda x: re.sub(regex, "", x)) | ||
|
||
for name, regex in regexes.items(): | ||
selected_rows = df.name_spanish.str.contains(regex) | ||
df["tag_sp_" + name] = False | ||
df.loc[selected_rows, "tag_sp_" + name] = True | ||
df.name_spanish = df.name_spanish.map(lambda x: re.sub(regex, "", x)) | ||
|
||
# Replace trailing comma and space | ||
df.name_spanish = df.name_spanish.str.replace(", $", "") | ||
df.name_english = df.name_english.str.replace(", $", "") | ||
|
||
h = Hierarchy(["twodigit", "threedigit", "fourdigit", "fivedigit", "sixdigit"]) | ||
|
||
df.loc[df.code.str.len() == 2, "level"] = "twodigit" | ||
df.loc[df.code.str.len() == 3, "level"] = "threedigit" | ||
df.loc[df.code.str.len() == 4, "level"] = "fourdigit" | ||
df.loc[df.code.str.len() == 5, "level"] = "fivedigit" | ||
df.loc[df.code.str.len() == 6, "level"] = "sixdigit" | ||
|
||
df = df[["code", "name_spanish", "level"]] | ||
df.columns = ["code", "name", "level"] | ||
|
||
parent_code_table = ordered_table_to_parent_code_table(df, h) | ||
parent_id_table = parent_code_table_to_parent_id_table(parent_code_table, h) | ||
|
||
c = Classification(parent_id_table, h) | ||
|
||
c.to_csv("out/industries_mexico_scian_2007.csv") |
Binary file not shown.
Binary file not shown.
2,085 changes: 2,085 additions & 0 deletions
2,085
industry/NAICS/Mexico/out/industries_mexico_scian_2007.csv
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters