Skip to content

Commit

Permalink
Merge pull request #10 from cid-harvard/feature-industry-data
Browse files Browse the repository at this point in the history
Industry complexity variables plus industry / year data.
  • Loading branch information
makmanalp committed Jun 16, 2015
2 parents 2fcb3de + f1adfc4 commit d8ea40d
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 150 deletions.
4 changes: 2 additions & 2 deletions colombia/api_schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ class Meta:
class DepartmentIndustryYearSchema(ma.Schema):

class Meta:
fields = ("employment", "wages", "department_id", "industry_id",
"year")
fields = ("employment", "wages", "rca", "distance", "cog", "coi",
"department_id", "industry_id", "year")


class DepartmentSchema(ma.Schema):
Expand Down
26 changes: 26 additions & 0 deletions colombia/data/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class DepartmentProductYear(BaseModel, IDMixin):

import_value = db.Column(db.Integer)
export_value = db.Column(db.Integer)

export_rca = db.Column(db.Integer)
density = db.Column(db.Float)
cog = db.Column(db.Float)
Expand Down Expand Up @@ -62,6 +63,18 @@ class ProductYear(BaseModel, IDMixin):
pci_rank = db.Column(db.Integer)


class IndustryYear(BaseModel, IDMixin):

__tablename__ = "industry_year"

industry_id = db.Column(db.Integer, db.ForeignKey(Industry.id))
year = db.Column(db.Integer)

industry = db.relationship(Industry)

complexity = db.Column(db.Float)


class DepartmentIndustryYear(BaseModel, IDMixin):

__tablename__ = "department_industry_year"
Expand All @@ -75,3 +88,16 @@ class DepartmentIndustryYear(BaseModel, IDMixin):

employment = db.Column(db.Integer)
wages = db.Column(db.Integer)

rca = db.Column(db.Integer)
density = db.Column(db.Float)
cog = db.Column(db.Float)
coi = db.Column(db.Float)

@hybrid_property
def distance(self):
return 1.0 - self.density

@distance.expression
def distance(cls):
return (1.0 - cls.density).label("distance")
172 changes: 25 additions & 147 deletions colombia/import.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,16 @@ def inner(line):
return inner


def make_iy(industry_map):
def inner(line):
iy = models.IndustryYear()
iy.industry = industry_map[line["i"]]
iy.year = int(line["year"])
iy.pci = line["pci"]
return iy
return inner


def process_cpy(cpy, product_map, department_map):
"""Take a dataframe and return
Expand All @@ -82,51 +92,6 @@ def process_cpy(cpy, product_map, department_map):
return [cy_out, py_out, cpy_out]


def process_department(dept):
department_data = dept.groupby("department_code")\
.first().reset_index()\
[["department_name", "department_code"]]

def make_department(line):
d = models.Department()
d.aggregation = "department"
d.name = line["department_name"]
d.code = line["department_code"]
return d

return department_data.apply(make_department, axis=1)


def process_product(prod):
d = prod[["code", "name", "name_en", "name_es", "community_id"]]

four_digit = d[d.code.str.len() == 4]
two_digit = d[d.code.str.len() == 2]
section = d[d.code.str.len() == 3]

def product_maker(aggregation):
def make_product(line):
d = models.HSProduct()
d.code = line["code"]
d.name = line["name"]
d.en = line["name_en"]
d.es = line["name_es"]
d.aggregation = aggregation
if aggregation == "4digit":
d.section_code = str(line["community_id"])
community_index = section.code == d.section_code
d.section_name = section[community_index].name_en.values[0]
d.section_name_es = section[community_index].name_es.values[0]
return d
return make_product

return (
list(section.apply(product_maker("section"), axis=1)),
list(two_digit.apply(product_maker("2digit"), axis=1)),
list(four_digit.apply(product_maker("4digit"), axis=1))
)


# Taken from ecomplexity_from_cepii_xx_dollar.dta
# Sample: [u'department', u'hs4', u'peso', u'dollar', u'__000001', u'M',
# u'density', u'eci', u'pci', u'diversity', u'ubiquity', u'coi', u'cog',
Expand Down Expand Up @@ -277,100 +242,6 @@ def test_process_cpy(self):
self.assertEquals(py[2].year, 1998)
self.assertEquals(py[2].pci, 1)

def test_process_department(self):
data = """
department_code department_name municipality_code municipality_name city rural midsize pop_2012 nbi
08 Atlántico 08849 Usiacurí FALSE TRUE FALSE 9238 43.27979231
11 "Bogotá, D.C." 11001 "Bogotá, D.C." TRUE FALSE FALSE 7571345 9.20300877
13 Bolívar 13001 Cartagena TRUE FALSE FALSE 967051 26.01059996"""

data = pd.read_table(StringIO(data), encoding="utf-8",
dtype={"department_code": np.object})
d = process_department(data)
db.session.add_all(d)
db.session.commit()

# TODO population, gdp

self.assertEquals(d[0].aggregation, "department")
self.assertEquals(d[0].name, u"Atlántico")
self.assertEquals(d[0].code, "08")

self.assertEquals(d[1].aggregation, "department")
self.assertEquals(d[1].name, u"Bogotá, D.C.")
self.assertEquals(d[1].code, "11")

self.assertEquals(d[2].aggregation, "department")
self.assertEquals(d[2].name, u"Bolívar")
self.assertEquals(d[2].code, "13")

def test_process_product(self):

data = """
code name name_en name_es community_id
0101 Live horses, asses, mules or hinnies Horses Caballos 106
0102 Live bovine animals Bovines Bovinos 116
106 Animal & Animal Products Animal & Animal Products NULL 106
116 Vegetable Products Vegetable Products NULL 116
04 Dairy, Eggs, Honey, & Ed. Products Dairy, Honey, & Ed. Prod. Productos lácteos, la miel, y Ed. Prod. 106
06 Live Trees & Other Plants Trees & Plants  Árboles y Plantas 116"""

data = pd.read_table(StringIO(data), encoding="utf-8",
dtype={"code": np.object})
section, two_digit, four_digit = process_product(data)

db.session.add_all(section)
db.session.add_all(two_digit)
db.session.add_all(four_digit)
db.session.commit()

len(four_digit) == 2
self.assertEquals(four_digit[0].name, "Live horses, asses, mules or hinnies")
self.assertEquals(four_digit[0].en, "Horses")
self.assertEquals(four_digit[0].es, "Caballos")
self.assertEquals(four_digit[0].code, "0101")
self.assertEquals(four_digit[0].aggregation, "4digit")
self.assertEquals(four_digit[0].section_code, "106")
self.assertEquals(four_digit[0].section_name, "Animal & Animal Products")
self.assertEquals(four_digit[1].name, "Live bovine animals")
self.assertEquals(four_digit[1].en, "Bovines")
self.assertEquals(four_digit[1].es, "Bovinos")
self.assertEquals(four_digit[1].code, "0102")
self.assertEquals(four_digit[1].aggregation, "4digit")
self.assertEquals(four_digit[1].section_code, "116")
self.assertEquals(four_digit[1].section_name, "Vegetable Products")

len(two_digit) == 2
self.assertEquals(two_digit[0].name, "Dairy, Eggs, Honey, & Ed. Products")
self.assertEquals(two_digit[0].en, "Dairy, Honey, & Ed. Prod.")
self.assertEquals(two_digit[0].es, "Productos lácteos, la miel, y Ed. Prod.")
self.assertEquals(two_digit[0].code, "04")
self.assertEquals(two_digit[0].aggregation, "2digit")
self.assertEquals(two_digit[0].section_code, None)
self.assertEquals(two_digit[0].section_name, None)
self.assertEquals(two_digit[1].name, "Live Trees & Other Plants")
self.assertEquals(two_digit[1].en, " Trees & Plants")
self.assertEquals(two_digit[1].es, " Árboles y Plantas")
self.assertEquals(two_digit[1].code, "06")
self.assertEquals(two_digit[1].aggregation, "2digit")
self.assertEquals(two_digit[1].section_code, None)
self.assertEquals(two_digit[1].section_name, None)

len(section) == 2
self.assertEquals(section[0].name, "Animal & Animal Products")
self.assertEquals(section[0].en, "Animal & Animal Products")
self.assertEquals(section[0].es, None)
self.assertEquals(section[0].code, "106")
self.assertEquals(section[0].aggregation, "section")
self.assertEquals(section[0].section_code, None)
self.assertEquals(section[0].section_name, None)
self.assertEquals(section[1].name, "Vegetable Products")
self.assertEquals(section[1].en, "Vegetable Products")
self.assertEquals(section[1].es, None)
self.assertEquals(section[1].code, "116")
self.assertEquals(section[1].aggregation, "section")
self.assertEquals(section[1].section_code, None)
self.assertEquals(section[1].section_name, None)


if __name__ == "__main__":
Expand Down Expand Up @@ -442,28 +313,35 @@ def parse_dpy(dpy_file, translation_table):


# Department - industry - year
df = pd.read_stata("/Users/makmana/ciddata/PILA_andres/COL_PILA_ciy_2008-2012_rev3_stata13.dta")
df = df[["year", "r", "i", "E_yir", "W_yir"]]
df = pd.read_stata("/Users/makmana/ciddata/PILA_andres/COL_PILA_ecomp-E_yir_2008-2012_rev3_dpto.dta")
df = df[["year", "r", "i", "E_yir", "W_yir", "rca", "density", "cog", "coi", "pci"]]
df = df[df.i != "."]

df = df.merge(industry_classification.table, left_on="i",
right_on="code", how="inner")

df["d"] = df.r.str[:2]
df.groupby(["year", "d", "i"]).agg({"E_yir": np.sum,
"W_yir": np.sum})

def make_diy():
def inner(line):
dpy = models.DepartmentIndustryYear()
dpy.industry = industry_map[line["i"]]
dpy.department = location_map[line["d"]]
dpy.department = location_map[line["r"]]
dpy.year = line["year"]
dpy.employment = line["E_yir"]
dpy.wages = line["W_yir"]

dpy.rca = line["rca"]
dpy.density = line["density"]
dpy.cog = line["cog"]
dpy.coi = line["coi"]

return dpy
return inner
cpy_out = df.apply(make_diy(), axis=1)
db.session.add_all(cpy_out)
db.session.commit()

iy = df.groupby(["i", "year"])[["pci"]].first().reset_index()
iy_out = iy.apply(make_iy(industry_map), axis=1)
db.session.add_all(iy_out)


db.session.commit()
2 changes: 1 addition & 1 deletion colombia/models.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from .metadata.models import Metadata, HSProduct, Location, Industry
from .data.models import DepartmentProductYear, DepartmentYear, ProductYear, DepartmentIndustryYear
from .data.models import DepartmentProductYear, DepartmentYear, ProductYear, DepartmentIndustryYear, IndustryYear

0 comments on commit d8ea40d

Please sign in to comment.