Merge pull request #10 from cid-harvard/feature-industry-data

Industry complexity variables plus industry / year data.
cid-harvard · Jun 16, 2015 · d8ea40d · d8ea40d
2 parents 2fcb3de + f1adfc4
commit d8ea40d
Show file tree

Hide file tree

Showing 4 changed files with 54 additions and 150 deletions.
diff --git a/colombia/api_schemas.py b/colombia/api_schemas.py
@@ -29,8 +29,8 @@ class Meta:
 class DepartmentIndustryYearSchema(ma.Schema):
 
     class Meta:
-        fields = ("employment", "wages", "department_id", "industry_id",
-                  "year")
+        fields = ("employment", "wages", "rca", "distance", "cog", "coi",
+                  "department_id", "industry_id", "year")
 
 
 class DepartmentSchema(ma.Schema):

diff --git a/colombia/data/models.py b/colombia/data/models.py
@@ -21,6 +21,7 @@ class DepartmentProductYear(BaseModel, IDMixin):
 
     import_value = db.Column(db.Integer)
     export_value = db.Column(db.Integer)
+
     export_rca = db.Column(db.Integer)
     density = db.Column(db.Float)
     cog = db.Column(db.Float)
@@ -62,6 +63,18 @@ class ProductYear(BaseModel, IDMixin):
     pci_rank = db.Column(db.Integer)
 
 
+class IndustryYear(BaseModel, IDMixin):
+
+    __tablename__ = "industry_year"
+
+    industry_id = db.Column(db.Integer, db.ForeignKey(Industry.id))
+    year = db.Column(db.Integer)
+
+    industry = db.relationship(Industry)
+
+    complexity = db.Column(db.Float)
+
+
 class DepartmentIndustryYear(BaseModel, IDMixin):
 
     __tablename__ = "department_industry_year"
@@ -75,3 +88,16 @@ class DepartmentIndustryYear(BaseModel, IDMixin):
 
     employment = db.Column(db.Integer)
     wages = db.Column(db.Integer)
+
+    rca = db.Column(db.Integer)
+    density = db.Column(db.Float)
+    cog = db.Column(db.Float)
+    coi = db.Column(db.Float)
+
+    @hybrid_property
+    def distance(self):
+        return 1.0 - self.density
+
+    @distance.expression
+    def distance(cls):
+        return (1.0 - cls.density).label("distance")
diff --git a/colombia/import.py b/colombia/import.py
@@ -66,6 +66,16 @@ def inner(line):
     return inner
 
 
+def make_iy(industry_map):
+    def inner(line):
+        iy = models.IndustryYear()
+        iy.industry = industry_map[line["i"]]
+        iy.year = int(line["year"])
+        iy.pci = line["pci"]
+        return iy
+    return inner
+
+
 def process_cpy(cpy, product_map, department_map):
     """Take a dataframe and return
 
@@ -82,51 +92,6 @@ def process_cpy(cpy, product_map, department_map):
     return [cy_out, py_out, cpy_out]
 
 
-def process_department(dept):
-    department_data = dept.groupby("department_code")\
-        .first().reset_index()\
-        [["department_name", "department_code"]]
-
-    def make_department(line):
-        d = models.Department()
-        d.aggregation = "department"
-        d.name = line["department_name"]
-        d.code = line["department_code"]
-        return d
-
-    return department_data.apply(make_department, axis=1)
-
-
-def process_product(prod):
-    d = prod[["code", "name", "name_en", "name_es", "community_id"]]
-
-    four_digit = d[d.code.str.len() == 4]
-    two_digit = d[d.code.str.len() == 2]
-    section = d[d.code.str.len() == 3]
-
-    def product_maker(aggregation):
-        def make_product(line):
-            d = models.HSProduct()
-            d.code = line["code"]
-            d.name = line["name"]
-            d.en = line["name_en"]
-            d.es = line["name_es"]
-            d.aggregation = aggregation
-            if aggregation == "4digit":
-                d.section_code = str(line["community_id"])
-                community_index = section.code == d.section_code
-                d.section_name = section[community_index].name_en.values[0]
-                d.section_name_es = section[community_index].name_es.values[0]
-            return d
-        return make_product
-
-    return (
-        list(section.apply(product_maker("section"), axis=1)),
-        list(two_digit.apply(product_maker("2digit"), axis=1)),
-        list(four_digit.apply(product_maker("4digit"), axis=1))
-    )
-
-
 # Taken from ecomplexity_from_cepii_xx_dollar.dta
 # Sample: [u'department', u'hs4', u'peso', u'dollar', u'__000001', u'M',
 # u'density', u'eci', u'pci', u'diversity', u'ubiquity', u'coi', u'cog',
@@ -277,100 +242,6 @@ def test_process_cpy(self):
         self.assertEquals(py[2].year, 1998)
         self.assertEquals(py[2].pci, 1)
 
-    def test_process_department(self):
-        data = """
-department_code	department_name	municipality_code	municipality_name	city	rural	midsize	pop_2012	nbi
-08	Atlántico	08849	Usiacurí	FALSE	TRUE	FALSE	9238	43.27979231
-11	"Bogotá, D.C."	11001	"Bogotá, D.C."	TRUE	FALSE	FALSE	7571345	9.20300877
-13	Bolívar	13001	Cartagena	TRUE	FALSE	FALSE	967051	26.01059996"""
-
-        data = pd.read_table(StringIO(data), encoding="utf-8",
-                             dtype={"department_code": np.object})
-        d = process_department(data)
-        db.session.add_all(d)
-        db.session.commit()
-
-        # TODO population, gdp
-
-        self.assertEquals(d[0].aggregation, "department")
-        self.assertEquals(d[0].name, u"Atlántico")
-        self.assertEquals(d[0].code, "08")
-
-        self.assertEquals(d[1].aggregation, "department")
-        self.assertEquals(d[1].name, u"Bogotá, D.C.")
-        self.assertEquals(d[1].code, "11")
-
-        self.assertEquals(d[2].aggregation, "department")
-        self.assertEquals(d[2].name, u"Bolívar")
-        self.assertEquals(d[2].code, "13")
-
-    def test_process_product(self):
-
-        data = """
-code	name	name_en	name_es	community_id
-0101	Live horses, asses, mules or hinnies	Horses	Caballos	106
-0102	Live bovine animals	Bovines	Bovinos	116
-106	Animal & Animal Products	Animal & Animal Products	NULL	106
-116	Vegetable Products	Vegetable Products	NULL	116
-04	Dairy, Eggs, Honey, & Ed. Products	Dairy, Honey, & Ed. Prod.	Productos lácteos, la miel, y Ed. Prod.	106
-06	Live Trees & Other Plants	 Trees & Plants	 Árboles y Plantas	116"""
-
-        data = pd.read_table(StringIO(data), encoding="utf-8",
-                             dtype={"code": np.object})
-        section, two_digit, four_digit = process_product(data)
-
-        db.session.add_all(section)
-        db.session.add_all(two_digit)
-        db.session.add_all(four_digit)
-        db.session.commit()
-
-        len(four_digit) == 2
-        self.assertEquals(four_digit[0].name, "Live horses, asses, mules or hinnies")
-        self.assertEquals(four_digit[0].en, "Horses")
-        self.assertEquals(four_digit[0].es, "Caballos")
-        self.assertEquals(four_digit[0].code, "0101")
-        self.assertEquals(four_digit[0].aggregation, "4digit")
-        self.assertEquals(four_digit[0].section_code, "106")
-        self.assertEquals(four_digit[0].section_name, "Animal & Animal Products")
-        self.assertEquals(four_digit[1].name, "Live bovine animals")
-        self.assertEquals(four_digit[1].en, "Bovines")
-        self.assertEquals(four_digit[1].es, "Bovinos")
-        self.assertEquals(four_digit[1].code, "0102")
-        self.assertEquals(four_digit[1].aggregation, "4digit")
-        self.assertEquals(four_digit[1].section_code, "116")
-        self.assertEquals(four_digit[1].section_name, "Vegetable Products")
-
-        len(two_digit) == 2
-        self.assertEquals(two_digit[0].name, "Dairy, Eggs, Honey, & Ed. Products")
-        self.assertEquals(two_digit[0].en, "Dairy, Honey, & Ed. Prod.")
-        self.assertEquals(two_digit[0].es, "Productos lácteos, la miel, y Ed. Prod.")
-        self.assertEquals(two_digit[0].code, "04")
-        self.assertEquals(two_digit[0].aggregation, "2digit")
-        self.assertEquals(two_digit[0].section_code, None)
-        self.assertEquals(two_digit[0].section_name, None)
-        self.assertEquals(two_digit[1].name, "Live Trees & Other Plants")
-        self.assertEquals(two_digit[1].en, " Trees & Plants")
-        self.assertEquals(two_digit[1].es, " Árboles y Plantas")
-        self.assertEquals(two_digit[1].code, "06")
-        self.assertEquals(two_digit[1].aggregation, "2digit")
-        self.assertEquals(two_digit[1].section_code, None)
-        self.assertEquals(two_digit[1].section_name, None)
-
-        len(section) == 2
-        self.assertEquals(section[0].name, "Animal & Animal Products")
-        self.assertEquals(section[0].en, "Animal & Animal Products")
-        self.assertEquals(section[0].es, None)
-        self.assertEquals(section[0].code, "106")
-        self.assertEquals(section[0].aggregation, "section")
-        self.assertEquals(section[0].section_code, None)
-        self.assertEquals(section[0].section_name, None)
-        self.assertEquals(section[1].name, "Vegetable Products")
-        self.assertEquals(section[1].en, "Vegetable Products")
-        self.assertEquals(section[1].es, None)
-        self.assertEquals(section[1].code, "116")
-        self.assertEquals(section[1].aggregation, "section")
-        self.assertEquals(section[1].section_code, None)
-        self.assertEquals(section[1].section_name, None)
 
 
 if __name__ == "__main__":
@@ -442,28 +313,35 @@ def parse_dpy(dpy_file, translation_table):
 
 
             # Department - industry - year
-            df = pd.read_stata("/Users/makmana/ciddata/PILA_andres/COL_PILA_ciy_2008-2012_rev3_stata13.dta")
-            df = df[["year", "r", "i", "E_yir", "W_yir"]]
+            df = pd.read_stata("/Users/makmana/ciddata/PILA_andres/COL_PILA_ecomp-E_yir_2008-2012_rev3_dpto.dta")
+            df = df[["year", "r", "i", "E_yir", "W_yir", "rca", "density", "cog", "coi", "pci"]]
             df = df[df.i != "."]
 
             df = df.merge(industry_classification.table, left_on="i",
                           right_on="code", how="inner")
 
-            df["d"] = df.r.str[:2]
-            df.groupby(["year", "d", "i"]).agg({"E_yir": np.sum,
-                                                "W_yir": np.sum})
-
             def make_diy():
                 def inner(line):
                     dpy = models.DepartmentIndustryYear()
                     dpy.industry = industry_map[line["i"]]
-                    dpy.department = location_map[line["d"]]
+                    dpy.department = location_map[line["r"]]
                     dpy.year = line["year"]
                     dpy.employment = line["E_yir"]
                     dpy.wages = line["W_yir"]
+
+                    dpy.rca = line["rca"]
+                    dpy.density = line["density"]
+                    dpy.cog = line["cog"]
+                    dpy.coi = line["coi"]
+
                     return dpy
                 return inner
             cpy_out = df.apply(make_diy(), axis=1)
             db.session.add_all(cpy_out)
-            db.session.commit()
 
+            iy = df.groupby(["i", "year"])[["pci"]].first().reset_index()
+            iy_out = iy.apply(make_iy(industry_map), axis=1)
+            db.session.add_all(iy_out)
+
+
+            db.session.commit()
diff --git a/colombia/models.py b/colombia/models.py
@@ -1,2 +1,2 @@
 from .metadata.models import Metadata, HSProduct, Location, Industry
-from .data.models import DepartmentProductYear, DepartmentYear, ProductYear, DepartmentIndustryYear
+from .data.models import DepartmentProductYear, DepartmentYear, ProductYear, DepartmentIndustryYear, IndustryYear