Skip to content

Commit

Permalink
Merge pull request #34 from cid-harvard/feature/msa-2digit-industry
Browse files Browse the repository at this point in the history
Ingest data for MSA 2digit industry (divisions)
  • Loading branch information
makmanalp committed Nov 17, 2015
2 parents 284b123 + be9b210 commit cf19bd5
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 0 deletions.
59 changes: 59 additions & 0 deletions colombia/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -806,6 +806,65 @@ def hook_industry4digit_msa(df):
}
}


def hook_industry2digit_msa(df):
df = df.drop_duplicates(["location", "industry", "year"])
df = df[df.location.notnull()]
df.location = df.location.astype(int).astype(str).str.zfill(5) + "0"
return df

industry2digit_msa = {
"read_function": lambda: pd.read_hdf(prefix_path("Industries/industries_msa.hdf"), "data"),
"hook_pre_merge": hook_industry2digit_msa,
"field_mapping": {
"msa_code": "location",
"d3_code": "industry",
"year": "year",
"msa_d3_est": "num_establishments",
"msa_d3_wage": "wages",
"msa_d3_wagemonth": "monthly_wages",
"msa_d3_emp": "employment",
"msa_d3_rca": "rca",
"msa_d3_distance_ps_pred1": "distance",
"msa_d3_cog_ps_pred1": "cog",
"all_d3_pci": "complexity"
},
"classification_fields": {
"location": {
"classification": location_classification,
"level": "msa"
},
"industry": {
"classification": industry_classification,
"level": "division"
},
},
"digit_padding": {
"industry": 2,
"location": 5
},
"facet_fields": ["location", "industry", "year"],
"facets": {
("industry_id", "year"): {
"wages": sum_group,
"monthly_wages": sum_group,
"employment": sum_group,
"num_establishments": sum_group,
"complexity": first
},

("location_id", "industry_id", "year"): {
"wages": first,
"monthly_wages": first,
"employment": first,
"num_establishments": first,
"distance": first,
"cog": first,
"rca": first
}
}
}

occupation2digit_industry2digit = {
"read_function": lambda: pd.read_stata(prefix_path("Vacancies/Vacancies_do010_2d-Ind_X_4d-Occ.dta")),
"field_mapping": {
Expand Down
9 changes: 9 additions & 0 deletions colombia/import.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
industry4digit_msa, industry2digit_department,
industry4digit_municipality,
trade4digit_rcpy_municipality,
industry2digit_msa,
trade4digit_rcpy_department, trade4digit_rcpy_msa,
trade4digit_rcpy_country, population,
gdp_nominal_department, gdp_real_department,
Expand Down Expand Up @@ -199,6 +200,14 @@
df.to_sql("department_industry_year", db.engine, index=False,
chunksize=10000, if_exists="append")

# MSA - two digit industry - year
ret = process_dataset(industry2digit_msa)

df = ret[('location_id', 'industry_id', 'year')].reset_index()
df["level"] = "division"
df.to_sql("msa_industry_year", db.engine, index=False,
chunksize=10000, if_exists="append")

# MSA - industry - year
ret = process_dataset(industry4digit_msa)

Expand Down

0 comments on commit cf19bd5

Please sign in to comment.