Skip to content

Commit

Permalink
Replace requests with urllib and further clean up pandas (#112)
Browse files Browse the repository at this point in the history
* Replace requests with urllib and furture clean up pandas

* Fix all the unittest
  • Loading branch information
shifucun authored Jan 28, 2020
1 parent 5956370 commit 6d9181a
Show file tree
Hide file tree
Showing 14 changed files with 253 additions and 361 deletions.
1 change: 0 additions & 1 deletion datacommons/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
from collections import defaultdict

import datacommons.utils as utils
import requests

# ----------------------------- WRAPPER FUNCTIONS -----------------------------

Expand Down
4 changes: 0 additions & 4 deletions datacommons/examples/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ py_binary(
srcs=["core.py"],
deps=[
"//datacommons:datacommons",
requirement("pandas"),
]
)

Expand All @@ -14,7 +13,6 @@ py_binary(
srcs=["places.py"],
deps=[
"//datacommons:datacommons",
requirement("pandas"),
]
)

Expand All @@ -23,7 +21,6 @@ py_binary(
srcs=["populations.py"],
deps=[
"//datacommons:datacommons",
requirement("pandas"),
]
)

Expand All @@ -32,6 +29,5 @@ py_binary(
srcs=["query.py"],
deps=[
"//datacommons:datacommons",
requirement("pandas"),
]
)
39 changes: 3 additions & 36 deletions datacommons/examples/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,26 +21,21 @@
from __future__ import print_function

import datacommons as dc
import pandas as pd

import datacommons.utils as utils


def main():
# Set the dcid to be that of Santa Clara County.
dcids = ['geoId/06085']

# Print all incoming and outgoing properties from Santa Clara County.
utils._print_header('Property Labels for Santa Clara County')
print('Property Labels for Santa Clara County')
in_labels = dc.get_property_labels(dcids)
out_labels = dc.get_property_labels(dcids, out=False)
print('> Printing properties for {}'.format(dcids))
print('> Incoming properties: {}'.format(in_labels))
print('> Outgoing properties: {}'.format(out_labels))

# Print all property values for "containedInPlace" for Santa Clara County.
utils._print_header(
'Property Values for "containedInPlace" of Santa Clara County')
print('Property Values for "containedInPlace" of Santa Clara County')
prop_vals = dc.get_property_values(
dcids, 'containedInPlace', out=False, value_type='City')
print('> Cities contained in {}'.format(dcids))
Expand All @@ -49,41 +44,13 @@ def main():
print(' - {}'.format(city_dcid))

# Print the first 10 triples associated with Santa Clara County
utils._print_header('Triples for Santa Clara County')
print('Triples for Santa Clara County')
triples = dc.get_triples(dcids)
for dcid in dcids:
print('> Triples for {}'.format(dcid))
for s, p, o in triples[dcid][:5]:
print(' - ("{}", {}, "{}")'.format(s, p, o))

# get_property_values can be easily used to populate Pandas DataFrames. First
# create a DataFrame with some data.
utils._print_header('Initialize the DataFrame')
pd_frame = pd.DataFrame({'county': ['geoId/06085', 'geoId/24031']})
print(pd_frame)

# Get the names for the given counties.
utils._print_header('Get County Names')
pd_frame['county_name'] = pd_frame['county'].map(
dc.get_property_values(pd_frame['county'], 'name'))
pd_frame = pd_frame.explode('county_name')
print(pd_frame)

# Get the cities contained in these counties.
utils._print_header('Get Contained Cities')
pd_frame['city'] = pd_frame['county'].map(
dc.get_property_values(
pd_frame['county'], 'containedInPlace', out=False, value_type='City'))
pd_frame = pd_frame.explode('city')
print(pd_frame)

# Get the names for each city.
utils._print_header('Get City Names')
pd_frame['city_name'] = pd_frame['city'].map(
dc.get_property_values(pd_frame['city'], 'name'))
pd_frame = pd_frame.explode('city_name')
print(pd_frame)


if __name__ == '__main__':
main()
21 changes: 1 addition & 20 deletions datacommons/examples/places.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,42 +21,23 @@
from __future__ import print_function

import datacommons as dc
import pandas as pd

import datacommons.utils as utils


def main():
# Create a list of dcids for Santa Clara and Montgomery County.
sc, mc = 'geoId/06085', 'geoId/24031'
dcids = [sc, mc]

# Get all CensusTracts in these two counties.
utils._print_header('Get Census Tracts')
print('Get Census Tracts')
tracts = dc.get_places_in(dcids, 'CensusTract')
if sc in tracts:
print('> 10 CensusTracts in Santa Clara County')
for dcid in tracts[sc][:10]:
print(' - {}'.format(dcid))
print()
if mc in tracts:
print('> 10 CensusTracts in Montgomery County')
for dcid in tracts[mc][:10]:
print(' - {}'.format(dcid))

# We perform the same task using a Pandas DataFrame. First, initialize a
# DataFrame with Santa Clara and Montgomery County.
utils._print_header('Initialize the DataFrame')
pd_frame = pd.DataFrame({'county': ['geoId/06085', 'geoId/24031']})
print(pd_frame)

# Get all CensusTracts in these two counties.
utils._print_header('Get Census Tracts')
pd_frame['tracts'] = pd_frame['county'].map(
dc.get_places_in(pd_frame['county'], 'CensusTract'))
pd_frame = pd_frame.explode('tracts')
print(pd_frame)


if __name__ == '__main__':
main()
45 changes: 3 additions & 42 deletions datacommons/examples/populations.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,7 @@
from __future__ import print_function

import datacommons as dc
import pandas as pd
import pprint

import datacommons.utils as utils

import json


Expand All @@ -35,16 +31,14 @@ def main():
dcids = [ca, ky, md]

# Get the population of all employed individuals in the above states.
utils._print_header('Get Populations for All Employed Individuals')
print('Get Populations for All Employed Individuals')
employed = dc.get_populations(dcids, 'Person', constraining_properties={
'employment': 'BLS_Employed'})
print('> Printing all populations of employed individuals\n')
print(json.dumps(employed, indent=2))

# Get the count for all male / females for the above states in 2016
utils._print_header('Get Population Counts for Employed Individuals in Maryland')
print('Get Population Counts for Employed Individuals in Maryland')
pop_dcids = [employed[md]]
print('> Requesting observations for {} in December 2018\n'.format(pop_dcids))
obs = dc.get_observations(pop_dcids,
'count',
'measuredValue',
Expand All @@ -53,41 +47,8 @@ def main():
measurement_method='BLSSeasonallyAdjusted')
print(json.dumps(obs, indent=2))

# We perform the same workflow using a Pandas DataFrame. First, initialize a
# DataFrame with Santa Clara and Montgomery County.
utils._print_header('Initialize the DataFrame')
pd_frame = pd.DataFrame({'state': ['geoId/06', 'geoId/21', 'geoId/24']})
pd_frame['state_name'] = pd_frame['state'].map(
dc.get_property_values(pd_frame['state'], 'name'))
pd_frame = pd_frame.explode('state_name').reset_index(drop=True)

# Get populations for employed individuals
utils._print_header('Add Population and Observation to DataFrame')
pd_frame['employed_pop'] = pd_frame['state'].map(dc.get_populations(
pd_frame['state'],
'Person',
constraining_properties={'employment': 'BLS_Employed'}))

# Add the observation for employed individuals
pd_frame['employed_count'] = pd_frame['employed_pop'].map(
dc.get_observations(
pd_frame['employed_pop'],
'count',
'measuredValue',
'2018-12',
observation_period='P1M',
measurement_method='BLSSeasonallyAdjusted'))
print(pd_frame)

# Final dataframe. Use the convenience function "clean_frame" to convert
# columns to numerical types.
utils._print_header('Final Data Frame')
pd_frame = pd_frame.dropna().reset_index(drop=True)
print(pd_frame)


# Get all population and observation data of Mountain View.
utils._print_header('Get Mountain View population and observation')
print('Get Mountain View population and observation')
popobs = dc.get_pop_obs("geoId/0649670")
pprint.pprint(popobs)

Expand Down
2 changes: 0 additions & 2 deletions datacommons/places.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@

import datacommons.utils as utils

import requests


def get_places_in(dcids, place_type):
""" Returns :obj:`Place`s contained in :code:`dcids` of type
Expand Down
21 changes: 18 additions & 3 deletions datacommons/populations.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,22 @@

import datacommons.utils as utils

import requests

def _flatten_results(result, default_value=None):
""" Formats results to map to a single value or default value if empty. """
for k in list(result):
v = result[k]
if len(v) > 1:
raise ValueError(
'Expected one result, but more returned for "{}": {}'.format(k, v))
if len(v) == 1:
result[k] = v[0]
else:
if default_value is not None:
result[k] = default_value
else:
del result[k]
return result


def get_populations(dcids, population_type, constraining_properties={}):
Expand Down Expand Up @@ -96,7 +111,7 @@ def get_populations(dcids, population_type, constraining_properties={}):
payload, 'population', must_exist=dcids)

# Drop empty results while flattening
return utils._flatten_results(result)
return _flatten_results(result)


def get_observations(dcids,
Expand Down Expand Up @@ -184,7 +199,7 @@ def get_observations(dcids,
# Drop empty results by calling _flatten_results without default_value, then
# coerce the type to float if possible.
typed_results = {}
for k, v in utils._flatten_results(result).items():
for k, v in _flatten_results(result).items():
try:
typed_results[k] = float(v)
except ValueError:
Expand Down
30 changes: 20 additions & 10 deletions datacommons/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@

from datacommons.utils import _API_ROOT, _API_ENDPOINTS, _ENV_VAR_API_KEY

import json
import os
import requests
import urllib.request

# ----------------------------- WRAPPER FUNCTIONS -----------------------------

Expand Down Expand Up @@ -88,17 +89,26 @@ def query(query_string, select=None):
if not os.environ.get(_ENV_VAR_API_KEY, None):
raise ValueError(
'Request error: Must set an API key before using the API!')
url = _API_ROOT + _API_ENDPOINTS['query']
res = requests.post(url, json={'sparql': query_string}, headers={
'x-api-key': os.environ[_ENV_VAR_API_KEY]
})

# Verify then store the results.
if res.status_code != 200:
req_url = _API_ROOT + _API_ENDPOINTS['query']

headers = {
'x-api-key': os.environ[_ENV_VAR_API_KEY],
'Content-Type': 'application/json'
}
req = urllib.request.Request(
req_url,
data=json.dumps({'sparql': query_string}).encode("utf-8"),
headers=headers)

try:
res = urllib.request.urlopen(req)
except urllib.error.HTTPError as e:
raise ValueError(
'Response error: An HTTP {} code was returned by the mixer. Printing '
'response\n\n{}'.format(res.status_code , res.text))
res_json = res.json()
'response\n\n{}'.format(e.code, e.read()))

# Verify then store the results.
res_json = json.loads(res.read())

# Iterate through the query results
header = res_json['header']
Expand Down
Loading

0 comments on commit 6d9181a

Please sign in to comment.