Replace requests with urllib and further clean up pandas (#112)

* Replace requests with urllib and furture clean up pandas * Fix all the unittest
datacommonsorg · Jan 28, 2020 · 6d9181a · 6d9181a
1 parent 5956370
commit 6d9181a
Show file tree

Hide file tree

Showing 14 changed files with 253 additions and 361 deletions.
diff --git a/datacommons/core.py b/datacommons/core.py
@@ -29,7 +29,6 @@
 from collections import defaultdict
 
 import datacommons.utils as utils
-import requests
 
 # ----------------------------- WRAPPER FUNCTIONS -----------------------------
 

diff --git a/datacommons/examples/BUILD.bazel b/datacommons/examples/BUILD.bazel
@@ -5,7 +5,6 @@ py_binary(
     srcs=["core.py"],
     deps=[
         "//datacommons:datacommons",
-        requirement("pandas"),
     ]
 )
 
@@ -14,7 +13,6 @@ py_binary(
     srcs=["places.py"],
     deps=[
         "//datacommons:datacommons",
-        requirement("pandas"),
     ]
 )
 
@@ -23,7 +21,6 @@ py_binary(
     srcs=["populations.py"],
     deps=[
         "//datacommons:datacommons",
-        requirement("pandas"),
     ]
 )
 
@@ -32,6 +29,5 @@ py_binary(
     srcs=["query.py"],
     deps=[
         "//datacommons:datacommons",
-        requirement("pandas"),
     ]
 )
diff --git a/datacommons/examples/core.py b/datacommons/examples/core.py
@@ -21,26 +21,21 @@
 from __future__ import print_function
 
 import datacommons as dc
-import pandas as pd
-
-import datacommons.utils as utils
-
 
 def main():
   # Set the dcid to be that of Santa Clara County.
   dcids = ['geoId/06085']
 
   # Print all incoming and outgoing properties from Santa Clara County.
-  utils._print_header('Property Labels for Santa Clara County')
+  print('Property Labels for Santa Clara County')
   in_labels = dc.get_property_labels(dcids)
   out_labels = dc.get_property_labels(dcids, out=False)
   print('> Printing properties for {}'.format(dcids))
   print('> Incoming properties: {}'.format(in_labels))
   print('> Outgoing properties: {}'.format(out_labels))
 
   # Print all property values for "containedInPlace" for Santa Clara County.
-  utils._print_header(
-    'Property Values for "containedInPlace" of Santa Clara County')
+  print('Property Values for "containedInPlace" of Santa Clara County')
   prop_vals = dc.get_property_values(
     dcids, 'containedInPlace', out=False, value_type='City')
   print('> Cities contained in {}'.format(dcids))
@@ -49,41 +44,13 @@ def main():
       print('  - {}'.format(city_dcid))
 
   # Print the first 10 triples associated with Santa Clara County
-  utils._print_header('Triples for Santa Clara County')
+  print('Triples for Santa Clara County')
   triples = dc.get_triples(dcids)
   for dcid in dcids:
     print('> Triples for {}'.format(dcid))
     for s, p, o in triples[dcid][:5]:
       print('  - ("{}", {}, "{}")'.format(s, p, o))
 
-  # get_property_values can be easily used to populate Pandas DataFrames. First
-  # create a DataFrame with some data.
-  utils._print_header('Initialize the DataFrame')
-  pd_frame = pd.DataFrame({'county': ['geoId/06085', 'geoId/24031']})
-  print(pd_frame)
-
-  # Get the names for the given counties.
-  utils._print_header('Get County Names')
-  pd_frame['county_name'] = pd_frame['county'].map(
-    dc.get_property_values(pd_frame['county'], 'name'))
-  pd_frame = pd_frame.explode('county_name')
-  print(pd_frame)
-
-  # Get the cities contained in these counties.
-  utils._print_header('Get Contained Cities')
-  pd_frame['city'] = pd_frame['county'].map(
-    dc.get_property_values(
-      pd_frame['county'], 'containedInPlace', out=False, value_type='City'))
-  pd_frame = pd_frame.explode('city')
-  print(pd_frame)
-
-  # Get the names for each city.
-  utils._print_header('Get City Names')
-  pd_frame['city_name'] = pd_frame['city'].map(
-    dc.get_property_values(pd_frame['city'], 'name'))
-  pd_frame = pd_frame.explode('city_name')
-  print(pd_frame)
-
 
 if __name__ == '__main__':
   main()
diff --git a/datacommons/examples/places.py b/datacommons/examples/places.py
@@ -21,42 +21,23 @@
 from __future__ import print_function
 
 import datacommons as dc
-import pandas as pd
-
-import datacommons.utils as utils
-
 
 def main():
   # Create a list of dcids for Santa Clara and Montgomery County.
   sc, mc = 'geoId/06085', 'geoId/24031'
   dcids = [sc, mc]
 
   # Get all CensusTracts in these two counties.
-  utils._print_header('Get Census Tracts')
+  print('Get Census Tracts')
   tracts = dc.get_places_in(dcids, 'CensusTract')
   if sc in tracts:
     print('> 10 CensusTracts in Santa Clara County')
     for dcid in tracts[sc][:10]:
       print('  - {}'.format(dcid))
-  print()
   if mc in tracts:
     print('> 10 CensusTracts in Montgomery County')
     for dcid in tracts[mc][:10]:
       print('  - {}'.format(dcid))
 
-  # We perform the same task using a Pandas DataFrame. First, initialize a
-  # DataFrame with Santa Clara and Montgomery County.
-  utils._print_header('Initialize the DataFrame')
-  pd_frame = pd.DataFrame({'county': ['geoId/06085', 'geoId/24031']})
-  print(pd_frame)
-
-  # Get all CensusTracts in these two counties.
-  utils._print_header('Get Census Tracts')
-  pd_frame['tracts'] = pd_frame['county'].map(
-    dc.get_places_in(pd_frame['county'], 'CensusTract'))
-  pd_frame = pd_frame.explode('tracts')
-  print(pd_frame)
-
-
 if __name__ == '__main__':
   main()
diff --git a/datacommons/examples/populations.py b/datacommons/examples/populations.py
@@ -21,11 +21,7 @@
 from __future__ import print_function
 
 import datacommons as dc
-import pandas as pd
 import pprint
-
-import datacommons.utils as utils
-
 import json
 
 
@@ -35,16 +31,14 @@ def main():
   dcids = [ca, ky, md]
 
   # Get the population of all employed individuals in the above states.
-  utils._print_header('Get Populations for All Employed Individuals')
+  print('Get Populations for All Employed Individuals')
   employed = dc.get_populations(dcids, 'Person', constraining_properties={
                   'employment': 'BLS_Employed'})
-  print('> Printing all populations of employed individuals\n')
   print(json.dumps(employed, indent=2))
 
   # Get the count for all male / females for the above states in 2016
-  utils._print_header('Get Population Counts for Employed Individuals in Maryland')
+  print('Get Population Counts for Employed Individuals in Maryland')
   pop_dcids = [employed[md]]
-  print('> Requesting observations for {} in December 2018\n'.format(pop_dcids))
   obs = dc.get_observations(pop_dcids,
                             'count',
                             'measuredValue',
@@ -53,41 +47,8 @@ def main():
                             measurement_method='BLSSeasonallyAdjusted')
   print(json.dumps(obs, indent=2))
 
-  # We perform the same workflow using a Pandas DataFrame. First, initialize a
-  # DataFrame with Santa Clara and Montgomery County.
-  utils._print_header('Initialize the DataFrame')
-  pd_frame = pd.DataFrame({'state': ['geoId/06', 'geoId/21', 'geoId/24']})
-  pd_frame['state_name'] = pd_frame['state'].map(
-    dc.get_property_values(pd_frame['state'], 'name'))
-  pd_frame = pd_frame.explode('state_name').reset_index(drop=True)
-
-  # Get populations for employed individuals
-  utils._print_header('Add Population and Observation to DataFrame')
-  pd_frame['employed_pop'] = pd_frame['state'].map(dc.get_populations(
-    pd_frame['state'],
-    'Person',
-    constraining_properties={'employment': 'BLS_Employed'}))
-
-  # Add the observation for employed individuals
-  pd_frame['employed_count'] = pd_frame['employed_pop'].map(
-    dc.get_observations(
-      pd_frame['employed_pop'],
-      'count',
-      'measuredValue',
-      '2018-12',
-      observation_period='P1M',
-      measurement_method='BLSSeasonallyAdjusted'))
-  print(pd_frame)
-
-  # Final dataframe. Use the convenience function "clean_frame" to convert
-  # columns to numerical types.
-  utils._print_header('Final Data Frame')
-  pd_frame = pd_frame.dropna().reset_index(drop=True)
-  print(pd_frame)
-
-
   # Get all population and observation data of Mountain View.
-  utils._print_header('Get Mountain View population and observation')
+  print('Get Mountain View population and observation')
   popobs = dc.get_pop_obs("geoId/0649670")
   pprint.pprint(popobs)
 

diff --git a/datacommons/places.py b/datacommons/places.py
@@ -24,8 +24,6 @@
 
 import datacommons.utils as utils
 
-import requests
-
 
 def get_places_in(dcids, place_type):
   """ Returns :obj:`Place`s contained in :code:`dcids` of type

diff --git a/datacommons/populations.py b/datacommons/populations.py
@@ -28,7 +28,22 @@
 
 import datacommons.utils as utils
 
-import requests
+
+def _flatten_results(result, default_value=None):
+  """ Formats results to map to a single value or default value if empty. """
+  for k in list(result):
+    v = result[k]
+    if len(v) > 1:
+      raise ValueError(
+        'Expected one result, but more returned for "{}": {}'.format(k, v))
+    if len(v) == 1:
+      result[k] = v[0]
+    else:
+      if default_value is not None:
+        result[k] = default_value
+      else:
+        del result[k]
+  return result
 
 
 def get_populations(dcids, population_type, constraining_properties={}):
@@ -96,7 +111,7 @@ def get_populations(dcids, population_type, constraining_properties={}):
     payload, 'population', must_exist=dcids)
 
   # Drop empty results while flattening
-  return utils._flatten_results(result)
+  return _flatten_results(result)
 
 
 def get_observations(dcids,
@@ -184,7 +199,7 @@ def get_observations(dcids,
   # Drop empty results by calling _flatten_results without default_value, then
   # coerce the type to float if possible.
   typed_results = {}
-  for k, v in utils._flatten_results(result).items():
+  for k, v in _flatten_results(result).items():
     try:
       typed_results[k] = float(v)
     except ValueError:

diff --git a/datacommons/query.py b/datacommons/query.py
@@ -23,8 +23,9 @@
 
 from datacommons.utils import _API_ROOT, _API_ENDPOINTS, _ENV_VAR_API_KEY
 
+import json
 import os
-import requests
+import urllib.request
 
 # ----------------------------- WRAPPER FUNCTIONS -----------------------------
 
@@ -88,17 +89,26 @@ def query(query_string, select=None):
   if not os.environ.get(_ENV_VAR_API_KEY, None):
     raise ValueError(
         'Request error: Must set an API key before using the API!')
-  url = _API_ROOT + _API_ENDPOINTS['query']
-  res = requests.post(url, json={'sparql': query_string}, headers={
-    'x-api-key': os.environ[_ENV_VAR_API_KEY]
-  })
-
-  # Verify then store the results.
-  if res.status_code != 200:
+  req_url = _API_ROOT + _API_ENDPOINTS['query']
+
+  headers = {
+    'x-api-key': os.environ[_ENV_VAR_API_KEY],
+    'Content-Type': 'application/json'
+  }
+  req = urllib.request.Request(
+    req_url,
+    data=json.dumps({'sparql': query_string}).encode("utf-8"),
+    headers=headers)
+
+  try:
+    res = urllib.request.urlopen(req)
+  except urllib.error.HTTPError as e:
     raise ValueError(
         'Response error: An HTTP {} code was returned by the mixer. Printing '
-        'response\n\n{}'.format(res.status_code , res.text))
-  res_json = res.json()
+        'response\n\n{}'.format(e.code, e.read()))
+
+  # Verify then store the results.
+  res_json = json.loads(res.read())
 
   # Iterate through the query results
   header = res_json['header']