diff --git a/parsely_raw_data/__init__.py b/parsely_raw_data/__init__.py index 5597b55..5c66005 100644 --- a/parsely_raw_data/__init__.py +++ b/parsely_raw_data/__init__.py @@ -14,17 +14,58 @@ limitations under the License. """ -__version__ = '2.3.0.dev0' +__version__ = "2.3.0" from . import bigquery, docgen, redshift, s3, samples, schema, stream, utils +from six import iteritems __all__ = [ - 'bigquery', - 'docgen', - 'redshift', - 's3', - 'samples', - 'schema', - 'stream', - 'utils', + "bigquery", + "docgen", + "redshift", + "s3", + "samples", + "schema", + "stream", + "utils", ] + +BOOLEAN_FIELDS = {"flags_is_amp"} + + +def normalize_keys(input_event_dict, schema=None): + """Conform events to public schema: correct keys and proper value types. + + @param input_event_dict: A dictionary containing Parse.ly pixel events + @param schema: Optional parameter containing the schema to normalize the event_dict keys against + IF not specified, this will default to the most recent parsely_raw_data schema + """ + event_dict = {} + schema = schema or schema.SCHEMA + + # fix value types + if input_event_dict.get("metadata.share_urls") is not None and isinstance( + input_event_dict["metadata.share_urls"], dict + ): + input_event_dict["metadata.share_urls"] = ( + list(input_event_dict["metadata.share_urls"].values()) or None + ) + + # replace all "."s in the key with "_" + input_event_dict = {x.replace('.', '_'): v for x, v in input_event_dict.items()} + + # emit only public schema items + # ensure all columns are available and null when needed + # account for all boolean schema defined fields as this is parsely_raw_data specific + for key in schema: + if key not in input_event_dict: + if key in BOOLEAN_FIELDS: + event_dict[key] = False + else: + event_dict[key] = None + else: + event_dict[key] = input_event_dict[key] + + event_dict["schema_version"] = __version__ + + return event_dict diff --git a/parsely_raw_data/schema.py b/parsely_raw_data/schema.py index e2610c4..ffcdad1 100644 --- a/parsely_raw_data/schema.py +++ b/parsely_raw_data/schema.py @@ -1,7 +1,5 @@ from __future__ import print_function -import json - from tabulate import tabulate """ @@ -78,6 +76,7 @@ {"key": "ref_query", "ex": "", "type": str}, {"key": "ref_scheme", "ex": "http", "type": str, "size": 64}, {"key": "referrer", "ex": "http://mashable.com/", "type": str}, + {"key": "schema_version", "ex": "2.3.0", "type": str, "size": 64}, {"key": "session", "ex": True, "type": bool}, {"key": "session_id", "ex": 6, "type": int, "available_with_field": "session"}, {"key": "session_initial_referrer", "ex": "http://mashable.com/", "type": str, "available_with_field": "session"}, @@ -146,7 +145,6 @@ {"key": "visitor_site_id", "ex": "ab94fd31-a207-4010-8a25-fb4788207b82", "type": str, "size": 128, "req": True, "available_with_field": "visitor"} ] - def mk_sample_event(): sample = {} for record in SCHEMA: diff --git a/requirements.txt b/requirements.txt index 0bdeac1..1cf4b95 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ tablib xlsxwriter tabulate oauth2client +pytest diff --git a/setup.cfg b/setup.cfg index 0395844..e0ca01b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,7 +3,7 @@ logging-clear-handlers = 1 verbosity = 2 detailed-errors = 1 -[pytest] +[tool:pytest] norecursedirs = build docs/_build *.egg .tox *.venv requirements/ addopts = # Shows a line for every test diff --git a/setup.py b/setup.py index 9777ede..68bb3fe 100644 --- a/setup.py +++ b/setup.py @@ -80,7 +80,8 @@ def run_setup(): author='Emmett Butler', author_email='support@parsely.com', url='https://github.com/Parsely/parsely_raw_data', - description='Utilities for accessing raw Parse.ly data', + description='Utilities for accessing the Parse.ly Data Pipeline', + long_description='Utilities for accessing the Parse.ly Data Pipeline', keywords='parsely s3 kinesis redshift firehose bigquery', license='Apache License 2.0', packages=find_packages(), diff --git a/tests/test_basic.py b/tests/test_basic.py index 71bd79a..6d2cc8f 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -1,8 +1,269 @@ +from six import text_type + import parsely_raw_data +expected_view = { + "action": "pageview", + "apikey": "example.com", + "campaign_id": None, + "channel": "website", + "display": True, + "display_avail_height": 877, + "display_avail_width": 1436, + "display_pixel_depth": 24, + "display_total_height": 900, + "display_total_width": 1440, + "engaged_time_inc": None, + "extra_data": None, + "flags_is_amp": False, + "event_id": "0x32bc4058da8233e6ddd86ba0a8920586", + "ip_city": "Toronto", + "ip_continent": "NA", + "ip_country": "CA", + "ip_lat": 43.7124, + "ip_lon": -79.3644, + "ip_postal": "M4G", + "ip_subdivision": "ON", + "ip_timezone": "America/Toronto", + "ip_market_nielsen": None, + "ip_market_doubleclick": None, + "ip_market_name": None, + "metadata": False, + "metadata_authors": None, + "metadata_canonical_url": None, + "metadata_custom_metadata": None, + "metadata_data_source": None, + "metadata_duration": None, + "metadata_full_content_word_count": None, + "metadata_image_url": None, + "metadata_page_type": None, + "metadata_post_id": None, + "metadata_pub_date_tmsp": None, + "metadata_save_date_tmsp": None, + "metadata_section": None, + "metadata_share_urls": None, + "metadata_tags": None, + "metadata_thumb_url": None, + "metadata_title": None, + "metadata_urls": None, + "pageload_id": None, + "pageview_id": None, + "ref_category": "internal", + "ref_clean": "http://www.example.com/article-123", + "ref_domain": "example.com", + "ref_fragment": "", + "ref_netloc": "www.example.com", + "ref_params": "", + "ref_path": "/article-123", + "ref_query": "", + "ref_scheme": "http", + "referrer": "http://www.example.com/article-123", + "schema_version": parsely_raw_data.__version__, + "session": True, + "session_id": 5, + "session_initial_referrer": "https://www.google.ca/", + "session_initial_url": "http://www.example.com/", + "session_last_session_timestamp": "1_470_045_600_000", + "session_timestamp": "1_471_428_000_000", + "slot": False, + "sref_category": "search", + "sref_clean": "https://www.google.ca/", + "sref_domain": "google", + "sref_fragment": "", + "sref_netloc": "www.google.ca", + "sref_params": "", + "sref_path": "/", + "sref_query": "", + "sref_scheme": "https", + "surl_clean": "http://www.example.com/", + "surl_domain": "example.com", + "surl_fragment": "", + "surl_netloc": "www.example.com", + "surl_params": "", + "surl_path": "/", + "surl_query": "", + "surl_scheme": "http", + "surl_utm_campaign": None, + "surl_utm_content": None, + "surl_utm_medium": None, + "surl_utm_source": None, + "surl_utm_term": None, + "timestamp_info": True, + "timestamp_info_nginx_ms": "1_429_707_722_000", + "timestamp_info_override_ms": None, + "timestamp_info_pixel_ms": None, + "ts_action": "2015-04-22 13:02:02", + "ts_session_current": "2016-08-17 10:00:00", + "ts_session_last": "2016-08-01 10:00:00", + "ua_browser": "Chrome", + "ua_browserversion": "52.0.2743", + "ua_device": "Other", + "ua_devicebrand": None, + "ua_devicemodel": None, + "ua_devicetouchcapable": False, + "ua_devicetype": "desktop", + "ua_os": "Mac OS X", + "ua_osversion": "10.10.5", + "url": "http://www.example.com/", + "url_clean": "http://www.example.com/", + "url_domain": "example.com", + "url_fragment": "", + "url_netloc": "www.example.com", + "url_params": "", + "url_path": "/", + "url_query": "", + "url_scheme": "http", + "user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36", + "utm_campaign": None, + "utm_content": None, + "utm_medium": None, + "utm_source": None, + "utm_term": None, + "videostart_id": None, + "version": 1, + "visitor": True, + "visitor_ip": "184.149.39.120", + "visitor_network_id": "", + "visitor_site_id": "e71604df-a912-455d-aaf3-a9c72a6dd86c", +} + +test_view = { + "action": "pageview", + "apikey": "example.com", + "channel": "website", + "display": True, + "display_avail_height": 877, + "display_avail_width": 1436, + "display_pixel_depth": 24, + "display_total_height": 900, + "display_total_width": 1440, + "event_id": "0x32bc4058da8233e6ddd86ba0a8920586", + "ip_city": "Toronto", + "ip_continent": "NA", + "ip_country": "CA", + "ip_lat": 43.7124, + "ip_lon": -79.3644, + "ip_postal": "M4G", + "ip_subdivision": "ON", + "ip_timezone": "America/Toronto", + "ip_market_nielsen": None, + "ip_market_doubleclick": None, + "ip_market_name": None, + "metadata": False, + "metadata_authors": None, + "metadata_canonical_url": None, + "metadata_custom_metadata": None, + "metadata_data_source": None, + "metadata_duration": None, + "metadata_full_content_word_count": None, + "metadata_image_url": None, + "metadata_page_type": None, + "metadata_post_id": None, + "metadata_pub_date_tmsp": None, + "metadata_save_date_tmsp": None, + "metadata_section": None, + "metadata_share_urls": None, + "metadata_tags": None, + "metadata_thumb_url": None, + "metadata_title": None, + "metadata_urls": None, + "pageload_id": None, + "pageview_id": None, + "ref_category": "internal", + "ref_clean": "http://www.example.com/article-123", + "ref_domain": "example.com", + "ref_fragment": "", + "ref_netloc": "www.example.com", + "ref_params": "", + "ref_path": "/article-123", + "ref_query": "", + "ref_scheme": "http", + "referrer": "http://www.example.com/article-123", + "schema_version": parsely_raw_data.__version__, + "session": True, + "session_id": 5, + "session_initial_referrer": "https://www.google.ca/", + "session_initial_url": "http://www.example.com/", + "session_last_session_timestamp": "1_470_045_600_000", + "session_timestamp": "1_471_428_000_000", + "slot": False, + "sref_category": "search", + "sref_clean": "https://www.google.ca/", + "sref_domain": "google", + "sref_fragment": "", + "sref_netloc": "www.google.ca", + "sref_params": "", + "sref_path": "/", + "sref_query": "", + "sref_scheme": "https", + "surl_clean": "http://www.example.com/", + "surl_domain": "example.com", + "surl_fragment": "", + "surl_netloc": "www.example.com", + "surl_params": "", + "surl_path": "/", + "surl_query": "", + "surl_scheme": "http", + "surl_utm_campaign": None, + "surl_utm_content": None, + "surl_utm_medium": None, + "surl_utm_source": None, + "surl_utm_term": None, + "timestamp_info": True, + "timestamp_info_nginx_ms": "1_429_707_722_000", + "timestamp_info_override_ms": None, + "timestamp_info_pixel_ms": None, + "ts_action": "2015-04-22 13:02:02", + "ts_session_current": "2016-08-17 10:00:00", + "ts_session_last": "2016-08-01 10:00:00", + "ua_browser": "Chrome", + "ua_browserversion": "52.0.2743", + "ua_device": "Other", + "ua_devicetouchcapable": False, + "ua_devicetype": "desktop", + "ua_os": "Mac OS X", + "ua_osversion": "10.10.5", + "url": "http://www.example.com/", + "url_clean": "http://www.example.com/", + "url_domain": "example.com", + "url_fragment": "", + "url_netloc": "www.example.com", + "url_params": "", + "url_path": "/", + "url_query": "", + "url_scheme": "http", + "user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36", + "version": 1, + "visitor": True, + "visitor_ip": "184.149.39.120", + "visitor_network_id": "", + "visitor_site_id": "e71604df-a912-455d-aaf3-a9c72a6dd86c", +} + + +def test_normalize_keys_empty_dict(): + # Return schema keys + public_schema_keys = {text_type(field["key"]) for field in parsely_raw_data.schema.SCHEMA} + + # test passing empty dict yields full dict with empty rows + event_empty_dict = { + "event_id": "1234" + } + normalized_dict = dict(parsely_raw_data.normalize_keys(event_empty_dict, schema=public_schema_keys)) + assert len(normalized_dict.keys()) == len(public_schema_keys) + + +def test_normalize_keys_partially_complete_dict(): + public_schema_keys = {text_type(field["key"]) for field in parsely_raw_data.schema.SCHEMA} + + # test passing event dict yield all expected fields in schema + normalized_dict = parsely_raw_data.normalize_keys(test_view, schema=public_schema_keys) + assert normalized_dict == expected_view + def test_basic(): pass + if __name__ == "__main__": test_basic() diff --git a/tox.ini b/tox.ini index 3dd9209..81623be 100644 --- a/tox.ini +++ b/tox.ini @@ -5,4 +5,4 @@ envlist = py27, py34, py35, pypy commands = pip install -r test-requirements.txt pip install -e . - py.test {posargs} + pytest {posargs}