From bfd601e9a3d8a76d675f1ccbae2fedae3aed09a7 Mon Sep 17 00:00:00 2001 From: rachelannelise Date: Tue, 23 Jul 2019 15:11:30 -0600 Subject: [PATCH 01/18] Changing normalize keys from enrichments to parselyrawdata --- parsely_raw_data/schema.py | 43 +++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/parsely_raw_data/schema.py b/parsely_raw_data/schema.py index e2610c4..9d76709 100644 --- a/parsely_raw_data/schema.py +++ b/parsely_raw_data/schema.py @@ -1,6 +1,7 @@ from __future__ import print_function -import json +import re +from six import iteritems, text_type from tabulate import tabulate @@ -146,7 +147,47 @@ {"key": "visitor_site_id", "ex": "ab94fd31-a207-4010-8a25-fb4788207b82", "type": str, "size": 128, "req": True, "available_with_field": "visitor"} ] +def _get_public_dpl_schema(): + """Get set of public schema keys for use in normalize_keys()""" + global PUBLIC_DPL_SCHEMA_KEYS + if not PUBLIC_DPL_SCHEMA_KEYS: + public_schema_keys = {text_type(field["key"]) for field in SCHEMA} + PUBLIC_DPL_SCHEMA_KEYS = public_schema_keys + + return PUBLIC_DPL_SCHEMA_KEYS + + +def normalize_keys(r, schema): + """Conform events to public schema: correct keys and proper value types.""" + schema = schema or _get_public_dpl_schema() + event_dict = {} + with open('parsely_raw_data/__init__.py') as version_file: + version = re.search(r"""__version__\s+=\s+(['"])(?P.+?)\1""", + version_file.read()).group('version') + + # fix value types + if r.get("metadata.share_urls") is not None and isinstance( + r["metadata.share_urls"], dict + ): + r["metadata.share_urls"] = list(r["metadata.share_urls"].values()) or None + + # emit only public schema items + for key, val in iteritems(r): + key = key.replace(".", "_") + if key in schema: + event_dict[key]=val + + # ensure all columns are available and null when needed + for key in schema: + if key not in r.keys(): + event_dict[key] = None + + event_dict[version] = version + + return event_dict + + return def mk_sample_event(): sample = {} for record in SCHEMA: From e56415fb532286aca22e748d180f3988d9fde018 Mon Sep 17 00:00:00 2001 From: rachelannelise Date: Tue, 23 Jul 2019 16:20:51 -0600 Subject: [PATCH 02/18] Wrong file location --- parsely_raw_data/schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parsely_raw_data/schema.py b/parsely_raw_data/schema.py index 9d76709..ad2fc50 100644 --- a/parsely_raw_data/schema.py +++ b/parsely_raw_data/schema.py @@ -162,7 +162,7 @@ def normalize_keys(r, schema): """Conform events to public schema: correct keys and proper value types.""" schema = schema or _get_public_dpl_schema() event_dict = {} - with open('parsely_raw_data/__init__.py') as version_file: + with open('__init__.py') as version_file: version = re.search(r"""__version__\s+=\s+(['"])(?P.+?)\1""", version_file.read()).group('version') From b658110d11f16ba92ec7a73c5eaab05669b44cb0 Mon Sep 17 00:00:00 2001 From: rachelannelise Date: Wed, 24 Jul 2019 10:36:01 -0600 Subject: [PATCH 03/18] relocating normalize keys) --- parsely_raw_data/__init__.py | 28 +++++++++++++++++++++++ parsely_raw_data/schema.py | 44 ------------------------------------ 2 files changed, 28 insertions(+), 44 deletions(-) diff --git a/parsely_raw_data/__init__.py b/parsely_raw_data/__init__.py index 5597b55..04a5d64 100644 --- a/parsely_raw_data/__init__.py +++ b/parsely_raw_data/__init__.py @@ -17,6 +17,7 @@ __version__ = '2.3.0.dev0' from . import bigquery, docgen, redshift, s3, samples, schema, stream, utils +from six import iteritems __all__ = [ 'bigquery', @@ -28,3 +29,30 @@ 'stream', 'utils', ] + +def normalize_keys(r, schema): + """Conform events to public schema: correct keys and proper value types.""" + schema = schema or schema.SCHEMA + event_dict = {} + version =__version__ + + # fix value types + if r.get("metadata.share_urls") is not None and isinstance( + r["metadata.share_urls"], dict + ): + r["metadata.share_urls"] = list(r["metadata.share_urls"].values()) or None + + # emit only public schema items + for key, val in iteritems(r): + key = key.replace(".", "_") + if key in schema: + event_dict[key]=val + + # ensure all columns are available and null when needed + for key in schema: + if key not in r.keys(): + event_dict[key] = None + + event_dict[version] = version + + return event_dict diff --git a/parsely_raw_data/schema.py b/parsely_raw_data/schema.py index ad2fc50..cc9302d 100644 --- a/parsely_raw_data/schema.py +++ b/parsely_raw_data/schema.py @@ -1,8 +1,5 @@ from __future__ import print_function -import re -from six import iteritems, text_type - from tabulate import tabulate """ @@ -147,47 +144,6 @@ {"key": "visitor_site_id", "ex": "ab94fd31-a207-4010-8a25-fb4788207b82", "type": str, "size": 128, "req": True, "available_with_field": "visitor"} ] -def _get_public_dpl_schema(): - """Get set of public schema keys for use in normalize_keys()""" - global PUBLIC_DPL_SCHEMA_KEYS - - if not PUBLIC_DPL_SCHEMA_KEYS: - public_schema_keys = {text_type(field["key"]) for field in SCHEMA} - PUBLIC_DPL_SCHEMA_KEYS = public_schema_keys - - return PUBLIC_DPL_SCHEMA_KEYS - - -def normalize_keys(r, schema): - """Conform events to public schema: correct keys and proper value types.""" - schema = schema or _get_public_dpl_schema() - event_dict = {} - with open('__init__.py') as version_file: - version = re.search(r"""__version__\s+=\s+(['"])(?P.+?)\1""", - version_file.read()).group('version') - - # fix value types - if r.get("metadata.share_urls") is not None and isinstance( - r["metadata.share_urls"], dict - ): - r["metadata.share_urls"] = list(r["metadata.share_urls"].values()) or None - - # emit only public schema items - for key, val in iteritems(r): - key = key.replace(".", "_") - if key in schema: - event_dict[key]=val - - # ensure all columns are available and null when needed - for key in schema: - if key not in r.keys(): - event_dict[key] = None - - event_dict[version] = version - - return event_dict - - return def mk_sample_event(): sample = {} for record in SCHEMA: From 3f9eaac01b14250ffd9c3b54c1d040212688a7d5 Mon Sep 17 00:00:00 2001 From: rachelannelise Date: Wed, 24 Jul 2019 10:48:34 -0600 Subject: [PATCH 04/18] relocating normalize keys --- parsely_raw_data/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/parsely_raw_data/__init__.py b/parsely_raw_data/__init__.py index 04a5d64..602d186 100644 --- a/parsely_raw_data/__init__.py +++ b/parsely_raw_data/__init__.py @@ -49,9 +49,9 @@ def normalize_keys(r, schema): event_dict[key]=val # ensure all columns are available and null when needed - for key in schema: - if key not in r.keys(): - event_dict[key] = None + # for key in schema: + # if key not in r.keys(): + # event_dict[key] = None event_dict[version] = version From 2993f84615e9148ee327a046a7fed44934af1260 Mon Sep 17 00:00:00 2001 From: rachelannelise Date: Wed, 24 Jul 2019 11:18:57 -0600 Subject: [PATCH 05/18] relocating normalize keys --- parsely_raw_data/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/parsely_raw_data/__init__.py b/parsely_raw_data/__init__.py index 602d186..fbe5a74 100644 --- a/parsely_raw_data/__init__.py +++ b/parsely_raw_data/__init__.py @@ -46,12 +46,12 @@ def normalize_keys(r, schema): for key, val in iteritems(r): key = key.replace(".", "_") if key in schema: - event_dict[key]=val + event_dict[key] = val # ensure all columns are available and null when needed - # for key in schema: - # if key not in r.keys(): - # event_dict[key] = None + for key in schema: + if key not in event_dict.keys(): + event_dict[key] = None event_dict[version] = version From 099e629c55eaf81115117093ef1a4ac183b35b66 Mon Sep 17 00:00:00 2001 From: rachelannelise Date: Wed, 24 Jul 2019 12:00:23 -0600 Subject: [PATCH 06/18] relocating normalize keys --- parsely_raw_data/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parsely_raw_data/__init__.py b/parsely_raw_data/__init__.py index fbe5a74..81989a2 100644 --- a/parsely_raw_data/__init__.py +++ b/parsely_raw_data/__init__.py @@ -53,6 +53,6 @@ def normalize_keys(r, schema): if key not in event_dict.keys(): event_dict[key] = None - event_dict[version] = version + event_dict["version"] = version return event_dict From d47ccef4f37b52066d85a6a1fdc8ffdcbc266d60 Mon Sep 17 00:00:00 2001 From: rachelannelise Date: Wed, 24 Jul 2019 12:14:50 -0600 Subject: [PATCH 07/18] Adding schema_version column --- parsely_raw_data/__init__.py | 2 +- parsely_raw_data/schema.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/parsely_raw_data/__init__.py b/parsely_raw_data/__init__.py index 81989a2..01e4471 100644 --- a/parsely_raw_data/__init__.py +++ b/parsely_raw_data/__init__.py @@ -53,6 +53,6 @@ def normalize_keys(r, schema): if key not in event_dict.keys(): event_dict[key] = None - event_dict["version"] = version + event_dict["schema_version"] = version return event_dict diff --git a/parsely_raw_data/schema.py b/parsely_raw_data/schema.py index cc9302d..ffcdad1 100644 --- a/parsely_raw_data/schema.py +++ b/parsely_raw_data/schema.py @@ -76,6 +76,7 @@ {"key": "ref_query", "ex": "", "type": str}, {"key": "ref_scheme", "ex": "http", "type": str, "size": 64}, {"key": "referrer", "ex": "http://mashable.com/", "type": str}, + {"key": "schema_version", "ex": "2.3.0", "type": str, "size": 64}, {"key": "session", "ex": True, "type": bool}, {"key": "session_id", "ex": 6, "type": int, "available_with_field": "session"}, {"key": "session_initial_referrer", "ex": "http://mashable.com/", "type": str, "available_with_field": "session"}, From 9cda050fed0a587baaf5a5bd70d91ae6e2e2311d Mon Sep 17 00:00:00 2001 From: rachelannelise Date: Wed, 28 Aug 2019 14:26:15 -0600 Subject: [PATCH 08/18] Adding flags_is_amp False statement and fixing pytest travis error --- parsely_raw_data/__init__.py | 12 ++++++++++-- requirements.txt | 1 + tox.ini | 2 +- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/parsely_raw_data/__init__.py b/parsely_raw_data/__init__.py index 01e4471..2857ebc 100644 --- a/parsely_raw_data/__init__.py +++ b/parsely_raw_data/__init__.py @@ -14,7 +14,7 @@ limitations under the License. """ -__version__ = '2.3.0.dev0' +__version__ = '2.3.0.dev1' from . import bigquery, docgen, redshift, s3, samples, schema, stream, utils from six import iteritems @@ -30,6 +30,10 @@ 'utils', ] +BOOLEAN_FIELDS = [ + "flags_is_amp", +] + def normalize_keys(r, schema): """Conform events to public schema: correct keys and proper value types.""" schema = schema or schema.SCHEMA @@ -49,9 +53,13 @@ def normalize_keys(r, schema): event_dict[key] = val # ensure all columns are available and null when needed + # account for all boolean schema defined fields as this is parsely_raw_data specific for key in schema: if key not in event_dict.keys(): - event_dict[key] = None + if key in BOOLEAN_FIELDS: + event_dict[key] = False + else: + event_dict[key] = None event_dict["schema_version"] = version diff --git a/requirements.txt b/requirements.txt index 0bdeac1..1cf4b95 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ tablib xlsxwriter tabulate oauth2client +pytest diff --git a/tox.ini b/tox.ini index 3dd9209..81623be 100644 --- a/tox.ini +++ b/tox.ini @@ -5,4 +5,4 @@ envlist = py27, py34, py35, pypy commands = pip install -r test-requirements.txt pip install -e . - py.test {posargs} + pytest {posargs} From ee24ae3c8c002e9b5e0cb56950a39382877d62e5 Mon Sep 17 00:00:00 2001 From: rachelannelise Date: Wed, 28 Aug 2019 14:59:46 -0600 Subject: [PATCH 09/18] adding tool:pytest to setup cfg --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 0395844..e0ca01b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,7 +3,7 @@ logging-clear-handlers = 1 verbosity = 2 detailed-errors = 1 -[pytest] +[tool:pytest] norecursedirs = build docs/_build *.egg .tox *.venv requirements/ addopts = # Shows a line for every test From 3719bb91c0861288db581f0404f98e51e2416a9a Mon Sep 17 00:00:00 2001 From: rachelannelise Date: Thu, 29 Aug 2019 16:19:53 -0600 Subject: [PATCH 10/18] Adding normalize_keys tests --- parsely_raw_data/__init__.py | 3 +- tests/test_basic.py | 256 +++++++++++++++++++++++++++++++++++ 2 files changed, 258 insertions(+), 1 deletion(-) diff --git a/parsely_raw_data/__init__.py b/parsely_raw_data/__init__.py index 2857ebc..f2db1fe 100644 --- a/parsely_raw_data/__init__.py +++ b/parsely_raw_data/__init__.py @@ -14,7 +14,7 @@ limitations under the License. """ -__version__ = '2.3.0.dev1' +__version__ = '2.3.0' from . import bigquery, docgen, redshift, s3, samples, schema, stream, utils from six import iteritems @@ -34,6 +34,7 @@ "flags_is_amp", ] + def normalize_keys(r, schema): """Conform events to public schema: correct keys and proper value types.""" schema = schema or schema.SCHEMA diff --git a/tests/test_basic.py b/tests/test_basic.py index 71bd79a..e12002f 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -1,8 +1,264 @@ +from six import text_type + import parsely_raw_data +def test_normalize_keys( ): + # Return schema keys + public_schema_keys = {text_type(field["key"]) for field in parsely_raw_data.schema.SCHEMA} + + # test passing empty dict yields full dict with empty rows + event_empty_dict = { + "event_id": "1234" + } + normalized_dict = dict(parsely_raw_data.normalize_keys(event_empty_dict, schema=public_schema_keys)) + assert len(normalized_dict.keys()) > 2 + + # test passing event dict yield all expected fields in schema + expected_view = { + "action": "pageview", + "apikey": "example.com", + "campaign_id": None, + "channel": "website", + "display": True, + "display_avail_height": 877, + "display_avail_width": 1436, + "display_pixel_depth": 24, + "display_total_height": 900, + "display_total_width": 1440, + "engaged_time_inc": None, + "extra_data": None, + "flags_is_amp": False, + "event_id": "0x32bc4058da8233e6ddd86ba0a8920586", + "ip_city": "Toronto", + "ip_continent": "NA", + "ip_country": "CA", + "ip_lat": 43.7124, + "ip_lon": -79.3644, + "ip_postal": "M4G", + "ip_subdivision": "ON", + "ip_timezone": "America/Toronto", + "ip_market_nielsen": None, + "ip_market_doubleclick": None, + "ip_market_name": None, + "metadata": False, + "metadata_authors": None, + "metadata_canonical_url": None, + "metadata_custom_metadata": None, + "metadata_data_source": None, + "metadata_duration": None, + "metadata_full_content_word_count": None, + "metadata_image_url": None, + "metadata_page_type": None, + "metadata_post_id": None, + "metadata_pub_date_tmsp": None, + "metadata_save_date_tmsp": None, + "metadata_section": None, + "metadata_share_urls": None, + "metadata_tags": None, + "metadata_thumb_url": None, + "metadata_title": None, + "metadata_urls": None, + "pageload_id": None, + "pageview_id": None, + "ref_category": "internal", + "ref_clean": "http://www.example.com/article-123", + "ref_domain": "example.com", + "ref_fragment": "", + "ref_netloc": "www.example.com", + "ref_params": "", + "ref_path": "/article-123", + "ref_query": "", + "ref_scheme": "http", + "referrer": "http://www.example.com/article-123", + "schema_version": parsely_raw_data.__version__, + "session": True, + "session_id": 5, + "session_initial_referrer": "https://www.google.ca/", + "session_initial_url": "http://www.example.com/", + "session_last_session_timestamp": 1_470_045_600_000, + "session_timestamp": 1_471_428_000_000, + "slot": False, + "sref_category": "search", + "sref_clean": "https://www.google.ca/", + "sref_domain": "google", + "sref_fragment": "", + "sref_netloc": "www.google.ca", + "sref_params": "", + "sref_path": "/", + "sref_query": "", + "sref_scheme": "https", + "surl_clean": "http://www.example.com/", + "surl_domain": "example.com", + "surl_fragment": "", + "surl_netloc": "www.example.com", + "surl_params": "", + "surl_path": "/", + "surl_query": "", + "surl_scheme": "http", + "surl_utm_campaign": None, + "surl_utm_content": None, + "surl_utm_medium": None, + "surl_utm_source": None, + "surl_utm_term": None, + "timestamp_info": True, + "timestamp_info_nginx_ms": 1_429_707_722_000, + "timestamp_info_override_ms": None, + "timestamp_info_pixel_ms": None, + "ts_action": "2015-04-22 13:02:02", + "ts_session_current": "2016-08-17 10:00:00", + "ts_session_last": "2016-08-01 10:00:00", + "ua_browser": "Chrome", + "ua_browserversion": "52.0.2743", + "ua_device": "Other", + "ua_devicebrand": None, + "ua_devicemodel": None, + "ua_devicetouchcapable": False, + "ua_devicetype": "desktop", + "ua_os": "Mac OS X", + "ua_osversion": "10.10.5", + "url": "http://www.example.com/", + "url_clean": "http://www.example.com/", + "url_domain": "example.com", + "url_fragment": "", + "url_netloc": "www.example.com", + "url_params": "", + "url_path": "/", + "url_query": "", + "url_scheme": "http", + "user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36", + "utm_campaign": None, + "utm_content": None, + "utm_medium": None, + "utm_source": None, + "utm_term": None, + "videostart_id": None, + "version": 1, + "visitor": True, + "visitor_ip": "184.149.39.120", + "visitor_network_id": "", + "visitor_site_id": "e71604df-a912-455d-aaf3-a9c72a6dd86c", + } + test_view = { + "action": "pageview", + "apikey": "example.com", + "channel": "website", + "display": True, + "display_avail_height": 877, + "display_avail_width": 1436, + "display_pixel_depth": 24, + "display_total_height": 900, + "display_total_width": 1440, + "event_id": "0x32bc4058da8233e6ddd86ba0a8920586", + "ip_city": "Toronto", + "ip_continent": "NA", + "ip_country": "CA", + "ip_lat": 43.7124, + "ip_lon": -79.3644, + "ip_postal": "M4G", + "ip_subdivision": "ON", + "ip_timezone": "America/Toronto", + "ip_market_nielsen": None, + "ip_market_doubleclick": None, + "ip_market_name": None, + "metadata": False, + "metadata_authors": None, + "metadata_canonical_url": None, + "metadata_custom_metadata": None, + "metadata_data_source": None, + "metadata_duration": None, + "metadata_full_content_word_count": None, + "metadata_image_url": None, + "metadata_page_type": None, + "metadata_post_id": None, + "metadata_pub_date_tmsp": None, + "metadata_save_date_tmsp": None, + "metadata_section": None, + "metadata_share_urls": None, + "metadata_tags": None, + "metadata_thumb_url": None, + "metadata_title": None, + "metadata_urls": None, + "pageload_id": None, + "pageview_id": None, + "ref_category": "internal", + "ref_clean": "http://www.example.com/article-123", + "ref_domain": "example.com", + "ref_fragment": "", + "ref_netloc": "www.example.com", + "ref_params": "", + "ref_path": "/article-123", + "ref_query": "", + "ref_scheme": "http", + "referrer": "http://www.example.com/article-123", + "schema_version": parsely_raw_data.__version__, + "session": True, + "session_id": 5, + "session_initial_referrer": "https://www.google.ca/", + "session_initial_url": "http://www.example.com/", + "session_last_session_timestamp": 1_470_045_600_000, + "session_timestamp": 1_471_428_000_000, + "slot": False, + "sref_category": "search", + "sref_clean": "https://www.google.ca/", + "sref_domain": "google", + "sref_fragment": "", + "sref_netloc": "www.google.ca", + "sref_params": "", + "sref_path": "/", + "sref_query": "", + "sref_scheme": "https", + "surl_clean": "http://www.example.com/", + "surl_domain": "example.com", + "surl_fragment": "", + "surl_netloc": "www.example.com", + "surl_params": "", + "surl_path": "/", + "surl_query": "", + "surl_scheme": "http", + "surl_utm_campaign": None, + "surl_utm_content": None, + "surl_utm_medium": None, + "surl_utm_source": None, + "surl_utm_term": None, + "timestamp_info": True, + "timestamp_info_nginx_ms": 1_429_707_722_000, + "timestamp_info_override_ms": None, + "timestamp_info_pixel_ms": None, + "ts_action": "2015-04-22 13:02:02", + "ts_session_current": "2016-08-17 10:00:00", + "ts_session_last": "2016-08-01 10:00:00", + "ua_browser": "Chrome", + "ua_browserversion": "52.0.2743", + "ua_device": "Other", + "ua_devicetouchcapable": False, + "ua_devicetype": "desktop", + "ua_os": "Mac OS X", + "ua_osversion": "10.10.5", + "url": "http://www.example.com/", + "url_clean": "http://www.example.com/", + "url_domain": "example.com", + "url_fragment": "", + "url_netloc": "www.example.com", + "url_params": "", + "url_path": "/", + "url_query": "", + "url_scheme": "http", + "user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36", + "version": 1, + "visitor": True, + "visitor_ip": "184.149.39.120", + "visitor_network_id": "", + "visitor_site_id": "e71604df-a912-455d-aaf3-a9c72a6dd86c", + } + + normalized_dict = dict(parsely_raw_data.normalize_keys(test_view, schema=public_schema_keys)) + assert normalized_dict == expected_view + + def test_basic(): pass + if __name__ == "__main__": test_basic() From 8f33cf9adcd6217f11b68e43009e003174eda328 Mon Sep 17 00:00:00 2001 From: rachelannelise Date: Wed, 4 Sep 2019 12:40:36 -0600 Subject: [PATCH 11/18] Cleaning up normalize keys dict --- parsely_raw_data/__init__.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/parsely_raw_data/__init__.py b/parsely_raw_data/__init__.py index f2db1fe..1f71e7e 100644 --- a/parsely_raw_data/__init__.py +++ b/parsely_raw_data/__init__.py @@ -35,20 +35,23 @@ ] -def normalize_keys(r, schema): - """Conform events to public schema: correct keys and proper value types.""" +def normalize_keys(event_dict, schema=None): + """Conform events to public schema: correct keys and proper value types. + + @param event_dict: A dictionary containing Parse.ly pixel events + @param schema: Optional parameter containing the schema to normalize the event_dict keys against + IF not specified, this will default to the most recent parsely_raw_data schema + """ schema = schema or schema.SCHEMA - event_dict = {} - version =__version__ # fix value types - if r.get("metadata.share_urls") is not None and isinstance( - r["metadata.share_urls"], dict + if event_dict.get("metadata.share_urls") is not None and isinstance( + event_dict["metadata.share_urls"], dict ): - r["metadata.share_urls"] = list(r["metadata.share_urls"].values()) or None + event_dict["metadata.share_urls"] = list(event_dict["metadata.share_urls"].values()) or None # emit only public schema items - for key, val in iteritems(r): + for key, val in iteritems(event_dict): key = key.replace(".", "_") if key in schema: event_dict[key] = val @@ -62,6 +65,6 @@ def normalize_keys(r, schema): else: event_dict[key] = None - event_dict["schema_version"] = version + event_dict["schema_version"] = __version__ return event_dict From 1545b8448669c1e1cf4dd5772ba2d88a9bb693e4 Mon Sep 17 00:00:00 2001 From: rachelannelise Date: Wed, 4 Sep 2019 12:45:12 -0600 Subject: [PATCH 12/18] Renaming event_dict fields for readibility and black --- parsely_raw_data/__init__.py | 37 ++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/parsely_raw_data/__init__.py b/parsely_raw_data/__init__.py index 1f71e7e..5bc5b78 100644 --- a/parsely_raw_data/__init__.py +++ b/parsely_raw_data/__init__.py @@ -14,44 +14,45 @@ limitations under the License. """ -__version__ = '2.3.0' +__version__ = "2.3.0" from . import bigquery, docgen, redshift, s3, samples, schema, stream, utils from six import iteritems __all__ = [ - 'bigquery', - 'docgen', - 'redshift', - 's3', - 'samples', - 'schema', - 'stream', - 'utils', + "bigquery", + "docgen", + "redshift", + "s3", + "samples", + "schema", + "stream", + "utils", ] -BOOLEAN_FIELDS = [ - "flags_is_amp", -] +BOOLEAN_FIELDS = ["flags_is_amp"] -def normalize_keys(event_dict, schema=None): +def normalize_keys(input_event_dict, schema=None): """Conform events to public schema: correct keys and proper value types. - @param event_dict: A dictionary containing Parse.ly pixel events + @param input_event_dict: A dictionary containing Parse.ly pixel events @param schema: Optional parameter containing the schema to normalize the event_dict keys against IF not specified, this will default to the most recent parsely_raw_data schema """ + event_dict = {} schema = schema or schema.SCHEMA # fix value types - if event_dict.get("metadata.share_urls") is not None and isinstance( - event_dict["metadata.share_urls"], dict + if input_event_dict.get("metadata.share_urls") is not None and isinstance( + input_event_dict["metadata.share_urls"], dict ): - event_dict["metadata.share_urls"] = list(event_dict["metadata.share_urls"].values()) or None + input_event_dict["metadata.share_urls"] = ( + list(input_event_dict["metadata.share_urls"].values()) or None + ) # emit only public schema items - for key, val in iteritems(event_dict): + for key, val in iteritems(input_event_dict): key = key.replace(".", "_") if key in schema: event_dict[key] = val From 82fce88669d062e1914bec091189e221ce787c14 Mon Sep 17 00:00:00 2001 From: rachelannelise Date: Wed, 4 Sep 2019 12:52:17 -0600 Subject: [PATCH 13/18] Updating tests for backwards compatibility with 2.7 via travis --- tests/test_basic.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_basic.py b/tests/test_basic.py index e12002f..6d42ce0 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -76,8 +76,8 @@ def test_normalize_keys( ): "session_id": 5, "session_initial_referrer": "https://www.google.ca/", "session_initial_url": "http://www.example.com/", - "session_last_session_timestamp": 1_470_045_600_000, - "session_timestamp": 1_471_428_000_000, + "session_last_session_timestamp": "1_470_045_600_000", + "session_timestamp": "1_471_428_000_000", "slot": False, "sref_category": "search", "sref_clean": "https://www.google.ca/", @@ -102,7 +102,7 @@ def test_normalize_keys( ): "surl_utm_source": None, "surl_utm_term": None, "timestamp_info": True, - "timestamp_info_nginx_ms": 1_429_707_722_000, + "timestamp_info_nginx_ms": "1_429_707_722_000", "timestamp_info_override_ms": None, "timestamp_info_pixel_ms": None, "ts_action": "2015-04-22 13:02:02", @@ -196,8 +196,8 @@ def test_normalize_keys( ): "session_id": 5, "session_initial_referrer": "https://www.google.ca/", "session_initial_url": "http://www.example.com/", - "session_last_session_timestamp": 1_470_045_600_000, - "session_timestamp": 1_471_428_000_000, + "session_last_session_timestamp": "1_470_045_600_000", + "session_timestamp": "1_471_428_000_000", "slot": False, "sref_category": "search", "sref_clean": "https://www.google.ca/", @@ -222,7 +222,7 @@ def test_normalize_keys( ): "surl_utm_source": None, "surl_utm_term": None, "timestamp_info": True, - "timestamp_info_nginx_ms": 1_429_707_722_000, + "timestamp_info_nginx_ms": "1_429_707_722_000", "timestamp_info_override_ms": None, "timestamp_info_pixel_ms": None, "ts_action": "2015-04-22 13:02:02", From e5d264e98f62e1a2b857748d10ad332e4c1bb2df Mon Sep 17 00:00:00 2001 From: rachelannelise Date: Wed, 4 Sep 2019 13:41:05 -0600 Subject: [PATCH 14/18] Cleaning up schema loop and key character replacement --- parsely_raw_data/__init__.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/parsely_raw_data/__init__.py b/parsely_raw_data/__init__.py index 5bc5b78..0f014b5 100644 --- a/parsely_raw_data/__init__.py +++ b/parsely_raw_data/__init__.py @@ -51,20 +51,20 @@ def normalize_keys(input_event_dict, schema=None): list(input_event_dict["metadata.share_urls"].values()) or None ) - # emit only public schema items - for key, val in iteritems(input_event_dict): - key = key.replace(".", "_") - if key in schema: - event_dict[key] = val + # replace all "."s in the key with "_" + input_event_dict = {x.replace(',', '_'): v for x, v in input_event_dict.items()} + # emit only public schema items # ensure all columns are available and null when needed # account for all boolean schema defined fields as this is parsely_raw_data specific for key in schema: - if key not in event_dict.keys(): + if key not in input_event_dict.keys(): if key in BOOLEAN_FIELDS: event_dict[key] = False else: event_dict[key] = None + else: + event_dict[key] = input_event_dict[key] event_dict["schema_version"] = __version__ From 00356b31832443fe4f1be42b00945d089ac940c1 Mon Sep 17 00:00:00 2001 From: rachelannelise Date: Thu, 5 Sep 2019 14:37:04 -0600 Subject: [PATCH 15/18] Updating test formats --- parsely_raw_data/__init__.py | 2 +- tests/test_basic.py | 483 ++++++++++++++++++----------------- 2 files changed, 245 insertions(+), 240 deletions(-) diff --git a/parsely_raw_data/__init__.py b/parsely_raw_data/__init__.py index 0f014b5..3c1aabf 100644 --- a/parsely_raw_data/__init__.py +++ b/parsely_raw_data/__init__.py @@ -58,7 +58,7 @@ def normalize_keys(input_event_dict, schema=None): # ensure all columns are available and null when needed # account for all boolean schema defined fields as this is parsely_raw_data specific for key in schema: - if key not in input_event_dict.keys(): + if key not in input_event_dict: if key in BOOLEAN_FIELDS: event_dict[key] = False else: diff --git a/tests/test_basic.py b/tests/test_basic.py index 6d42ce0..ca64af2 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -2,8 +2,246 @@ import parsely_raw_data +expected_view = { + "action": "pageview", + "apikey": "example.com", + "campaign_id": None, + "channel": "website", + "display": True, + "display_avail_height": 877, + "display_avail_width": 1436, + "display_pixel_depth": 24, + "display_total_height": 900, + "display_total_width": 1440, + "engaged_time_inc": None, + "extra_data": None, + "flags_is_amp": False, + "event_id": "0x32bc4058da8233e6ddd86ba0a8920586", + "ip_city": "Toronto", + "ip_continent": "NA", + "ip_country": "CA", + "ip_lat": 43.7124, + "ip_lon": -79.3644, + "ip_postal": "M4G", + "ip_subdivision": "ON", + "ip_timezone": "America/Toronto", + "ip_market_nielsen": None, + "ip_market_doubleclick": None, + "ip_market_name": None, + "metadata": False, + "metadata_authors": None, + "metadata_canonical_url": None, + "metadata_custom_metadata": None, + "metadata_data_source": None, + "metadata_duration": None, + "metadata_full_content_word_count": None, + "metadata_image_url": None, + "metadata_page_type": None, + "metadata_post_id": None, + "metadata_pub_date_tmsp": None, + "metadata_save_date_tmsp": None, + "metadata_section": None, + "metadata_share_urls": None, + "metadata_tags": None, + "metadata_thumb_url": None, + "metadata_title": None, + "metadata_urls": None, + "pageload_id": None, + "pageview_id": None, + "ref_category": "internal", + "ref_clean": "http://www.example.com/article-123", + "ref_domain": "example.com", + "ref_fragment": "", + "ref_netloc": "www.example.com", + "ref_params": "", + "ref_path": "/article-123", + "ref_query": "", + "ref_scheme": "http", + "referrer": "http://www.example.com/article-123", + "schema_version": parsely_raw_data.__version__, + "session": True, + "session_id": 5, + "session_initial_referrer": "https://www.google.ca/", + "session_initial_url": "http://www.example.com/", + "session_last_session_timestamp": "1_470_045_600_000", + "session_timestamp": "1_471_428_000_000", + "slot": False, + "sref_category": "search", + "sref_clean": "https://www.google.ca/", + "sref_domain": "google", + "sref_fragment": "", + "sref_netloc": "www.google.ca", + "sref_params": "", + "sref_path": "/", + "sref_query": "", + "sref_scheme": "https", + "surl_clean": "http://www.example.com/", + "surl_domain": "example.com", + "surl_fragment": "", + "surl_netloc": "www.example.com", + "surl_params": "", + "surl_path": "/", + "surl_query": "", + "surl_scheme": "http", + "surl_utm_campaign": None, + "surl_utm_content": None, + "surl_utm_medium": None, + "surl_utm_source": None, + "surl_utm_term": None, + "timestamp_info": True, + "timestamp_info_nginx_ms": "1_429_707_722_000", + "timestamp_info_override_ms": None, + "timestamp_info_pixel_ms": None, + "ts_action": "2015-04-22 13:02:02", + "ts_session_current": "2016-08-17 10:00:00", + "ts_session_last": "2016-08-01 10:00:00", + "ua_browser": "Chrome", + "ua_browserversion": "52.0.2743", + "ua_device": "Other", + "ua_devicebrand": None, + "ua_devicemodel": None, + "ua_devicetouchcapable": False, + "ua_devicetype": "desktop", + "ua_os": "Mac OS X", + "ua_osversion": "10.10.5", + "url": "http://www.example.com/", + "url_clean": "http://www.example.com/", + "url_domain": "example.com", + "url_fragment": "", + "url_netloc": "www.example.com", + "url_params": "", + "url_path": "/", + "url_query": "", + "url_scheme": "http", + "user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36", + "utm_campaign": None, + "utm_content": None, + "utm_medium": None, + "utm_source": None, + "utm_term": None, + "videostart_id": None, + "version": 1, + "visitor": True, + "visitor_ip": "184.149.39.120", + "visitor_network_id": "", + "visitor_site_id": "e71604df-a912-455d-aaf3-a9c72a6dd86c", +} -def test_normalize_keys( ): +test_view = { + "action": "pageview", + "apikey": "example.com", + "channel": "website", + "display": True, + "display_avail_height": 877, + "display_avail_width": 1436, + "display_pixel_depth": 24, + "display_total_height": 900, + "display_total_width": 1440, + "event_id": "0x32bc4058da8233e6ddd86ba0a8920586", + "ip_city": "Toronto", + "ip_continent": "NA", + "ip_country": "CA", + "ip_lat": 43.7124, + "ip_lon": -79.3644, + "ip_postal": "M4G", + "ip_subdivision": "ON", + "ip_timezone": "America/Toronto", + "ip_market_nielsen": None, + "ip_market_doubleclick": None, + "ip_market_name": None, + "metadata": False, + "metadata_authors": None, + "metadata_canonical_url": None, + "metadata_custom_metadata": None, + "metadata_data_source": None, + "metadata_duration": None, + "metadata_full_content_word_count": None, + "metadata_image_url": None, + "metadata_page_type": None, + "metadata_post_id": None, + "metadata_pub_date_tmsp": None, + "metadata_save_date_tmsp": None, + "metadata_section": None, + "metadata_share_urls": None, + "metadata_tags": None, + "metadata_thumb_url": None, + "metadata_title": None, + "metadata_urls": None, + "pageload_id": None, + "pageview_id": None, + "ref_category": "internal", + "ref_clean": "http://www.example.com/article-123", + "ref_domain": "example.com", + "ref_fragment": "", + "ref_netloc": "www.example.com", + "ref_params": "", + "ref_path": "/article-123", + "ref_query": "", + "ref_scheme": "http", + "referrer": "http://www.example.com/article-123", + "schema_version": parsely_raw_data.__version__, + "session": True, + "session_id": 5, + "session_initial_referrer": "https://www.google.ca/", + "session_initial_url": "http://www.example.com/", + "session_last_session_timestamp": "1_470_045_600_000", + "session_timestamp": "1_471_428_000_000", + "slot": False, + "sref_category": "search", + "sref_clean": "https://www.google.ca/", + "sref_domain": "google", + "sref_fragment": "", + "sref_netloc": "www.google.ca", + "sref_params": "", + "sref_path": "/", + "sref_query": "", + "sref_scheme": "https", + "surl_clean": "http://www.example.com/", + "surl_domain": "example.com", + "surl_fragment": "", + "surl_netloc": "www.example.com", + "surl_params": "", + "surl_path": "/", + "surl_query": "", + "surl_scheme": "http", + "surl_utm_campaign": None, + "surl_utm_content": None, + "surl_utm_medium": None, + "surl_utm_source": None, + "surl_utm_term": None, + "timestamp_info": True, + "timestamp_info_nginx_ms": "1_429_707_722_000", + "timestamp_info_override_ms": None, + "timestamp_info_pixel_ms": None, + "ts_action": "2015-04-22 13:02:02", + "ts_session_current": "2016-08-17 10:00:00", + "ts_session_last": "2016-08-01 10:00:00", + "ua_browser": "Chrome", + "ua_browserversion": "52.0.2743", + "ua_device": "Other", + "ua_devicetouchcapable": False, + "ua_devicetype": "desktop", + "ua_os": "Mac OS X", + "ua_osversion": "10.10.5", + "url": "http://www.example.com/", + "url_clean": "http://www.example.com/", + "url_domain": "example.com", + "url_fragment": "", + "url_netloc": "www.example.com", + "url_params": "", + "url_path": "/", + "url_query": "", + "url_scheme": "http", + "user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36", + "version": 1, + "visitor": True, + "visitor_ip": "184.149.39.120", + "visitor_network_id": "", + "visitor_site_id": "e71604df-a912-455d-aaf3-a9c72a6dd86c", +} + + +def test_normalize_keys_empty_dict(): # Return schema keys public_schema_keys = {text_type(field["key"]) for field in parsely_raw_data.schema.SCHEMA} @@ -14,245 +252,12 @@ def test_normalize_keys( ): normalized_dict = dict(parsely_raw_data.normalize_keys(event_empty_dict, schema=public_schema_keys)) assert len(normalized_dict.keys()) > 2 - # test passing event dict yield all expected fields in schema - expected_view = { - "action": "pageview", - "apikey": "example.com", - "campaign_id": None, - "channel": "website", - "display": True, - "display_avail_height": 877, - "display_avail_width": 1436, - "display_pixel_depth": 24, - "display_total_height": 900, - "display_total_width": 1440, - "engaged_time_inc": None, - "extra_data": None, - "flags_is_amp": False, - "event_id": "0x32bc4058da8233e6ddd86ba0a8920586", - "ip_city": "Toronto", - "ip_continent": "NA", - "ip_country": "CA", - "ip_lat": 43.7124, - "ip_lon": -79.3644, - "ip_postal": "M4G", - "ip_subdivision": "ON", - "ip_timezone": "America/Toronto", - "ip_market_nielsen": None, - "ip_market_doubleclick": None, - "ip_market_name": None, - "metadata": False, - "metadata_authors": None, - "metadata_canonical_url": None, - "metadata_custom_metadata": None, - "metadata_data_source": None, - "metadata_duration": None, - "metadata_full_content_word_count": None, - "metadata_image_url": None, - "metadata_page_type": None, - "metadata_post_id": None, - "metadata_pub_date_tmsp": None, - "metadata_save_date_tmsp": None, - "metadata_section": None, - "metadata_share_urls": None, - "metadata_tags": None, - "metadata_thumb_url": None, - "metadata_title": None, - "metadata_urls": None, - "pageload_id": None, - "pageview_id": None, - "ref_category": "internal", - "ref_clean": "http://www.example.com/article-123", - "ref_domain": "example.com", - "ref_fragment": "", - "ref_netloc": "www.example.com", - "ref_params": "", - "ref_path": "/article-123", - "ref_query": "", - "ref_scheme": "http", - "referrer": "http://www.example.com/article-123", - "schema_version": parsely_raw_data.__version__, - "session": True, - "session_id": 5, - "session_initial_referrer": "https://www.google.ca/", - "session_initial_url": "http://www.example.com/", - "session_last_session_timestamp": "1_470_045_600_000", - "session_timestamp": "1_471_428_000_000", - "slot": False, - "sref_category": "search", - "sref_clean": "https://www.google.ca/", - "sref_domain": "google", - "sref_fragment": "", - "sref_netloc": "www.google.ca", - "sref_params": "", - "sref_path": "/", - "sref_query": "", - "sref_scheme": "https", - "surl_clean": "http://www.example.com/", - "surl_domain": "example.com", - "surl_fragment": "", - "surl_netloc": "www.example.com", - "surl_params": "", - "surl_path": "/", - "surl_query": "", - "surl_scheme": "http", - "surl_utm_campaign": None, - "surl_utm_content": None, - "surl_utm_medium": None, - "surl_utm_source": None, - "surl_utm_term": None, - "timestamp_info": True, - "timestamp_info_nginx_ms": "1_429_707_722_000", - "timestamp_info_override_ms": None, - "timestamp_info_pixel_ms": None, - "ts_action": "2015-04-22 13:02:02", - "ts_session_current": "2016-08-17 10:00:00", - "ts_session_last": "2016-08-01 10:00:00", - "ua_browser": "Chrome", - "ua_browserversion": "52.0.2743", - "ua_device": "Other", - "ua_devicebrand": None, - "ua_devicemodel": None, - "ua_devicetouchcapable": False, - "ua_devicetype": "desktop", - "ua_os": "Mac OS X", - "ua_osversion": "10.10.5", - "url": "http://www.example.com/", - "url_clean": "http://www.example.com/", - "url_domain": "example.com", - "url_fragment": "", - "url_netloc": "www.example.com", - "url_params": "", - "url_path": "/", - "url_query": "", - "url_scheme": "http", - "user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36", - "utm_campaign": None, - "utm_content": None, - "utm_medium": None, - "utm_source": None, - "utm_term": None, - "videostart_id": None, - "version": 1, - "visitor": True, - "visitor_ip": "184.149.39.120", - "visitor_network_id": "", - "visitor_site_id": "e71604df-a912-455d-aaf3-a9c72a6dd86c", - } - test_view = { - "action": "pageview", - "apikey": "example.com", - "channel": "website", - "display": True, - "display_avail_height": 877, - "display_avail_width": 1436, - "display_pixel_depth": 24, - "display_total_height": 900, - "display_total_width": 1440, - "event_id": "0x32bc4058da8233e6ddd86ba0a8920586", - "ip_city": "Toronto", - "ip_continent": "NA", - "ip_country": "CA", - "ip_lat": 43.7124, - "ip_lon": -79.3644, - "ip_postal": "M4G", - "ip_subdivision": "ON", - "ip_timezone": "America/Toronto", - "ip_market_nielsen": None, - "ip_market_doubleclick": None, - "ip_market_name": None, - "metadata": False, - "metadata_authors": None, - "metadata_canonical_url": None, - "metadata_custom_metadata": None, - "metadata_data_source": None, - "metadata_duration": None, - "metadata_full_content_word_count": None, - "metadata_image_url": None, - "metadata_page_type": None, - "metadata_post_id": None, - "metadata_pub_date_tmsp": None, - "metadata_save_date_tmsp": None, - "metadata_section": None, - "metadata_share_urls": None, - "metadata_tags": None, - "metadata_thumb_url": None, - "metadata_title": None, - "metadata_urls": None, - "pageload_id": None, - "pageview_id": None, - "ref_category": "internal", - "ref_clean": "http://www.example.com/article-123", - "ref_domain": "example.com", - "ref_fragment": "", - "ref_netloc": "www.example.com", - "ref_params": "", - "ref_path": "/article-123", - "ref_query": "", - "ref_scheme": "http", - "referrer": "http://www.example.com/article-123", - "schema_version": parsely_raw_data.__version__, - "session": True, - "session_id": 5, - "session_initial_referrer": "https://www.google.ca/", - "session_initial_url": "http://www.example.com/", - "session_last_session_timestamp": "1_470_045_600_000", - "session_timestamp": "1_471_428_000_000", - "slot": False, - "sref_category": "search", - "sref_clean": "https://www.google.ca/", - "sref_domain": "google", - "sref_fragment": "", - "sref_netloc": "www.google.ca", - "sref_params": "", - "sref_path": "/", - "sref_query": "", - "sref_scheme": "https", - "surl_clean": "http://www.example.com/", - "surl_domain": "example.com", - "surl_fragment": "", - "surl_netloc": "www.example.com", - "surl_params": "", - "surl_path": "/", - "surl_query": "", - "surl_scheme": "http", - "surl_utm_campaign": None, - "surl_utm_content": None, - "surl_utm_medium": None, - "surl_utm_source": None, - "surl_utm_term": None, - "timestamp_info": True, - "timestamp_info_nginx_ms": "1_429_707_722_000", - "timestamp_info_override_ms": None, - "timestamp_info_pixel_ms": None, - "ts_action": "2015-04-22 13:02:02", - "ts_session_current": "2016-08-17 10:00:00", - "ts_session_last": "2016-08-01 10:00:00", - "ua_browser": "Chrome", - "ua_browserversion": "52.0.2743", - "ua_device": "Other", - "ua_devicetouchcapable": False, - "ua_devicetype": "desktop", - "ua_os": "Mac OS X", - "ua_osversion": "10.10.5", - "url": "http://www.example.com/", - "url_clean": "http://www.example.com/", - "url_domain": "example.com", - "url_fragment": "", - "url_netloc": "www.example.com", - "url_params": "", - "url_path": "/", - "url_query": "", - "url_scheme": "http", - "user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36", - "version": 1, - "visitor": True, - "visitor_ip": "184.149.39.120", - "visitor_network_id": "", - "visitor_site_id": "e71604df-a912-455d-aaf3-a9c72a6dd86c", - } - normalized_dict = dict(parsely_raw_data.normalize_keys(test_view, schema=public_schema_keys)) +def test_normalize_keys_partially_complete_dict(): + public_schema_keys = {text_type(field["key"]) for field in parsely_raw_data.schema.SCHEMA} + + # test passing event dict yield all expected fields in schema + normalized_dict = parsely_raw_data.normalize_keys(test_view, schema=public_schema_keys) assert normalized_dict == expected_view From 8fa88c5474d7ace34a6d0afd74e69a64438ec3ec Mon Sep 17 00:00:00 2001 From: rachelannelise Date: Thu, 5 Sep 2019 14:48:47 -0600 Subject: [PATCH 16/18] Updating test formats --- parsely_raw_data/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parsely_raw_data/__init__.py b/parsely_raw_data/__init__.py index 3c1aabf..d4af058 100644 --- a/parsely_raw_data/__init__.py +++ b/parsely_raw_data/__init__.py @@ -30,7 +30,7 @@ "utils", ] -BOOLEAN_FIELDS = ["flags_is_amp"] +BOOLEAN_FIELDS = {"flags_is_amp"} def normalize_keys(input_event_dict, schema=None): From 0fca3b68e179dbe1bd6d42fc218a27f43af181e3 Mon Sep 17 00:00:00 2001 From: rachelannelise Date: Fri, 6 Sep 2019 11:39:25 -0600 Subject: [PATCH 17/18] Adding test specific schema length --- tests/test_basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_basic.py b/tests/test_basic.py index ca64af2..6d2cc8f 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -250,7 +250,7 @@ def test_normalize_keys_empty_dict(): "event_id": "1234" } normalized_dict = dict(parsely_raw_data.normalize_keys(event_empty_dict, schema=public_schema_keys)) - assert len(normalized_dict.keys()) > 2 + assert len(normalized_dict.keys()) == len(public_schema_keys) def test_normalize_keys_partially_complete_dict(): From 0e7322e09eb9b18ceeb104e58178c21a37076a89 Mon Sep 17 00:00:00 2001 From: rachelannelise Date: Tue, 10 Sep 2019 14:50:19 -0600 Subject: [PATCH 18/18] Adding pypi description and fixing typo causing casterisk build to fail --- parsely_raw_data/__init__.py | 2 +- setup.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/parsely_raw_data/__init__.py b/parsely_raw_data/__init__.py index d4af058..5c66005 100644 --- a/parsely_raw_data/__init__.py +++ b/parsely_raw_data/__init__.py @@ -52,7 +52,7 @@ def normalize_keys(input_event_dict, schema=None): ) # replace all "."s in the key with "_" - input_event_dict = {x.replace(',', '_'): v for x, v in input_event_dict.items()} + input_event_dict = {x.replace('.', '_'): v for x, v in input_event_dict.items()} # emit only public schema items # ensure all columns are available and null when needed diff --git a/setup.py b/setup.py index 9777ede..68bb3fe 100644 --- a/setup.py +++ b/setup.py @@ -80,7 +80,8 @@ def run_setup(): author='Emmett Butler', author_email='support@parsely.com', url='https://github.com/Parsely/parsely_raw_data', - description='Utilities for accessing raw Parse.ly data', + description='Utilities for accessing the Parse.ly Data Pipeline', + long_description='Utilities for accessing the Parse.ly Data Pipeline', keywords='parsely s3 kinesis redshift firehose bigquery', license='Apache License 2.0', packages=find_packages(),