From b9d5fa759cfe2aa0994ba7bfb0927eeacf5974f4 Mon Sep 17 00:00:00 2001 From: Jeremy Cohen Date: Fri, 24 May 2019 13:01:49 -0400 Subject: [PATCH 1/4] Fix pass through cols --- .../adapters/default/page_views/snowplow_page_views.sql | 2 +- .../adapters/default/page_views/snowplow_web_events.sql | 2 +- macros/adapters/default/sessions/snowplow_sessions.sql | 2 +- .../adapters/default/sessions/snowplow_sessions_tmp.sql | 8 ++++---- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/macros/adapters/default/page_views/snowplow_page_views.sql b/macros/adapters/default/page_views/snowplow_page_views.sql index 959b6bd..1a11fef 100644 --- a/macros/adapters/default/page_views/snowplow_page_views.sql +++ b/macros/adapters/default/page_views/snowplow_page_views.sql @@ -307,7 +307,7 @@ prep as ( {%- for column in var('snowplow:pass_through_columns') %} , a.{{column}} - {% endfor -%} + {% endfor %} from web_events_fixed as a inner join web_events_time as b on a.page_view_id = b.page_view_id diff --git a/macros/adapters/default/page_views/snowplow_web_events.sql b/macros/adapters/default/page_views/snowplow_web_events.sql index 5e73c2b..81d6aaa 100644 --- a/macros/adapters/default/page_views/snowplow_web_events.sql +++ b/macros/adapters/default/page_views/snowplow_web_events.sql @@ -128,7 +128,7 @@ prep as ( {%- for column in var('snowplow:pass_through_columns') %} , ev.{{column}} - {% endfor -%} + {% endfor %} from events as ev inner join web_page_context as wp on ev.event_id = wp.root_id diff --git a/macros/adapters/default/sessions/snowplow_sessions.sql b/macros/adapters/default/sessions/snowplow_sessions.sql index 0d85344..b98de1f 100644 --- a/macros/adapters/default/sessions/snowplow_sessions.sql +++ b/macros/adapters/default/sessions/snowplow_sessions.sql @@ -107,7 +107,7 @@ stitched as ( {%- for column in var('snowplow:pass_through_columns') %} , first_{{column}} , last_{{column}} - {% endfor -%} + {% endfor %} from snowplow_sessions as s left outer join id_map as id on s.user_snowplow_domain_id = id.domain_userid diff --git a/macros/adapters/default/sessions/snowplow_sessions_tmp.sql b/macros/adapters/default/sessions/snowplow_sessions_tmp.sql index bd91a6c..82d1e96 100644 --- a/macros/adapters/default/sessions/snowplow_sessions_tmp.sql +++ b/macros/adapters/default/sessions/snowplow_sessions_tmp.sql @@ -66,12 +66,12 @@ prep AS ( sum(time_engaged_in_s) as time_engaged_in_s, max(case when last_page_view_in_session = 1 then page_url else null end) - as exit_page_url, + as exit_page_url {%- for column in var('snowplow:pass_through_columns') %} - max(case when last_page_view_in_session = 1 then {{column}} else null end) + , max(case when last_page_view_in_session = 1 then {{column}} else null end) as {{column}} - {% endfor -%} + {% endfor %} from web_page_views @@ -204,7 +204,7 @@ sessions as ( {%- for column in var('snowplow:pass_through_columns') %} , a.{{column}} as first_{{column}} , b.{{column}} as last_{{column}} - {% endfor -%} + {% endfor %} from web_page_views as a inner join prep as b on a.session_id = b.session_id From bf9cf6a225c6b9a4f4df6e0ee07116127d12679f Mon Sep 17 00:00:00 2001 From: Jeremy Cohen Date: Fri, 24 May 2019 13:02:03 -0400 Subject: [PATCH 2/4] Clean up lookml --- lookml/bigquery/snowplow_sessions.view.lkml | 2 +- lookml/snowplow_sessions.view.lkml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/lookml/bigquery/snowplow_sessions.view.lkml b/lookml/bigquery/snowplow_sessions.view.lkml index 24e48b4..7762704 100644 --- a/lookml/bigquery/snowplow_sessions.view.lkml +++ b/lookml/bigquery/snowplow_sessions.view.lkml @@ -1,5 +1,5 @@ view: snowplow_sessions { - sql_table_name: dbt_jthandy.snowplow_sessions ;; + sql_table_name: analytics.snowplow_sessions ;; #Session identifying information dimension: session_id { diff --git a/lookml/snowplow_sessions.view.lkml b/lookml/snowplow_sessions.view.lkml index adca055..1e4bc3c 100644 --- a/lookml/snowplow_sessions.view.lkml +++ b/lookml/snowplow_sessions.view.lkml @@ -4,6 +4,7 @@ view: snowplow_sessions { #Session identifying information dimension: session_id { type: string + primary_key: yes sql: ${TABLE}.session_id ;; } From e76cfebdb90af96ad41837f7fa19d7ef91a141f5 Mon Sep 17 00:00:00 2001 From: Jeremy Cohen Date: Fri, 24 May 2019 13:02:39 -0400 Subject: [PATCH 3/4] Clean up readme --- README.md | 143 +++++++++++++++++++--------------------------------- pg_udfs.sql | 67 ++++++++++++++++++++++++ 2 files changed, 118 insertions(+), 92 deletions(-) create mode 100644 pg_udfs.sql diff --git a/README.md b/README.md index 5c17698..8174bbf 100644 --- a/README.md +++ b/README.md @@ -1,32 +1,33 @@ -# snowplow data models +# Snowplow sessionization -dbt data models for snowplow analytics. Adapted from Snowplow's [web model](https://github.com/snowplow/web-data-model) +dbt data models for sessionizing Snowplow data. Adapted from Snowplow's [web model](https://github.com/snowplow/web-data-model). -### models ### +### Models ### -The primary data models contained in this package are described below. While other models exist, -they are primarily used to build the two primary models listed here. +The primary ouputs of this package are **page views** and **sessions**. There are +several intermediate models tasked with created these two. | model | description | |-------|-------------| | snowplow_page_views | Contains a list of pageviews with scroll depth, view timing, and optionally useragent and performance data. | -| snowplow_sessions | Contains a rollup of page views indexed by cookie id (`user_snowplow_domain_id`) | +| snowplow_sessions | Contains a rollup of page views indexed by cookie id (`domain_sessionid`) | ![snowplow graph](/etc/snowplow_graph.png) -### installation ### +### Installation ### -- add the following lines to the bottom of your `dbt_project.yml` file: +- Include the following in your `packages.yml` file: ```YAML -repositories: - - https://github.com/fishtown-analytics/snowplow.git +packages: + - package: fishtown-analytics/snowplow + version: 0.7.3 ``` -- run `dbt deps`. +- Run `dbt deps` to install the package. -### configuration ### +## Configuration ### -The [variables](http://dbt.readthedocs.io/en/master/guide/context-variables/#arbitrary-configuration-variables) needed to configure this package are as follows: +The [variables](https://docs.getdbt.com/docs/using-variables) needed to configure this package are as follows: | variable | information | required | |----------|-------------|:--------:| @@ -36,8 +37,10 @@ The [variables](http://dbt.readthedocs.io/en/master/guide/context-variables/#arb |snowplow:context:web_page|Schema and table for web page context|Yes| |snowplow:context:performance_timing|Schema and table for perf timing context, or `false` if none is present|Yes| |snowplow:context:useragent|Schema and table for useragent context, or `false` if none is available|Yes| +|snowplow:pass_through_columns|Additional columns for inclusion in final models|Yes| + +An example `dbt_project.yml` configuration: -An example `dbt_project.yml` configuration is provided below ```yml # dbt_project.yml @@ -52,94 +55,50 @@ models: 'snowplow:context:web_page': "{{ ref('sp_base_web_page_context') }}" 'snowplow:context:performance_timing': false 'snowplow:context:useragent': false + 'snowplow:pass_through_columns': [] base: optional: enabled: false page_views: optional: enabled: false +``` -... +### Database support -repositories: - - "git@github.com:fishtown-analytics/snowplow.git" -``` +* Redshift +* Snowflake +* BigQuery +* Postgres, with the creation of [these UDFs](pg_udfs.sql) in your database -### database support - -These models were written for Redshift, but have also been tested with Postgres. If you're using Postgres, create the following UDFs in your database. - -```sql -create function convert_timezone( - in_tzname text, - out_tzname text, - in_t timestamptz - ) returns timestamptz -as $$ -declare -begin - return in_t at time zone out_tzname at time zone in_tzname; -end; -$$ language plpgsql; -``` +These models were originally written for Redshift, with added support for +Snowflake and BigQuery. To run on Postgres, create [these UDFs] in your database. -```sql -create or replace function datediff( - units varchar(30), - start_t timestamp, - end_t timestamp) returns int -as $$ -declare - diff_interval interval; - diff int = 0; - years_diff int = 0; -begin - if units in ('yy', 'yyyy', 'year', 'mm', 'm', 'month') then - years_diff = date_part('year', end_t) - date_part('year', start_t); - - if units in ('yy', 'yyyy', 'year') then - -- sql server does not count full years passed (only difference between year parts) - return years_diff; - else - -- if end month is less than start month it will subtracted - return years_diff * 12 + (date_part('month', end_t) - date_part('month', start_t)); - end if; - end if; - - -- Minus operator returns interval 'DDD days HH:MI:SS' - diff_interval = end_t - start_t; - - diff = diff + date_part('day', diff_interval); - - if units in ('wk', 'ww', 'week') then - diff = diff/7; - return diff; - end if; - - if units in ('dd', 'd', 'day') then - return diff; - end if; - - diff = diff * 24 + date_part('hour', diff_interval); - - if units in ('hh', 'hour') then - return diff; - end if; - - diff = diff * 60 + date_part('minute', diff_interval); - - if units in ('mi', 'n', 'minute') then - return diff; - end if; - - diff = diff * 60 + date_part('second', diff_interval); - - return diff; -end; -$$ language plpgsql; -``` +### Contributions ### + +Additional contributions to this package are very welcome! Please create issues +or open PRs against `master`. -### contribution ### +Much of tracking can be the Wild West. Snowplow's canonical event model is a major +asset in our ability to perform consistent analysis atop predictably structured +data, but any detailed implementation is bound to diverge. -Additional contributions to this repo are very welcome! Please submit PRs to master. All PRs should only include functionality that is contained within all snowplow deployments; no implementation-specific details should be included. +To that end, we aim to keep this package rooted in a garden-variety Snowplow web +deployment. All PRs should seek to add or improve functionality that is contained +within a plurality of snowplow deployments. + +If you need to change implementation-specific details, you have two avenues: + +* Override models from this package with versions that feature your custom logic. +Create a model with the same name locally (e.g. `snowplow_id_map`) and disable the `snowplow` +package's version in `dbt_project.yml`: + +```yml +snowplow: + ... + identification: + snowplow_id_map: + enabled: false +``` +* Fork this repository :) diff --git a/pg_udfs.sql b/pg_udfs.sql new file mode 100644 index 0000000..4a6eb75 --- /dev/null +++ b/pg_udfs.sql @@ -0,0 +1,67 @@ +-- convert_timezone +create function convert_timezone( + in_tzname text, + out_tzname text, + in_t timestamptz + ) returns timestamptz +as $$ +declare +begin + return in_t at time zone out_tzname at time zone in_tzname; +end; +$$ language plpgsql; + +-- datediff +create or replace function datediff( + units varchar(30), + start_t timestamp, + end_t timestamp) returns int +as $$ +declare + diff_interval interval; + diff int = 0; + years_diff int = 0; +begin + if units in ('yy', 'yyyy', 'year', 'mm', 'm', 'month') then + years_diff = date_part('year', end_t) - date_part('year', start_t); + + if units in ('yy', 'yyyy', 'year') then + -- sql server does not count full years passed (only difference between year parts) + return years_diff; + else + -- if end month is less than start month it will subtracted + return years_diff * 12 + (date_part('month', end_t) - date_part('month', start_t)); + end if; + end if; + + -- Minus operator returns interval 'DDD days HH:MI:SS' + diff_interval = end_t - start_t; + + diff = diff + date_part('day', diff_interval); + + if units in ('wk', 'ww', 'week') then + diff = diff/7; + return diff; + end if; + + if units in ('dd', 'd', 'day') then + return diff; + end if; + + diff = diff * 24 + date_part('hour', diff_interval); + + if units in ('hh', 'hour') then + return diff; + end if; + + diff = diff * 60 + date_part('minute', diff_interval); + + if units in ('mi', 'n', 'minute') then + return diff; + end if; + + diff = diff * 60 + date_part('second', diff_interval); + + return diff; +end; +$$ language plpgsql; \ No newline at end of file From 8e8956845569b8ddb4db97555a2873b2900c8ad6 Mon Sep 17 00:00:00 2001 From: Jeremy Cohen Date: Mon, 27 May 2019 11:46:20 -0400 Subject: [PATCH 4/4] Resolve review comments --- README.md | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 8174bbf..ee9cdbe 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ dbt data models for sessionizing Snowplow data. Adapted from Snowplow's [web mod ### Models ### The primary ouputs of this package are **page views** and **sessions**. There are -several intermediate models tasked with created these two. +several intermediate models used to create these two models. | model | description | |-------|-------------| @@ -37,7 +37,7 @@ The [variables](https://docs.getdbt.com/docs/using-variables) needed to configur |snowplow:context:web_page|Schema and table for web page context|Yes| |snowplow:context:performance_timing|Schema and table for perf timing context, or `false` if none is present|Yes| |snowplow:context:useragent|Schema and table for useragent context, or `false` if none is available|Yes| -|snowplow:pass_through_columns|Additional columns for inclusion in final models|Yes| +|snowplow:pass_through_columns|Additional columns for inclusion in final models|No| An example `dbt_project.yml` configuration: @@ -69,11 +69,7 @@ models: * Redshift * Snowflake * BigQuery -* Postgres, with the creation of [these UDFs](pg_udfs.sql) in your database - -These models were originally written for Redshift, with added support for -Snowflake and BigQuery. To run on Postgres, create [these UDFs] in your database. - +* Postgres, with the creation of [these UDFs](pg_udfs.sql) ### Contributions ###