Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat/incremental-models #316

Closed
wants to merge 10 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/run_unit_tests_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ jobs:
pip install dbt-core
pip install dbt-bigquery
pip install pytest
pip install pyarrow

- name: Run tests
run: python -m pytest .
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -171,4 +171,6 @@ cython_debug/
.idea/

#mac pc specific - system configuratio files
.DS_Store
.DS_Store

.local*
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,14 @@ vars:

## Optional Variables

### Exclude User Id models
Many websites does not implement log-in feature lead to undefined (or null) `user_id` in GA4 data. While this is not recommended, but you can drop `stg_ga4__user_id_mapping` and `fct_ga4__user_ids` models by specify `is_user_id_implemented`, default to true.
```
vars:
ga4:
is_user_id_implemented: false
```

### Query Parameter Exclusions

Setting `query_parameter_exclusions` will remove query string parameters from the `page_location` and `page_referrer` fields for all downstream processing. Original parameters are captured in the `original_page_location` and `original_page_referrer` fields. Ex:
Expand Down
30 changes: 30 additions & 0 deletions dbt_project.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ seed-paths: ["seeds"]
macro-paths: ["macros"]
snapshot-paths: ["snapshots"]

profile: "sado_analytics_services"

target-path: "target" # directory which will store compiled SQL files
clean-targets: # directories to be removed by `dbt clean`
- "target"
Expand All @@ -18,3 +20,31 @@ models:
+materialized: view
marts:
+materialized: table
vars:
ga4:
source_project: "agile-scheme-394814"
property_ids: [336884118]
start_date: "20231001"
static_incremental_days: 2
query_parameter_exclusions: ["gclid", "fbclid", "_ga"]
query_parameter_extraction: ["gclid", "fbclid", "keyword"]
conversion_events: ["purchase", "message", "call", "generate_lead", "store_visit"]
user_properties:
- user_property_name: "phone"
value_type: "string_value"
- user_property_name: "address"
value_type: "string_value"
- user_property_name: "email"
value_type: "string_value"
- user_property_name: "name"
value_type: "string_value"
derived_user_properties:
- event_parameter: "page_location"
user_property_name: "most_recent_page_location"
value_type: "string_value"
- event_parameter: "store"
user_property_name: "visited_branch"
value_type: "string_value"
- event_parameter: "location"
user_property_name: "searched_branch"
value_type: "string_value"
71 changes: 65 additions & 6 deletions models/marts/core/dim_ga4__client_keys.sql
Original file line number Diff line number Diff line change
@@ -1,9 +1,57 @@
-- Mart for dimensions related to user devices (based on client_key)
{{
config(
materialized = 'incremental',
incremental_strategy = 'merge',
unique_key = ['client_key'],
tags = ["incremental"],
partition_by={
"field": "last_seen_at",
"data_type": "timestamp",
"granularity": "day"
},
on_schema_change = 'sync_all_columns',
merge_update_columns = [
'last_geo_continent',
'last_geo_country',
'last_geo_region',
'last_geo_city',
'last_geo_sub_continent',
'last_geo_metro',
'last_device_category',
'last_device_mobile_brand_name',
'last_device_mobile_model_name',
'last_device_mobile_marketing_name',
'last_device_mobile_os_hardware_model',
'last_device_operating_system',
'last_device_operating_system_version',
'last_device_vendor_id',
'last_device_advertising_id',
'last_device_language',
'last_device_is_limited_ad_tracking',
'last_device_time_zone_offset_seconds',
'last_device_browser',
'last_device_browser_version',
'last_device_web_info_browser',
'last_device_web_info_browser_version',
'last_device_web_info_hostname',
'last_user_campaign',
'last_user_medium',
'last_user_source',
'last_seen_at',
'last_page_location',
'last_page_hostname',
'last_page_referrer',
],
)
}}

with include_first_last_events as (
select
*
from {{ref('stg_ga4__client_key_first_last_events')}}
{% if is_incremental() %}
where date(last_seen_at) >= date_sub(current_date, interval {{var('static_incremental_days',3) | int}} day)
{% endif %}
),
include_first_last_page_views as (
select
Expand All @@ -13,21 +61,32 @@ include_first_last_page_views as (
first_last_page_views.first_page_referrer,
first_last_page_views.last_page_location,
first_last_page_views.last_page_hostname,
first_last_page_views.last_page_referrer
first_last_page_views.last_page_referrer,
from include_first_last_events
left join {{ref('stg_ga4__client_key_first_last_pageviews')}} as first_last_page_views using (client_key)
{% if is_incremental() %}
where date(first_last_page_views.last_seen_at) >= date_sub(current_date, interval {{var('static_incremental_days',3) | int}} day)
{% endif %}
),
include_user_properties as (


select * from include_first_last_page_views
select p.*,

{% if var('derived_user_properties', false) %}
dup.* except(last_updated,client_key),
{% endif %}
{% if var('user_properties', false) %}
up.* except(last_updated,client_key),
{% endif %}

from include_first_last_page_views p
{% if var('derived_user_properties', false) %}
-- If derived user properties have been assigned as variables, join them on the client_key
left join {{ref('stg_ga4__derived_user_properties')}} using (client_key)
inner join {{ref('stg_ga4__derived_user_properties')}} as dup using (client_key)
{% endif %}
{% if var('user_properties', false) %}
-- If user properties have been assigned as variables, join them on the client_key
left join {{ref('stg_ga4__user_properties')}} using (client_key)
inner join {{ref('stg_ga4__user_properties')}} as up using (client_key)
{% endif %}

)
Expand Down
39 changes: 33 additions & 6 deletions models/marts/core/dim_ga4__sessions.sql
Original file line number Diff line number Diff line change
@@ -1,16 +1,40 @@
{{
config(
materialized = 'incremental',
incremental_strategy = 'insert_overwrite',
tags = ["incremental"],
on_schema_change = 'sync_all_columns',
unique_key = ['session_key'],
partition_by={
"field": "session_partition_date",
"data_type": "date",
"granularity": "day"
},
merge_exclude_columns= [
'session_partition_date'
]
)
}}


-- Dimension table for sessions based on the first event that isn't session_start or first_visit.
with session_first_event as
(
select *
from {{ref('stg_ga4__events')}}
where event_name != 'first_visit'
and event_name != 'session_start'
qualify row_number() over(partition by session_key order by event_timestamp) = 1
select e.*
from {{ref('stg_ga4__events')}} e
inner join {{ref("stg_ga4__sessions_first_last_pageviews")}} pv
on e.session_key = pv.session_key and e.event_date_dt = date(pv.first_page_view_event_time)
where e.event_name != 'first_visit'
and e.event_name != 'session_start'
{% if is_incremental() %}
and e.event_date_dt >= date_sub(current_date, interval {{var('static_incremental_days',3) | int}} day)
{% endif %}
qualify row_number() over(partition by e.session_key order by event_timestamp) = 1
),
session_start_dims as (
select
session_key,
event_date_dt as session_start_date,
event_date_dt as session_partition_date,
event_timestamp as session_start_timestamp,
page_path as landing_page_path,
page_location as landing_page,
Expand Down Expand Up @@ -59,6 +83,9 @@ join_traffic_source as (
sessions_traffic_sources.session_source_category
from session_start_dims
left join {{ref('stg_ga4__sessions_traffic_sources')}} sessions_traffic_sources using (session_key)
{% if is_incremental() %}
where sessions_traffic_sources.session_partition_date >= date_sub(current_date, interval {{var('static_incremental_days',3) | int}} day)
{% endif %}
),
include_session_properties as (
select
Expand Down
4 changes: 3 additions & 1 deletion models/marts/core/fct_ga4__client_keys.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ select
client_key,
stream_id,
min(session_start_timestamp) as first_seen_timestamp,
min(session_start_date) as first_seen_start_date,
max(session_start_timestamp) as last_seen_session_timestamp,
min(session_start_date) as first_seen_date,
max(session_start_date) as last_seen_date,
sum(count_pageviews) as count_pageviews,
sum(is_session_engaged) as count_engaged_sessions,
sum(sum_event_value_in_usd) as sum_event_value_in_usd,
Expand Down
13 changes: 5 additions & 8 deletions models/marts/core/fct_ga4__pages.sql
Original file line number Diff line number Diff line change
@@ -1,18 +1,15 @@
{% set partitions_to_replace = ['current_date'] %}
{% for i in range(var('static_incremental_days')) %}
{% set partitions_to_replace = partitions_to_replace.append('date_sub(current_date, interval ' + (i+1)|string + ' day)') %}
{% endfor %}
{{
config(
materialized = 'incremental',
incremental_strategy = 'insert_overwrite',
incremental_strategy = 'merge',
unique_key = ['event_date_dt', 'stream_id' , 'page_location'],
tags = ["incremental"],
partition_by={
"field": "event_date_dt",
"data_type": "date",
"granularity": "day"
},
partitions = partitions_to_replace
on_schema_change = 'sync_all_columns',
)
}}

Expand All @@ -29,7 +26,7 @@ with page_view as (
sum(entrances) as entrances,
from {{ref('stg_ga4__event_page_view')}}
{% if is_incremental() %}
where event_date_dt in ({{ partitions_to_replace | join(',') }})
where event_date_dt >= date_sub(current_date, interval {{var('static_incremental_days',3)}} day)
{% endif %}
group by 1,2,3,4,5
), page_engagement as (
Expand All @@ -54,7 +51,7 @@ from {{ref('stg_ga4__event_page_view')}}
count(event_name) as scroll_events
from {{ref('stg_ga4__event_scroll')}}
{% if is_incremental() %}
where event_date_dt in ({{ partitions_to_replace | join(',') }})
where event_date_dt >= date_sub(current_date, interval {{var('static_incremental_days',3)}} day)
{% endif %}
group by 1,2
)
Expand Down
17 changes: 17 additions & 0 deletions models/marts/core/fct_ga4__sessions.sql
Original file line number Diff line number Diff line change
@@ -1,4 +1,18 @@
-- Stay mindful of performance/cost when using this model. Making this model partitioned on date is not possible because there's no way to create a single record per session AND partition on date.
{{
config(
materialized = 'incremental',
incremental_strategy = 'merge',
unique_key = ['session_key','client_key'],
tags = ["incremental"],
partition_by={
"field": "session_start_date",
"data_type": "date",
"granularity": "day"
},
on_schema_change = 'sync_all_columns',
)
}}

select
client_key,
Expand All @@ -18,5 +32,8 @@ select
{% endfor %}
{% endif %}
from {{ref('fct_ga4__sessions_daily')}}
{% if is_incremental() %}
where session_partition_date >= date_sub(current_date, interval {{var('static_incremental_days',3)}} day)
{% endif %}
group by 1,2,3

13 changes: 5 additions & 8 deletions models/marts/core/fct_ga4__sessions_daily.sql
Original file line number Diff line number Diff line change
@@ -1,18 +1,15 @@
{% set partitions_to_replace = ['current_date'] %}
{% for i in range(var('static_incremental_days')) %}
{% set partitions_to_replace = partitions_to_replace.append('date_sub(current_date, interval ' + (i+1)|string + ' day)') %}
{% endfor %}
{{
config(
materialized = 'incremental',
incremental_strategy = 'insert_overwrite',
incremental_strategy = 'merge',
tags = ["incremental"],
partition_by={
"field": "session_partition_date",
"data_type": "date",
"granularity": "day"
},
partitions = partitions_to_replace
unique_key = ['session_key','session_partition_key'],
on_schema_change = 'sync_all_columns'
)
}}

Expand All @@ -34,7 +31,7 @@ with session_metrics as (
from {{ref('stg_ga4__events')}}
where session_key is not null
{% if is_incremental() %}
and event_date_dt in ({{ partitions_to_replace | join(',') }})
and event_date_dt >= date_sub(current_date, interval {{var('static_incremental_days',3) | int}} day)
{% endif %}
group by 1,2,3,4
)
Expand All @@ -46,7 +43,7 @@ with session_metrics as (
select * from {{ref('stg_ga4__session_conversions_daily')}}
where 1=1
{% if is_incremental() %}
and session_partition_date in ({{ partitions_to_replace | join(',') }})
and session_partition_date >= date_sub(current_date, interval {{var('static_incremental_days',3) | int}} day)
{% endif %}
),
join_metrics_and_conversions as (
Expand Down
12 changes: 11 additions & 1 deletion models/marts/core/fct_ga4__user_ids.sql
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@

{{
config(
enabled = var('is_user_id_implemented', true),
)
}}


with user_id_mapped as (
select
client_keys.*,
Expand All @@ -16,7 +24,9 @@ select
stream_id,
max(is_user_id) as is_user_id,
min(first_seen_timestamp) as first_seen_timestamp,
min(first_seen_start_date) as first_seen_start_date,
max(last_seen_session_timestamp) as last_seen_session_timestamp,
min(first_seen_date) as first_seen_date,
max(last_seen_date) as last_seen_date,
sum(count_pageviews) as count_pageviews,
sum(count_engaged_sessions) as count_engaged_sessions,
sum(sum_event_value_in_usd) as sum_event_value_in_usd,
Expand Down
Loading
Loading