From f1a81624e357639c34a6c15254a65f405a82f854 Mon Sep 17 00:00:00 2001 From: Chien Le Date: Wed, 27 Mar 2024 11:24:45 +0700 Subject: [PATCH 1/9] first commit --- .gitignore | 5 ++++- dbt_project.yml | 20 -------------------- 2 files changed, 4 insertions(+), 21 deletions(-) delete mode 100644 dbt_project.yml diff --git a/.gitignore b/.gitignore index 84063871..64648eef 100644 --- a/.gitignore +++ b/.gitignore @@ -171,4 +171,7 @@ cython_debug/ .idea/ #mac pc specific - system configuratio files -.DS_Store \ No newline at end of file +.DS_Store + +.local* +dbt_project.yml \ No newline at end of file diff --git a/dbt_project.yml b/dbt_project.yml deleted file mode 100644 index b758f5e5..00000000 --- a/dbt_project.yml +++ /dev/null @@ -1,20 +0,0 @@ -name: 'ga4' -version: '6.0.1' -config-version: 2 -model-paths: ["models"] -analysis-paths: ["analyses"] -test-paths: ["tests"] -seed-paths: ["seeds"] -macro-paths: ["macros"] -snapshot-paths: ["snapshots"] - -target-path: "target" # directory which will store compiled SQL files -clean-targets: # directories to be removed by `dbt clean` - - "target" - - "dbt_packages" - -models: - ga4: - +materialized: view - marts: - +materialized: table From fb6b3a50a99da37c8600bd2ae98a0315eb07bb49 Mon Sep 17 00:00:00 2001 From: Chien Le Date: Wed, 27 Mar 2024 17:37:32 +0700 Subject: [PATCH 2/9] feat: convert ga4 models to incremental --- .gitattributes | 3 + .github/pull_request_template.md | 10 + .github/workflows/run_unit_tests_on_pr.yml | 37 + .gitignore | 176 ++++ LICENSE | 21 + README.md | 335 +++++++ TODO.md | 35 + analyses/.gitkeep | 0 dbt_project.yml | 22 + macros/.gitkeep | 0 macros/base_select.sql | 162 ++++ macros/combine_property_data.sql | 38 + macros/create_custom_event.sql | 15 + macros/default_channel_grouping.sql | 148 ++++ macros/stage_custom_parameters.sql | 6 + macros/unnest_key.sql | 20 + macros/unpack_struct.sql | 5 + macros/url_parsing.sql | 23 + models/marts/core/core.yml | 23 + models/marts/core/dim_ga4__client_keys.sql | 85 ++ models/marts/core/dim_ga4__sessions.sql | 95 ++ models/marts/core/dim_ga4__sessions_daily.sql | 175 ++++ models/marts/core/dim_ga4__sessions_daily.yml | 26 + models/marts/core/fct_ga4__client_keys.sql | 20 + models/marts/core/fct_ga4__pages.sql | 77 ++ models/marts/core/fct_ga4__pages.yml | 13 + models/marts/core/fct_ga4__sessions.sql | 39 + models/marts/core/fct_ga4__sessions.yml | 18 + models/marts/core/fct_ga4__sessions_daily.sql | 66 ++ models/marts/core/fct_ga4__sessions_daily.yml | 19 + models/marts/core/fct_ga4__user_ids.sql | 34 + models/marts/core/fct_ga4__user_ids.yml | 11 + models/staging/base/base_ga4__events.sql | 36 + models/staging/base/base_ga4__events.yml | 47 + .../staging/events/stg_ga4__event_click.sql | 26 + .../events/stg_ga4__event_file_download.sql | 24 + .../events/stg_ga4__event_first_visit.sql | 17 + .../events/stg_ga4__event_page_view.sql | 20 + .../events/stg_ga4__event_page_view.yml | 10 + .../staging/events/stg_ga4__event_scroll.sql | 14 + .../staging/events/stg_ga4__event_scroll.yml | 8 + .../events/stg_ga4__event_session_start.sql | 15 + .../events/stg_ga4__event_session_start.yml | 5 + .../events/stg_ga4__event_user_engagement.sql | 15 + .../events/stg_ga4__event_video_complete.sql | 23 + .../events/stg_ga4__event_video_start.sql | 23 + .../stg_ga4__event_view_search_results.sql | 18 + models/staging/recommended_events/README.md | 40 + .../stg_ga4__event_add_payment_info.sql | 23 + .../stg_ga4__event_add_shipping_info.sql | 22 + .../stg_ga4__event_add_to_cart.sql | 22 + .../stg_ga4__event_add_to_wishlist.sql | 22 + .../stg_ga4__event_begin_checkout.sql | 21 + .../stg_ga4__event_generate_lead.sql | 21 + .../stg_ga4__event_login.sql | 19 + .../stg_ga4__event_purchase.sql | 33 + .../stg_ga4__event_purchase.yml | 5 + .../stg_ga4__event_purchase_deduplicated.sql | 27 + .../stg_ga4__event_refund.sql | 35 + .../stg_ga4__event_remove_from_cart.sql | 22 + .../stg_ga4__event_search.sql | 20 + .../stg_ga4__event_select_item.sql | 20 + .../stg_ga4__event_select_promotion.sql | 20 + .../stg_ga4__event_share.sql | 23 + .../stg_ga4__event_sign_up.sql | 19 + .../stg_ga4__event_view_cart.sql | 20 + .../stg_ga4__event_view_item.sql | 22 + .../stg_ga4__event_view_item_list.sql | 20 + .../stg_ga4__event_view_promotion.sql | 20 + models/staging/src_ga4.yml | 16 + .../stg_ga4__client_key_first_last_events.sql | 133 +++ .../stg_ga4__client_key_first_last_events.yml | 10 + ...g_ga4__client_key_first_last_pageviews.sql | 63 ++ ...g_ga4__client_key_first_last_pageviews.yml | 10 + .../stg_ga4__derived_session_properties.sql | 32 + .../stg_ga4__derived_session_properties.yml | 11 + ..._ga4__derived_session_properties_daily.sql | 49 ++ ..._ga4__derived_session_properties_daily.yml | 11 + .../stg_ga4__derived_user_properties.sql | 42 + .../stg_ga4__derived_user_properties.yml | 10 + models/staging/stg_ga4__event_items.sql | 38 + models/staging/stg_ga4__event_items.yml | 5 + .../stg_ga4__event_to_query_string_params.sql | 24 + .../stg_ga4__event_to_query_string_params.yml | 6 + models/staging/stg_ga4__events.sql | 100 +++ models/staging/stg_ga4__events.yml | 20 + models/staging/stg_ga4__page_conversions.sql | 11 + models/staging/stg_ga4__page_engaged_time.sql | 32 + models/staging/stg_ga4__page_engaged_time.yml | 10 + .../stg_ga4__session_conversions_daily.sql | 38 + .../stg_ga4__session_conversions_daily.yml | 12 + ...stg_ga4__sessions_first_last_pageviews.sql | 36 + .../stg_ga4__sessions_traffic_sources.sql | 55 ++ .../stg_ga4__sessions_traffic_sources.yml | 17 + ...tg_ga4__sessions_traffic_sources_daily.sql | 86 ++ ...tg_ga4__sessions_traffic_sources_daily.yml | 17 + ..._traffic_sources_last_non_direct_daily.sql | 80 ++ ..._traffic_sources_last_non_direct_daily.yml | 24 + models/staging/stg_ga4__user_id_mapping.sql | 29 + models/staging/stg_ga4__user_id_mapping.yml | 12 + models/staging/stg_ga4__user_properties.sql | 89 ++ models/staging/stg_ga4__user_properties.yml | 10 + packages.yml | 3 + seeds/.gitkeep | 0 seeds/ga4_source_categories.csv | 820 ++++++++++++++++++ snapshots/.gitkeep | 0 tests/.gitkeep | 0 tests/page_location_with_gclid_is_cpc.sql | 15 + unit_tests/.env.example | 1 + unit_tests/README.md | 26 + unit_tests/conftest.py | 32 + unit_tests/requirements.txt | 2 + .../test_macro_default_channel_grouping.py | 302 +++++++ .../test_macro_exclude_query_parameters.py | 52 ++ ...est_macro_extract_query_parameter_value.py | 54 ++ unit_tests/test_stg_Ga4__user_id_mapping.py | 43 + ...est_stg_ga4__derived_session_properties.py | 74 ++ .../test_stg_ga4__derived_user_properties.py | 74 ++ ...t_stg_ga4__event_to_query_string_params.py | 45 + unit_tests/test_stg_ga4__events.example | 45 + unit_tests/test_stg_ga4__events.todo | 43 + unit_tests/test_stg_ga4__page_conversions.py | 37 + ...test_stg_ga4__session_conversions_daily.py | 42 + ...s_traffic_sources_last_non_direct_daily.py | 40 + .../test_stg_ga4__users_first_last_events.py | 35 + 125 files changed, 5477 insertions(+) create mode 100644 .gitattributes create mode 100644 .github/pull_request_template.md create mode 100644 .github/workflows/run_unit_tests_on_pr.yml create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md create mode 100644 TODO.md create mode 100644 analyses/.gitkeep create mode 100644 dbt_project.yml create mode 100644 macros/.gitkeep create mode 100644 macros/base_select.sql create mode 100644 macros/combine_property_data.sql create mode 100644 macros/create_custom_event.sql create mode 100644 macros/default_channel_grouping.sql create mode 100644 macros/stage_custom_parameters.sql create mode 100644 macros/unnest_key.sql create mode 100644 macros/unpack_struct.sql create mode 100644 macros/url_parsing.sql create mode 100644 models/marts/core/core.yml create mode 100644 models/marts/core/dim_ga4__client_keys.sql create mode 100644 models/marts/core/dim_ga4__sessions.sql create mode 100644 models/marts/core/dim_ga4__sessions_daily.sql create mode 100644 models/marts/core/dim_ga4__sessions_daily.yml create mode 100644 models/marts/core/fct_ga4__client_keys.sql create mode 100644 models/marts/core/fct_ga4__pages.sql create mode 100644 models/marts/core/fct_ga4__pages.yml create mode 100644 models/marts/core/fct_ga4__sessions.sql create mode 100644 models/marts/core/fct_ga4__sessions.yml create mode 100644 models/marts/core/fct_ga4__sessions_daily.sql create mode 100644 models/marts/core/fct_ga4__sessions_daily.yml create mode 100644 models/marts/core/fct_ga4__user_ids.sql create mode 100644 models/marts/core/fct_ga4__user_ids.yml create mode 100644 models/staging/base/base_ga4__events.sql create mode 100644 models/staging/base/base_ga4__events.yml create mode 100644 models/staging/events/stg_ga4__event_click.sql create mode 100644 models/staging/events/stg_ga4__event_file_download.sql create mode 100644 models/staging/events/stg_ga4__event_first_visit.sql create mode 100644 models/staging/events/stg_ga4__event_page_view.sql create mode 100644 models/staging/events/stg_ga4__event_page_view.yml create mode 100644 models/staging/events/stg_ga4__event_scroll.sql create mode 100644 models/staging/events/stg_ga4__event_scroll.yml create mode 100644 models/staging/events/stg_ga4__event_session_start.sql create mode 100644 models/staging/events/stg_ga4__event_session_start.yml create mode 100644 models/staging/events/stg_ga4__event_user_engagement.sql create mode 100644 models/staging/events/stg_ga4__event_video_complete.sql create mode 100644 models/staging/events/stg_ga4__event_video_start.sql create mode 100644 models/staging/events/stg_ga4__event_view_search_results.sql create mode 100644 models/staging/recommended_events/README.md create mode 100644 models/staging/recommended_events/stg_ga4__event_add_payment_info.sql create mode 100644 models/staging/recommended_events/stg_ga4__event_add_shipping_info.sql create mode 100644 models/staging/recommended_events/stg_ga4__event_add_to_cart.sql create mode 100644 models/staging/recommended_events/stg_ga4__event_add_to_wishlist.sql create mode 100644 models/staging/recommended_events/stg_ga4__event_begin_checkout.sql create mode 100644 models/staging/recommended_events/stg_ga4__event_generate_lead.sql create mode 100644 models/staging/recommended_events/stg_ga4__event_login.sql create mode 100644 models/staging/recommended_events/stg_ga4__event_purchase.sql create mode 100644 models/staging/recommended_events/stg_ga4__event_purchase.yml create mode 100644 models/staging/recommended_events/stg_ga4__event_purchase_deduplicated.sql create mode 100644 models/staging/recommended_events/stg_ga4__event_refund.sql create mode 100644 models/staging/recommended_events/stg_ga4__event_remove_from_cart.sql create mode 100644 models/staging/recommended_events/stg_ga4__event_search.sql create mode 100644 models/staging/recommended_events/stg_ga4__event_select_item.sql create mode 100644 models/staging/recommended_events/stg_ga4__event_select_promotion.sql create mode 100644 models/staging/recommended_events/stg_ga4__event_share.sql create mode 100644 models/staging/recommended_events/stg_ga4__event_sign_up.sql create mode 100644 models/staging/recommended_events/stg_ga4__event_view_cart.sql create mode 100644 models/staging/recommended_events/stg_ga4__event_view_item.sql create mode 100644 models/staging/recommended_events/stg_ga4__event_view_item_list.sql create mode 100644 models/staging/recommended_events/stg_ga4__event_view_promotion.sql create mode 100644 models/staging/src_ga4.yml create mode 100644 models/staging/stg_ga4__client_key_first_last_events.sql create mode 100644 models/staging/stg_ga4__client_key_first_last_events.yml create mode 100644 models/staging/stg_ga4__client_key_first_last_pageviews.sql create mode 100644 models/staging/stg_ga4__client_key_first_last_pageviews.yml create mode 100644 models/staging/stg_ga4__derived_session_properties.sql create mode 100644 models/staging/stg_ga4__derived_session_properties.yml create mode 100644 models/staging/stg_ga4__derived_session_properties_daily.sql create mode 100644 models/staging/stg_ga4__derived_session_properties_daily.yml create mode 100644 models/staging/stg_ga4__derived_user_properties.sql create mode 100644 models/staging/stg_ga4__derived_user_properties.yml create mode 100644 models/staging/stg_ga4__event_items.sql create mode 100644 models/staging/stg_ga4__event_items.yml create mode 100644 models/staging/stg_ga4__event_to_query_string_params.sql create mode 100644 models/staging/stg_ga4__event_to_query_string_params.yml create mode 100644 models/staging/stg_ga4__events.sql create mode 100644 models/staging/stg_ga4__events.yml create mode 100644 models/staging/stg_ga4__page_conversions.sql create mode 100644 models/staging/stg_ga4__page_engaged_time.sql create mode 100644 models/staging/stg_ga4__page_engaged_time.yml create mode 100644 models/staging/stg_ga4__session_conversions_daily.sql create mode 100644 models/staging/stg_ga4__session_conversions_daily.yml create mode 100644 models/staging/stg_ga4__sessions_first_last_pageviews.sql create mode 100644 models/staging/stg_ga4__sessions_traffic_sources.sql create mode 100644 models/staging/stg_ga4__sessions_traffic_sources.yml create mode 100644 models/staging/stg_ga4__sessions_traffic_sources_daily.sql create mode 100644 models/staging/stg_ga4__sessions_traffic_sources_daily.yml create mode 100644 models/staging/stg_ga4__sessions_traffic_sources_last_non_direct_daily.sql create mode 100644 models/staging/stg_ga4__sessions_traffic_sources_last_non_direct_daily.yml create mode 100644 models/staging/stg_ga4__user_id_mapping.sql create mode 100644 models/staging/stg_ga4__user_id_mapping.yml create mode 100644 models/staging/stg_ga4__user_properties.sql create mode 100644 models/staging/stg_ga4__user_properties.yml create mode 100644 packages.yml create mode 100644 seeds/.gitkeep create mode 100644 seeds/ga4_source_categories.csv create mode 100644 snapshots/.gitkeep create mode 100644 tests/.gitkeep create mode 100644 tests/page_location_with_gclid_is_cpc.sql create mode 100644 unit_tests/.env.example create mode 100644 unit_tests/README.md create mode 100644 unit_tests/conftest.py create mode 100644 unit_tests/requirements.txt create mode 100644 unit_tests/test_macro_default_channel_grouping.py create mode 100644 unit_tests/test_macro_exclude_query_parameters.py create mode 100644 unit_tests/test_macro_extract_query_parameter_value.py create mode 100644 unit_tests/test_stg_Ga4__user_id_mapping.py create mode 100644 unit_tests/test_stg_ga4__derived_session_properties.py create mode 100644 unit_tests/test_stg_ga4__derived_user_properties.py create mode 100644 unit_tests/test_stg_ga4__event_to_query_string_params.py create mode 100644 unit_tests/test_stg_ga4__events.example create mode 100644 unit_tests/test_stg_ga4__events.todo create mode 100644 unit_tests/test_stg_ga4__page_conversions.py create mode 100644 unit_tests/test_stg_ga4__session_conversions_daily.py create mode 100644 unit_tests/test_stg_ga4__sessions_traffic_sources_last_non_direct_daily.py create mode 100644 unit_tests/test_stg_ga4__users_first_last_events.py diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..d0e2b9d1 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,3 @@ +# Auto detect text files and perform LF normalization +* text=auto +*.sql linguist-detectable=true diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 00000000..d33751fe --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,10 @@ +## Description & motivation + + +## Checklist +- [ ] I have verified that these changes work locally +- [ ] I have updated the README.md (if applicable) +- [ ] I have added tests & descriptions to my models (and macros if applicable) +- [ ] I have run `dbt test` and `python -m pytest .` to validate existing tests diff --git a/.github/workflows/run_unit_tests_on_pr.yml b/.github/workflows/run_unit_tests_on_pr.yml new file mode 100644 index 00000000..6a2b6dd4 --- /dev/null +++ b/.github/workflows/run_unit_tests_on_pr.yml @@ -0,0 +1,37 @@ +name: Run Unit Tests on Pull Request + +on: [pull_request_target,workflow_dispatch] +env: + BIGQUERY_PROJECT: ${{ secrets.BIGQUERY_PROJECT }} + +jobs: + pytest_run_all: + name: Pytest Run All + runs-on: ubuntu-latest + defaults: + run: + working-directory: ./unit_tests + steps: + - name: Check out + uses: actions/checkout@v3 + with: + ref: ${{ github.event.pull_request.head.sha }} + + - uses: actions/setup-python@v1 + with: + python-version: "3.11.x" + + - name: Authenticate using service account + run: 'echo "$KEYFILE" > ./dbt-service-account.json' + shell: bash + env: + KEYFILE: ${{secrets.GCP_BIGQUERY_USER_KEYFILE}} + + - name: Install dependencies + run: | + pip install dbt-core + pip install dbt-bigquery + pip install pytest + + - name: Run tests + run: python -m pytest . diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..5752af4d --- /dev/null +++ b/.gitignore @@ -0,0 +1,176 @@ +.vscode +target/ +dbt_packages/ +logs/ + +google-cloud-sdk/ +unit_tests/.env +unit_tests/__pycache__ +unit_tests/.pytest_cache +unit_tests/dbt-service-account.json + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ + +#mac pc specific - system configuratio files +.DS_Store + +.local* \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..8faf81e4 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 Adam Ribaudo + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 00000000..5cf39293 --- /dev/null +++ b/README.md @@ -0,0 +1,335 @@ +# GA4 DBT Package + +This [dbt](https://www.getdbt.com/) package connects to an exported GA4 dataset and provides useful transformations as well as report-ready dimensional models that can be used to build reports. + +Features include: +- Flattened models to access common events and event parameters such as `page_view`, `session_start`, and `purchase` +- Conversion of sharded event tables into a single partitioned table +- Incremental loading of GA4 data into your staging tables +- Page, session and user dimensional models with conversion counts +- Last non-direct session attribution +- Simple methods for accessing query parameters (like UTM params) or filtering query parameters (like click IDs) +- Support for custom event parameters & user properties +- Mapping from source/medium to default channel grouping + +# Models + +| model | description | +|-------|-------------| +| stg_ga4__events | Contains cleaned event data that is enhanced with useful event and session keys. | +| stg_ga4__event_* | 1 model per event (ex: page_view, purchase) which flattens event parameters specific to that event | +| stg_ga4__event_items | Contains item data associated with e-commerce events (Purchase, add to cart, etc) | +| stg_ga4__event_to_query_string_params | Mapping between each event and any query parameters & values that were contained in the event's `page_location` field | +| stg_ga4__user_properties | Finds the most recent occurance of specified user_properties for each user | +| stg_ga4__derived_user_properties | Finds the most recent occurance of specific event_params value and assigns them to a client_key. Derived user properties are specified as variables (see documentation below) | +| stg_ga4__derived_session_properties | Finds the most recent occurance of specific event_params or user_properties value and assigns them to a session's session_key. Derived session properties are specified as variables (see documentation below) | +| stg_ga4__session_conversions_daily | Produces daily counts of conversions per session. The list of conversion events to include is configurable (see documentation below) | +| stg_ga4__sessions_traffic_sources | Finds the first source, medium, campaign, content, paid search term (from UTM tracking), and default channel grouping for each session. | +| stg_ga4__sessions_traffic_sources_daily | Same data as stg_ga4__sessions_traffic_sources, but partitioned by day to allow for efficient loading and querying of data. | +| stg_ga4__sessions_traffic_sources_last_non_direct_daily | Finds the last non-direct source attributed to each session within a 30-day lookback window. Assumes each session is contained within a day. | +| dim_ga4__client_keys | Dimension table for user devices as indicated by client_keys. Contains attributes such as first and last page viewed.| +| dim_ga4__sessions | Dimension table for sessions which contains useful attributes such as geography, device information, and acquisition data. Can be expensive to run on large installs (see `dim_ga4__sessions_daily`) | +| dim_ga4__sessions_daily | Query-optimized session dimension table that is incremental and partitioned on date. Assumes that each partition is contained within a single day | +| fct_ga4__pages | Fact table for pages which aggregates common page metrics by date, stream_id and page_location. | +| fct_ga4__sessions_daily | Fact table for session metrics, partitioned by date. A single session may span multiple rows given that sessions can span multiple days. | +| fct_ga4__sessions | Fact table that aggregates session metrics across days. This table is not partitioned, so be mindful of performance/cost when querying. | + +# Seeds + +| seed file | description | +|-----------|-------------| +| ga4_source_categories.csv| Google's mapping between `source` and `source_category`. Downloaded from https://support.google.com/analytics/answer/9756891?hl=en | + +Be sure to run `dbt seed` before you run `dbt run`. + +# Installation & Configuration +## Install from DBT Package Hub +To pull the latest stable release along with minor updates, add the following to your `packages.yml` file: + +``` +packages: + - package: Velir/ga4 + version: [">=6.0.0", "<6.1.0"] +``` + +## Install From main branch on GitHub + +To install the latest code (may be unstable), add the following to your `packages.yml` file: + +``` +packages: + - git: "https://github.com/Velir/dbt-ga4.git" +``` + +## Install From Local Directory + +1. Clone this repository to a folder in the same parent directory as your DBT project +2. Update your project's `packages.yml` to include a reference to this package: + +``` +packages: + - local: ../dbt-ga4 +``` +## Required Variables + +This package assumes that you have an existing DBT project with a BigQuery profile and a BigQuery GCP instance available with GA4 event data loaded. Source data is defined using the `project` and `dataset` variables below. The `static_incremental_days` variable defines how many days' worth of data to reprocess during incremental runs. + +``` +vars: + ga4: + source_project: "my_source_gcp_project" # Project that contains raw GA4 data + property_ids: [11111111] # Array of properties to process + start_date: "YYYYMMDD" # Earliest date to load + static_incremental_days: 3 # Number of days to scan and reprocess on each run +``` + +## Required Variables (Multi-Project Instance) + +When processing multiple properties at a time, the required variables change slightly. See [Multi-Property Support](#multi-property-support) section for details on configuring multiple GA4 properties as a source. + +``` +vars: + ga4: + source_project: "my_source_gcp_project" # Project that contains raw GA4 data + combined_dataset: "my_combined_data" # Dataset where multi-property data is cloned + property_ids: [11111111,2222222] # Array of properties to process + start_date: "YYYYMMDD" # Earliest date to load + static_incremental_days: 3 # Number of days to scan and reprocess on each run +``` + +## Optional Variables + +### Query Parameter Exclusions + +Setting `query_parameter_exclusions` will remove query string parameters from the `page_location` and `page_referrer` fields for all downstream processing. Original parameters are captured in the `original_page_location` and `original_page_referrer` fields. Ex: + +``` +vars: + ga4: + query_parameter_exclusions: ["gclid","fbclid","_ga"] +``` + +### Query Parameter Extraction + +Setting `query_parameter_extraction` will extract query string parameters from the `page_location` field into new columns. This can be used to extract advertising click IDs into columns that can be joined with advertising data sets. Ex: + +``` +vars: + ga4: + query_parameter_extraction: ["gclid","fbclid","keyword"] +``` + + +### Custom Parameters + +Within GA4, you can add custom parameters to any event. These custom parameters will be picked up by this package if they are defined as variables within your `dbt_project.yml` file using the following syntax: + +``` +[event name]_custom_parameters + - name: "[name of custom parameter]" + value_type: "[string_value|int_value|float_value|double_value]" +``` + +For example: + +``` +vars: + ga4: + page_view_custom_parameters: + - name: "clean_event" + value_type: "string_value" + - name: "country_code" + value_type: "int_value" +``` + +You can optionally rename the output column: + +``` +vars: + ga4: + page_view_custom_parameters: + - name: "country_code" + value_type: "int_value" + rename_to: "country" +``` + +If there are custom parameters you need on all events, you can define defaults using `default_custom_parameters`, for example: + +``` +vars: + ga4: + default_custom_parameters: + - name: "country_code" + value_type: "int_value" +``` + +### User Properties + +User properties are provided by GA4 in the `user_properties` repeated field. The most recent user property for each user will be extracted and included in the `dim_ga4__users` model by configuring the `user_properties` variable in your project as follows: + +``` +vars: + ga4: + user_properties: + - user_property_name: "membership_level" + value_type: "int_value" + - user_property_name: "account_status" + value_type: "string_value" +``` + +### Derived User Properties + +Derived user properties are different from "User Properties" in that they are derived from event parameters. This provides additional flexibility in allowing users to turn any event parameter into a user property. + +Derived User Properties are included in the `dim_ga4__users` model and contain the latest event parameter value per user. + +``` +derived_user_properties: + - event_parameter: "[your event parameter]" + user_property_name: "[a unique name for the derived user property]" + value_type: "[string_value|int_value|float_value|double_value]" +``` + +For example: + +``` +vars: + ga4: + derived_user_properties: + - event_parameter: "page_location" + user_property_name: "most_recent_page_location" + value_type: "string_value" + - event_parameter: "another_event_param" + user_property_name: "most_recent_param" + value_type: "string_value" +``` + +### Derived Session Properties + +Derived session properties are similar to derived user properties, but on a per-session basis, for properties that change slowly over time. This provides additional flexibility in allowing users to turn any event parameter into a session property. + +Derived Session Properties are included in the `dim_ga4__sessions` and `dim_ga4__sessions_daily` models and contain the latest event parameter or user property value per session. + +``` +derived_session_properties: + - event_parameter: "[your event parameter]" + session_property_name: "[a unique name for the derived session property]" + value_type: "[string_value|int_value|float_value|double_value]" + - user_property: "[your user property key]" + session_property_name: "[a unique name for the derived session property]" + value_type: "[string_value|int_value|float_value|double_value]" +``` + +For example: + +``` +vars: + ga4: + derived_session_properties: + - event_parameter: "page_location" + session_property_name: "most_recent_page_location" + value_type: "string_value" + - event_parameter: "another_event_param" + session_property_name: "most_recent_param" + value_type: "string_value" + - user_property: "first_open_time" + session_property_name: "first_open_time" + value_type: "int_value" +``` + +### GA4 Recommended Events + +See the README file at /dbt_packages/models/staging/recommended_events for instructions on enabling [Google's recommended events](https://support.google.com/analytics/answer/9267735?hl=en). + +### Conversion Events + +Specific event names can be specified as conversions by setting the `conversion_events` variable in your `dbt_project.yml` file. These events will be counted against each session and included in the `fct_sessions.sql` dimensional model. Ex: + +``` +vars: + ga4: + conversion_events: ['purchase','download'] +``` + +### Session Attribution Lookback Window + +The `stg_ga4__sessions_traffic_sources_last_non_direct_daily` model provides last non-direct session attribution within a configurable lookback window. The default is 30 days, but this can be overridden with the `session_attribution_lookback_window_days` variable. + +``` +vars: + ga4: + session_attribution_lookback_window_days: 90 +``` + +# Custom Events + +Custom events can be generated in your project using the `create_custom_event` macro. Simply create a new model in your project and enter the following: + +``` +{{ ga4.create_custom_event('my_custom_event') }} +``` + +Note, however, that any event-specific custom parameters or default custom parameters must be defined in the global variable space as shown below: + +``` +vars: + default_custom_parameters: + - name: "some_parameter" + value_type: "string_value" + my_custom_event_custom_parameters: + - name: "some_other_parameter" + value_type: "string_value" +``` +# Connecting to BigQuery + +This package assumes that BigQuery is the source of your GA4 data. Full instructions for connecting DBT to BigQuery are here: https://docs.getdbt.com/reference/warehouse-profiles/bigquery-profile + +The easiest option is using OAuth with your Google Account. Summarized instructions are as follows: + +1. Download and initialize gcloud SDK with your Google Account (https://cloud.google.com/sdk/docs/install) +2. Run the following command to provide default application OAuth access to BigQuery: + +``` +gcloud auth application-default login --scopes=https://www.googleapis.com/auth/bigquery,https://www.googleapis.com/auth/iam.test +``` +# Unit Testing + +This package uses `pytest` as a method of unit testing individual models. More details can be found in the [unit_tests/README.md](unit_tests) folder. + +# Overriding Default Channel Groupings + +By default, this package maps traffic sources to channel groupings using the `macros/default_channel_grouping.sql` macro. This macro closely adheres to Google's recommended channel groupings documented here: https://support.google.com/analytics/answer/9756891?hl=en . + +Package users can override this macro and implement their own channel groupings by following these steps: +- Create a macro in your project named `default__default_channel_grouping` that accepts the same 3 arguments: source, medium, source_category +- Implement your custom logic within that macro. It may be easiest to first copy the code from the package macro and modify from there. + +Overriding the package's default channel mapping makes use of dbt's dispatch override capability documented here: https://docs.getdbt.com/reference/dbt-jinja-functions/dispatch#overriding-package-macros + +# Multi-Property Support + +Multiple GA4 properties are supported by listing out the project IDs in the `property_ids` variable. In this scenario, the `static_incremental_days` variable is required and the `combined_dataset` variable will define the dataset (in your profile's target project) where source data will be copied. + +``` +vars: + ga4: + property_ids: [11111111, 22222222, 33333333] + static_incremental_days: 3 + combined_dataset: "my_combined_dataset" +``` + +With these variables set, the `combine_property_data` macro will run as a pre-hook to `base_ga4_events` and clone shards to the target dataset. The number of days' worth of data to clone during incremental runs will be based on the `static_incremental_days` variable. + +Jobs that run a large number of clone operations are prone to timing out. As a result, it is recommended that you increase the query timeout if you need to backfill or full-refresh the table, when first setting up or when the base model gets modified. Otherwise, it is best to prevent the base model from rebuilding on full refreshes unless needed to minimize timeouts. + +``` +models: + ga4: + staging: + base: + base_ga4__events: + +full_refresh: false +``` +# dbt Style Guide + +This package attempts to adhere to the Brooklyn Data style guide found [here](https://github.com/brooklyn-data/co/blob/main/sql_style_guide.md). This work is in-progress. diff --git a/TODO.md b/TODO.md new file mode 100644 index 00000000..84ca8488 --- /dev/null +++ b/TODO.md @@ -0,0 +1,35 @@ + +# TODO + +- It may be overly expensive to scan ALL events looking for first/last occurances of user's event parameters. We can move data from 1st & last session into a new table and scan that table instead. +- mechanism to take in an array variable listing custom events and output 1 model per event (is this possible?) +- Add event timing (avg time to next page) metrics +- Anything else to do with `privacy_info` field? Right now removing 'null' client ids from user dim tables. +- Create staging tables for the following events: + - view_promotion + - add_to_cart + - Audience trigger events. See https://support.google.com/analytics/answer/9934109?hl=en + - Special treatment for conversion events? + - Full event reference: + - https://developers.google.com/analytics/devguides/collection/ga4/reference/events + - https://support.google.com/analytics/answer/9216061?hl=en&ref_topic=9756175 +- Review these issues for ideas for our repo: https://github.com/coding-is-for-losers/ga4-bigquery-starter/issues +- Any special considerations for handling >1 data stream? +- Implement dev profile considerations to limit processing: https://docs.getdbt.com/docs/guides/best-practices#limit-the-data-processed-when-in-development +- Example of a funnel model https://github.com/teej/sf-funnels +- Review LookML examples for inspiration: https://github.com/llooker/ga_four_block_dev/tree/master/views/event_data_dimensions + - Add landing page / exit page, session start/end time, session duration, is bounce, campaign source to `dim_sessions` model +- Configuration and dynamic templates to create custom event tables and dimensions +- Configuration to create custom dimensions (session, user, event_*) from event parameters +- Use Fivetran's `union_data` method (or something similar) to handle multiple, unioned GA4 exports. https://github.com/fivetran/dbt_xero_source/blob/main/models/tmp/stg_xero__account_tmp.sql + +## Misc + +- DBT guide to package creation: https://docs.getdbt.com/docs/guides/building-packages +- DBT project structure notes: https://discourse.getdbt.com/t/how-we-structure-our-dbt-projects/355 + +## Discussion: Configuration to create custom dimensions + +Product-scope (or item-scope in GA4) custom dimensions are a much missed feature. + +We can implement them, with some difficulty, mapping event properties to the custom dimension. However, it is possible that Google, presuming they add item-scoped CDs, will just add the dimension to the items array which could result in stg_ga4__items.sql automatically picking up item-scoped CDs the way that it is currently written. diff --git a/analyses/.gitkeep b/analyses/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/dbt_project.yml b/dbt_project.yml new file mode 100644 index 00000000..adab0c5f --- /dev/null +++ b/dbt_project.yml @@ -0,0 +1,22 @@ +name: 'ga4' +version: '6.0.1' +config-version: 2 +model-paths: ["models"] +analysis-paths: ["analyses"] +test-paths: ["tests"] +seed-paths: ["seeds"] +macro-paths: ["macros"] +snapshot-paths: ["snapshots"] + +profile: "sado_analytics_services" + +target-path: "target" # directory which will store compiled SQL files +clean-targets: # directories to be removed by `dbt clean` + - "target" + - "dbt_packages" + +models: + ga4: + +materialized: view + marts: + +materialized: table \ No newline at end of file diff --git a/macros/.gitkeep b/macros/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/macros/base_select.sql b/macros/base_select.sql new file mode 100644 index 00000000..1f374539 --- /dev/null +++ b/macros/base_select.sql @@ -0,0 +1,162 @@ +{% macro base_select_source() %} + {{ return(adapter.dispatch('base_select_source', 'ga4')()) }} +{% endmacro %} + +{% macro default__base_select_source() %} + parse_date('%Y%m%d',event_date) as event_date_dt + , event_timestamp + , event_name + , event_params + , event_previous_timestamp + , event_value_in_usd + , event_bundle_sequence_id + , event_server_timestamp_offset + , user_id + , user_pseudo_id + , privacy_info + , user_properties + , user_first_touch_timestamp + , user_ltv + , device + , geo + , app_info + , traffic_source + , stream_id + , platform + , ecommerce.total_item_quantity + , ecommerce.purchase_revenue_in_usd + , ecommerce.purchase_revenue + , ecommerce.refund_value_in_usd + , ecommerce.refund_value + , ecommerce.shipping_value_in_usd + , ecommerce.shipping_value + , ecommerce.tax_value_in_usd + , ecommerce.tax_value + , ecommerce.unique_items + , ecommerce.transaction_id + , items +{% endmacro %} + +{% macro base_select_renamed() %} + {{ return(adapter.dispatch('base_select_renamed', 'ga4')()) }} +{% endmacro %} + +{% macro default__base_select_renamed() %} + event_date_dt + , event_timestamp + , lower(replace(trim(event_name), " ", "_")) as event_name -- Clean up all event names to be snake cased + , event_params + , event_previous_timestamp + , event_value_in_usd + , event_bundle_sequence_id + , event_server_timestamp_offset + , user_id + , user_pseudo_id + , privacy_info.analytics_storage as privacy_info_analytics_storage + , privacy_info.ads_storage as privacy_info_ads_storage + , privacy_info.uses_transient_token as privacy_info_uses_transient_token + , user_properties + , user_first_touch_timestamp + , user_ltv.revenue as user_ltv_revenue + , user_ltv.currency as user_ltv_currency + , device.category as device_category + , device.mobile_brand_name as device_mobile_brand_name + , device.mobile_model_name as device_mobile_model_name + , device.mobile_marketing_name as device_mobile_marketing_name + , device.mobile_os_hardware_model as device_mobile_os_hardware_model + , device.operating_system as device_operating_system + , device.operating_system_version as device_operating_system_version + , device.vendor_id as device_vendor_id + , device.advertising_id as device_advertising_id + , device.language as device_language + , device.is_limited_ad_tracking as device_is_limited_ad_tracking + , device.time_zone_offset_seconds as device_time_zone_offset_seconds + , device.browser as device_browser + , device.browser_version as device_browser_version + , device.web_info.browser as device_web_info_browser + , device.web_info.browser_version as device_web_info_browser_version + , device.web_info.hostname as device_web_info_hostname + , geo.continent as geo_continent + , geo.country as geo_country + , geo.region as geo_region + , geo.city as geo_city + , geo.sub_continent as geo_sub_continent + , geo.metro as geo_metro + , app_info.id as app_info_id + , app_info.version as app_info_version + , app_info.install_store as app_info_install_store + , app_info.firebase_app_id as app_info_firebase_app_id + , app_info.install_source as app_info_install_source + , traffic_source.name as user_campaign + , traffic_source.medium as user_medium + , traffic_source.source as user_source + , stream_id + , platform + , struct( + total_item_quantity + , purchase_revenue_in_usd + , purchase_revenue + , refund_value_in_usd + , refund_value + , shipping_value_in_usd + , shipping_value + , tax_value_in_usd + , tax_value + , unique_items + , transaction_id + ) as ecommerce + , (select + array_agg(struct( + unnested_items.item_id + , unnested_items.item_name + , unnested_items.item_brand + , unnested_items.item_variant + , unnested_items.item_category + , unnested_items.item_category2 + , unnested_items.item_category3 + , unnested_items.item_category4 + , unnested_items.item_category5 + , unnested_items.price_in_usd + , unnested_items.price + , unnested_items.quantity + , unnested_items.item_revenue_in_usd + , unnested_items.item_revenue + , unnested_items.item_refund_in_usd + , unnested_items.item_refund + , unnested_items.coupon + , unnested_items.affiliation + , unnested_items.location_id + , unnested_items.item_list_id + , unnested_items.item_list_name + , unnested_items.item_list_index + , unnested_items.promotion_id + , unnested_items.promotion_name + , unnested_items.creative_name + , unnested_items.creative_slot + , unnested_items.item_params + )) from unnest(items) as unnested_items + ) items + , {{ ga4.unnest_key('event_params', 'ga_session_id', 'int_value', 'session_id') }} + , {{ ga4.unnest_key('event_params', 'page_location') }} + , {{ ga4.unnest_key('event_params', 'ga_session_number', 'int_value', 'session_number') }} + , COALESCE( + (SELECT value.int_value FROM unnest(event_params) WHERE key = "session_engaged"), + (CASE WHEN (SELECT value.string_value FROM unnest(event_params) WHERE key = "session_engaged") = "1" THEN 1 END) + ) as session_engaged + , {{ ga4.unnest_key('event_params', 'engagement_time_msec', 'int_value') }} + , {{ ga4.unnest_key('event_params', 'page_title') }} + , {{ ga4.unnest_key('event_params', 'page_referrer') }} + , {{ ga4.unnest_key('event_params', 'source', 'lower_string_value', 'event_source') }} + , {{ ga4.unnest_key('event_params', 'medium', 'lower_string_value', 'event_medium') }} + , {{ ga4.unnest_key('event_params', 'campaign', 'lower_string_value', 'event_campaign') }} + , {{ ga4.unnest_key('event_params', 'content', 'lower_string_value', 'event_content') }} + , {{ ga4.unnest_key('event_params', 'term', 'lower_string_value', 'event_term') }} + , CASE + WHEN event_name = 'page_view' THEN 1 + ELSE 0 + END AS is_page_view + , CASE + WHEN event_name = 'purchase' THEN 1 + ELSE 0 + END AS is_purchase +{% endmacro %} \ No newline at end of file diff --git a/macros/combine_property_data.sql b/macros/combine_property_data.sql new file mode 100644 index 00000000..67ef31cc --- /dev/null +++ b/macros/combine_property_data.sql @@ -0,0 +1,38 @@ +{%- macro combine_property_data() -%} + {{ return(adapter.dispatch('combine_property_data', 'ga4')()) }} +{%- endmacro -%} + +{% macro default__combine_property_data() %} + + create schema if not exists `{{target.project}}.{{var('combined_dataset')}}`; + + {# If incremental, then use static_incremental_days variable to find earliest shard to copy #} + {% if not should_full_refresh() %} + {% set earliest_shard_to_retrieve = (modules.datetime.date.today() - modules.datetime.timedelta(days=var('static_incremental_days')))|string|replace("-", "")|int %} + {% else %} + {# Otherwise use 'start_date' variable #} + + {% set earliest_shard_to_retrieve = var('start_date')|int %} + {% endif %} + + {% for property_id in var('property_ids') %} + {%- set schema_name = "analytics_" + property_id|string -%} + {# Copy intraday tables #} + {%- set relations = dbt_utils.get_relations_by_pattern(schema_pattern=schema_name, table_pattern='events_intraday_%', database=var('source_project')) -%} + {% for relation in relations %} + {%- set relation_suffix = relation.identifier|replace('events_intraday_', '') -%} + {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} + CREATE OR REPLACE TABLE `{{target.project}}.{{var('combined_dataset')}}.events_intraday_{{relation_suffix}}{{property_id}}` CLONE `{{var('source_project')}}.analytics_{{property_id}}.events_intraday_{{relation_suffix}}`; + {%- endif -%} + {% endfor %} + {# Copy daily tables and drop old intraday table #} + {%- set relations = dbt_utils.get_relations_by_pattern(schema_pattern=schema_name, table_pattern='events_%', exclude='events_intraday_%', database=var('source_project')) -%} + {% for relation in relations %} + {%- set relation_suffix = relation.identifier|replace('events_', '') -%} + {%- if relation_suffix|int >= earliest_shard_to_retrieve|int -%} + CREATE OR REPLACE TABLE `{{target.project}}.{{var('combined_dataset')}}.events_{{relation_suffix}}{{property_id}}` CLONE `{{var('source_project')}}.analytics_{{property_id}}.events_{{relation_suffix}}`; + DROP TABLE IF EXISTS `{{target.project}}.{{var('combined_dataset')}}.events_intraday_{{relation_suffix}}{{property_id}}`; + {%- endif -%} + {% endfor %} + {% endfor %} +{% endmacro %} \ No newline at end of file diff --git a/macros/create_custom_event.sql b/macros/create_custom_event.sql new file mode 100644 index 00000000..48d09b42 --- /dev/null +++ b/macros/create_custom_event.sql @@ -0,0 +1,15 @@ +{%- macro create_custom_event(event_name) -%} + {{ return(adapter.dispatch('create_custom_event', 'ga4')(event_name)) }} +{%- endmacro -%} + +{%- macro default__create_custom_event(event_name) -%} + select * + {% if var("default_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("default_custom_parameters", "none") )}} + {% endif %} + {% if var(event_name+"_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var(event_name+"_custom_parameters") )}} + {% endif %} + from {{ref('stg_ga4__events')}} + where event_name = '{{event_name}}' +{%- endmacro -%} \ No newline at end of file diff --git a/macros/default_channel_grouping.sql b/macros/default_channel_grouping.sql new file mode 100644 index 00000000..3906e389 --- /dev/null +++ b/macros/default_channel_grouping.sql @@ -0,0 +1,148 @@ +-- Google's documentation is here: https://support.google.com/analytics/answer/9756891?hl=en +-- source_category Excel file can be downloaded from the above link and may change over time +{% macro default_channel_grouping(source, medium, source_category, campaign) %} + {{ return(adapter.dispatch('default_channel_grouping', 'ga4')(source, medium, source_category, campaign)) }} +{% endmacro %} + +{% macro default__default_channel_grouping(source, medium, source_category, campaign) %} +case + -- Direct: Source exactly matches "(direct)" AND Medium is one of ("(not set)", "(none)") + when ( + {{source}} is null + and {{medium}} is null + ) + or ( + {{source}} = '(direct)' + and ({{medium}} = '(none)' or {{medium}} = '(not set)') + ) + then 'Direct' + + -- Cross-network: Campaign Name contains "cross-network" + when REGEXP_CONTAINS({{campaign}}, r"cross-network") + then 'Cross-network' + + -- Paid Shopping: + -- (Source matches a list of shopping sites + -- OR + -- Campaign Name matches regex ^(.*(([^a-df-z]|^)shop|shopping).*)$) + -- AND + -- Medium matches regex ^(.*cp.*|ppc|retargeting|paid.*)$ + when ( + {{source_category}} = 'SOURCE_CATEGORY_SHOPPING' + or REGEXP_CONTAINS({{campaign}}, r"^(.*(([^a-df-z]|^)shop|shopping).*)$") + ) + and REGEXP_CONTAINS({{medium}},r"^(.*cp.*|ppc|retargeting|paid.*)$") + then 'Paid Shopping' + + -- Paid Search: + -- Source matches a list of search sites + -- AND + -- Medium matches regex ^(.*cp.*|ppc|retargeting|paid.*)$ + when {{source_category}} = 'SOURCE_CATEGORY_SEARCH' + and REGEXP_CONTAINS({{medium}}, r"^(.*cp.*|ppc|retargeting|paid.*)$") + then 'Paid Search' + + -- Paid Social: + -- Source matches a regex list of social sites + -- AND + -- Medium matches regex ^(.*cp.*|ppc|retargeting|paid.*)$ + when {{source_category}} = 'SOURCE_CATEGORY_SOCIAL' + and REGEXP_CONTAINS({{medium}}, r"^(.*cp.*|ppc|retargeting|paid.*)$") + then 'Paid Social' + + -- Paid Video: + -- Source matches a list of video sites + -- AND + -- Medium matches regex ^(.*cp.*|ppc|retargeting|paid.*)$ + when {{source_category}} = 'SOURCE_CATEGORY_VIDEO' + and REGEXP_CONTAINS({{medium}},r"^(.*cp.*|ppc|retargeting|paid.*)$") + then 'Paid Video' + + -- Display: + -- Medium is one of ("display", "banner", "expandable", "interstitial", "cpm") + when {{medium}} in ('display', 'banner', 'expandable', 'interstitial', 'cpm') + then 'Display' + + -- Paid Other: + -- Medium matches regex ^(.*cp.*|ppc|retargeting|paid.*)$ + when REGEXP_CONTAINS({{medium}}, r"^(.*cp.*|ppc|retargeting|paid.*)$") + then 'Paid Other' + + -- Organic Shopping: + -- Source matches a list of shopping sites + -- OR + -- Campaign name matches regex ^(.*(([^a-df-z]|^)shop|shopping).*)$ + when {{source_category}} = 'SOURCE_CATEGORY_SHOPPING' + or REGEXP_CONTAINS({{campaign}}, r"^(.*(([^a-df-z]|^)shop|shopping).*)$") + then 'Organic Shopping' + + -- Organic Social: + -- Source matches a regex list of social sites + -- OR + -- Medium is one of ("social", "social-network", "social-media", "sm", "social network", "social media") + when {{source_category}} = 'SOURCE_CATEGORY_SOCIAL' + or {{medium}} in ("social","social-network","social-media","sm","social network","social media") + then 'Organic Social' + + -- Organic Video: + -- Source matches a list of video sites + -- OR + -- Medium matches regex ^(.*video.*)$ + when {{source_category}} = 'SOURCE_CATEGORY_VIDEO' + or REGEXP_CONTAINS({{medium}}, r"^(.*video.*)$") + then 'Organic Video' + + -- Organic Search: + -- Source matches a list of search sites + -- OR + -- Medium exactly matches organic + when {{source_category}} = 'SOURCE_CATEGORY_SEARCH' or {{medium}} = 'organic' + then 'Organic Search' + + -- Referral: + -- Medium is one of ("referral", "app", or "link") + when {{medium}} in ("referral", "app", "link") + then 'Referral' + + -- Email: + -- Source = email|e-mail|e_mail|e mail + -- OR + -- Medium = email|e-mail|e_mail|e mail + when REGEXP_CONTAINS({{source}}, r"email|e-mail|e_mail|e mail") + or REGEXP_CONTAINS({{medium}}, r"email|e-mail|e_mail|e mail") + then 'Email' + + -- Affiliates: + -- Medium = affiliate + when {{medium}} = 'affiliate' + then 'Affiliates' + + -- Audio: + -- Medium exactly matches audio + when {{medium}} = 'audio' + then 'Audio' + + -- SMS: + -- Source exactly matches sms + -- OR + -- Medium exactly matches sms + when {{source}} = 'sms' + or {{medium}} = 'sms' + then 'SMS' + + -- Mobile Push Notifications: + -- Medium ends with "push" + -- OR + -- Medium contains "mobile" or "notification" + -- OR + -- Source exactly matches "firebase" + when REGEXP_CONTAINS({{medium}}, r"push$") + or REGEXP_CONTAINS({{medium}}, r"mobile|notification") + or {{source}} = 'firebase' + then 'Mobile Push Notifications' + + -- Unassigned is the value Analytics uses when there are no other channel rules that match the event data. + else 'Unassigned' +end + +{% endmacro %} \ No newline at end of file diff --git a/macros/stage_custom_parameters.sql b/macros/stage_custom_parameters.sql new file mode 100644 index 00000000..b856e962 --- /dev/null +++ b/macros/stage_custom_parameters.sql @@ -0,0 +1,6 @@ +{% macro stage_custom_parameters(custom_parameters ) %} + {% for cp in custom_parameters %} + ,{{ ga4.unnest_key('event_params', cp.name , cp.value_type, cp.rename_to or "default" ) }} + {% endfor %} +{% endmacro %} + diff --git a/macros/unnest_key.sql b/macros/unnest_key.sql new file mode 100644 index 00000000..08943f88 --- /dev/null +++ b/macros/unnest_key.sql @@ -0,0 +1,20 @@ +-- Unnests a single key's value from an array. Use value_type = 'lower_string_value' to produce a lowercase version of the string value + +{%- macro unnest_key(column_to_unnest, key_to_extract, value_type = "string_value", rename_column = "default") -%} + {{ return(adapter.dispatch('unnest_key', 'ga4')(column_to_unnest, key_to_extract, value_type, rename_column)) }} +{%- endmacro -%} + +{%- macro default__unnest_key(column_to_unnest, key_to_extract, value_type = "string_value", rename_column = "default") -%} + (select + {% if value_type == "lower_string_value" %} + lower(value.string_value) + {% else %} + value.{{value_type}} + {% endif %} + from unnest({{column_to_unnest}}) where key = '{{key_to_extract}}') as + {% if rename_column == "default" %} + {{ key_to_extract }} + {% else %} + {{rename_column}} + {% endif %} +{%- endmacro -%} \ No newline at end of file diff --git a/macros/unpack_struct.sql b/macros/unpack_struct.sql new file mode 100644 index 00000000..b64d67d6 --- /dev/null +++ b/macros/unpack_struct.sql @@ -0,0 +1,5 @@ +{%- macro unpack_struct(column_to_unpack, fields) -%} +{% for field in fields %} +{{column_to_unpack}}.{{field}} as {{column_to_unpack}}_{{field}} {% if not loop.last %},{% endif %} +{% endfor %} +{%- endmacro -%} \ No newline at end of file diff --git a/macros/url_parsing.sql b/macros/url_parsing.sql new file mode 100644 index 00000000..b91fe611 --- /dev/null +++ b/macros/url_parsing.sql @@ -0,0 +1,23 @@ +{% macro extract_hostname_from_url(url) %} + REGEXP_EXTRACT({{ url }}, '(?:http[s]?://)?(?:www\\.)?(.*?)(?:(?:/|:)(?:.)*|$)') +{% endmacro %} + +{% macro extract_query_string_from_url(url) %} + REGEXP_EXTRACT({{ url }}, '\\?(.+)') +{% endmacro %} + +{% macro remove_query_parameters(url, parameters)%} +REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE({{url}}, '(\\?|&)({{ parameters|join("|") }})=[^&]*', '\\1'), '\\?&+', '?'), '&+', '&'), '\\?$|&$', '') +{% endmacro %} + +{% macro extract_page_path(url) %} + {{ return(adapter.dispatch('extract_page_path', 'ga4')(url)) }} +{% endmacro %} + +{% macro default__extract_page_path(url) %} + REGEXP_EXTRACT({{url}}, '(?:\\w+:)?\\/\\/[^\\/]+([^?#]+)') +{% endmacro %} + +{% macro extract_query_parameter_value(url, param) %} + REGEXP_EXTRACT( {{url}}, r'{{param}}=([^&|\?|#]*)' ) +{% endmacro %} \ No newline at end of file diff --git a/models/marts/core/core.yml b/models/marts/core/core.yml new file mode 100644 index 00000000..32095107 --- /dev/null +++ b/models/marts/core/core.yml @@ -0,0 +1,23 @@ +version: 2 + +models: + - name: dim_ga4__sessions + description: Dimension table for sessions containing context useful for filtering such as acquisition source, medium, and campaign. Each row represents a session. Unique on session_key + columns: + - name: session_key + tests: + - unique + - name: dim_ga4__client_keys + description: Dimension table for user devices (client_key) which includes data from the first and last event produced. Unique on client_key + columns: + - name: client_key + description: Hashed combination of user_pseudo_id and stream_id + tests: + - unique + - name: fct_ga4__client_keys + description: Fact table with aggregate metrics at the level of the user's device (as indicated by the client_key). Metrics are aggregated from fct_ga4__sessions. + columns: + - name: client_key + description: Hashed combination of user_pseudo_id and stream_id + tests: + - unique diff --git a/models/marts/core/dim_ga4__client_keys.sql b/models/marts/core/dim_ga4__client_keys.sql new file mode 100644 index 00000000..6a6cf4a8 --- /dev/null +++ b/models/marts/core/dim_ga4__client_keys.sql @@ -0,0 +1,85 @@ +{{ + config( + materialized = 'incremental', + incremental_strategy = 'merge', + unique_key = ['client_key'], + tags = ["incremental"], + partition_by={ + "field": "last_seen_at", + "data_type": "timestamp", + "granularity": "day" + }, + on_schema_change = 'sync_all_columns', + merge_update_columns = [ + 'last_geo_continent', + 'last_geo_country', + 'last_geo_region', + 'last_geo_city', + 'last_geo_sub_continent', + 'last_geo_metro', + 'last_device_category', + 'last_device_mobile_brand_name', + 'last_device_mobile_model_name', + 'last_device_mobile_marketing_name', + 'last_device_mobile_os_hardware_model', + 'last_device_operating_system', + 'last_device_operating_system_version', + 'last_device_vendor_id', + 'last_device_advertising_id', + 'last_device_language', + 'last_device_is_limited_ad_tracking', + 'last_device_time_zone_offset_seconds', + 'last_device_browser', + 'last_device_browser_version', + 'last_device_web_info_browser', + 'last_device_web_info_browser_version', + 'last_device_web_info_hostname', + 'last_user_campaign', + 'last_user_medium', + 'last_user_source', + 'last_seen_at', + 'last_page_location', + 'last_page_hostname', + 'last_page_referrer', + ], + ) +}} + +with include_first_last_events as ( + select + * + from {{ref('stg_ga4__client_key_first_last_events')}} + {% if is_incremental() %} + where date(last_seen_at) >= date_sub(current_date, interval {{var('static_incremental_days',3) | int}} day) + {% endif %} +), +include_first_last_page_views as ( + select + include_first_last_events.*, + first_last_page_views.first_page_location, + first_last_page_views.first_page_hostname, + first_last_page_views.first_page_referrer, + first_last_page_views.last_page_location, + first_last_page_views.last_page_hostname, + first_last_page_views.last_page_referrer, + from include_first_last_events + left join {{ref('stg_ga4__client_key_first_last_pageviews')}} as first_last_page_views using (client_key) + {% if is_incremental() %} + where date(first_last_page_views.last_seen_at) >= date_sub(current_date, interval {{var('static_incremental_days',3) | int}} day) + {% endif %} +), +include_user_properties as ( + +select * from include_first_last_page_views +{% if var('derived_user_properties', false) %} +-- If derived user properties have been assigned as variables, join them on the client_key +left join {{ref('stg_ga4__derived_user_properties')}} using (client_key) +{% endif %} +{% if var('user_properties', false) %} +-- If user properties have been assigned as variables, join them on the client_key +left join {{ref('stg_ga4__user_properties')}} using (client_key) +{% endif %} + +) + +select * from include_user_properties \ No newline at end of file diff --git a/models/marts/core/dim_ga4__sessions.sql b/models/marts/core/dim_ga4__sessions.sql new file mode 100644 index 00000000..b9c6447d --- /dev/null +++ b/models/marts/core/dim_ga4__sessions.sql @@ -0,0 +1,95 @@ +{{ + config( + materialized = 'incremental', + incremental_strategy = 'merge', + tags = ["incremental"], + on_schema_change = 'sync_all_columns', + unnest_keys = ['session_key'], + partition_by={ + "field": "session_partition_date", + "data_type": "date", + "granularity": "day" + }, + ) +}} + + +-- Dimension table for sessions based on the first event that isn't session_start or first_visit. +with session_first_event as +( + select * + from {{ref('stg_ga4__events')}} + where event_name != 'first_visit' + and event_name != 'session_start' + {% if is_incremental() %} + and event_date_dt >= date_sub(current_date, interval {{var('static_incremental_days',3) | int}} day) + {% endif %} + qualify row_number() over(partition by session_key order by event_timestamp) = 1 +), + session_start_dims as ( + select + session_key, + event_date_dt as session_partition_date, + event_timestamp as session_start_timestamp, + page_path as landing_page_path, + page_location as landing_page, + page_hostname as landing_page_hostname, + page_referrer as landing_page_referrer, + geo_continent, + geo_country, + geo_region, + geo_city, + geo_sub_continent, + geo_metro, + stream_id, + platform, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_mobile_marketing_name, + device_mobile_os_hardware_model, + device_operating_system, + device_operating_system_version, + device_vendor_id, + device_advertising_id, + device_language, + device_is_limited_ad_tracking, + device_time_zone_offset_seconds, + device_browser, + device_web_info_browser, + device_web_info_browser_version, + device_web_info_hostname, + session_number, + session_number = 1 as is_first_session, + user_campaign, + user_medium, + user_source, + from session_first_event +), +join_traffic_source as ( + select + session_start_dims.*, + sessions_traffic_sources.session_source, + sessions_traffic_sources.session_medium, + sessions_traffic_sources.session_campaign, + sessions_traffic_sources.session_content, + sessions_traffic_sources.session_term, + sessions_traffic_sources.session_default_channel_grouping, + sessions_traffic_sources.session_source_category + from session_start_dims + left join {{ref('stg_ga4__sessions_traffic_sources')}} sessions_traffic_sources using (session_key) + {% if is_incremental() %} + where sessions_traffic_sources.session_partition_date >= date_sub(current_date, interval {{var('static_incremental_days',3) | int}} day) + {% endif %} +), +include_session_properties as ( + select + * + from join_traffic_source + {% if var('derived_session_properties', false) %} + -- If derived session properties have been assigned as variables, join them on the session_key + left join {{ref('stg_ga4__derived_session_properties')}} using (session_key) + {% endif %} +) + +select * from include_session_properties \ No newline at end of file diff --git a/models/marts/core/dim_ga4__sessions_daily.sql b/models/marts/core/dim_ga4__sessions_daily.sql new file mode 100644 index 00000000..8273dd74 --- /dev/null +++ b/models/marts/core/dim_ga4__sessions_daily.sql @@ -0,0 +1,175 @@ +{% set partitions_to_replace = ['current_date'] %} +{% for i in range(var('static_incremental_days')) %} + {% set partitions_to_replace = partitions_to_replace.append('date_sub(current_date, interval ' + (i+1)|string + ' day)') %} +{% endfor %} +{{ + config( + materialized = 'incremental', + incremental_strategy = 'insert_overwrite', + tags = ["incremental"], + partition_by={ + "field": "session_partition_date", + "data_type": "date", + "granularity": "day" + }, + partitions = partitions_to_replace + ) +}} + + +with event_dimensions as +( + select + client_key, + session_key, + session_partition_key, + event_date_dt as session_partition_date, + event_timestamp, + page_path, + page_location, + page_hostname, + page_referrer, + geo_continent, + geo_country, + geo_region, + geo_city, + geo_sub_continent, + geo_metro, + stream_id, + platform, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_mobile_marketing_name, + device_mobile_os_hardware_model, + device_operating_system, + device_operating_system_version, + device_vendor_id, + device_advertising_id, + device_language, + device_is_limited_ad_tracking, + device_time_zone_offset_seconds, + device_browser, + device_web_info_browser, + device_web_info_browser_version, + device_web_info_hostname, + user_campaign, + user_medium, + user_source, + from {{ref('stg_ga4__events')}} + where event_name != 'first_visit' + and event_name != 'session_start' + {% if is_incremental() %} + and event_date_dt in ({{ partitions_to_replace | join(',') }}) + {% endif %} +) +,traffic_sources as ( + select + session_partition_key, + session_source, + session_medium, + session_campaign, + session_content, + session_term, + session_default_channel_grouping, + session_source_category, + -- last non-direct traffic sources + last_non_direct_source, + last_non_direct_medium, + last_non_direct_campaign, + last_non_direct_content, + last_non_direct_term, + last_non_direct_default_channel_grouping, + last_non_direct_source_category + from {{ref('stg_ga4__sessions_traffic_sources_last_non_direct_daily')}} + where 1=1 + {% if is_incremental() %} + and session_partition_date in ({{ partitions_to_replace | join(',') }}) + {% endif %} +) +{% if var('derived_session_properties', false) %} +,session_properties as ( + select + * except (session_partition_date) + from {{ref('stg_ga4__derived_session_properties_daily')}} + where 1=1 + {% if is_incremental() %} + and session_partition_date in ({{ partitions_to_replace | join(',') }}) + {% endif %} +) +{% endif %} +,session_dimensions as +( + select + distinct -- Distinct call will, in effect, group by session_partition_key + stream_id + ,session_key + ,session_partition_key + ,session_partition_date + ,FIRST_VALUE(event_timestamp IGNORE NULLS) OVER (session_partition_window) AS session_partition_start_timestamp + ,FIRST_VALUE(page_path IGNORE NULLS) OVER (session_partition_window) AS landing_page_path + ,FIRST_VALUE(page_location IGNORE NULLS) OVER (session_partition_window) AS landing_page_location + ,FIRST_VALUE(page_hostname IGNORE NULLS) OVER (session_partition_window) AS landing_page_hostname + ,FIRST_VALUE(page_referrer IGNORE NULLS) OVER (session_partition_window) AS referrer + ,FIRST_VALUE(geo_continent IGNORE NULLS) OVER (session_partition_window) AS geo_continent + ,FIRST_VALUE(geo_country IGNORE NULLS) OVER (session_partition_window) AS geo_country + ,FIRST_VALUE(geo_region IGNORE NULLS) OVER (session_partition_window) AS geo_region + ,FIRST_VALUE(geo_city IGNORE NULLS) OVER (session_partition_window) AS geo_city + ,FIRST_VALUE(geo_sub_continent IGNORE NULLS) OVER (session_partition_window) AS geo_sub_continent + ,FIRST_VALUE(geo_metro IGNORE NULLS) OVER (session_partition_window) AS geo_metro + ,FIRST_VALUE(platform IGNORE NULLS) OVER (session_partition_window) AS platform + ,FIRST_VALUE(device_category IGNORE NULLS) OVER (session_partition_window) AS device_category + ,FIRST_VALUE(device_mobile_brand_name IGNORE NULLS) OVER (session_partition_window) AS device_mobile_brand_name + ,FIRST_VALUE(device_mobile_model_name IGNORE NULLS) OVER (session_partition_window) AS device_mobile_model_name + ,FIRST_VALUE(device_mobile_marketing_name IGNORE NULLS) OVER (session_partition_window) AS device_mobile_marketing_name + ,FIRST_VALUE(device_mobile_os_hardware_model IGNORE NULLS) OVER (session_partition_window) AS device_mobile_os_hardware_model + ,FIRST_VALUE(device_operating_system IGNORE NULLS) OVER (session_partition_window) AS device_operating_system + ,FIRST_VALUE(device_operating_system_version IGNORE NULLS) OVER (session_partition_window) AS device_operating_system_version + ,FIRST_VALUE(device_vendor_id IGNORE NULLS) OVER (session_partition_window) AS device_vendor_id + ,FIRST_VALUE(device_advertising_id IGNORE NULLS) OVER (session_partition_window) AS device_advertising_id + ,FIRST_VALUE(device_language IGNORE NULLS) OVER (session_partition_window) AS device_language + ,FIRST_VALUE(device_is_limited_ad_tracking IGNORE NULLS) OVER (session_partition_window) AS device_is_limited_ad_tracking + ,FIRST_VALUE(device_time_zone_offset_seconds IGNORE NULLS) OVER (session_partition_window) AS device_time_zone_offset_seconds + ,FIRST_VALUE(device_browser IGNORE NULLS) OVER (session_partition_window) AS device_browser + ,FIRST_VALUE(device_web_info_browser IGNORE NULLS) OVER (session_partition_window) AS device_web_info_browser + ,FIRST_VALUE(device_web_info_browser_version IGNORE NULLS) OVER (session_partition_window) AS device_web_info_browser_version + ,FIRST_VALUE(device_web_info_hostname IGNORE NULLS) OVER (session_partition_window) AS device_web_info_hostname + ,FIRST_VALUE(user_campaign IGNORE NULLS) OVER (session_partition_window) AS user_campaign + ,FIRST_VALUE(user_medium IGNORE NULLS) OVER (session_partition_window) AS user_medium + ,FIRST_VALUE(user_source IGNORE NULLS) OVER (session_partition_window) AS user_source + from event_dimensions + WINDOW session_partition_window AS (PARTITION BY session_partition_key ORDER BY event_timestamp ASC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) +) +,join_traffic_source as ( + select + session_dimensions.*, + session_source, + session_medium, + session_campaign, + session_content, + session_term, + session_default_channel_grouping, + session_source_category, + -- last non-direct traffic sources + last_non_direct_source, + last_non_direct_medium, + last_non_direct_campaign, + last_non_direct_content, + last_non_direct_term, + last_non_direct_default_channel_grouping, + last_non_direct_source_category + from session_dimensions + left join traffic_sources sessions_traffic_sources using (session_partition_key) +) +,join_session_properties as ( + select + * + from join_traffic_source + {% if var('derived_session_properties', false) %} + -- If derived session properties have been assigned as variables, join them on the session_partition_key + left join session_properties using (session_partition_key) + {% endif %} +) + +-- Collapse +select distinct * from join_session_properties \ No newline at end of file diff --git a/models/marts/core/dim_ga4__sessions_daily.yml b/models/marts/core/dim_ga4__sessions_daily.yml new file mode 100644 index 00000000..e424fd3c --- /dev/null +++ b/models/marts/core/dim_ga4__sessions_daily.yml @@ -0,0 +1,26 @@ +version: 2 + +models: + - name: dim_ga4__sessions_daily + description: > + Incremental, partitioned dimension table for session partitions. Partitioned on session_partition_date for improved query optimization when filtering on date. + Contains context useful for filtering sessions such as acquisition source, medium, and campaign. + Each row represents a daily session partition (as opposed to a session). + Unique on session_partion_key + columns: + - name: session_partition_key + description: > + Unique key assigned to session partitions which are daily partitions of a session. In GA4, sessions can span multiple days. + To improve query performance, it's easier to work with 'session partitions' which can be filtered/partitioned by date. + tests: + - unique + - name: session_key + description: > + Unique key assigned to sessions. Sessions can span multiple dates/partitions. + - name: session_partition_date + description: > + Date associated with the session_partition_key. Used to partition the table. Filter on this column to optimize query cost and performance. + - name: session_source + description: The source of the session based on the events contained within this particular session. For last non-direct source, see last_non_direct_source + - name: last_non_direct_source + description: The last non-direct source attributed to this session based on a 30-day lookback window. diff --git a/models/marts/core/fct_ga4__client_keys.sql b/models/marts/core/fct_ga4__client_keys.sql new file mode 100644 index 00000000..3eb1270a --- /dev/null +++ b/models/marts/core/fct_ga4__client_keys.sql @@ -0,0 +1,20 @@ +select + client_key, + stream_id, + min(session_start_timestamp) as first_seen_timestamp, + max(session_start_timestamp) as last_seen_session_timestamp, + min(session_start_date) as first_seen_date, + max(session_start_date) as last_seen_date, + sum(count_pageviews) as count_pageviews, + sum(is_session_engaged) as count_engaged_sessions, + sum(sum_event_value_in_usd) as sum_event_value_in_usd, + sum(sum_engaged_time_msec) as sum_engaged_time_msec, + count(distinct session_key) as count_sessions + {% if var('conversion_events', false) %} + {% for ce in var('conversion_events',[]) %} + , sum(count_{{ce}}) as count_{{ce}} + {% endfor %} + {% endif %} +from {{ref('fct_ga4__sessions')}} +group by 1, 2 + diff --git a/models/marts/core/fct_ga4__pages.sql b/models/marts/core/fct_ga4__pages.sql new file mode 100644 index 00000000..583737ed --- /dev/null +++ b/models/marts/core/fct_ga4__pages.sql @@ -0,0 +1,77 @@ +{{ + config( + materialized = 'incremental', + incremental_strategy = 'merge', + unique_key = ['event_date_dt', 'stream_id' , 'page_location'], + tags = ["incremental"], + partition_by={ + "field": "event_date_dt", + "data_type": "date", + "granularity": "day" + }, + on_schema_change = 'sync_all_columns', + ) +}} + +with page_view as ( + select + event_date_dt, + stream_id, + page_location, -- includes query string parameters not listed in query_parameter_exclusions variable + page_key, + page_engagement_key, + count(event_name) as page_views, + count(distinct client_key ) as distinct_client_keys, + sum( if(session_number = 1,1,0)) as new_client_keys, + sum(entrances) as entrances, +from {{ref('stg_ga4__event_page_view')}} +{% if is_incremental() %} + where event_date_dt >= date_sub(current_date, interval {{var('static_incremental_days',3)}} day) +{% endif %} + group by 1,2,3,4,5 +), page_engagement as ( + select + page_view.event_date_dt, + page_view.stream_id, + page_view.page_location, + page_view.page_key, + sum(page_view.page_views) as page_views, -- page_engagement_key references the page_referrer; need to re-aggregate metrics + sum(page_view.distinct_client_keys) as distinct_client_keys, + sum(page_view.new_client_keys) as new_client_keys, + sum(page_view.entrances) as entrances, + sum(page_engagement_time_msec) as total_engagement_time_msec, + sum( page_engagement_denominator) as avg_engagement_time_denominator + from {{ ref('stg_ga4__page_engaged_time') }} + right join page_view using (page_engagement_key) + group by 1,2,3,4 +), scroll as ( + select + event_date_dt, + page_location, + count(event_name) as scroll_events + from {{ref('stg_ga4__event_scroll')}} + {% if is_incremental() %} + where event_date_dt >= date_sub(current_date, interval {{var('static_incremental_days',3)}} day) + {% endif %} + group by 1,2 +) +{% if var('conversion_events',false) %} +, +join_conversions as ( + select + * + from page_engagement + left join {{ ref('stg_ga4__page_conversions') }} using (page_key) +) +select + join_conversions.* except (page_key), + ifnull(scroll.scroll_events, 0) as scroll_events +from join_conversions +left join scroll using (event_date_dt, page_location) +{% else %} +select + page_engagement.* except (page_key), + ifnull(scroll.scroll_events, 0) as scroll_events +from page_engagement +left join scroll using (event_date_dt, page_location) +{% endif %} \ No newline at end of file diff --git a/models/marts/core/fct_ga4__pages.yml b/models/marts/core/fct_ga4__pages.yml new file mode 100644 index 00000000..79be94ce --- /dev/null +++ b/models/marts/core/fct_ga4__pages.yml @@ -0,0 +1,13 @@ +version: 2 + +models: + - name: fct_ga4__pages + description: Incremental model with page metrics such as visits, users, new_users, entrances and exits as well as configurable conversion counts grouped by stream_id and page_location. + tests: + - unique: + column_name: "(event_date_dt || stream_id || page_location)" + columns: + - name: total_engagement_time_msec + description: The total engagement time for that page_location. + - name: avg_engagement_time_denominator + description: Use avg_engagement_time_denominator to calculate the average engagement time, which is derived by dividing the sum of total engagement time by the product of the sum of the denominator and 1000 to get the average engagement time in seconds (average_engagement_time = sum(total_engagement_time_msec)/(sum(avg_engagement_time_denominator) *1000 )). The denominator excludes page_view events where no engagement time is recorded for the page_location within a session. However, it includes subsequent page_view events to a page_location that has previously recorded a page_view event in the same session, even if the subsequent event has no recorded engagement time. diff --git a/models/marts/core/fct_ga4__sessions.sql b/models/marts/core/fct_ga4__sessions.sql new file mode 100644 index 00000000..b910c52b --- /dev/null +++ b/models/marts/core/fct_ga4__sessions.sql @@ -0,0 +1,39 @@ +-- Stay mindful of performance/cost when using this model. Making this model partitioned on date is not possible because there's no way to create a single record per session AND partition on date. +{{ + config( + materialized = 'incremental', + incremental_strategy = 'merge', + unique_key = ['session_key','client_key'], + tags = ["incremental"], + partition_by={ + "field": "session_start_date", + "data_type": "date", + "granularity": "day" + }, + on_schema_change = 'sync_all_columns', + ) +}} + +select + client_key, + session_key, + stream_id, + max(user_id) as user_id, + min(session_partition_min_timestamp) as session_start_timestamp, + min(session_partition_date) as session_start_date, + sum(session_partition_count_page_views) as count_pageviews, + sum(session_partition_sum_event_value_in_usd) as sum_event_value_in_usd, + max(session_partition_max_session_engaged) as is_session_engaged, + sum(session_partition_sum_engagement_time_msec) as sum_engaged_time_msec, + min(session_number) as session_number + {% if var('conversion_events', false) %} + {% for ce in var('conversion_events',[]) %} + , sum({{ce}}_count) as count_{{ce}} + {% endfor %} + {% endif %} +from {{ref('fct_ga4__sessions_daily')}} +{% if is_incremental() %} + where session_partition_date >= date_sub(current_date, interval {{var('static_incremental_days',3)}} day) +{% endif %} +group by 1,2,3 + diff --git a/models/marts/core/fct_ga4__sessions.yml b/models/marts/core/fct_ga4__sessions.yml new file mode 100644 index 00000000..a2016b9d --- /dev/null +++ b/models/marts/core/fct_ga4__sessions.yml @@ -0,0 +1,18 @@ +version: 2 + +models: + - name: fct_ga4__sessions + description: > + Fact table containing metrics related to sessions. This model uses the daily partition metrics generated in fct_ga4__sessions_daily to calculate metrics for the session as a whole. Unique on session_key. + columns: + - name: session_key + description: > + Unique key assigned to sessions. Sessions can span multiple dates/partitions. + tests: + - unique + - name: user_id + description: > + User ID associated with the client. Will be set to null unless explicitly assigned in the GA4 implementation. This value can change mid-session so we take the 'max' value. + - name: session_start_date + description: > + Date associated with the first session partition. diff --git a/models/marts/core/fct_ga4__sessions_daily.sql b/models/marts/core/fct_ga4__sessions_daily.sql new file mode 100644 index 00000000..8e6adcf9 --- /dev/null +++ b/models/marts/core/fct_ga4__sessions_daily.sql @@ -0,0 +1,66 @@ +{{ + config( + materialized = 'incremental', + incremental_strategy = 'merge', + tags = ["incremental"], + partition_by={ + "field": "session_partition_date", + "data_type": "date", + "granularity": "day" + }, + unique_key = ['session_key','session_partition_key'], + on_schema_change = 'sync_all_columns' + ) +}} + + +with session_metrics as ( + select + session_key, + session_partition_key, + client_key, + stream_id, + max(user_id) as user_id, -- user_id can be null at the start and end of a session and still be set in the middle + min(event_date_dt) as session_partition_date, -- Date of the session partition, does not represent the true session start date which, in GA4, can span multiple days + min(event_timestamp) as session_partition_min_timestamp, + countif(event_name = 'page_view') as session_partition_count_page_views, + sum(event_value_in_usd) as session_partition_sum_event_value_in_usd, + ifnull(max(session_engaged), 0) as session_partition_max_session_engaged, + sum(engagement_time_msec) as session_partition_sum_engagement_time_msec, + min(session_number) as session_number + from {{ref('stg_ga4__events')}} + where session_key is not null + {% if is_incremental() %} + and event_date_dt >= date_sub(current_date, interval {{var('static_incremental_days',3) | int}} day) + {% endif %} + group by 1,2,3,4 +) +{% if var('conversion_events', false) == false %} + select * from session_metrics +{% else %} + , + session_conversions as ( + select * from {{ref('stg_ga4__session_conversions_daily')}} + where 1=1 + {% if is_incremental() %} + and session_partition_date >= date_sub(current_date, interval {{var('static_incremental_days',3) | int}} day) + {% endif %} + ), + join_metrics_and_conversions as ( + select + session_metrics.client_key, + session_metrics.stream_id, + session_metrics.user_id, + session_metrics.session_partition_min_timestamp, + session_metrics.session_partition_count_page_views, + session_metrics.session_partition_sum_event_value_in_usd, + session_metrics.session_partition_max_session_engaged, + session_metrics.session_partition_sum_engagement_time_msec, + session_metrics.session_number, + session_conversions.* + from session_metrics left join session_conversions using (session_partition_key) + ) + + select * from join_metrics_and_conversions +{% endif %} + diff --git a/models/marts/core/fct_ga4__sessions_daily.yml b/models/marts/core/fct_ga4__sessions_daily.yml new file mode 100644 index 00000000..97bcae63 --- /dev/null +++ b/models/marts/core/fct_ga4__sessions_daily.yml @@ -0,0 +1,19 @@ +version: 2 + +models: + - name: fct_ga4__sessions_daily + description: > + Incremental fact table with metrics related to daily session partitions. + columns: + - name: session_partition_key + description: > + Unique key assigned to session partitions which are daily partitions of a session. In GA4, sessions can span multiple days. + To improve query performance, it's easier to work with 'session partitions' which can be filtered/partitioned by date. + tests: + - unique + - name: session_key + description: > + Unique key assigned to sessions. Sessions can span multiple dates/partitions. + - name: session_partition_date + description: > + Date associated with the session_partition_key. Used to partition the table. Filter on this column to optimize query cost and performance. diff --git a/models/marts/core/fct_ga4__user_ids.sql b/models/marts/core/fct_ga4__user_ids.sql new file mode 100644 index 00000000..018019f0 --- /dev/null +++ b/models/marts/core/fct_ga4__user_ids.sql @@ -0,0 +1,34 @@ +with user_id_mapped as ( + select + client_keys.*, + -- Use a user_id if it exists, otherwise fall back to the client_key + coalesce(user_id_mapping.last_seen_user_id, client_keys.client_key) as user_id_or_client_key, + -- Indicate whether the user_id_or_client_key value is a user_id + CASE + WHEN user_id_mapping.last_seen_user_id is null THEN 0 ELSE 1 + END as is_user_id + from {{ref('fct_ga4__client_keys')}} client_keys + left join {{ref('stg_ga4__user_id_mapping')}} user_id_mapping using (client_key) +) + +select + user_id_or_client_key, + stream_id, + max(is_user_id) as is_user_id, + min(first_seen_timestamp) as first_seen_timestamp, + max(last_seen_session_timestamp) as last_seen_session_timestamp, + min(first_seen_date) as first_seen_date, + max(last_seen_date) as last_seen_date, + sum(count_pageviews) as count_pageviews, + sum(count_engaged_sessions) as count_engaged_sessions, + sum(sum_event_value_in_usd) as sum_event_value_in_usd, + sum(sum_engaged_time_msec) as sum_engaged_time_msec, + sum(count_sessions) as count_sessions + {% if var('conversion_events', false) %} + {% for ce in var('conversion_events',[]) %} + , sum(count_{{ce}}) as count_{{ce}} + {% endfor %} + {% endif %} +from user_id_mapped +group by 1, 2 + diff --git a/models/marts/core/fct_ga4__user_ids.yml b/models/marts/core/fct_ga4__user_ids.yml new file mode 100644 index 00000000..72551eb0 --- /dev/null +++ b/models/marts/core/fct_ga4__user_ids.yml @@ -0,0 +1,11 @@ +version: 2 + +models: + - name: fct_ga4__user_ids + description: Fact table with aggregate metrics at the level of the user_id when one is present, otherwise at the device level (as indicated by the client_key). Metrics are aggregated from fct_ga4__client_keys. + columns: + - name: user_id_or_client_key + tests: + - unique: + column_name: "md5(user_id_or_client_key || stream_id)" + \ No newline at end of file diff --git a/models/staging/base/base_ga4__events.sql b/models/staging/base/base_ga4__events.sql new file mode 100644 index 00000000..533dbc0f --- /dev/null +++ b/models/staging/base/base_ga4__events.sql @@ -0,0 +1,36 @@ +{% set partitions_to_replace = ['current_date'] %} +{% for i in range(var('static_incremental_days')) %} + {% set partitions_to_replace = partitions_to_replace.append('date_sub(current_date, interval ' + (i+1)|string + ' day)') %} +{% endfor %} + +{{ + config( + pre_hook="{{ ga4.combine_property_data() }}" if var('combined_dataset', false) else "", + materialized = 'incremental', + incremental_strategy = 'insert_overwrite', + partition_by={ + "field": "event_date_dt", + "data_type": "date", + }, + partitions = partitions_to_replace, + cluster_by=['event_name'] + ) +}} + +with source as ( + select + {{ ga4.base_select_source() }} + from {{ source('ga4', 'events') }} + where cast(left(replace(_table_suffix, 'intraday_', ''), 8) as int64) >= {{var('start_date')}} + {% if is_incremental() %} + and parse_date('%Y%m%d', left(replace(_table_suffix, 'intraday_', ''), 8)) in ({{ partitions_to_replace | join(',') }}) + {% endif %} +), +renamed as ( + select + {{ ga4.base_select_renamed() }} + from source +) + +select * from renamed +qualify row_number() over(partition by event_date_dt, stream_id, user_pseudo_id, session_id, event_name, event_timestamp, to_json_string(ARRAY(SELECT params FROM UNNEST(event_params) AS params ORDER BY key))) = 1 diff --git a/models/staging/base/base_ga4__events.yml b/models/staging/base/base_ga4__events.yml new file mode 100644 index 00000000..f46a9355 --- /dev/null +++ b/models/staging/base/base_ga4__events.yml @@ -0,0 +1,47 @@ +version: 2 + +models: + - name: base_ga4__events + description: > + Base events model that pulls all fields from raw data. Resulting table is partitioned on event_date_dt which + is useful in that BQ queries can be cached against this table, but not against wildcard searches from the original sharded tables. + This model handles some light transformation and renaming beyond the base events model. + Events are deduped by looking for duplicate event and event parameter payloads. + More documentation here: https://support.google.com/analytics/answer/7029846 + columns: + - name: event_date_dt + description: Date of the event converted to Date type. Time zone is the time zone configured in the GA4 property. + - name: event_timestamp + description: > + Timestamp (in microseconds) indicating when the event's batch was received (as opposed to when the event actually occurred). + Time zone is technically UTC, but in a practical sense it is set to the time zone configured in the GA4 property. + - name: user_campaign + description: > + Contains the initial campaign name that brought the user to the property. This has been renamed from traffic_source.name. + From Google's documentation: Name of the marketing campaign that first acquired the user. This field is not populated in intraday tables. + Note: traffic_source attribution is based on cross-channel last click. The traffic_source values do not change if the user interacts with subsequent campaigns after installation. + https://support.google.com/analytics/answer/7029846?hl=en#zippy=%2Ctraffic-source + - name: user_medium + description: Contains the initial medium that brought the user to the property. This has been renamed from traffic_source.medium. + - name: user_source + description: Contains the initial source that brought the user to the property. This has been renamed from traffic_source.source. + - name: session_id + description: Unique identifier for the user's session. This is not guaranteed unique across users. Renamed from ga_session_id + - name: event_campaign + description: Contain's the campaign associated with the event (typically derived from utm_campaign) + - name: event_source + description: Contain's the source associated with the event (typically derived from utm_source) + - name: event_medium + description: Contain's the campaign associated with the event (typically derived from utm_medium) + - name: event_content + description: Contain's the campaign associated with the event (typically derived from utm_content) + - name: event_term + description: Contain's the campaign associated with the event (typically derived from utm_term) + - name: stream_id + description: The numeric ID of the data stream from which the event originated. + - name: platform + description: The data stream platform (Web, IOS or Android) from which the event originated. + - name: privacy_info_analytics_storage + description: Whether Analytics storage is enabled for the user. Possible values are Yes, No, Unset + - name: privacy_info_ads_storage + description: Whether ad targeting is enabled for a user. Possible values are Yes, No, Unset \ No newline at end of file diff --git a/models/staging/events/stg_ga4__event_click.sql b/models/staging/events/stg_ga4__event_click.sql new file mode 100644 index 00000000..d7f99a0f --- /dev/null +++ b/models/staging/events/stg_ga4__event_click.sql @@ -0,0 +1,26 @@ +-- reference here: https://support.google.com/analytics/answer/9216061?hl=en + + with click_with_params as ( + select *, + {{ ga4.unnest_key('event_params', 'entrances', 'int_value') }}, + {{ ga4.unnest_key('event_params', 'outbound') }}, + {{ ga4.unnest_key('event_params', 'link_classes') }}, + {{ ga4.unnest_key('event_params', 'link_domain') }}, + {{ ga4.unnest_key('event_params', 'link_url') }}, + {{ ga4.unnest_key('event_params', 'click_element') }}, + {{ ga4.unnest_key('event_params', 'link_id') }}, + {{ ga4.unnest_key('event_params', 'click_region') }}, + {{ ga4.unnest_key('event_params', 'click_tag_name') }}, + {{ ga4.unnest_key('event_params', 'click_url') }}, + {{ ga4.unnest_key('event_params', 'file_name') }} + {% if var("default_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("default_custom_parameters") )}} + {% endif %} + {% if var("click_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("click_custom_parameters") )}} + {% endif %} + from {{ref('stg_ga4__events')}} + where event_name = 'click' +) + +select * from click_with_params \ No newline at end of file diff --git a/models/staging/events/stg_ga4__event_file_download.sql b/models/staging/events/stg_ga4__event_file_download.sql new file mode 100644 index 00000000..791fc7b4 --- /dev/null +++ b/models/staging/events/stg_ga4__event_file_download.sql @@ -0,0 +1,24 @@ + -- reference here: https://support.google.com/analytics/answer/9216061?hl=en&ref_topic=9756175 + + with event_with_params as ( + select *, + {{ ga4.unnest_key('event_params', 'entrances', 'int_value') }}, + {{ ga4.unnest_key('event_params', 'value', 'float_value') }}, + {{ ga4.unnest_key('event_params', 'file_extension') }}, + {{ ga4.unnest_key('event_params', 'file_name') }}, + {{ ga4.unnest_key('event_params', 'link_classes') }}, + {{ ga4.unnest_key('event_params', 'link_domain') }}, + {{ ga4.unnest_key('event_params', 'link_id') }}, + {{ ga4.unnest_key('event_params', 'link_text') }}, + {{ ga4.unnest_key('event_params', 'link_url') }} + {% if var("default_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("default_custom_parameters") )}} + {% endif %} + {% if var("file_download_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("file_download_custom_parameters") )}} + {% endif %} + from {{ref('stg_ga4__events')}} + where event_name = 'file_download' +) + +select * from event_with_params \ No newline at end of file diff --git a/models/staging/events/stg_ga4__event_first_visit.sql b/models/staging/events/stg_ga4__event_first_visit.sql new file mode 100644 index 00000000..3c03747f --- /dev/null +++ b/models/staging/events/stg_ga4__event_first_visit.sql @@ -0,0 +1,17 @@ +-- TODO: Unclear why there are first_visit events firing when the ga_session_number is >1. This might cause confusion. + +with first_visit_with_params as ( + select + *, + {{ ga4.unnest_key('event_params', 'page_location', 'string_value', 'landing_page') }} + {% if var("default_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("default_custom_parameters") )}} + {% endif %} + {% if var("first_visit_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("first_visit_custom_parameters") )}} + {% endif %} + from {{ref('stg_ga4__events')}} + where event_name = 'first_visit' +) + +select * from first_visit_with_params \ No newline at end of file diff --git a/models/staging/events/stg_ga4__event_page_view.sql b/models/staging/events/stg_ga4__event_page_view.sql new file mode 100644 index 00000000..2bd68a26 --- /dev/null +++ b/models/staging/events/stg_ga4__event_page_view.sql @@ -0,0 +1,20 @@ + with page_view_with_params as ( + select * except(page_engagement_key), + {{ ga4.unnest_key('event_params', 'entrances', 'int_value') }}, + {{ ga4.unnest_key('event_params', 'value', 'float_value') }}, + case when split(split(page_location,'/')[safe_ordinal(4)],'?')[safe_ordinal(1)] = '' then null else concat('/',split(split(page_location,'/')[safe_ordinal(4)],'?')[safe_ordinal(1)]) end as pagepath_level_1, + case when split(split(page_location,'/')[safe_ordinal(5)],'?')[safe_ordinal(1)] = '' then null else concat('/',split(split(page_location,'/')[safe_ordinal(5)],'?')[safe_ordinal(1)]) end as pagepath_level_2, + case when split(split(page_location,'/')[safe_ordinal(6)],'?')[safe_ordinal(1)] = '' then null else concat('/',split(split(page_location,'/')[safe_ordinal(6)],'?')[safe_ordinal(1)]) end as pagepath_level_3, + case when split(split(page_location,'/')[safe_ordinal(7)],'?')[safe_ordinal(1)] = '' then null else concat('/',split(split(page_location,'/')[safe_ordinal(7)],'?')[safe_ordinal(1)]) end as pagepath_level_4, + to_base64(md5(concat(session_key, page_location))) as page_engagement_key + {% if var("default_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("default_custom_parameters") )}} + {% endif %} + {% if var("page_view_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("page_view_custom_parameters") )}} + {% endif %} + from {{ref('stg_ga4__events')}} + where event_name = 'page_view' +) +select * +from page_view_with_params \ No newline at end of file diff --git a/models/staging/events/stg_ga4__event_page_view.yml b/models/staging/events/stg_ga4__event_page_view.yml new file mode 100644 index 00000000..8f567fc6 --- /dev/null +++ b/models/staging/events/stg_ga4__event_page_view.yml @@ -0,0 +1,10 @@ +version: 2 + +models: + - name: stg_ga4__event_page_view + description: GA4 events filtered to only show 'page_view' events. Pivots common event parameters into separate columns. Identifies the first and last pageview in the 'is_entrance' and 'is_exit' columns. + columns: + - name: page_location + tests: + - not_null: + severity: warn \ No newline at end of file diff --git a/models/staging/events/stg_ga4__event_scroll.sql b/models/staging/events/stg_ga4__event_scroll.sql new file mode 100644 index 00000000..432d4a1c --- /dev/null +++ b/models/staging/events/stg_ga4__event_scroll.sql @@ -0,0 +1,14 @@ + with scroll_with_params as ( + select *, + {{ ga4.unnest_key('event_params', 'percent_scrolled', 'int_value') }} + {% if var("default_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("default_custom_parameters") )}} + {% endif %} + {% if var("scroll_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("scroll_custom_parameters") )}} + {% endif %} + from {{ref('stg_ga4__events')}} + where event_name = 'scroll' +) + +select * from scroll_with_params \ No newline at end of file diff --git a/models/staging/events/stg_ga4__event_scroll.yml b/models/staging/events/stg_ga4__event_scroll.yml new file mode 100644 index 00000000..4e61aedc --- /dev/null +++ b/models/staging/events/stg_ga4__event_scroll.yml @@ -0,0 +1,8 @@ +version: 2 + +models: + - name: stg_ga4__event_scroll + description: > + Staging model containing only 'scroll' events. + Includes the 'percent_scrolled' parameter. + GA4's default implementation will only track the 90% scroll threshold diff --git a/models/staging/events/stg_ga4__event_session_start.sql b/models/staging/events/stg_ga4__event_session_start.sql new file mode 100644 index 00000000..fad4d0ce --- /dev/null +++ b/models/staging/events/stg_ga4__event_session_start.sql @@ -0,0 +1,15 @@ + with session_start_with_params as ( + select *, + {{ ga4.unnest_key('event_params', 'entrances', 'int_value') }}, + {{ ga4.unnest_key('event_params', 'value', 'float_value') }} + {% if var("default_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("default_custom_parameters") )}} + {% endif %} + {% if var("session_start_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("session_start_custom_parameters") )}} + {% endif %} + from {{ref('stg_ga4__events')}} + where event_name = 'session_start' +) + +select * from session_start_with_params \ No newline at end of file diff --git a/models/staging/events/stg_ga4__event_session_start.yml b/models/staging/events/stg_ga4__event_session_start.yml new file mode 100644 index 00000000..dad4ee2b --- /dev/null +++ b/models/staging/events/stg_ga4__event_session_start.yml @@ -0,0 +1,5 @@ +version: 2 + +models: + - name: stg_ga4__event_session_start + description: Events filtered to only show 'session_start' event. Unnests common event parameters into separate columns. diff --git a/models/staging/events/stg_ga4__event_user_engagement.sql b/models/staging/events/stg_ga4__event_user_engagement.sql new file mode 100644 index 00000000..321e7587 --- /dev/null +++ b/models/staging/events/stg_ga4__event_user_engagement.sql @@ -0,0 +1,15 @@ +-- Event defined as "when the app is in the foreground or webpage is in focus for at least one second." + + with user_engagement_with_params as ( + select * + {% if var("default_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("default_custom_parameters") )}} + {% endif %} + {% if var("user_engagement_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("user_engagement_custom_parameters") )}} + {% endif %} + from {{ref('stg_ga4__events')}} + where event_name = 'user_engagement' +) + +select * from user_engagement_with_params \ No newline at end of file diff --git a/models/staging/events/stg_ga4__event_video_complete.sql b/models/staging/events/stg_ga4__event_video_complete.sql new file mode 100644 index 00000000..543cc635 --- /dev/null +++ b/models/staging/events/stg_ga4__event_video_complete.sql @@ -0,0 +1,23 @@ +-- Defined as when the video ends. For embedded YouTube videos that have JS API support enabled. Collected by default via enhanced measurement. +-- More info: https://support.google.com/firebase/answer/9234069?hl=en + + with video_complete_with_params as ( + select *, + {{ ga4.unnest_key('event_params', 'video_current_time', 'int_value') }}, + {{ ga4.unnest_key('event_params', 'video_duration', 'int_value') }}, + {{ ga4.unnest_key('event_params', 'video_percent', 'int_value') }}, + {{ ga4.unnest_key('event_params', 'video_url') }}, + {{ ga4.unnest_key('event_params', 'video_provider') }}, + {{ ga4.unnest_key('event_params', 'vide_title') }}, + {{ ga4.unnest_key('event_params', 'visible') }} + {% if var("default_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("default_custom_parameters") )}} + {% endif %} + {% if var("video_complete_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("video_complete_custom_parameters") )}} + {% endif %} + from {{ ref('stg_ga4__events') }} + where event_name = 'video_complete' +) + +select * from video_complete_with_params \ No newline at end of file diff --git a/models/staging/events/stg_ga4__event_video_start.sql b/models/staging/events/stg_ga4__event_video_start.sql new file mode 100644 index 00000000..7e1793d1 --- /dev/null +++ b/models/staging/events/stg_ga4__event_video_start.sql @@ -0,0 +1,23 @@ +-- Defined as when the video starts playing. For embedded YouTube videos that have JS API support enabled. Collected by default via enhanced measurement. +-- More info: https://support.google.com/firebase/answer/9234069?hl=en + + with video_start_with_params as ( + select *, + {{ ga4.unnest_key('event_params', 'video_current_time', 'int_value') }}, + {{ ga4.unnest_key('event_params', 'video_duration', 'int_value') }}, + {{ ga4.unnest_key('event_params', 'video_percent', 'int_value') }}, + {{ ga4.unnest_key('event_params', 'video_url') }}, + {{ ga4.unnest_key('event_params', 'video_provider') }}, + {{ ga4.unnest_key('event_params', 'vide_title') }}, + {{ ga4.unnest_key('event_params', 'visible') }} + {% if var("default_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("default_custom_parameters") )}} + {% endif %} + {% if var("video_start_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("video_start_custom_parameters") )}} + {% endif %} + from {{ref('stg_ga4__events')}} + where event_name = 'video_start' +) + +select * from video_start_with_params \ No newline at end of file diff --git a/models/staging/events/stg_ga4__event_view_search_results.sql b/models/staging/events/stg_ga4__event_view_search_results.sql new file mode 100644 index 00000000..96852ff0 --- /dev/null +++ b/models/staging/events/stg_ga4__event_view_search_results.sql @@ -0,0 +1,18 @@ +-- reference here: https://support.google.com/analytics/answer/9216061?hl=en + + with event_with_params as ( + select *, + {{ ga4.unnest_key('event_params', 'entrances', 'int_value') }}, + {{ ga4.unnest_key('event_params', 'search_term') }}, + {{ ga4.unnest_key('event_params', 'unique_search_term') }} + {% if var("default_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("default_custom_parameters") )}} + {% endif %} + {% if var("view_search_results_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("view_search_results_custom_parameters") )}} + {% endif %} + from {{ref('stg_ga4__events')}} + where event_name = 'view_search_results' +) + +select * from event_with_params \ No newline at end of file diff --git a/models/staging/recommended_events/README.md b/models/staging/recommended_events/README.md new file mode 100644 index 00000000..9dd09197 --- /dev/null +++ b/models/staging/recommended_events/README.md @@ -0,0 +1,40 @@ +# Recommended Events + +The events in this folder are the [GA4 recommended events] (https://support.google.com/analytics/answer/9267735?hl=en). + +These events are disabled by default so as not to slow down the building of your models unneccessarily. + +To enable these models, enter the event file name, without the file extension, in your `dbt_project.yml` and set the enabled configuration to true. + +This is how you would enable the purchase event. + +``` +models: + ga4: + staging: + recommended_events: + stg_ga4__event_purchase: + +enabled: true +``` + +This is how you would enable all recommended events: + +``` +models: + ga4: + staging: + recommended_events: + +enabled: true +``` + +Not all recommended events have been implemented. If you need a specific event, please consider creating a pull request with the model that you need in the [dbt-ga4 GitHub repository](https://github.com/Velir/dbt-ga4). + +## Purchase Event Transaction Deduplication + +The `stg_ga4__event_purchase_deduplicated` model builds on the `sgt_ga4__event_purchase` model. It is disabled by default and thus needs to be enabled along with the `stg_ga4__event_purchase` model. + +The model only processes purchase events that fall within the window as defined by `static_incremental_days` and can only reliably be expected to deduplicate purchase events occurring in the same day. + +The model provides a highly-performant, minimum-viable product for this feature returning only data from the first purchase event with a matching `transaction_id` within the processing window. + +You are encouraged to copy this model to your project and customize it there should this MVP be insufficient for your needs. \ No newline at end of file diff --git a/models/staging/recommended_events/stg_ga4__event_add_payment_info.sql b/models/staging/recommended_events/stg_ga4__event_add_payment_info.sql new file mode 100644 index 00000000..94a1b0dc --- /dev/null +++ b/models/staging/recommended_events/stg_ga4__event_add_payment_info.sql @@ -0,0 +1,23 @@ +{{ + config( + enabled = false, + ) +}} + + with add_payment_info_with_params as ( + select *, + {{ ga4.unnest_key('event_params', 'coupon') }}, + {{ ga4.unnest_key('event_params', 'currency') }}, + {{ ga4.unnest_key('event_params', 'payment_type') }}, + {{ ga4.unnest_key('event_params', 'value', 'float_value') }} + {% if var("default_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("default_custom_parameters") )}} + {% endif %} + {% if var("add_payment_info_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("add_payment_info_custom_parameters") )}} + {% endif %} + from {{ref('stg_ga4__events')}} + where event_name = 'add_payment_info' +) + +select * from add_payment_info_with_params \ No newline at end of file diff --git a/models/staging/recommended_events/stg_ga4__event_add_shipping_info.sql b/models/staging/recommended_events/stg_ga4__event_add_shipping_info.sql new file mode 100644 index 00000000..3b722161 --- /dev/null +++ b/models/staging/recommended_events/stg_ga4__event_add_shipping_info.sql @@ -0,0 +1,22 @@ +{{ + config( + enabled = false, + ) +}} + with add_shipping_info_with_params as ( + select *, + {{ ga4.unnest_key('event_params', 'coupon') }}, + {{ ga4.unnest_key('event_params', 'currency') }}, + {{ ga4.unnest_key('event_params', 'shipping_tier') }}, + {{ ga4.unnest_key('event_params', 'value', 'float_value') }} + {% if var("default_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("default_custom_parameters") )}} + {% endif %} + {% if var("add_shipping_info_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("add_shipping_info_custom_parameters") )}} + {% endif %} + from {{ref('stg_ga4__events')}} + where event_name = 'add_shipping_info' +) + +select * from add_shipping_info_with_params \ No newline at end of file diff --git a/models/staging/recommended_events/stg_ga4__event_add_to_cart.sql b/models/staging/recommended_events/stg_ga4__event_add_to_cart.sql new file mode 100644 index 00000000..9c900cae --- /dev/null +++ b/models/staging/recommended_events/stg_ga4__event_add_to_cart.sql @@ -0,0 +1,22 @@ +{{ + config( + enabled = false, + ) +}} + with add_to_cart_with_params as ( + select *except (items), + {{ ga4.unnest_key('event_params', 'currency') }}, + {{ ga4.unnest_key('event_params', 'value', 'double_value') }}, + (select items FROM UNNEST(items) items LIMIT 1) as items + {% if var("default_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("default_custom_parameters") )}} + {% endif %} + {% if var("add_to_cart_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("add_to_cart_custom_parameters") )}} + {% endif %} + from {{ref('stg_ga4__events')}}, + unnest(items) + where event_name = 'add_to_cart' +) + +select * from add_to_cart_with_params \ No newline at end of file diff --git a/models/staging/recommended_events/stg_ga4__event_add_to_wishlist.sql b/models/staging/recommended_events/stg_ga4__event_add_to_wishlist.sql new file mode 100644 index 00000000..c992acbf --- /dev/null +++ b/models/staging/recommended_events/stg_ga4__event_add_to_wishlist.sql @@ -0,0 +1,22 @@ +{{ + config( + enabled = false, + ) +}} + with add_to_wishlist_with_params as ( + select * except (items), + (select items FROM unnest(items) items LIMIT 1) as items, + {{ ga4.unnest_key('event_params', 'currency') }}, + {{ ga4.unnest_key('event_params', 'value', 'float_value') }} + {% if var("default_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("default_custom_parameters") )}} + {% endif %} + {% if var("add_to_wishlist_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("add_to_wishlist_custom_parameters") )}} + {% endif %} + from {{ref('stg_ga4__events')}}, + unnest(items) + where event_name = 'add_to_wishlist' +) + +select * from add_to_wishlist_with_params \ No newline at end of file diff --git a/models/staging/recommended_events/stg_ga4__event_begin_checkout.sql b/models/staging/recommended_events/stg_ga4__event_begin_checkout.sql new file mode 100644 index 00000000..d9c23f81 --- /dev/null +++ b/models/staging/recommended_events/stg_ga4__event_begin_checkout.sql @@ -0,0 +1,21 @@ +{{ + config( + enabled = false, + ) +}} + with begin_checkout_with_params as ( + select * + , {{ ga4.unnest_key('event_params', 'currency') }} + , {{ ga4.unnest_key('event_params', 'value', 'double_value') }} + , {{ ga4.unnest_key('event_params', 'coupon') }} + {% if var("default_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("default_custom_parameters") )}} + {% endif %} + {% if var("begin_checkout_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("begin_checkout_custom_parameters") )}} + {% endif %} + from {{ref('stg_ga4__events')}} + where event_name = 'begin_checkout' +) + +select * from begin_checkout_with_params \ No newline at end of file diff --git a/models/staging/recommended_events/stg_ga4__event_generate_lead.sql b/models/staging/recommended_events/stg_ga4__event_generate_lead.sql new file mode 100644 index 00000000..a890242e --- /dev/null +++ b/models/staging/recommended_events/stg_ga4__event_generate_lead.sql @@ -0,0 +1,21 @@ +{{ + config( + enabled = false, + ) +}} + +with generate_lead_with_params as ( + select *, + {{ ga4.unnest_key('event_params', 'currency') }}, + {{ ga4.unnest_key('event_params', 'value') }} + {% if var("default_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("default_custom_parameters") )}} + {% endif %} + {% if var("generate_lead_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("generate_lead_custom_parameters") )}} + {% endif %} + from {{ref('stg_ga4__events')}} + where event_name = 'generate_lead' +) + +select * from generate_lead_with_params \ No newline at end of file diff --git a/models/staging/recommended_events/stg_ga4__event_login.sql b/models/staging/recommended_events/stg_ga4__event_login.sql new file mode 100644 index 00000000..ee560fac --- /dev/null +++ b/models/staging/recommended_events/stg_ga4__event_login.sql @@ -0,0 +1,19 @@ +{{ + config( + enabled = false, + ) +}} + + with login_with_params as ( + select * + {% if var("default_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("default_custom_parameters") )}} + {% endif %} + {% if var("login_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("login_custom_parameters") )}} + {% endif %} + from {{ref('stg_ga4__events')}} + where event_name = 'login' +) + +select * from login_with_params \ No newline at end of file diff --git a/models/staging/recommended_events/stg_ga4__event_purchase.sql b/models/staging/recommended_events/stg_ga4__event_purchase.sql new file mode 100644 index 00000000..1ed89c98 --- /dev/null +++ b/models/staging/recommended_events/stg_ga4__event_purchase.sql @@ -0,0 +1,33 @@ +{{ + config( + enabled = false, + ) +}} +with purchase_with_params as ( + select * except (ecommerce), + ecommerce.total_item_quantity, + ecommerce.purchase_revenue_in_usd, + ecommerce.purchase_revenue, + ecommerce.shipping_value_in_usd, + ecommerce.shipping_value, + ecommerce.tax_value_in_usd, + ecommerce.tax_value, + ecommerce.unique_items, + {{ ga4.unnest_key('event_params', 'coupon') }}, + {{ ga4.unnest_key('event_params', 'transaction_id') }}, + {{ ga4.unnest_key('event_params', 'currency') }}, + {{ ga4.unnest_key('event_params', 'value', 'double_value') }}, + {{ ga4.unnest_key('event_params', 'tax', 'double_value') }}, + {{ ga4.unnest_key('event_params', 'shipping', 'double_value') }}, + {{ ga4.unnest_key('event_params', 'affiliation') }} + {% if var("default_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("default_custom_parameters") )}} + {% endif %} + {% if var("purchase_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("purchase_custom_parameters") )}} + {% endif %} + from {{ref('stg_ga4__events')}} + where event_name = 'purchase' +) + +select * from purchase_with_params \ No newline at end of file diff --git a/models/staging/recommended_events/stg_ga4__event_purchase.yml b/models/staging/recommended_events/stg_ga4__event_purchase.yml new file mode 100644 index 00000000..7d8fddaa --- /dev/null +++ b/models/staging/recommended_events/stg_ga4__event_purchase.yml @@ -0,0 +1,5 @@ +version: 2 + +models: + - name: stg_ga4__event_purchase + description: GA4 events filtered to only show 'purchase' events. Pivots common event parameters into separate columns. diff --git a/models/staging/recommended_events/stg_ga4__event_purchase_deduplicated.sql b/models/staging/recommended_events/stg_ga4__event_purchase_deduplicated.sql new file mode 100644 index 00000000..ce5aeae1 --- /dev/null +++ b/models/staging/recommended_events/stg_ga4__event_purchase_deduplicated.sql @@ -0,0 +1,27 @@ +{% if not flags.FULL_REFRESH %} + {% set partitions_to_query = ['current_date'] %} + {% for i in range(var('static_incremental_days', 1)) %} + {% set partitions_to_query = partitions_to_query.append('date_sub(current_date, interval ' + (i+1)|string + ' day)') %} + {% endfor %} +{% endif %} + +{{ + config( + enabled = false, + ) +}} +with purch as ( + select + * + from {{ref('stg_ga4__event_purchase')}} + {% if not flags.FULL_REFRESH %} + where event_date_dt in ({{ partitions_to_query | join(',') }}) + {% endif %} + qualify row_number() over( + partition by transaction_id + order by event_timestamp + ) = 1 +) +select + * +from purch diff --git a/models/staging/recommended_events/stg_ga4__event_refund.sql b/models/staging/recommended_events/stg_ga4__event_refund.sql new file mode 100644 index 00000000..be45600c --- /dev/null +++ b/models/staging/recommended_events/stg_ga4__event_refund.sql @@ -0,0 +1,35 @@ +{{ + config( + enabled = false, + ) +}} +with refund_with_params as ( + select * except (ecommerce), + ecommerce.total_item_quantity, + ecommerce.purchase_revenue_in_usd, + ecommerce.purchase_revenue, + ecommerce.shipping_value_in_usd, + ecommerce.shipping_value, + ecommerce.refund_value_in_usd, + ecommerce.refund_value, + ecommerce.tax_value_in_usd, + ecommerce.tax_value, + ecommerce.unique_items, + {{ ga4.unnest_key('event_params', 'coupon') }}, + {{ ga4.unnest_key('event_params', 'transaction_id') }}, + {{ ga4.unnest_key('event_params', 'currency') }}, + {{ ga4.unnest_key('event_params', 'value', 'float_value') }}, + {{ ga4.unnest_key('event_params', 'tax', 'float_value') }}, + {{ ga4.unnest_key('event_params', 'shipping', 'float_value') }}, + {{ ga4.unnest_key('event_params', 'affiliation') }} + {% if var("default_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("default_custom_parameters") )}} + {% endif %} + {% if var("refund_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("refund_custom_parameters") )}} + {% endif %} + from {{ref('stg_ga4__events')}} + where event_name = 'refund' +) + +select * from refund_with_params \ No newline at end of file diff --git a/models/staging/recommended_events/stg_ga4__event_remove_from_cart.sql b/models/staging/recommended_events/stg_ga4__event_remove_from_cart.sql new file mode 100644 index 00000000..a168bca2 --- /dev/null +++ b/models/staging/recommended_events/stg_ga4__event_remove_from_cart.sql @@ -0,0 +1,22 @@ +{{ + config( + enabled = false, + ) +}} +with remove_from_cart_with_params as ( + select * except (items) + , {{ ga4.unnest_key('event_params', 'currency') }} + , {{ ga4.unnest_key('event_params', 'value', 'double_value') }} + , (select items from unnest(items) items limit 1) as items + {% if var("default_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("default_custom_parameters") )}} + {% endif %} + {% if var("remove_from_cart_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("remove_from_cart_custom_parameters") )}} + {% endif %} +from {{ref('stg_ga4__events')}}, +unnest(items) + where event_name = 'remove_from_cart' +) + +select * from remove_from_cart_with_params \ No newline at end of file diff --git a/models/staging/recommended_events/stg_ga4__event_search.sql b/models/staging/recommended_events/stg_ga4__event_search.sql new file mode 100644 index 00000000..e74ce90a --- /dev/null +++ b/models/staging/recommended_events/stg_ga4__event_search.sql @@ -0,0 +1,20 @@ +{{ + config( + enabled = false, + ) +}} + + with search_with_params as ( + select *, + {{ ga4.unnest_key('event_params', 'search_term') }} + {% if var("default_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("default_custom_parameters") )}} + {% endif %} + {% if var("search_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("search_custom_parameters") )}} + {% endif %} + from {{ref('stg_ga4__events')}} + where event_name = 'search' +) + +select * from search_with_params \ No newline at end of file diff --git a/models/staging/recommended_events/stg_ga4__event_select_item.sql b/models/staging/recommended_events/stg_ga4__event_select_item.sql new file mode 100644 index 00000000..7b858c80 --- /dev/null +++ b/models/staging/recommended_events/stg_ga4__event_select_item.sql @@ -0,0 +1,20 @@ +{{ + config( + enabled = false, + ) +}} +with select_item_with_params as ( + select * except (items), + (select items from unnest(items) items limit 1) as items + {% if var("default_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("default_custom_parameters") )}} + {% endif %} + {% if var("select_item_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("select_item_custom_parameters") )}} + {% endif %} + from {{ref('stg_ga4__events')}}, + unnest(items) + where event_name = 'select_item' +) + +select * from select_item_with_params \ No newline at end of file diff --git a/models/staging/recommended_events/stg_ga4__event_select_promotion.sql b/models/staging/recommended_events/stg_ga4__event_select_promotion.sql new file mode 100644 index 00000000..69291911 --- /dev/null +++ b/models/staging/recommended_events/stg_ga4__event_select_promotion.sql @@ -0,0 +1,20 @@ +{{ + config( + enabled = false, + ) +}} +with select_promotion_with_params as ( + select * except (items), + (select items from unnest(items) items limit 1) as items + {% if var("default_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("default_custom_parameters") )}} + {% endif %} + {% if var("select_promotion_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("select_promotion_custom_parameters") )}} + {% endif %} + from {{ref('stg_ga4__events')}}, + unnest(items) + where event_name = 'select_promotion' +) + +select * from select_promotion_with_params \ No newline at end of file diff --git a/models/staging/recommended_events/stg_ga4__event_share.sql b/models/staging/recommended_events/stg_ga4__event_share.sql new file mode 100644 index 00000000..73158612 --- /dev/null +++ b/models/staging/recommended_events/stg_ga4__event_share.sql @@ -0,0 +1,23 @@ +{{ + config( + enabled = false, + ) +}} + +with share_with_params as ( + select *, + {{ ga4.unnest_key('event_params', 'method') }}, + {{ ga4.unnest_key('event_params', 'content_type') }}, + {{ ga4.unnest_key('event_params', 'item_id') }} + {% if var("default_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("default_custom_parameters") )}} + {% endif %} + {% if var("share_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("share_custom_parameters") )}} + {% endif %} + from {{ref('stg_ga4__events')}} + where event_name = 'share' + +) + +select * from share_with_params \ No newline at end of file diff --git a/models/staging/recommended_events/stg_ga4__event_sign_up.sql b/models/staging/recommended_events/stg_ga4__event_sign_up.sql new file mode 100644 index 00000000..24617eb5 --- /dev/null +++ b/models/staging/recommended_events/stg_ga4__event_sign_up.sql @@ -0,0 +1,19 @@ +{{ + config( + enabled = false, + ) +}} + + with sign_up_with_params as ( + select * + {% if var("default_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("default_custom_parameters") )}} + {% endif %} + {% if var("sign_up_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("sign_up_custom_parameters") )}} + {% endif %} + from {{ref('stg_ga4__events')}} + where event_name = 'sign_up' +) + +select * from sign_up_with_params \ No newline at end of file diff --git a/models/staging/recommended_events/stg_ga4__event_view_cart.sql b/models/staging/recommended_events/stg_ga4__event_view_cart.sql new file mode 100644 index 00000000..dd151984 --- /dev/null +++ b/models/staging/recommended_events/stg_ga4__event_view_cart.sql @@ -0,0 +1,20 @@ +{{ + config( + enabled = false, + ) +}} +with view_cart_with_params as ( + select *, + {{ ga4.unnest_key('event_params', 'currency') }}, + {{ ga4.unnest_key('event_params', 'value', 'float_value') }} + {% if var("default_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("default_custom_parameters") )}} + {% endif %} + {% if var("view_cart_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("view_cart_custom_parameters") )}} + {% endif %} + from {{ref('stg_ga4__events')}} + where event_name = 'view_cart' +) + +select * from view_cart_with_params \ No newline at end of file diff --git a/models/staging/recommended_events/stg_ga4__event_view_item.sql b/models/staging/recommended_events/stg_ga4__event_view_item.sql new file mode 100644 index 00000000..d135c02d --- /dev/null +++ b/models/staging/recommended_events/stg_ga4__event_view_item.sql @@ -0,0 +1,22 @@ +{{ + config( + enabled = false, + ) +}} +with view_item_with_params as ( + select * except (items) + , {{ ga4.unnest_key('event_params', 'currency') }} + , {{ ga4.unnest_key('event_params', 'value', 'double_value') }} + , (select items from unnest(items) items limit 1) as items + {% if var("default_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("default_custom_parameters") )}} + {% endif %} + {% if var("view_item_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("view_item_custom_parameters") )}} + {% endif %} + from {{ref('stg_ga4__events')}}, + unnest(items) + where event_name = 'view_item' +) + +select * from view_item_with_params \ No newline at end of file diff --git a/models/staging/recommended_events/stg_ga4__event_view_item_list.sql b/models/staging/recommended_events/stg_ga4__event_view_item_list.sql new file mode 100644 index 00000000..b87ffb37 --- /dev/null +++ b/models/staging/recommended_events/stg_ga4__event_view_item_list.sql @@ -0,0 +1,20 @@ +{{ + config( + enabled = false, + ) +}} +with view_item_list_with_params as ( + select * + , {{ ga4.unnest_key('event_params', 'item_list_id') }} + , {{ ga4.unnest_key('event_params', 'item_list_name') }} + {% if var("default_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("default_custom_parameters") )}} + {% endif %} + {% if var("view_item_list_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("view_item_list_custom_parameters") )}} + {% endif %} + from {{ref('stg_ga4__events')}} + where event_name = 'view_item_list' +) + +select * from view_item_list_with_params \ No newline at end of file diff --git a/models/staging/recommended_events/stg_ga4__event_view_promotion.sql b/models/staging/recommended_events/stg_ga4__event_view_promotion.sql new file mode 100644 index 00000000..bfc0a1a0 --- /dev/null +++ b/models/staging/recommended_events/stg_ga4__event_view_promotion.sql @@ -0,0 +1,20 @@ +{{ + config( + enabled = false, + ) +}} +with view_promotion_with_params as ( + select * except (items), + (select items from unnest(items) items limit 1) as items + {% if var("default_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("default_custom_parameters") )}} + {% endif %} + {% if var("view_promotion_custom_parameters", "none") != "none" %} + {{ ga4.stage_custom_parameters( var("view_promotion_custom_parameters") )}} + {% endif %} + from {{ref('stg_ga4__events')}}, + unnest(items) + where event_name = 'view_promotion' +) + +select * from view_promotion_with_params \ No newline at end of file diff --git a/models/staging/src_ga4.yml b/models/staging/src_ga4.yml new file mode 100644 index 00000000..29104767 --- /dev/null +++ b/models/staging/src_ga4.yml @@ -0,0 +1,16 @@ +version: 2 + +sources: + - name: ga4 + database: | # Source from target.project if multi-property, otherwise source from source_project + {%- if var('combined_dataset', false) != false -%} {{target.project}} + {%- else -%} {{var('source_project')}} + {%- endif -%} + schema: | # Source from combined property dataset if set, otherwise source from original GA4 property + {%- if var('combined_dataset', false) != false -%} {{var('combined_dataset')}} + {%- else -%} analytics_{{var('property_ids')[0]}} + {%- endif -%} + tables: + - name: events + identifier: events_* # Scan across all sharded event tables. Use the 'start_date' variable to limit this scan + description: Main events table exported by GA4. Sharded by date. \ No newline at end of file diff --git a/models/staging/stg_ga4__client_key_first_last_events.sql b/models/staging/stg_ga4__client_key_first_last_events.sql new file mode 100644 index 00000000..ccac80f2 --- /dev/null +++ b/models/staging/stg_ga4__client_key_first_last_events.sql @@ -0,0 +1,133 @@ +{{ + config( + materialized = 'incremental', + incremental_strategy = 'merge', + unique_key = ['client_key'], + tags = ["incremental"], + partition_by={ + "field": "last_seen_at", + "data_type": "timestamp", + "granularity": "day" + }, + merge_update_columns = [ + 'last_geo_continent', + 'last_geo_country', + 'last_geo_region', + 'last_geo_city', + 'last_geo_sub_continent', + 'last_geo_metro', + 'last_device_category', + 'last_device_mobile_brand_name', + 'last_device_mobile_model_name', + 'last_device_mobile_marketing_name', + 'last_device_mobile_os_hardware_model', + 'last_device_operating_system', + 'last_device_operating_system_version', + 'last_device_vendor_id', + 'last_device_advertising_id', + 'last_device_language', + 'last_device_is_limited_ad_tracking', + 'last_device_time_zone_offset_seconds', + 'last_device_browser', + 'last_device_browser_version', + 'last_device_web_info_browser', + 'last_device_web_info_browser_version', + 'last_device_web_info_hostname', + 'last_user_campaign', + 'last_user_medium', + 'last_user_source', + 'last_seen_at', + ], + on_schema_change = 'sync_all_columns', + ) +}} + +with first_last_event as ( + select + client_key, + FIRST_VALUE(event_key) OVER (PARTITION BY client_key ORDER BY event_timestamp ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS first_event, + LAST_VALUE(event_key) OVER (PARTITION BY client_key ORDER BY event_timestamp ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS last_event, + stream_id + from {{ref('stg_ga4__events')}} + where client_key is not null --remove users with privacy settings enabled + {% if is_incremental() %} + and event_date_dt >= date_sub(current_date, interval {{var('static_incremental_days',3) | int}} day) + {% endif %} +), +events_by_client_key as ( + select distinct + client_key, + first_event, + last_event, + stream_id + from first_last_event +), +events_joined as ( + select + events_by_client_key.*, + timestamp_micros(events_first.event_timestamp) as first_visit, + events_first.geo_continent as first_geo_continent, + events_first.geo_country as first_geo_country, + events_first.geo_region as first_geo_region, + events_first.geo_city as first_geo_city, + events_first.geo_sub_continent as first_geo_sub_continent, + events_first.geo_metro as first_geo_metro, + events_first.device_category as first_device_category, + events_first.device_mobile_brand_name as first_device_mobile_brand_name, + events_first.device_mobile_model_name as first_device_mobile_model_name, + events_first.device_mobile_marketing_name as first_device_mobile_marketing_name, + events_first.device_mobile_os_hardware_model as first_device_mobile_os_hardware_model, + events_first.device_operating_system as first_device_operating_system, + events_first.device_operating_system_version as first_device_operating_system_version, + events_first.device_vendor_id as first_device_vendor_id, + events_first.device_advertising_id as first_device_advertising_id, + events_first.device_language as first_device_language, + events_first.device_is_limited_ad_tracking as first_device_is_limited_ad_tracking, + events_first.device_time_zone_offset_seconds as first_device_time_zone_offset_seconds, + events_first.device_browser as first_device_browser, + events_first.device_browser_version as first_device_browser_version, + events_first.device_web_info_browser as first_device_web_info_browser, + events_first.device_web_info_browser_version as first_device_web_info_browser_version, + events_first.device_web_info_hostname as first_device_web_info_hostname, + events_first.user_campaign as first_user_campaign, + events_first.user_medium as first_user_medium, + events_first.user_source as first_user_source, + events_last.geo_continent as last_geo_continent, + events_last.geo_country as last_geo_country, + events_last.geo_region as last_geo_region, + events_last.geo_city as last_geo_city, + events_last.geo_sub_continent as last_geo_sub_continent, + events_last.geo_metro as last_geo_metro, + events_last.device_category as last_device_category, + events_last.device_mobile_brand_name as last_device_mobile_brand_name, + events_last.device_mobile_model_name as last_device_mobile_model_name, + events_last.device_mobile_marketing_name as last_device_mobile_marketing_name, + events_last.device_mobile_os_hardware_model as last_device_mobile_os_hardware_model, + events_last.device_operating_system as last_device_operating_system, + events_last.device_operating_system_version as last_device_operating_system_version, + events_last.device_vendor_id as last_device_vendor_id, + events_last.device_advertising_id as last_device_advertising_id, + events_last.device_language as last_device_language, + events_last.device_is_limited_ad_tracking as last_device_is_limited_ad_tracking, + events_last.device_time_zone_offset_seconds as last_device_time_zone_offset_seconds, + events_last.device_browser as last_device_browser, + events_last.device_browser_version as last_device_browser_version, + events_last.device_web_info_browser as last_device_web_info_browser, + events_last.device_web_info_browser_version as last_device_web_info_browser_version, + events_last.device_web_info_hostname as last_device_web_info_hostname, + events_last.user_campaign as last_user_campaign, + events_last.user_medium as last_user_medium, + events_last.user_source as last_user_source, + timestamp_micros(events_last.event_timestamp) as last_seen_at, + from events_by_client_key + left join {{ref('stg_ga4__events')}} events_first + on events_by_client_key.first_event = events_first.event_key + left join {{ref('stg_ga4__events')}} events_last + on events_by_client_key.last_event = events_last.event_key + where 1=1 + {% if is_incremental() %} + and events_last.event_date_dt >= date_sub(current_date, interval {{var('static_incremental_days',3) | int}} day) + and events_first.event_date_dt >= date_sub(current_date, interval {{var('static_incremental_days',3) | int}} day) + {% endif %} +) +select * from events_joined \ No newline at end of file diff --git a/models/staging/stg_ga4__client_key_first_last_events.yml b/models/staging/stg_ga4__client_key_first_last_events.yml new file mode 100644 index 00000000..4e9cc7f4 --- /dev/null +++ b/models/staging/stg_ga4__client_key_first_last_events.yml @@ -0,0 +1,10 @@ +version: 2 + +models: + - name: stg_ga4__client_key_first_last_events + description: Captures the first and last event completed by the user's device in order to pull in the first and last geo, device, and traffic source seen from the user + columns: + - name: client_key + description: Hashed combination of user_pseudo_id and stream_id + tests: + - unique \ No newline at end of file diff --git a/models/staging/stg_ga4__client_key_first_last_pageviews.sql b/models/staging/stg_ga4__client_key_first_last_pageviews.sql new file mode 100644 index 00000000..9f17d368 --- /dev/null +++ b/models/staging/stg_ga4__client_key_first_last_pageviews.sql @@ -0,0 +1,63 @@ +{{ + config( + materialized = 'incremental', + incremental_strategy = 'merge', + unique_key = ['client_key'], + tags = ["incremental"], + partition_by={ + "field": "last_seen_at", + "data_type": "timestamp", + "granularity": "day" + }, + merge_update_columns = [ + 'last_page_location', + 'last_page_hostname', + 'last_page_referrer', + 'last_seen_at', + ], + on_schema_change='sync_all_columns' + ) +}} + +with page_views_first_last as ( + select + client_key, + FIRST_VALUE(event_key) OVER (PARTITION BY client_key ORDER BY event_timestamp ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS first_page_view_event_key, + LAST_VALUE(event_key) OVER (PARTITION BY client_key ORDER BY event_timestamp ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS last_page_view_event_key + from {{ref('stg_ga4__event_page_view')}} + where client_key is not null -- Remove users with privacy settings enabled + {% if is_incremental() %} + and event_date_dt >= date_sub(current_date, interval {{var('static_incremental_days',3) | int}} day) + {% endif %} +), +page_views_by_client_key as ( + select distinct + client_key, + first_page_view_event_key, + last_page_view_event_key + from page_views_first_last +), + +page_views_joined as ( + select + page_views_by_client_key.*, + first_page_view.page_location as first_page_location, + first_page_view.page_hostname as first_page_hostname, + first_page_view.page_referrer as first_page_referrer, + last_page_view.page_location as last_page_location, + last_page_view.page_hostname as last_page_hostname, + last_page_view.page_referrer as last_page_referrer, + timestamp_micros(last_page_view.event_timestamp) as last_seen_at, + from page_views_by_client_key + left join {{ref('stg_ga4__event_page_view')}} first_page_view + on page_views_by_client_key.first_page_view_event_key = first_page_view.event_key + left join {{ref('stg_ga4__event_page_view')}} last_page_view + on page_views_by_client_key.last_page_view_event_key = last_page_view.event_key + where 1=1 + {% if is_incremental() %} + and first_page_view.event_date_dt >= date_sub(current_date, interval {{var('static_incremental_days',3) | int}} day) + and last_page_view.event_date_dt >= date_sub(current_date, interval {{var('static_incremental_days',3) | int}} day) + {% endif %} +) + +select * from page_views_joined \ No newline at end of file diff --git a/models/staging/stg_ga4__client_key_first_last_pageviews.yml b/models/staging/stg_ga4__client_key_first_last_pageviews.yml new file mode 100644 index 00000000..9623fd66 --- /dev/null +++ b/models/staging/stg_ga4__client_key_first_last_pageviews.yml @@ -0,0 +1,10 @@ +version: 2 + +models: + - name: stg_ga4__client_key_first_last_pageviews + description: Captures data related to the first and last page view that each user device has completed (by client_key). + columns: + - name: client_key + description: Hashed combination of user_pseudo_id and stream_id + tests: + - unique \ No newline at end of file diff --git a/models/staging/stg_ga4__derived_session_properties.sql b/models/staging/stg_ga4__derived_session_properties.sql new file mode 100644 index 00000000..65fbcfd6 --- /dev/null +++ b/models/staging/stg_ga4__derived_session_properties.sql @@ -0,0 +1,32 @@ +{{ config( + enabled = true if var('derived_session_properties', false) else false, + materialized = "table" +) }} + +-- Remove null session_keys (users with privacy enabled) +with events_from_valid_users as ( + select * from {{ref('stg_ga4__events')}} + where session_key is not null +), +unnest_event_params as +( + select + session_key, + event_timestamp + {% for sp in var('derived_session_properties', []) %} + {% if sp.user_property %} + , {{ ga4.unnest_key('user_properties', sp.user_property, sp.value_type) }} + {% else %} + , {{ ga4.unnest_key('event_params', sp.event_parameter, sp.value_type) }} + {% endif %} + {% endfor %} + from events_from_valid_users +) + +SELECT DISTINCT + session_key + {% for sp in var('derived_session_properties', []) %} + , LAST_VALUE({{ sp.user_property | default(sp.event_parameter) }} IGNORE NULLS) OVER (session_window) AS {{ sp.session_property_name }} + {% endfor %} +FROM unnest_event_params +WINDOW session_window AS (PARTITION BY session_key ORDER BY event_timestamp ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) diff --git a/models/staging/stg_ga4__derived_session_properties.yml b/models/staging/stg_ga4__derived_session_properties.yml new file mode 100644 index 00000000..0ecffcf9 --- /dev/null +++ b/models/staging/stg_ga4__derived_session_properties.yml @@ -0,0 +1,11 @@ +version: 2 + +models: + - name: stg_ga4__derived_session_properties + description: > + Optional model that will pull out the most recent instance of a particular event parameter for each session_key. + Later used in the fct_ga4__sessions fact table. + columns: + - name: session_key + tests: + - unique \ No newline at end of file diff --git a/models/staging/stg_ga4__derived_session_properties_daily.sql b/models/staging/stg_ga4__derived_session_properties_daily.sql new file mode 100644 index 00000000..f997d40b --- /dev/null +++ b/models/staging/stg_ga4__derived_session_properties_daily.sql @@ -0,0 +1,49 @@ +{% set partitions_to_replace = ['current_date'] %} +{% for i in range(var('static_incremental_days')) %} + {% set partitions_to_replace = partitions_to_replace.append('date_sub(current_date, interval ' + (i+1)|string + ' day)') %} +{% endfor %} +{{ + config( + enabled = true if var('derived_session_properties', false) else false, + materialized = 'incremental', + incremental_strategy = 'insert_overwrite', + tags = ["incremental"], + partition_by={ + "field": "session_partition_date", + "data_type": "date", + "granularity": "day" + }, + partitions = partitions_to_replace + ) +}} + + +with unnest_event_params as +( + select + session_partition_key + ,event_date_dt as session_partition_date + ,event_timestamp + {% for sp in var('derived_session_properties', []) %} + {% if sp.user_property %} + , {{ ga4.unnest_key('user_properties', sp.user_property, sp.value_type) }} + {% else %} + , {{ ga4.unnest_key('event_params', sp.event_parameter, sp.value_type) }} + {% endif %} + {% endfor %} + from {{ref('stg_ga4__events')}} + where event_key is not null + {% if is_incremental() %} + and event_date_dt in ({{ partitions_to_replace | join(',') }}) + {% endif %} + +) + +SELECT DISTINCT + session_partition_key + ,session_partition_date + {% for sp in var('derived_session_properties', []) %} + , LAST_VALUE({{ sp.user_property | default(sp.event_parameter) }} IGNORE NULLS) OVER (session_window) AS {{ sp.session_property_name }} + {% endfor %} +FROM unnest_event_params +WINDOW session_window AS (PARTITION BY session_partition_key ORDER BY event_timestamp ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) diff --git a/models/staging/stg_ga4__derived_session_properties_daily.yml b/models/staging/stg_ga4__derived_session_properties_daily.yml new file mode 100644 index 00000000..c645fb83 --- /dev/null +++ b/models/staging/stg_ga4__derived_session_properties_daily.yml @@ -0,0 +1,11 @@ +version: 2 + +models: + - name: stg_ga4__derived_session_properties_daily + description: > + Optional model that will pull out the most recent instance of a particular event parameter for each session_partition_key. + Later used in the fct_ga4__sessions_daily fact table. + columns: + - name: session_partition_key + tests: + - unique \ No newline at end of file diff --git a/models/staging/stg_ga4__derived_user_properties.sql b/models/staging/stg_ga4__derived_user_properties.sql new file mode 100644 index 00000000..10ae0861 --- /dev/null +++ b/models/staging/stg_ga4__derived_user_properties.sql @@ -0,0 +1,42 @@ +{{ + config( + enabled = true if var('derived_user_properties', false) else false, + materialized = "incremental", + incremental_strategy = 'merge', + unique_key = ['client_key'], + tags = ["incremental"], + partition_by={ + "field": "last_updated", + "data_type": "timestamp", + "granularity": "day" + }, + on_schema_change='sync_all_columns' +) }} + +-- Remove null client_key (users with privacy enabled) +with events_from_valid_users as ( + select * from {{ref('stg_ga4__events')}} + where client_key is not null + {% if is_incremental() %} + and event_date_dt >= date_sub(current_date, interval {{var('static_incremental_days',3) | int}} day) + {% endif %} +), +unnest_user_properties as +( + select + client_key, + event_timestamp + {% for up in var('derived_user_properties', []) %} + ,{{ ga4.unnest_key('event_params', up.event_parameter , up.value_type ) }} + {% endfor %} + from events_from_valid_users +) + +SELECT DISTINCT + client_key + {% for up in var('derived_user_properties', []) %} + , LAST_VALUE({{ up.event_parameter }} IGNORE NULLS) OVER (user_window) AS {{ up.user_property_name }} + {% endfor %} + , last_value(timestamp_micros(event_timestamp)) over (user_window) as last_updated, +FROM unnest_user_properties +WINDOW user_window AS (PARTITION BY client_key ORDER BY event_timestamp ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) diff --git a/models/staging/stg_ga4__derived_user_properties.yml b/models/staging/stg_ga4__derived_user_properties.yml new file mode 100644 index 00000000..3aadf7f4 --- /dev/null +++ b/models/staging/stg_ga4__derived_user_properties.yml @@ -0,0 +1,10 @@ +version: 2 + +models: + - name: stg_ga4__derived_user_properties + description: Optional model that will pull out the most recent instance of a particular event parameter for each device (client_key). Later used in the dim_ga4__client_key dimension table. + columns: + - name: client_key + description: Hashed combination of user_pseudo_id and stream_id + tests: + - unique \ No newline at end of file diff --git a/models/staging/stg_ga4__event_items.sql b/models/staging/stg_ga4__event_items.sql new file mode 100644 index 00000000..62680917 --- /dev/null +++ b/models/staging/stg_ga4__event_items.sql @@ -0,0 +1,38 @@ +with items_with_params as ( + select + event_key, + event_name, + event_date_dt, + stream_id, + i.item_id, + i.item_name, + i.item_brand, + i.item_variant, + i.item_category, + i.item_category2, + i.item_category3, + i.item_category4, + i.item_category5, + i.price_in_usd, + i.price, + i.quantity, + i.item_revenue_in_usd, + i.item_revenue, + i.item_refund_in_usd, + i.item_refund, + i.coupon, + i.affiliation, + i.location_id, + i.item_list_id, + i.item_list_name, + i.item_list_index, + i.promotion_id, + i.promotion_name, + i.creative_name, + i.creative_slot + from {{ref('stg_ga4__events')}}, + unnest(items) as i + where event_name in ('add_payment_info', 'add_shipping_info', 'add_to_cart','add_to_wishlist','begin_checkout' ,'purchase','refund', 'remove_from_cart','select_item', 'select_promotion','view_item_list','view_promotion', 'view_item') +) + +select * from items_with_params \ No newline at end of file diff --git a/models/staging/stg_ga4__event_items.yml b/models/staging/stg_ga4__event_items.yml new file mode 100644 index 00000000..6e75c609 --- /dev/null +++ b/models/staging/stg_ga4__event_items.yml @@ -0,0 +1,5 @@ +version: 2 + +models: + - name: stg_ga4__event_items + description: Flattens out the 'items' field for e-commerce events such as purchase and add_to_cart diff --git a/models/staging/stg_ga4__event_to_query_string_params.sql b/models/staging/stg_ga4__event_to_query_string_params.sql new file mode 100644 index 00000000..72cd10c9 --- /dev/null +++ b/models/staging/stg_ga4__event_to_query_string_params.sql @@ -0,0 +1,24 @@ +with event_and_query_string as +( + select + event_key, + split(page_query_string, '&') as qs_split + from {{ref('stg_ga4__events')}} +), +flattened_qs as +( + select + event_key, + params + from event_and_query_string, unnest(qs_split) as params +), +split_param_value as +( + select + event_key, + split(params,'=')[SAFE_OFFSET(0)] as param, + NULLIF(split(params,'=')[SAFE_OFFSET(1)], '') as value + from flattened_qs +) + +select * from split_param_value \ No newline at end of file diff --git a/models/staging/stg_ga4__event_to_query_string_params.yml b/models/staging/stg_ga4__event_to_query_string_params.yml new file mode 100644 index 00000000..4b4310f4 --- /dev/null +++ b/models/staging/stg_ga4__event_to_query_string_params.yml @@ -0,0 +1,6 @@ +version: 2 + +models: + - name: stg_ga4__event_to_query_string_params + description: This model pivots the query string parameters contained within the event's page_location field to become rows. Each row is a single parameter/value combination contained in a single event's query string. + \ No newline at end of file diff --git a/models/staging/stg_ga4__events.sql b/models/staging/stg_ga4__events.sql new file mode 100644 index 00000000..e3914bef --- /dev/null +++ b/models/staging/stg_ga4__events.sql @@ -0,0 +1,100 @@ +-- This staging model contains key creation and window functions. Keeping window functions outside of the base incremental model ensures that the incremental updates don't artificially limit the window partition sizes (ex: if a session spans 2 days, but only 1 day is in the incremental update) +with base_events as ( + select * from {{ ref('base_ga4__events')}} +), +-- Add key that captures a combination of stream_id and user_pseudo_id to uniquely identify a 'client' (aka. a device) within a single stream +include_client_key as ( + select * + , to_base64(md5(concat(user_pseudo_id, stream_id))) as client_key + from base_events +), +-- Add key for sessions. session_key will be null if client_key is null due to consent being denied. ga_session_id may be null during audience trigger events. +include_session_key as ( + select + *, + to_base64(md5(CONCAT(client_key, CAST(session_id as STRING)))) as session_key + from include_client_key +), +-- Add a key that combines session key and date. Useful when working with session table within date-partitioned tables +include_session_partition_key as ( + select + *, + CONCAT(session_key, CAST(event_date_dt as STRING)) as session_partition_key + from include_session_key +), +-- Add unique key for events +include_event_key as ( + select + *, + to_base64(md5(CONCAT(session_key, event_name, CAST(event_timestamp as STRING), to_json_string(event_params)))) as event_key -- Surrogate key for unique events. + from include_session_partition_key +), +detect_gclid as ( + select + * except (event_source, event_medium, event_campaign), + case + when (page_location like '%gclid%' and event_source is null) then "google" + else event_source + end as event_source, + case + when (page_location like '%gclid%' and event_medium is null) then "cpc" + when (page_location like '%gclid%' and event_medium = 'organic') then "cpc" + else event_medium + end as event_medium, + case + when (page_location like '%gclid%' and event_campaign is null) then "(cpc)" + when (page_location like '%gclid%' and event_campaign = 'organic') then "(cpc)" + else event_campaign + end as event_campaign + from include_event_key +), +{% if var('query_parameter_extraction', none) != none %} +extract_query_params as ( + select + *, + {%- for param in var('query_parameter_extraction') -%} + {{ extract_query_parameter_value( 'page_location' , param ) }} as {{"query_param_"+param}} + {% if not loop.last %},{% endif %} + {%- endfor -%} + from detect_gclid +), +{% endif %} +remove_query_params as ( + select + * EXCEPT (page_location, page_referrer), + page_location as original_page_location, + page_referrer as original_page_referrer, + {{ extract_page_path('page_location') }} as page_path, + -- If there are query parameters to exclude, exclude them using regex + {% if var('query_parameter_exclusions',none) is not none %} + {{remove_query_parameters('page_location',var('query_parameter_exclusions'))}} as page_location, + {{remove_query_parameters('page_referrer',var('query_parameter_exclusions'))}} as page_referrer + {% else %} + page_location, + page_referrer + {% endif %} + + {% if var('query_parameter_extraction', none) != none %} + from extract_query_params + {% else %} + from detect_gclid + {% endif %} +), +enrich_params as ( + select + *, + {{extract_hostname_from_url('page_location')}} as page_hostname, + {{extract_query_string_from_url('page_location')}} as page_query_string, + from remove_query_params +), +page_key as ( + select + *, + (concat( cast(event_date_dt as string), page_location )) as page_key, + case + when event_name = 'page_view' then to_base64(md5(concat(session_key, page_referrer))) + else to_base64(md5(concat(session_key, page_location))) + end as page_engagement_key + from enrich_params +) +select * from page_key \ No newline at end of file diff --git a/models/staging/stg_ga4__events.yml b/models/staging/stg_ga4__events.yml new file mode 100644 index 00000000..4d4a8b4d --- /dev/null +++ b/models/staging/stg_ga4__events.yml @@ -0,0 +1,20 @@ +version: 2 + +models: + - name: stg_ga4__events + description: Staging model that generates keys for users, sessions, and events. Also parses URLs to remove query string params as defined in project config. + columns: + - name: client_key + description: Surrogate key created from stream_id and user_pseudo_id. Provides a way to uniquely identify a user's device within a stream. Important when using the package to combine data across properties and streams. + - name: event_key + tests: + - unique + - name: page_path + description: This field contains the page_location with the query string portion removed. Uses macro remove_query_string + - name: page_engagement_key + description: > + This field is used to assign engagement_time_msec to the correct page. + While the GA4 documentation claims that a user_engagement event that sets engagement_time_msec should fire when navigating to a new page, + the data shows that this is not always the case. When a user_engagement event does not fire, the engagement_time_msec parameter is assigned to the next page_view. + This engagement time should be credited to the previous page, + so for page_view events this field uses the session_key and page_referrer as the key while all other events use the session_key and page_location. diff --git a/models/staging/stg_ga4__page_conversions.sql b/models/staging/stg_ga4__page_conversions.sql new file mode 100644 index 00000000..e220cd3e --- /dev/null +++ b/models/staging/stg_ga4__page_conversions.sql @@ -0,0 +1,11 @@ +{{ config( + enabled= var('conversion_events', false) != false +) }} + +select + page_key + {% for ce in var('conversion_events',[]) %} + , countif(event_name = '{{ce}}') as {{ce}}_count + {% endfor %} +from {{ref('stg_ga4__events')}} +group by 1 \ No newline at end of file diff --git a/models/staging/stg_ga4__page_engaged_time.sql b/models/staging/stg_ga4__page_engaged_time.sql new file mode 100644 index 00000000..74d89162 --- /dev/null +++ b/models/staging/stg_ga4__page_engaged_time.sql @@ -0,0 +1,32 @@ +with pek_time as ( +select + page_engagement_key, + sum(engagement_time_msec) as page_engagement_time, +from {{ ref('stg_ga4__events') }} +group by 1 +), +matched_pv as ( -- need to replace the pek with one that uses page_location to match back to correct page_view + select + to_base64(md5(concat(session_key, page_location))) as page_engagement_key, + from {{ ref('stg_ga4__events') }} + where event_name = 'page_view' +), +denominator as ( + select + page_engagement_key, + count(page_engagement_key) as page_engagement_denominator, --for sessions with multiple hits to the same page + from matched_pv + group by 1 +) +select + denominator.page_engagement_key, + case + when pek_time.page_engagement_time is null then null -- safe_divide in the numerator would return 0; we need null to prevent page views with no recorded engagement time from factoring in to later calculations + else safe_divide(pek_time.page_engagement_time , denominator.page_engagement_denominator) + end as page_engagement_time_msec, --technically the average engagement time for that page in that session + case + when pek_time.page_engagement_time is null then null -- remove page_views with no engagement time from the denominator + else denominator.page_engagement_denominator + end as page_engagement_denominator +from denominator +left join pek_time using(page_engagement_key) \ No newline at end of file diff --git a/models/staging/stg_ga4__page_engaged_time.yml b/models/staging/stg_ga4__page_engaged_time.yml new file mode 100644 index 00000000..8f895ebc --- /dev/null +++ b/models/staging/stg_ga4__page_engaged_time.yml @@ -0,0 +1,10 @@ +version: 2 + +models: + - name: stg_ga4__page_engaged_time + description: View that calculates the numerator and denominator for engagement time metrics. + columns: + - name: page_engagement_time + description: Total engagement time in a session for all page_view events to the same page_location. + - name: page_engagement_denominator + description: The total number of page_views in a session to the same page_location except for pages with no page_engagement_time. diff --git a/models/staging/stg_ga4__session_conversions_daily.sql b/models/staging/stg_ga4__session_conversions_daily.sql new file mode 100644 index 00000000..f33ea6d5 --- /dev/null +++ b/models/staging/stg_ga4__session_conversions_daily.sql @@ -0,0 +1,38 @@ +{% set partitions_to_replace = ['current_date'] %} +{% for i in range(var('static_incremental_days')) %} + {% set partitions_to_replace = partitions_to_replace.append('date_sub(current_date, interval ' + (i+1)|string + ' day)') %} +{% endfor %} +{{ + config( + enabled= var('conversion_events', false) != false, + materialized = 'incremental', + incremental_strategy = 'insert_overwrite', + tags = ["incremental"], + partition_by={ + "field": "session_partition_date", + "data_type": "date", + "granularity": "day" + }, + partitions = partitions_to_replace + ) +}} + + + +with event_counts as ( + select + session_key, + session_partition_key, + min(event_date_dt) as session_partition_date -- The date of this session partition + {% for ce in var('conversion_events',[]) %} + , countif(event_name = '{{ce}}') as {{ce}}_count + {% endfor %} + from {{ref('stg_ga4__events')}} + where 1=1 + {% if is_incremental() %} + and event_date_dt in ({{ partitions_to_replace | join(',') }}) + {% endif %} + group by 1,2 +) + +select * from event_counts diff --git a/models/staging/stg_ga4__session_conversions_daily.yml b/models/staging/stg_ga4__session_conversions_daily.yml new file mode 100644 index 00000000..2f26a7c6 --- /dev/null +++ b/models/staging/stg_ga4__session_conversions_daily.yml @@ -0,0 +1,12 @@ +version: 2 + +models: + - name: stg_ga4__session_conversions_daily + description: > + Incremental model that counts the number of events per day listed in the 'conversion_events' variable. + Aggregated and partitioned on session_start_date. + Only enabled when the conversion_events variable is set. + columns: + - name: session_partition_key + tests: + - unique \ No newline at end of file diff --git a/models/staging/stg_ga4__sessions_first_last_pageviews.sql b/models/staging/stg_ga4__sessions_first_last_pageviews.sql new file mode 100644 index 00000000..e1d432c5 --- /dev/null +++ b/models/staging/stg_ga4__sessions_first_last_pageviews.sql @@ -0,0 +1,36 @@ +{{ + config( + materialized = 'incremental', + incremental_strategy = 'merge', + unique_key = ['session_key'], + tags = ["incremental"], + partition_by={ + "field": "first_page_view_event_time", + "data_type": "timestamp", + "granularity": "day" + }, + on_schema_change = 'sync_all_columns' + ) +}} +with page_views_first_last as ( + select + session_key, + FIRST_VALUE(event_key) OVER (PARTITION BY session_key ORDER BY event_timestamp ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS first_page_view_event_key, + LAST_VALUE(event_key) OVER (PARTITION BY session_key ORDER BY event_timestamp ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS last_page_view_event_key, + FIRST_VALUE(timestamp_micros(event_timestamp)) OVER (PARTITION BY session_key ORDER BY event_timestamp ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS first_page_view_event_time, + from {{ref('stg_ga4__events')}} + where event_name = 'page_view' + {% if is_incremental() %} + and event_date_dt >= date_sub(current_date, interval {{var('static_incremental_days',3) | int}} day) + {% endif %} +), +page_views_by_session_key as ( + select distinct + session_key, + first_page_view_event_key, + last_page_view_event_key, + first_page_view_event_time + from page_views_first_last +) + +select * from page_views_by_session_key \ No newline at end of file diff --git a/models/staging/stg_ga4__sessions_traffic_sources.sql b/models/staging/stg_ga4__sessions_traffic_sources.sql new file mode 100644 index 00000000..5dd34127 --- /dev/null +++ b/models/staging/stg_ga4__sessions_traffic_sources.sql @@ -0,0 +1,55 @@ +{{ + config( + materialized = 'incremental', + incremental_strategy = 'insert_overwrite', + tags = ["incremental"], + on_schema_change = 'sync_all_columns', + partition_by={ + "field": "session_partition_date", + "data_type": "date", + "granularity": "day" + }, + ) +}} + +with session_events as ( + select + session_key + ,event_timestamp + ,events.event_source + ,event_medium + ,event_campaign + ,event_content + ,event_term + ,source_category + ,event_date_dt + from {{ref('stg_ga4__events')}} events + left join {{ref('ga4_source_categories')}} source_categories on events.event_source = source_categories.source + where session_key is not null + and event_name != 'session_start' + and event_name != 'first_visit' + {% if is_incremental() %} + and event_date_dt >= date_sub(current_date, interval {{var('static_incremental_days',3) | int}} day) + {% endif %} + ), +set_default_channel_grouping as ( + select + * + ,{{ga4.default_channel_grouping('event_source','event_medium','source_category', 'event_campaign')}} as default_channel_grouping + from session_events +), +session_source as ( + select + session_key + ,FIRST_VALUE( event_date_dt IGNORE NULLS) OVER (session_window) AS session_partition_date + ,COALESCE(FIRST_VALUE((CASE WHEN event_source <> '(direct)' THEN event_source END) IGNORE NULLS) OVER (session_window), '(direct)') AS session_source + ,COALESCE(FIRST_VALUE((CASE WHEN event_source <> '(direct)' THEN COALESCE(event_medium, '(none)') END) IGNORE NULLS) OVER (session_window), '(none)') AS session_medium + ,COALESCE(FIRST_VALUE((CASE WHEN event_source <> '(direct)' THEN COALESCE(source_category, '(none)') END) IGNORE NULLS) OVER (session_window), '(none)') AS session_source_category + ,COALESCE(FIRST_VALUE((CASE WHEN event_source <> '(direct)' THEN COALESCE(event_campaign, '(none)') END) IGNORE NULLS) OVER (session_window), '(none)') AS session_campaign + ,COALESCE(FIRST_VALUE((CASE WHEN event_source <> '(direct)' THEN COALESCE(event_content, '(none)') END) IGNORE NULLS) OVER (session_window), '(none)') AS session_content + ,COALESCE(FIRST_VALUE((CASE WHEN event_source <> '(direct)' THEN COALESCE(event_term, '(none)') END) IGNORE NULLS) OVER (session_window), '(none)') AS session_term + ,COALESCE(FIRST_VALUE((CASE WHEN event_source <> '(direct)' THEN COALESCE(default_channel_grouping, 'Direct') END) IGNORE NULLS) OVER (session_window), 'Direct') AS session_default_channel_grouping + from set_default_channel_grouping + WINDOW session_window AS (PARTITION BY session_key ORDER BY event_timestamp ASC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) +) +select distinct * from session_source \ No newline at end of file diff --git a/models/staging/stg_ga4__sessions_traffic_sources.yml b/models/staging/stg_ga4__sessions_traffic_sources.yml new file mode 100644 index 00000000..fa5a54eb --- /dev/null +++ b/models/staging/stg_ga4__sessions_traffic_sources.yml @@ -0,0 +1,17 @@ +version: 2 + +models: + - name: stg_ga4__sessions_traffic_sources + description: > + Finds the first session source, medium and campaign and adds the default channel grouping information. + Uses the first non-null source value as the basis for selecting the event that will be used to assign source, medium, campaign, content, and term values. + The session_start and first_visit events are ignored for this purpose as they contain no acquisition data. + Aggregated by session_key. + columns: + - name: session_key + tests: + - unique + - name: session_source + description: First non-null source value of the session + tests: + - not_null \ No newline at end of file diff --git a/models/staging/stg_ga4__sessions_traffic_sources_daily.sql b/models/staging/stg_ga4__sessions_traffic_sources_daily.sql new file mode 100644 index 00000000..1847d8d8 --- /dev/null +++ b/models/staging/stg_ga4__sessions_traffic_sources_daily.sql @@ -0,0 +1,86 @@ +{% set partitions_to_replace = ['current_date'] %} +{% for i in range(var('static_incremental_days')) %} + {% set partitions_to_replace = partitions_to_replace.append('date_sub(current_date, interval ' + (i+1)|string + ' day)') %} +{% endfor %} +{{ + config( + materialized = 'incremental', + incremental_strategy = 'insert_overwrite', + tags = ["incremental"], + partition_by={ + "field": "session_partition_date", + "data_type": "date", + "granularity": "day" + }, + partitions = partitions_to_replace + ) +}} + + +with session_events as ( + select + client_key + ,session_partition_key + ,event_date_dt as session_partition_date + ,event_timestamp + ,events.event_source + ,event_medium + ,event_campaign + ,event_content + ,event_term + ,source_category + from {{ref('stg_ga4__events')}} events + left join {{ref('ga4_source_categories')}} source_categories on events.event_source = source_categories.source + where session_partition_key is not null + and event_name != 'session_start' + and event_name != 'first_visit' + {% if is_incremental() %} + and event_date_dt in ({{ partitions_to_replace | join(',') }}) + {% endif %} + + ), +set_default_channel_grouping as ( + select + * + ,{{ga4.default_channel_grouping('event_source','event_medium','source_category', 'event_campaign')}} as default_channel_grouping + from session_events +), +first_session_source as ( + select + client_key + ,session_partition_key + ,session_partition_date + ,event_timestamp + ,COALESCE(FIRST_VALUE((CASE WHEN event_source <> '(direct)' THEN event_source END) IGNORE NULLS) OVER (session_window), '(direct)') AS session_source + ,COALESCE(FIRST_VALUE((CASE WHEN event_source <> '(direct)' THEN COALESCE(event_medium, '(none)') END) IGNORE NULLS) OVER (session_window), '(none)') AS session_medium + ,COALESCE(FIRST_VALUE((CASE WHEN event_source <> '(direct)' THEN COALESCE(source_category, '(none)') END) IGNORE NULLS) OVER (session_window), '(none)') AS session_source_category + ,COALESCE(FIRST_VALUE((CASE WHEN event_source <> '(direct)' THEN COALESCE(event_campaign, '(none)') END) IGNORE NULLS) OVER (session_window), '(none)') AS session_campaign + ,COALESCE(FIRST_VALUE((CASE WHEN event_source <> '(direct)' THEN COALESCE(event_content, '(none)') END) IGNORE NULLS) OVER (session_window), '(none)') AS session_content + ,COALESCE(FIRST_VALUE((CASE WHEN event_source <> '(direct)' THEN COALESCE(event_term, '(none)') END) IGNORE NULLS) OVER (session_window), '(none)') AS session_term + ,COALESCE(FIRST_VALUE((CASE WHEN event_source <> '(direct)' THEN COALESCE(default_channel_grouping, 'Direct') END) IGNORE NULLS) OVER (session_window), 'Direct') AS session_default_channel_grouping + from set_default_channel_grouping + WINDOW session_window AS (PARTITION BY session_partition_key ORDER BY event_timestamp ASC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) +), +find_non_direct_session_partition_key as ( + + select + * + ,if(session_source <> '(direct)', session_partition_key, null) as non_direct_session_partition_key --provide the session_partition_key only if source is not direct. Useful for last non-direct attribution modeling + from first_session_source +) + +select + client_key + ,session_partition_key + ,session_partition_date + ,session_source + ,session_medium + ,session_source_category + ,session_campaign + ,session_content + ,session_term + ,session_default_channel_grouping + ,non_direct_session_partition_key + ,min(event_timestamp) as session_partition_timestamp +from find_non_direct_session_partition_key +group by 1,2,3,4,5,6,7,8,9,10,11 \ No newline at end of file diff --git a/models/staging/stg_ga4__sessions_traffic_sources_daily.yml b/models/staging/stg_ga4__sessions_traffic_sources_daily.yml new file mode 100644 index 00000000..7c658c45 --- /dev/null +++ b/models/staging/stg_ga4__sessions_traffic_sources_daily.yml @@ -0,0 +1,17 @@ +version: 2 + +models: + - name: stg_ga4__sessions_traffic_sources_daily + description: > + Incremental model that finds the acquisition source of each session day partition. + Uses the first non-null source value as the basis for selecting the event that will be used to assign source, medium, campaign, content, and term values. + The session_start and first_visit events are ignored for this purpose as they contain no acquisition data. + Aggregated by session_partition_key. + columns: + - name: session_partition_key + tests: + - unique + - name: session_source + description: First non-null source value of the session + tests: + - not_null \ No newline at end of file diff --git a/models/staging/stg_ga4__sessions_traffic_sources_last_non_direct_daily.sql b/models/staging/stg_ga4__sessions_traffic_sources_last_non_direct_daily.sql new file mode 100644 index 00000000..5c7fc69f --- /dev/null +++ b/models/staging/stg_ga4__sessions_traffic_sources_last_non_direct_daily.sql @@ -0,0 +1,80 @@ +{% set partitions_to_replace = ['current_date'] %} +{% for i in range(var('static_incremental_days')) %} + {% set partitions_to_replace = partitions_to_replace.append('date_sub(current_date, interval ' + (i+1)|string + ' day)') %} +{% endfor %} +{{ + config( + materialized = 'incremental', + incremental_strategy = 'insert_overwrite', + tags = ["incremental"], + partition_by={ + "field": "session_partition_date", + "data_type": "date", + "granularity": "day" + }, + partitions = partitions_to_replace + ) +}} + +with last_non_direct_session_partition_key as ( + select + client_key + ,session_partition_key + ,session_partition_date + ,session_source + ,session_medium + ,session_source_category + ,session_campaign + ,session_content + ,session_term + ,session_default_channel_grouping + ,non_direct_session_partition_key + ,CASE + WHEN non_direct_session_partition_key is null + THEN + last_value(non_direct_session_partition_key ignore nulls) over( + partition by client_key + order by + session_partition_timestamp range between {{var('session_attribution_lookback_window_days', 30 ) * 24 * 60 * 60 * 1000000 }} preceding + and current row -- lookback window + ) + ELSE non_direct_session_partition_key + END as session_partition_key_last_non_direct, + from + {{ref('stg_ga4__sessions_traffic_sources_daily')}} + {% if is_incremental() %} + -- Add 30 to static_incremental_days to include the session attribution lookback window + where session_partition_date >= date_sub(current_date, interval ({{var('static_incremental_days',3) + var('session_attribution_lookback_window_days', 30 )}} ) day) + {% endif %} +) +,join_last_non_direct_session_source as ( + select + last_non_direct_session_partition_key.client_key + ,last_non_direct_session_partition_key.session_partition_key + ,last_non_direct_session_partition_key.session_partition_date + ,last_non_direct_session_partition_key.session_source + ,last_non_direct_session_partition_key.session_medium + ,last_non_direct_session_partition_key.session_source_category + ,last_non_direct_session_partition_key.session_campaign + ,last_non_direct_session_partition_key.session_content + ,last_non_direct_session_partition_key.session_term + ,last_non_direct_session_partition_key.session_default_channel_grouping + ,last_non_direct_session_partition_key.session_partition_key_last_non_direct + ,coalesce(last_non_direct_source.session_source, '(direct)') as last_non_direct_source -- Value will be null if only direct sessions are within the lookback window + ,coalesce(last_non_direct_source.session_medium, '(none)') as last_non_direct_medium + ,coalesce(last_non_direct_source.session_source_category, '(none)') as last_non_direct_source_category + ,coalesce(last_non_direct_source.session_campaign, '(none)') as last_non_direct_campaign + ,coalesce(last_non_direct_source.session_content, '(none)') as last_non_direct_content + ,coalesce(last_non_direct_source.session_term, '(none)') as last_non_direct_term + ,coalesce(last_non_direct_source.session_default_channel_grouping, 'Direct') as last_non_direct_default_channel_grouping + from last_non_direct_session_partition_key + left join {{ref('stg_ga4__sessions_traffic_sources_daily')}} last_non_direct_source on + last_non_direct_session_partition_key.session_partition_key_last_non_direct = last_non_direct_source.session_partition_key + {% if is_incremental() %} + -- Only keep the records in the partitions we wish to replace (as opposed to the whole 30 day lookback window) + where last_non_direct_session_partition_key.session_partition_date in ({{ partitions_to_replace | join(',') }}) + {% endif %} +) + +select * from join_last_non_direct_session_source + diff --git a/models/staging/stg_ga4__sessions_traffic_sources_last_non_direct_daily.yml b/models/staging/stg_ga4__sessions_traffic_sources_last_non_direct_daily.yml new file mode 100644 index 00000000..0b34832b --- /dev/null +++ b/models/staging/stg_ga4__sessions_traffic_sources_last_non_direct_daily.yml @@ -0,0 +1,24 @@ +version: 2 + +models: + - name: stg_ga4__sessions_traffic_sources_last_non_direct_daily + description: > + This model associates the last non-direct traffic acquisition data with the current session. + It does this by scanning backwards 30 days from the session date (aka. the lookback window) and looks for the recent non-direct traffic source. + This model is incremental, partitioned on session_partition_date, and unique on session_partition_key. + columns: + - name: session_partition_key + tests: + - unique + - name: session_source + description: First non-null source value of the session + tests: + - not_null + - name: last_non_direct_source + description: The the most recent non-direct traffic source within a 30-day lookback window. + tests: + - not_null + - name: last_non_direct_default_channel_grouping + description: The the most recent non-direct channel grouping within a 30-day lookback window. + tests: + - not_null \ No newline at end of file diff --git a/models/staging/stg_ga4__user_id_mapping.sql b/models/staging/stg_ga4__user_id_mapping.sql new file mode 100644 index 00000000..75786898 --- /dev/null +++ b/models/staging/stg_ga4__user_id_mapping.sql @@ -0,0 +1,29 @@ +with events_with_user_id as ( + select + user_id, + client_key, + event_timestamp + from {{ref('stg_ga4__events')}} + where user_id is not null + and client_key is not null +), +include_last_seen_timestamp as ( + select + user_id, + client_key, + max(event_timestamp) as last_seen_user_id_timestamp + from events_with_user_id + group by 1,2 +), +pick_latest_timestamp as ( + select + user_id as last_seen_user_id, + client_key, + last_seen_user_id_timestamp + from include_last_seen_timestamp + -- Find the latest mapping between client_key and user_id + qualify row_number() over(partition by client_key order by last_seen_user_id_timestamp desc) = 1 + +) + +select * from pick_latest_timestamp diff --git a/models/staging/stg_ga4__user_id_mapping.yml b/models/staging/stg_ga4__user_id_mapping.yml new file mode 100644 index 00000000..bdca0579 --- /dev/null +++ b/models/staging/stg_ga4__user_id_mapping.yml @@ -0,0 +1,12 @@ +version: 2 + +models: + - name: stg_ga4__user_id_mapping + description: Mapping table that contains the latest association between client_key and user_id. Unique on client_key + columns: + - name: client_key + description: Hashed combination of user_pseudo_id and stream_id + tests: + - not_null + - unique + diff --git a/models/staging/stg_ga4__user_properties.sql b/models/staging/stg_ga4__user_properties.sql new file mode 100644 index 00000000..e5b5059d --- /dev/null +++ b/models/staging/stg_ga4__user_properties.sql @@ -0,0 +1,89 @@ +{{ + config( + enabled = true if var('user_properties', false) else false, + materialized = "incremental", + incremental_strategy = 'merge', + unique_key = ['client_key'], + tags = ["incremental"], + partition_by={ + "field": "last_updated", + "data_type": "timestamp", + "granularity": "day" + }, +) }} + +-- Remove null client_key (users with privacy enabled) +with events_from_valid_users as ( + select * from {{ref('stg_ga4__events')}} + where client_key is not null + {% if is_incremental() %} + and event_date_dt >= date_sub(current_date, interval {{var('static_incremental_days',3) | int}} day) + {% endif %} +), +unnest_user_properties as +( + select + client_key, + event_timestamp + {% for up in var('user_properties', []) %} + ,{{ ga4.unnest_key('user_properties', up.user_property_name , up.value_type ) }} + {% endfor %} + from events_from_valid_users +) +-- create 1 CTE per user property +{% for up in var('user_properties', []) %} +,non_null_{{up.user_property_name}} as +( + select + client_key, + event_timestamp, + {{up.user_property_name}} + from unnest_user_properties + where + {{up.user_property_name}} is not null +), +last_value_{{up.user_property_name}} as +( + select + client_key, + LAST_VALUE({{ up.user_property_name }}) OVER (PARTITION BY client_key ORDER BY event_timestamp ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS {{up.user_property_name}}, + LAST_VALUE(event_timestamp) OVER (PARTITION BY client_key ORDER BY event_timestamp ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS last_updated + from non_null_{{up.user_property_name}} +), +last_value_{{up.user_property_name}}_grouped as +( + select + client_key, + {{up.user_property_name}}, + max(last_updated) as last_updated + from last_value_{{up.user_property_name}} + group by client_key, {{up.user_property_name}} +) +{% endfor %} +, +client_keys as +( + select distinct + client_key + from events_from_valid_users +), +join_properties as +( + select + client_key + {% for up in var('user_properties', []) %} + ,last_value_{{up.user_property_name}}_grouped.{{up.user_property_name}} + {% endfor %} + ,timestamp_micros(greatest( + {% for up in var('user_properties', []) %} + last_value_{{up.user_property_name}}_grouped.last_updated {{"," if not loop.last}} + {% endfor %} + )) as last_updated + from client_keys + {% for up in var('user_properties', []) %} + left join last_value_{{up.user_property_name}}_grouped using (client_key) + {% endfor %} +) + + +select distinct * from join_properties diff --git a/models/staging/stg_ga4__user_properties.yml b/models/staging/stg_ga4__user_properties.yml new file mode 100644 index 00000000..fb652c86 --- /dev/null +++ b/models/staging/stg_ga4__user_properties.yml @@ -0,0 +1,10 @@ +version: 2 + +models: + - name: stg_ga4__user_properties + description: Contains unnested user_properties. Uses the user_properties dbt variable to determine which properties to unnest. + columns: + - name: client_key + description: Hashed combination of user_pseudo_id and stream_id + tests: + - unique \ No newline at end of file diff --git a/packages.yml b/packages.yml new file mode 100644 index 00000000..cd6b686d --- /dev/null +++ b/packages.yml @@ -0,0 +1,3 @@ +packages: + - package: dbt-labs/dbt_utils + version: [">=1.0.0", "<2.0.0"] diff --git a/seeds/.gitkeep b/seeds/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/seeds/ga4_source_categories.csv b/seeds/ga4_source_categories.csv new file mode 100644 index 00000000..bb4a7fee --- /dev/null +++ b/seeds/ga4_source_categories.csv @@ -0,0 +1,820 @@ +source,source_category +360.cn,SOURCE_CATEGORY_SEARCH +43things,SOURCE_CATEGORY_SOCIAL +43things.com,SOURCE_CATEGORY_SOCIAL +51.com,SOURCE_CATEGORY_SOCIAL +5ch.net,SOURCE_CATEGORY_SOCIAL +Google Shopping,SOURCE_CATEGORY_SHOPPING +Hatena,SOURCE_CATEGORY_SOCIAL +IGShopping,SOURCE_CATEGORY_SHOPPING +ImageShack,SOURCE_CATEGORY_SOCIAL +aax-us-east.amazon-adsystem.com,SOURCE_CATEGORY_SHOPPING +aax.amazon-adsystem.com,SOURCE_CATEGORY_SHOPPING +academia.edu,SOURCE_CATEGORY_SOCIAL +activerain,SOURCE_CATEGORY_SOCIAL +activerain.com,SOURCE_CATEGORY_SOCIAL +activeworlds,SOURCE_CATEGORY_SOCIAL +activeworlds.com,SOURCE_CATEGORY_SOCIAL +addthis,SOURCE_CATEGORY_SOCIAL +addthis.com,SOURCE_CATEGORY_SOCIAL +airg.ca,SOURCE_CATEGORY_SOCIAL +alibaba,SOURCE_CATEGORY_SHOPPING +alibaba.com,SOURCE_CATEGORY_SHOPPING +alice,SOURCE_CATEGORY_SEARCH +allnurses.com,SOURCE_CATEGORY_SOCIAL +allrecipes.com,SOURCE_CATEGORY_SOCIAL +alumniclass,SOURCE_CATEGORY_SOCIAL +alumniclass.com,SOURCE_CATEGORY_SOCIAL +amazon,SOURCE_CATEGORY_SHOPPING +amazon.co.uk,SOURCE_CATEGORY_SHOPPING +amazon.com,SOURCE_CATEGORY_SHOPPING +ameba.jp,SOURCE_CATEGORY_SOCIAL +ameblo.jp,SOURCE_CATEGORY_SOCIAL +americantowns,SOURCE_CATEGORY_SOCIAL +americantowns.com,SOURCE_CATEGORY_SOCIAL +amp.reddit.com,SOURCE_CATEGORY_SOCIAL +ancestry.com,SOURCE_CATEGORY_SOCIAL +anobii,SOURCE_CATEGORY_SOCIAL +anobii.com,SOURCE_CATEGORY_SOCIAL +answerbag,SOURCE_CATEGORY_SOCIAL +answerbag.com,SOURCE_CATEGORY_SOCIAL +answers.yahoo.com,SOURCE_CATEGORY_SOCIAL +aol,SOURCE_CATEGORY_SEARCH +aolanswers,SOURCE_CATEGORY_SOCIAL +aolanswers.com,SOURCE_CATEGORY_SOCIAL +apps.facebook.com,SOURCE_CATEGORY_SOCIAL +apps.shopify.com,SOURCE_CATEGORY_SHOPPING +ar.pinterest.com,SOURCE_CATEGORY_SOCIAL +ar.search.yahoo.com,SOURCE_CATEGORY_SEARCH +artstation.com,SOURCE_CATEGORY_SOCIAL +ask,SOURCE_CATEGORY_SEARCH +askubuntu,SOURCE_CATEGORY_SOCIAL +askubuntu.com,SOURCE_CATEGORY_SOCIAL +asmallworld.com,SOURCE_CATEGORY_SOCIAL +at.search.yahoo.com,SOURCE_CATEGORY_SEARCH +athlinks,SOURCE_CATEGORY_SOCIAL +athlinks.com,SOURCE_CATEGORY_SOCIAL +au.search.yahoo.com,SOURCE_CATEGORY_SEARCH +auone,SOURCE_CATEGORY_SEARCH +avg,SOURCE_CATEGORY_SEARCH +away.vk.com,SOURCE_CATEGORY_SOCIAL +awe.sm,SOURCE_CATEGORY_SOCIAL +b.hatena.ne.jp,SOURCE_CATEGORY_SOCIAL +baby-gaga,SOURCE_CATEGORY_SOCIAL +baby-gaga.com,SOURCE_CATEGORY_SOCIAL +babyblog.ru,SOURCE_CATEGORY_SOCIAL +babylon,SOURCE_CATEGORY_SEARCH +badoo,SOURCE_CATEGORY_SOCIAL +badoo.com,SOURCE_CATEGORY_SOCIAL +baidu,SOURCE_CATEGORY_SEARCH +bebo,SOURCE_CATEGORY_SOCIAL +bebo.com,SOURCE_CATEGORY_SOCIAL +beforeitsnews,SOURCE_CATEGORY_SOCIAL +beforeitsnews.com,SOURCE_CATEGORY_SOCIAL +bharatstudent,SOURCE_CATEGORY_SOCIAL +bharatstudent.com,SOURCE_CATEGORY_SOCIAL +biglobe,SOURCE_CATEGORY_SEARCH +biglobe.co.jp,SOURCE_CATEGORY_SEARCH +biglobe.ne.jp,SOURCE_CATEGORY_SEARCH +biip.no,SOURCE_CATEGORY_SOCIAL +bing,SOURCE_CATEGORY_SEARCH +biswap.org,SOURCE_CATEGORY_SOCIAL +bit.ly,SOURCE_CATEGORY_SOCIAL +blackcareernetwork.com,SOURCE_CATEGORY_SOCIAL +blackplanet,SOURCE_CATEGORY_SOCIAL +blackplanet.com,SOURCE_CATEGORY_SOCIAL +blip.fm,SOURCE_CATEGORY_SOCIAL +blog.com,SOURCE_CATEGORY_SOCIAL +blog.feedspot.com,SOURCE_CATEGORY_SOCIAL +blog.goo.ne.jp,SOURCE_CATEGORY_SOCIAL +blog.naver.com,SOURCE_CATEGORY_SOCIAL +blog.twitch.tv,SOURCE_CATEGORY_VIDEO +blog.yahoo.co.jp,SOURCE_CATEGORY_SOCIAL +blogg.no,SOURCE_CATEGORY_SOCIAL +bloggang.com,SOURCE_CATEGORY_SOCIAL +blogger,SOURCE_CATEGORY_SOCIAL +blogger.com,SOURCE_CATEGORY_SOCIAL +blogher,SOURCE_CATEGORY_SOCIAL +blogher.com,SOURCE_CATEGORY_SOCIAL +bloglines,SOURCE_CATEGORY_SOCIAL +bloglines.com,SOURCE_CATEGORY_SOCIAL +blogs.com,SOURCE_CATEGORY_SOCIAL +blogsome,SOURCE_CATEGORY_SOCIAL +blogsome.com,SOURCE_CATEGORY_SOCIAL +blogspot,SOURCE_CATEGORY_SOCIAL +blogspot.com,SOURCE_CATEGORY_SOCIAL +blogster,SOURCE_CATEGORY_SOCIAL +blogster.com,SOURCE_CATEGORY_SOCIAL +blurtit,SOURCE_CATEGORY_SOCIAL +blurtit.com,SOURCE_CATEGORY_SOCIAL +bookmarks.yahoo.co.jp,SOURCE_CATEGORY_SOCIAL +bookmarks.yahoo.com,SOURCE_CATEGORY_SOCIAL +br.pinterest.com,SOURCE_CATEGORY_SOCIAL +br.search.yahoo.com,SOURCE_CATEGORY_SEARCH +brightkite,SOURCE_CATEGORY_SOCIAL +brightkite.com,SOURCE_CATEGORY_SOCIAL +brizzly,SOURCE_CATEGORY_SOCIAL +brizzly.com,SOURCE_CATEGORY_SOCIAL +business.facebook.com,SOURCE_CATEGORY_SOCIAL +buzzfeed,SOURCE_CATEGORY_SOCIAL +buzzfeed.com,SOURCE_CATEGORY_SOCIAL +buzznet,SOURCE_CATEGORY_SOCIAL +buzznet.com,SOURCE_CATEGORY_SOCIAL +ca.search.yahoo.com,SOURCE_CATEGORY_SEARCH +cafe.naver.com,SOURCE_CATEGORY_SOCIAL +cafemom,SOURCE_CATEGORY_SOCIAL +cafemom.com,SOURCE_CATEGORY_SOCIAL +camospace,SOURCE_CATEGORY_SOCIAL +camospace.com,SOURCE_CATEGORY_SOCIAL +canalblog.com,SOURCE_CATEGORY_SOCIAL +care.com,SOURCE_CATEGORY_SOCIAL +care2,SOURCE_CATEGORY_SOCIAL +care2.com,SOURCE_CATEGORY_SOCIAL +caringbridge.org,SOURCE_CATEGORY_SOCIAL +catster,SOURCE_CATEGORY_SOCIAL +catster.com,SOURCE_CATEGORY_SOCIAL +cbnt.io,SOURCE_CATEGORY_SOCIAL +cellufun,SOURCE_CATEGORY_SOCIAL +cellufun.com,SOURCE_CATEGORY_SOCIAL +centerblog.net,SOURCE_CATEGORY_SOCIAL +centrum.cz,SOURCE_CATEGORY_SEARCH +ch.search.yahoo.com,SOURCE_CATEGORY_SEARCH +chat.zalo.me,SOURCE_CATEGORY_SOCIAL +checkout.shopify.com,SOURCE_CATEGORY_SHOPPING +checkout.stripe.com,SOURCE_CATEGORY_SHOPPING +chegg.com,SOURCE_CATEGORY_SOCIAL +chicagonow,SOURCE_CATEGORY_SOCIAL +chicagonow.com,SOURCE_CATEGORY_SOCIAL +chiebukuro.yahoo.co.jp,SOURCE_CATEGORY_SOCIAL +cl.search.yahoo.com,SOURCE_CATEGORY_SEARCH +classmates,SOURCE_CATEGORY_SOCIAL +classmates.com,SOURCE_CATEGORY_SOCIAL +classquest,SOURCE_CATEGORY_SOCIAL +classquest.com,SOURCE_CATEGORY_SOCIAL +cn.bing.com,SOURCE_CATEGORY_SEARCH +cnn,SOURCE_CATEGORY_SEARCH +co.pinterest.com,SOURCE_CATEGORY_SOCIAL +co.search.yahoo.com,SOURCE_CATEGORY_SEARCH +cocolog-nifty,SOURCE_CATEGORY_SOCIAL +cocolog-nifty.com,SOURCE_CATEGORY_SOCIAL +comcast,SOURCE_CATEGORY_SEARCH +conduit,SOURCE_CATEGORY_SEARCH +copainsdavant.linternaute.com,SOURCE_CATEGORY_SOCIAL +couchsurfing.org,SOURCE_CATEGORY_SOCIAL +cozycot,SOURCE_CATEGORY_SOCIAL +cozycot.com,SOURCE_CATEGORY_SOCIAL +cr.shopping.naver.com,SOURCE_CATEGORY_SHOPPING +cr2.shopping.naver.com,SOURCE_CATEGORY_SHOPPING +crackle,SOURCE_CATEGORY_VIDEO +crackle.com,SOURCE_CATEGORY_VIDEO +cross.tv,SOURCE_CATEGORY_SOCIAL +crunchyroll,SOURCE_CATEGORY_SOCIAL +crunchyroll.com,SOURCE_CATEGORY_SOCIAL +curiositystream,SOURCE_CATEGORY_VIDEO +curiositystream.com,SOURCE_CATEGORY_VIDEO +cyworld,SOURCE_CATEGORY_SOCIAL +cyworld.com,SOURCE_CATEGORY_SOCIAL +cz.pinterest.com,SOURCE_CATEGORY_SOCIAL +d.hatena.ne.jp,SOURCE_CATEGORY_SOCIAL +d.tube,SOURCE_CATEGORY_VIDEO +dailymotion,SOURCE_CATEGORY_VIDEO +dailymotion.com,SOURCE_CATEGORY_VIDEO +dailystrength.org,SOURCE_CATEGORY_SOCIAL +dashboard.twitch.tv,SOURCE_CATEGORY_VIDEO +daum,SOURCE_CATEGORY_SEARCH +daum.net,SOURCE_CATEGORY_SEARCH +de.search.yahoo.com,SOURCE_CATEGORY_SEARCH +deluxe.com,SOURCE_CATEGORY_SOCIAL +deviantart,SOURCE_CATEGORY_SOCIAL +deviantart.com,SOURCE_CATEGORY_SOCIAL +dianping,SOURCE_CATEGORY_SOCIAL +dianping.com,SOURCE_CATEGORY_SOCIAL +digg,SOURCE_CATEGORY_SOCIAL +digg.com,SOURCE_CATEGORY_SOCIAL +diigo,SOURCE_CATEGORY_SOCIAL +diigo.com,SOURCE_CATEGORY_SOCIAL +discover.hubpages.com,SOURCE_CATEGORY_SOCIAL +disneyplus,SOURCE_CATEGORY_VIDEO +disneyplus.com,SOURCE_CATEGORY_VIDEO +disqus,SOURCE_CATEGORY_SOCIAL +disqus.com,SOURCE_CATEGORY_SOCIAL +dk.search.yahoo.com,SOURCE_CATEGORY_SEARCH +dogpile,SOURCE_CATEGORY_SEARCH +dogpile.com,SOURCE_CATEGORY_SEARCH +dogster,SOURCE_CATEGORY_SOCIAL +dogster.com,SOURCE_CATEGORY_SOCIAL +dol2day,SOURCE_CATEGORY_SOCIAL +dol2day.com,SOURCE_CATEGORY_SOCIAL +doostang,SOURCE_CATEGORY_SOCIAL +doostang.com,SOURCE_CATEGORY_SOCIAL +dopplr,SOURCE_CATEGORY_SOCIAL +dopplr.com,SOURCE_CATEGORY_SOCIAL +douban,SOURCE_CATEGORY_SOCIAL +douban.com,SOURCE_CATEGORY_SOCIAL +draft.blogger.com,SOURCE_CATEGORY_SOCIAL +draugiem.lv,SOURCE_CATEGORY_SOCIAL +drugs-forum,SOURCE_CATEGORY_SOCIAL +drugs-forum.com,SOURCE_CATEGORY_SOCIAL +duckduckgo,SOURCE_CATEGORY_SEARCH +dzone,SOURCE_CATEGORY_SOCIAL +dzone.com,SOURCE_CATEGORY_SOCIAL +ebay,SOURCE_CATEGORY_SHOPPING +ebay.co.uk,SOURCE_CATEGORY_SHOPPING +ebay.com,SOURCE_CATEGORY_SHOPPING +ebay.com.au,SOURCE_CATEGORY_SHOPPING +ebay.de,SOURCE_CATEGORY_SHOPPING +ecosia.org,SOURCE_CATEGORY_SEARCH +edublogs.org,SOURCE_CATEGORY_SOCIAL +elftown,SOURCE_CATEGORY_SOCIAL +elftown.com,SOURCE_CATEGORY_SOCIAL +email.seznam.cz,SOURCE_CATEGORY_SEARCH +eniro,SOURCE_CATEGORY_SEARCH +epicurious.com,SOURCE_CATEGORY_SOCIAL +es.search.yahoo.com,SOURCE_CATEGORY_SEARCH +espanol.search.yahoo.com,SOURCE_CATEGORY_SEARCH +etsy,SOURCE_CATEGORY_SHOPPING +etsy.com,SOURCE_CATEGORY_SHOPPING +everforo.com,SOURCE_CATEGORY_SOCIAL +exalead.com,SOURCE_CATEGORY_SEARCH +exblog.jp,SOURCE_CATEGORY_SOCIAL +excite.com,SOURCE_CATEGORY_SEARCH +extole,SOURCE_CATEGORY_SOCIAL +extole.com,SOURCE_CATEGORY_SOCIAL +facebook,SOURCE_CATEGORY_SOCIAL +facebook.com,SOURCE_CATEGORY_SOCIAL +faceparty,SOURCE_CATEGORY_SOCIAL +faceparty.com,SOURCE_CATEGORY_SOCIAL +fandom.com,SOURCE_CATEGORY_SOCIAL +fanpop,SOURCE_CATEGORY_SOCIAL +fanpop.com,SOURCE_CATEGORY_SOCIAL +fark,SOURCE_CATEGORY_SOCIAL +fark.com,SOURCE_CATEGORY_SOCIAL +fast.wistia.net,SOURCE_CATEGORY_VIDEO +fb,SOURCE_CATEGORY_SOCIAL +fb.me,SOURCE_CATEGORY_SOCIAL +fc2,SOURCE_CATEGORY_SOCIAL +fc2.com,SOURCE_CATEGORY_SOCIAL +feedspot,SOURCE_CATEGORY_SOCIAL +feministing,SOURCE_CATEGORY_SOCIAL +feministing.com,SOURCE_CATEGORY_SOCIAL +fi.search.yahoo.com,SOURCE_CATEGORY_SEARCH +filmaffinity,SOURCE_CATEGORY_SOCIAL +filmaffinity.com,SOURCE_CATEGORY_SOCIAL +firmy.cz,SOURCE_CATEGORY_SEARCH +flickr,SOURCE_CATEGORY_SOCIAL +flickr.com,SOURCE_CATEGORY_SOCIAL +flipboard,SOURCE_CATEGORY_SOCIAL +flipboard.com,SOURCE_CATEGORY_SOCIAL +folkdirect,SOURCE_CATEGORY_SOCIAL +folkdirect.com,SOURCE_CATEGORY_SOCIAL +foodservice,SOURCE_CATEGORY_SOCIAL +foodservice.com,SOURCE_CATEGORY_SOCIAL +forums.androidcentral.com,SOURCE_CATEGORY_SOCIAL +forums.crackberry.com,SOURCE_CATEGORY_SOCIAL +forums.imore.com,SOURCE_CATEGORY_SOCIAL +forums.nexopia.com,SOURCE_CATEGORY_SOCIAL +forums.webosnation.com,SOURCE_CATEGORY_SOCIAL +forums.wpcentral.com,SOURCE_CATEGORY_SOCIAL +fotki,SOURCE_CATEGORY_SOCIAL +fotki.com,SOURCE_CATEGORY_SOCIAL +fotolog,SOURCE_CATEGORY_SOCIAL +fotolog.com,SOURCE_CATEGORY_SOCIAL +foursquare,SOURCE_CATEGORY_SOCIAL +foursquare.com,SOURCE_CATEGORY_SOCIAL +fr.search.yahoo.com,SOURCE_CATEGORY_SEARCH +free.facebook.com,SOURCE_CATEGORY_SOCIAL +friendfeed,SOURCE_CATEGORY_SOCIAL +friendfeed.com,SOURCE_CATEGORY_SOCIAL +fruehstueckstreff.org,SOURCE_CATEGORY_SOCIAL +fubar,SOURCE_CATEGORY_SOCIAL +fubar.com,SOURCE_CATEGORY_SOCIAL +gaiaonline,SOURCE_CATEGORY_SOCIAL +gaiaonline.com,SOURCE_CATEGORY_SOCIAL +gamerdna,SOURCE_CATEGORY_SOCIAL +gamerdna.com,SOURCE_CATEGORY_SOCIAL +gather.com,SOURCE_CATEGORY_SOCIAL +geni.com,SOURCE_CATEGORY_SOCIAL +getpocket.com,SOURCE_CATEGORY_SOCIAL +glassboard,SOURCE_CATEGORY_SOCIAL +glassboard.com,SOURCE_CATEGORY_SOCIAL +glassdoor,SOURCE_CATEGORY_SOCIAL +glassdoor.com,SOURCE_CATEGORY_SOCIAL +globo,SOURCE_CATEGORY_SEARCH +go.mail.ru,SOURCE_CATEGORY_SEARCH +godtube,SOURCE_CATEGORY_SOCIAL +godtube.com,SOURCE_CATEGORY_SOCIAL +goldenline.pl,SOURCE_CATEGORY_SOCIAL +goldstar,SOURCE_CATEGORY_SOCIAL +goldstar.com,SOURCE_CATEGORY_SOCIAL +goo.gl,SOURCE_CATEGORY_SOCIAL +gooblog,SOURCE_CATEGORY_SOCIAL +goodreads,SOURCE_CATEGORY_SOCIAL +goodreads.com,SOURCE_CATEGORY_SOCIAL +google,SOURCE_CATEGORY_SEARCH +google+,SOURCE_CATEGORY_SOCIAL +google-play,SOURCE_CATEGORY_SEARCH +googlegroups.com,SOURCE_CATEGORY_SOCIAL +googleplus,SOURCE_CATEGORY_SOCIAL +govloop,SOURCE_CATEGORY_SOCIAL +govloop.com,SOURCE_CATEGORY_SOCIAL +gowalla,SOURCE_CATEGORY_SOCIAL +gowalla.com,SOURCE_CATEGORY_SOCIAL +gree.jp,SOURCE_CATEGORY_SOCIAL +groups.google.com,SOURCE_CATEGORY_SOCIAL +gulli.com,SOURCE_CATEGORY_SOCIAL +gutefrage.net,SOURCE_CATEGORY_SOCIAL +habbo,SOURCE_CATEGORY_SOCIAL +habbo.com,SOURCE_CATEGORY_SOCIAL +help.hulu.com,SOURCE_CATEGORY_VIDEO +help.netflix.com,SOURCE_CATEGORY_VIDEO +hi5,SOURCE_CATEGORY_SOCIAL +hi5.com,SOURCE_CATEGORY_SOCIAL +hk.search.yahoo.com,SOURCE_CATEGORY_SEARCH +hootsuite,SOURCE_CATEGORY_SOCIAL +hootsuite.com,SOURCE_CATEGORY_SOCIAL +houzz,SOURCE_CATEGORY_SOCIAL +houzz.com,SOURCE_CATEGORY_SOCIAL +hoverspot,SOURCE_CATEGORY_SOCIAL +hoverspot.com,SOURCE_CATEGORY_SOCIAL +hr.com,SOURCE_CATEGORY_SOCIAL +hu.pinterest.com,SOURCE_CATEGORY_SOCIAL +hubculture,SOURCE_CATEGORY_SOCIAL +hubculture.com,SOURCE_CATEGORY_SOCIAL +hubpages.com,SOURCE_CATEGORY_SOCIAL +hulu,SOURCE_CATEGORY_VIDEO +hulu.com,SOURCE_CATEGORY_VIDEO +hyves.net,SOURCE_CATEGORY_SOCIAL +hyves.nl,SOURCE_CATEGORY_SOCIAL +ibibo,SOURCE_CATEGORY_SOCIAL +ibibo.com,SOURCE_CATEGORY_SOCIAL +id.pinterest.com,SOURCE_CATEGORY_SOCIAL +id.search.yahoo.com,SOURCE_CATEGORY_SEARCH +id.twitch.tv,SOURCE_CATEGORY_VIDEO +identi.ca,SOURCE_CATEGORY_SOCIAL +ig,SOURCE_CATEGORY_SOCIAL +imageshack.com,SOURCE_CATEGORY_SOCIAL +imageshack.us,SOURCE_CATEGORY_SOCIAL +imvu,SOURCE_CATEGORY_SOCIAL +imvu.com,SOURCE_CATEGORY_SOCIAL +in.pinterest.com,SOURCE_CATEGORY_SOCIAL +in.search.yahoo.com,SOURCE_CATEGORY_SEARCH +incredimail,SOURCE_CATEGORY_SEARCH +insanejournal,SOURCE_CATEGORY_SOCIAL +insanejournal.com,SOURCE_CATEGORY_SOCIAL +instagram,SOURCE_CATEGORY_SOCIAL +instagram.com,SOURCE_CATEGORY_SOCIAL +instapaper,SOURCE_CATEGORY_SOCIAL +instapaper.com,SOURCE_CATEGORY_SOCIAL +internations.org,SOURCE_CATEGORY_SOCIAL +interpals.net,SOURCE_CATEGORY_SOCIAL +intherooms,SOURCE_CATEGORY_SOCIAL +intherooms.com,SOURCE_CATEGORY_SOCIAL +iq.com,SOURCE_CATEGORY_VIDEO +iqiyi,SOURCE_CATEGORY_VIDEO +iqiyi.com,SOURCE_CATEGORY_VIDEO +irc-galleria.net,SOURCE_CATEGORY_SOCIAL +is.gd,SOURCE_CATEGORY_SOCIAL +it.search.yahoo.com,SOURCE_CATEGORY_SEARCH +italki,SOURCE_CATEGORY_SOCIAL +italki.com,SOURCE_CATEGORY_SOCIAL +jammerdirect,SOURCE_CATEGORY_SOCIAL +jammerdirect.com,SOURCE_CATEGORY_SOCIAL +jappy.com,SOURCE_CATEGORY_SOCIAL +jappy.de,SOURCE_CATEGORY_SOCIAL +jobs.netflix.com,SOURCE_CATEGORY_VIDEO +justin.tv,SOURCE_CATEGORY_VIDEO +kaboodle.com,SOURCE_CATEGORY_SOCIAL +kakao,SOURCE_CATEGORY_SOCIAL +kakao.com,SOURCE_CATEGORY_SOCIAL +kakaocorp.com,SOURCE_CATEGORY_SOCIAL +kaneva,SOURCE_CATEGORY_SOCIAL +kaneva.com,SOURCE_CATEGORY_SOCIAL +kin.naver.com,SOURCE_CATEGORY_SOCIAL +kvasir,SOURCE_CATEGORY_SEARCH +l.facebook.com,SOURCE_CATEGORY_SOCIAL +l.instagram.com,SOURCE_CATEGORY_SOCIAL +l.messenger.com,SOURCE_CATEGORY_SOCIAL +last.fm,SOURCE_CATEGORY_SOCIAL +lens.google.com,SOURCE_CATEGORY_SEARCH +librarything,SOURCE_CATEGORY_SOCIAL +librarything.com,SOURCE_CATEGORY_SOCIAL +lifestream.aol.com,SOURCE_CATEGORY_SOCIAL +line,SOURCE_CATEGORY_SOCIAL +line.me,SOURCE_CATEGORY_SOCIAL +linkedin,SOURCE_CATEGORY_SOCIAL +linkedin.com,SOURCE_CATEGORY_SOCIAL +listal,SOURCE_CATEGORY_SOCIAL +listal.com,SOURCE_CATEGORY_SOCIAL +listography,SOURCE_CATEGORY_SOCIAL +listography.com,SOURCE_CATEGORY_SOCIAL +lite.qwant.com,SOURCE_CATEGORY_SEARCH +livedoor.com,SOURCE_CATEGORY_SOCIAL +livedoorblog,SOURCE_CATEGORY_SOCIAL +livejournal,SOURCE_CATEGORY_SOCIAL +livejournal.com,SOURCE_CATEGORY_SOCIAL +lm.facebook.com,SOURCE_CATEGORY_SOCIAL +lnkd.in,SOURCE_CATEGORY_SOCIAL +lycos,SOURCE_CATEGORY_SEARCH +m.alibaba.com,SOURCE_CATEGORY_SHOPPING +m.baidu.com,SOURCE_CATEGORY_SEARCH +m.blog.naver.com,SOURCE_CATEGORY_SOCIAL +m.cafe.naver.com,SOURCE_CATEGORY_SOCIAL +m.facebook.com,SOURCE_CATEGORY_SOCIAL +m.kin.naver.com,SOURCE_CATEGORY_SOCIAL +m.naver.com,SOURCE_CATEGORY_SEARCH +m.search.naver.com,SOURCE_CATEGORY_SEARCH +m.shopping.naver.com,SOURCE_CATEGORY_SHOPPING +m.sogou.com,SOURCE_CATEGORY_SEARCH +m.twitch.tv,SOURCE_CATEGORY_VIDEO +m.vk.com,SOURCE_CATEGORY_SOCIAL +m.yelp.com,SOURCE_CATEGORY_SOCIAL +m.youtube.com,SOURCE_CATEGORY_VIDEO +mail.rambler.ru,SOURCE_CATEGORY_SEARCH +mail.yandex.ru,SOURCE_CATEGORY_SEARCH +malaysia.search.yahoo.com,SOURCE_CATEGORY_SEARCH +mbga.jp,SOURCE_CATEGORY_SOCIAL +medium.com,SOURCE_CATEGORY_SOCIAL +meetin.org,SOURCE_CATEGORY_SOCIAL +meetup,SOURCE_CATEGORY_SOCIAL +meetup.com,SOURCE_CATEGORY_SOCIAL +meinvz.net,SOURCE_CATEGORY_SOCIAL +meneame.net,SOURCE_CATEGORY_SOCIAL +menuism.com,SOURCE_CATEGORY_SOCIAL +mercadolibre,SOURCE_CATEGORY_SHOPPING +mercadolibre.com,SOURCE_CATEGORY_SHOPPING +mercadolibre.com.ar,SOURCE_CATEGORY_SHOPPING +mercadolibre.com.mx,SOURCE_CATEGORY_SHOPPING +message.alibaba.com,SOURCE_CATEGORY_SHOPPING +messages.google.com,SOURCE_CATEGORY_SOCIAL +messages.yahoo.co.jp,SOURCE_CATEGORY_SOCIAL +messenger,SOURCE_CATEGORY_SOCIAL +messenger.com,SOURCE_CATEGORY_SOCIAL +mix.com,SOURCE_CATEGORY_SOCIAL +mixi.jp,SOURCE_CATEGORY_SOCIAL +mobile.facebook.com,SOURCE_CATEGORY_SOCIAL +mocospace,SOURCE_CATEGORY_SOCIAL +mocospace.com,SOURCE_CATEGORY_SOCIAL +mouthshut,SOURCE_CATEGORY_SOCIAL +mouthshut.com,SOURCE_CATEGORY_SOCIAL +movabletype,SOURCE_CATEGORY_SOCIAL +movabletype.com,SOURCE_CATEGORY_SOCIAL +msearch.shopping.naver.com,SOURCE_CATEGORY_SHOPPING +msn,SOURCE_CATEGORY_SEARCH +msn.com,SOURCE_CATEGORY_SEARCH +mubi,SOURCE_CATEGORY_SOCIAL +mubi.com,SOURCE_CATEGORY_SOCIAL +music.youtube.com,SOURCE_CATEGORY_VIDEO +mx.search.yahoo.com,SOURCE_CATEGORY_SEARCH +my.opera.com,SOURCE_CATEGORY_SOCIAL +myanimelist.net,SOURCE_CATEGORY_SOCIAL +myheritage,SOURCE_CATEGORY_SOCIAL +myheritage.com,SOURCE_CATEGORY_SOCIAL +mylife,SOURCE_CATEGORY_SOCIAL +mylife.com,SOURCE_CATEGORY_SOCIAL +mymodernmet,SOURCE_CATEGORY_SOCIAL +mymodernmet.com,SOURCE_CATEGORY_SOCIAL +myspace,SOURCE_CATEGORY_SOCIAL +myspace.com,SOURCE_CATEGORY_SOCIAL +najdi,SOURCE_CATEGORY_SEARCH +naver,SOURCE_CATEGORY_SEARCH +naver.com,SOURCE_CATEGORY_SEARCH +netflix,SOURCE_CATEGORY_VIDEO +netflix.com,SOURCE_CATEGORY_VIDEO +netvibes,SOURCE_CATEGORY_SOCIAL +netvibes.com,SOURCE_CATEGORY_SOCIAL +news.google.com,SOURCE_CATEGORY_SEARCH +news.ycombinator.com,SOURCE_CATEGORY_SOCIAL +newsshowcase,SOURCE_CATEGORY_SOCIAL +nexopia,SOURCE_CATEGORY_SOCIAL +ngopost.org,SOURCE_CATEGORY_SOCIAL +niconico,SOURCE_CATEGORY_SOCIAL +nicovideo.jp,SOURCE_CATEGORY_SOCIAL +nightlifelink,SOURCE_CATEGORY_SOCIAL +nightlifelink.com,SOURCE_CATEGORY_SOCIAL +ning,SOURCE_CATEGORY_SOCIAL +ning.com,SOURCE_CATEGORY_SOCIAL +nl.pinterest.com,SOURCE_CATEGORY_SOCIAL +nl.search.yahoo.com,SOURCE_CATEGORY_SEARCH +nl.shopping.net,SOURCE_CATEGORY_SHOPPING +no.search.yahoo.com,SOURCE_CATEGORY_SEARCH +no.shopping.net,SOURCE_CATEGORY_SHOPPING +ntp.msn.com,SOURCE_CATEGORY_SEARCH +nz.search.yahoo.com,SOURCE_CATEGORY_SEARCH +odnoklassniki.ru,SOURCE_CATEGORY_SOCIAL +odnoklassniki.ua,SOURCE_CATEGORY_SOCIAL +offer.alibaba.com,SOURCE_CATEGORY_SHOPPING +okwave.jp,SOURCE_CATEGORY_SOCIAL +old.reddit.com,SOURCE_CATEGORY_SOCIAL +one.walmart.com,SOURCE_CATEGORY_SHOPPING +onet,SOURCE_CATEGORY_SEARCH +onet.pl,SOURCE_CATEGORY_SEARCH +oneworldgroup.org,SOURCE_CATEGORY_SOCIAL +onstartups,SOURCE_CATEGORY_SOCIAL +onstartups.com,SOURCE_CATEGORY_SOCIAL +opendiary,SOURCE_CATEGORY_SOCIAL +opendiary.com,SOURCE_CATEGORY_SOCIAL +order.shopping.yahoo.co.jp,SOURCE_CATEGORY_SHOPPING +oshiete.goo.ne.jp,SOURCE_CATEGORY_SOCIAL +out.reddit.com,SOURCE_CATEGORY_SOCIAL +over-blog.com,SOURCE_CATEGORY_SOCIAL +overblog.com,SOURCE_CATEGORY_SOCIAL +paper.li,SOURCE_CATEGORY_SOCIAL +partners.shopify.com,SOURCE_CATEGORY_SHOPPING +partyflock.nl,SOURCE_CATEGORY_SOCIAL +pe.search.yahoo.com,SOURCE_CATEGORY_SEARCH +ph.search.yahoo.com,SOURCE_CATEGORY_SEARCH +photobucket,SOURCE_CATEGORY_SOCIAL +photobucket.com,SOURCE_CATEGORY_SOCIAL +pinboard,SOURCE_CATEGORY_SOCIAL +pinboard.in,SOURCE_CATEGORY_SOCIAL +pingsta,SOURCE_CATEGORY_SOCIAL +pingsta.com,SOURCE_CATEGORY_SOCIAL +pinterest,SOURCE_CATEGORY_SOCIAL +pinterest.at,SOURCE_CATEGORY_SOCIAL +pinterest.ca,SOURCE_CATEGORY_SOCIAL +pinterest.ch,SOURCE_CATEGORY_SOCIAL +pinterest.cl,SOURCE_CATEGORY_SOCIAL +pinterest.co.kr,SOURCE_CATEGORY_SOCIAL +pinterest.co.uk,SOURCE_CATEGORY_SOCIAL +pinterest.com,SOURCE_CATEGORY_SOCIAL +pinterest.com.au,SOURCE_CATEGORY_SOCIAL +pinterest.com.mx,SOURCE_CATEGORY_SOCIAL +pinterest.de,SOURCE_CATEGORY_SOCIAL +pinterest.es,SOURCE_CATEGORY_SOCIAL +pinterest.fr,SOURCE_CATEGORY_SOCIAL +pinterest.it,SOURCE_CATEGORY_SOCIAL +pinterest.jp,SOURCE_CATEGORY_SOCIAL +pinterest.nz,SOURCE_CATEGORY_SOCIAL +pinterest.ph,SOURCE_CATEGORY_SOCIAL +pinterest.pt,SOURCE_CATEGORY_SOCIAL +pinterest.ru,SOURCE_CATEGORY_SOCIAL +pinterest.se,SOURCE_CATEGORY_SOCIAL +pixiv.net,SOURCE_CATEGORY_SOCIAL +pl.pinterest.com,SOURCE_CATEGORY_SOCIAL +pl.search.yahoo.com,SOURCE_CATEGORY_SEARCH +play.google.com,SOURCE_CATEGORY_SEARCH +playahead.se,SOURCE_CATEGORY_SOCIAL +player.twitch.tv,SOURCE_CATEGORY_VIDEO +player.vimeo.com,SOURCE_CATEGORY_VIDEO +plurk,SOURCE_CATEGORY_SOCIAL +plurk.com,SOURCE_CATEGORY_SOCIAL +plus.google.com,SOURCE_CATEGORY_SOCIAL +plus.url.google.com,SOURCE_CATEGORY_SOCIAL +pocket.co,SOURCE_CATEGORY_SOCIAL +posterous,SOURCE_CATEGORY_SOCIAL +posterous.com,SOURCE_CATEGORY_SOCIAL +pro.homeadvisor.com,SOURCE_CATEGORY_SOCIAL +pulse.yahoo.com,SOURCE_CATEGORY_SOCIAL +qapacity,SOURCE_CATEGORY_SOCIAL +qapacity.com,SOURCE_CATEGORY_SOCIAL +quechup,SOURCE_CATEGORY_SOCIAL +quechup.com,SOURCE_CATEGORY_SOCIAL +quora,SOURCE_CATEGORY_SOCIAL +quora.com,SOURCE_CATEGORY_SOCIAL +qwant,SOURCE_CATEGORY_SEARCH +qwant.com,SOURCE_CATEGORY_SEARCH +qzone.qq.com,SOURCE_CATEGORY_SOCIAL +rakuten,SOURCE_CATEGORY_SEARCH +rakuten.co.jp,SOURCE_CATEGORY_SEARCH +rambler,SOURCE_CATEGORY_SEARCH +rambler.ru,SOURCE_CATEGORY_SEARCH +ravelry,SOURCE_CATEGORY_SOCIAL +ravelry.com,SOURCE_CATEGORY_SOCIAL +reddit,SOURCE_CATEGORY_SOCIAL +reddit.com,SOURCE_CATEGORY_SOCIAL +redux,SOURCE_CATEGORY_SOCIAL +redux.com,SOURCE_CATEGORY_SOCIAL +renren,SOURCE_CATEGORY_SOCIAL +renren.com,SOURCE_CATEGORY_SOCIAL +researchgate.net,SOURCE_CATEGORY_SOCIAL +reunion,SOURCE_CATEGORY_SOCIAL +reunion.com,SOURCE_CATEGORY_SOCIAL +reverbnation,SOURCE_CATEGORY_SOCIAL +reverbnation.com,SOURCE_CATEGORY_SOCIAL +rtl.de,SOURCE_CATEGORY_SOCIAL +ryze,SOURCE_CATEGORY_SOCIAL +ryze.com,SOURCE_CATEGORY_SOCIAL +s3.amazonaws.com,SOURCE_CATEGORY_SHOPPING +salespider,SOURCE_CATEGORY_SOCIAL +salespider.com,SOURCE_CATEGORY_SOCIAL +scoop.it,SOURCE_CATEGORY_SOCIAL +screenrant,SOURCE_CATEGORY_SOCIAL +screenrant.com,SOURCE_CATEGORY_SOCIAL +scribd,SOURCE_CATEGORY_SOCIAL +scribd.com,SOURCE_CATEGORY_SOCIAL +scvngr,SOURCE_CATEGORY_SOCIAL +scvngr.com,SOURCE_CATEGORY_SOCIAL +se.search.yahoo.com,SOURCE_CATEGORY_SEARCH +se.shopping.net,SOURCE_CATEGORY_SHOPPING +search-results,SOURCE_CATEGORY_SEARCH +search.aol.co.uk,SOURCE_CATEGORY_SEARCH +search.aol.com,SOURCE_CATEGORY_SEARCH +search.google.com,SOURCE_CATEGORY_SEARCH +search.smt.docomo.ne.jp,SOURCE_CATEGORY_SEARCH +search.ukr.net,SOURCE_CATEGORY_SEARCH +secondlife,SOURCE_CATEGORY_SOCIAL +secondlife.com,SOURCE_CATEGORY_SOCIAL +secureurl.ukr.net,SOURCE_CATEGORY_SEARCH +serverfault,SOURCE_CATEGORY_SOCIAL +serverfault.com,SOURCE_CATEGORY_SOCIAL +seznam,SOURCE_CATEGORY_SEARCH +seznam.cz,SOURCE_CATEGORY_SEARCH +sg.search.yahoo.com,SOURCE_CATEGORY_SEARCH +shareit,SOURCE_CATEGORY_SOCIAL +sharethis,SOURCE_CATEGORY_SOCIAL +sharethis.com,SOURCE_CATEGORY_SOCIAL +shop.app,SOURCE_CATEGORY_SHOPPING +shopify,SOURCE_CATEGORY_SHOPPING +shopify.com,SOURCE_CATEGORY_SHOPPING +shopping.naver.com,SOURCE_CATEGORY_SHOPPING +shopping.yahoo.co.jp,SOURCE_CATEGORY_SHOPPING +shopping.yahoo.com,SOURCE_CATEGORY_SHOPPING +shopzilla,SOURCE_CATEGORY_SHOPPING +shopzilla.com,SOURCE_CATEGORY_SHOPPING +shvoong.com,SOURCE_CATEGORY_SOCIAL +simplycodes.com,SOURCE_CATEGORY_SHOPPING +sites.google.com,SOURCE_CATEGORY_SOCIAL +skype,SOURCE_CATEGORY_SOCIAL +skyrock,SOURCE_CATEGORY_SOCIAL +skyrock.com,SOURCE_CATEGORY_SOCIAL +slashdot.org,SOURCE_CATEGORY_SOCIAL +slideshare.net,SOURCE_CATEGORY_SOCIAL +smartnews.com,SOURCE_CATEGORY_SOCIAL +snapchat,SOURCE_CATEGORY_SOCIAL +snapchat.com,SOURCE_CATEGORY_SOCIAL +so.com,SOURCE_CATEGORY_SEARCH +social,SOURCE_CATEGORY_SOCIAL +sociallife.com.br,SOURCE_CATEGORY_SOCIAL +socialvibe,SOURCE_CATEGORY_SOCIAL +socialvibe.com,SOURCE_CATEGORY_SOCIAL +sogou,SOURCE_CATEGORY_SEARCH +sogou.com,SOURCE_CATEGORY_SEARCH +sp-web.search.auone.jp,SOURCE_CATEGORY_SEARCH +spaces.live.com,SOURCE_CATEGORY_SOCIAL +spoke,SOURCE_CATEGORY_SOCIAL +spoke.com,SOURCE_CATEGORY_SOCIAL +spruz,SOURCE_CATEGORY_SOCIAL +spruz.com,SOURCE_CATEGORY_SOCIAL +ssense.com,SOURCE_CATEGORY_SOCIAL +stackapps,SOURCE_CATEGORY_SOCIAL +stackapps.com,SOURCE_CATEGORY_SOCIAL +stackexchange,SOURCE_CATEGORY_SOCIAL +stackexchange.com,SOURCE_CATEGORY_SOCIAL +stackoverflow,SOURCE_CATEGORY_SOCIAL +stackoverflow.com,SOURCE_CATEGORY_SOCIAL +stardoll.com,SOURCE_CATEGORY_SOCIAL +startsiden,SOURCE_CATEGORY_SEARCH +startsiden.no,SOURCE_CATEGORY_SEARCH +stickam,SOURCE_CATEGORY_SOCIAL +stickam.com,SOURCE_CATEGORY_SOCIAL +store.shopping.yahoo.co.jp,SOURCE_CATEGORY_SHOPPING +stripe,SOURCE_CATEGORY_SHOPPING +stripe.com,SOURCE_CATEGORY_SHOPPING +studivz.net,SOURCE_CATEGORY_SOCIAL +suche.aol.de,SOURCE_CATEGORY_SEARCH +suomi24.fi,SOURCE_CATEGORY_SOCIAL +superuser,SOURCE_CATEGORY_SOCIAL +superuser.com,SOURCE_CATEGORY_SOCIAL +sweeva,SOURCE_CATEGORY_SOCIAL +sweeva.com,SOURCE_CATEGORY_SOCIAL +t.co,SOURCE_CATEGORY_SOCIAL +t.me,SOURCE_CATEGORY_SOCIAL +tagged,SOURCE_CATEGORY_SOCIAL +tagged.com,SOURCE_CATEGORY_SOCIAL +taggedmail,SOURCE_CATEGORY_SOCIAL +taggedmail.com,SOURCE_CATEGORY_SOCIAL +talkbiznow,SOURCE_CATEGORY_SOCIAL +talkbiznow.com,SOURCE_CATEGORY_SOCIAL +taringa.net,SOURCE_CATEGORY_SOCIAL +techmeme,SOURCE_CATEGORY_SOCIAL +techmeme.com,SOURCE_CATEGORY_SOCIAL +ted,SOURCE_CATEGORY_VIDEO +ted.com,SOURCE_CATEGORY_VIDEO +tencent,SOURCE_CATEGORY_SOCIAL +tencent.com,SOURCE_CATEGORY_SOCIAL +terra,SOURCE_CATEGORY_SEARCH +th.search.yahoo.com,SOURCE_CATEGORY_SEARCH +tiktok,SOURCE_CATEGORY_SOCIAL +tiktok.com,SOURCE_CATEGORY_SOCIAL +tinyurl,SOURCE_CATEGORY_SOCIAL +tinyurl.com,SOURCE_CATEGORY_SOCIAL +toolbox,SOURCE_CATEGORY_SOCIAL +toolbox.com,SOURCE_CATEGORY_SOCIAL +touch.facebook.com,SOURCE_CATEGORY_SOCIAL +tr.pinterest.com,SOURCE_CATEGORY_SOCIAL +tr.search.yahoo.com,SOURCE_CATEGORY_SEARCH +travellerspoint,SOURCE_CATEGORY_SOCIAL +travellerspoint.com,SOURCE_CATEGORY_SOCIAL +tripadvisor,SOURCE_CATEGORY_SOCIAL +tripadvisor.com,SOURCE_CATEGORY_SOCIAL +trombi,SOURCE_CATEGORY_SOCIAL +trombi.com,SOURCE_CATEGORY_SOCIAL +trustpilot,SOURCE_CATEGORY_SOCIAL +tudou,SOURCE_CATEGORY_SOCIAL +tudou.com,SOURCE_CATEGORY_SOCIAL +tuenti,SOURCE_CATEGORY_SOCIAL +tuenti.com,SOURCE_CATEGORY_SOCIAL +tumblr,SOURCE_CATEGORY_SOCIAL +tumblr.com,SOURCE_CATEGORY_SOCIAL +tut.by,SOURCE_CATEGORY_SEARCH +tw.search.yahoo.com,SOURCE_CATEGORY_SEARCH +tweetdeck,SOURCE_CATEGORY_SOCIAL +tweetdeck.com,SOURCE_CATEGORY_SOCIAL +twitch,SOURCE_CATEGORY_VIDEO +twitch.tv,SOURCE_CATEGORY_VIDEO +twitter,SOURCE_CATEGORY_SOCIAL +twitter.com,SOURCE_CATEGORY_SOCIAL +twoo.com,SOURCE_CATEGORY_SOCIAL +typepad,SOURCE_CATEGORY_SOCIAL +typepad.com,SOURCE_CATEGORY_SOCIAL +uk.search.yahoo.com,SOURCE_CATEGORY_SEARCH +uk.shopping.net,SOURCE_CATEGORY_SHOPPING +ukr,SOURCE_CATEGORY_SEARCH +unblog.fr,SOURCE_CATEGORY_SOCIAL +urbanspoon.com,SOURCE_CATEGORY_SOCIAL +us.search.yahoo.com,SOURCE_CATEGORY_SEARCH +ushareit.com,SOURCE_CATEGORY_SOCIAL +ushi.cn,SOURCE_CATEGORY_SOCIAL +utreon,SOURCE_CATEGORY_VIDEO +utreon.com,SOURCE_CATEGORY_VIDEO +vampirefreaks,SOURCE_CATEGORY_SOCIAL +vampirefreaks.com,SOURCE_CATEGORY_SOCIAL +vampirerave,SOURCE_CATEGORY_SOCIAL +vampirerave.com,SOURCE_CATEGORY_SOCIAL +veoh,SOURCE_CATEGORY_VIDEO +veoh.com,SOURCE_CATEGORY_VIDEO +vg.no,SOURCE_CATEGORY_SOCIAL +viadeo.journaldunet.com,SOURCE_CATEGORY_VIDEO +video.ibm.com,SOURCE_CATEGORY_SOCIAL +vimeo,SOURCE_CATEGORY_VIDEO +vimeo.com,SOURCE_CATEGORY_VIDEO +virgilio,SOURCE_CATEGORY_SEARCH +vk.com,SOURCE_CATEGORY_SOCIAL +vkontakte.ru,SOURCE_CATEGORY_SOCIAL +vn.search.yahoo.com,SOURCE_CATEGORY_SEARCH +wakoopa,SOURCE_CATEGORY_SOCIAL +wakoopa.com,SOURCE_CATEGORY_SOCIAL +walmart,SOURCE_CATEGORY_SHOPPING +walmart.com,SOURCE_CATEGORY_SHOPPING +wap.sogou.com,SOURCE_CATEGORY_SEARCH +wattpad,SOURCE_CATEGORY_SOCIAL +wattpad.com,SOURCE_CATEGORY_SOCIAL +web.facebook.com,SOURCE_CATEGORY_SOCIAL +web.skype.com,SOURCE_CATEGORY_SOCIAL +webmaster.yandex.ru,SOURCE_CATEGORY_SEARCH +websearch.rakuten.co.jp,SOURCE_CATEGORY_SEARCH +webshots,SOURCE_CATEGORY_SOCIAL +webshots.com,SOURCE_CATEGORY_SOCIAL +wechat,SOURCE_CATEGORY_SOCIAL +wechat.com,SOURCE_CATEGORY_SOCIAL +weebly,SOURCE_CATEGORY_SOCIAL +weebly.com,SOURCE_CATEGORY_SOCIAL +weibo,SOURCE_CATEGORY_SOCIAL +weibo.com,SOURCE_CATEGORY_SOCIAL +wer-weiss-was.de,SOURCE_CATEGORY_SOCIAL +weread,SOURCE_CATEGORY_SOCIAL +weread.com,SOURCE_CATEGORY_SOCIAL +whatsapp,SOURCE_CATEGORY_SOCIAL +whatsapp.com,SOURCE_CATEGORY_SOCIAL +wiki.answers.com,SOURCE_CATEGORY_SOCIAL +wikihow.com,SOURCE_CATEGORY_SOCIAL +wikitravel.org,SOURCE_CATEGORY_SOCIAL +wistia,SOURCE_CATEGORY_VIDEO +wistia.com,SOURCE_CATEGORY_VIDEO +woot.com,SOURCE_CATEGORY_SOCIAL +wordpress,SOURCE_CATEGORY_SOCIAL +wordpress.com,SOURCE_CATEGORY_SOCIAL +wordpress.org,SOURCE_CATEGORY_SOCIAL +xanga,SOURCE_CATEGORY_SOCIAL +xanga.com,SOURCE_CATEGORY_SOCIAL +xing,SOURCE_CATEGORY_SOCIAL +xing.com,SOURCE_CATEGORY_SOCIAL +yahoo,SOURCE_CATEGORY_SEARCH +yahoo-mbga.jp,SOURCE_CATEGORY_SOCIAL +yahoo.co.jp,SOURCE_CATEGORY_SEARCH +yahoo.com,SOURCE_CATEGORY_SEARCH +yammer,SOURCE_CATEGORY_SOCIAL +yammer.com,SOURCE_CATEGORY_SOCIAL +yandex,SOURCE_CATEGORY_SEARCH +yandex.by,SOURCE_CATEGORY_SEARCH +yandex.com,SOURCE_CATEGORY_SEARCH +yandex.com.tr,SOURCE_CATEGORY_SEARCH +yandex.fr,SOURCE_CATEGORY_SEARCH +yandex.kz,SOURCE_CATEGORY_SEARCH +yandex.ru,SOURCE_CATEGORY_SEARCH +yandex.ua,SOURCE_CATEGORY_SEARCH +yandex.uz,SOURCE_CATEGORY_SEARCH +yelp,SOURCE_CATEGORY_SOCIAL +yelp.co.uk,SOURCE_CATEGORY_SOCIAL +yelp.com,SOURCE_CATEGORY_SOCIAL +youku,SOURCE_CATEGORY_VIDEO +youku.com,SOURCE_CATEGORY_VIDEO +youroom.in,SOURCE_CATEGORY_SOCIAL +youtube,SOURCE_CATEGORY_VIDEO +youtube.com,SOURCE_CATEGORY_VIDEO +za.pinterest.com,SOURCE_CATEGORY_SOCIAL +zalo,SOURCE_CATEGORY_SOCIAL +zen.yandex.ru,SOURCE_CATEGORY_SEARCH +zoo.gr,SOURCE_CATEGORY_SOCIAL +zooppa,SOURCE_CATEGORY_SOCIAL +zooppa.com,SOURCE_CATEGORY_SOCIAL diff --git a/snapshots/.gitkeep b/snapshots/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/tests/.gitkeep b/tests/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/tests/page_location_with_gclid_is_cpc.sql b/tests/page_location_with_gclid_is_cpc.sql new file mode 100644 index 00000000..8f38b7fb --- /dev/null +++ b/tests/page_location_with_gclid_is_cpc.sql @@ -0,0 +1,15 @@ +-- Google has changed the combination of parameters that are used to identify a CPC source in the past. +-- In order to detect new changes, this test checks that a page_location with a gclid is classified as cpc. + +{{config( + severity = 'warn' +)}} +select + count(event_source) as sources + , count(event_medium) as mediums +from {{ref('stg_ga4__events')}} +where original_page_location like '%gclid%' + and event_source != 'google' + and event_medium != 'cpc' +having sources > 0 + or mediums > 0 \ No newline at end of file diff --git a/unit_tests/.env.example b/unit_tests/.env.example new file mode 100644 index 00000000..52aa8fd2 --- /dev/null +++ b/unit_tests/.env.example @@ -0,0 +1 @@ +BIGQUERY_PROJECT= \ No newline at end of file diff --git a/unit_tests/README.md b/unit_tests/README.md new file mode 100644 index 00000000..fdd75013 --- /dev/null +++ b/unit_tests/README.md @@ -0,0 +1,26 @@ +# Unit Testing + +The dbt-ga4 package treats each model and macro as a 'unit' of code. If we fix the input to each unit, we can test that we received the expected output. To do this, we use the `pytest` framework as described here: + +- https://docs.getdbt.com/docs/contributing/testing-a-new-adapter +- https://github.com/dbt-labs/dbt-core/discussions/4455#discussioncomment-2766503 + +You'll need to install pytest, pytest-dotenv and create a `.env` file with a `BIGQUERY_PROJECT` key containing the name of your BigQuery project. An 'oauth' connection method is assumed for local development. + +Installing pytest & pytest-dotenv can be done using the requirements.txt file. Navigate to the `unit_tests` folder and run + +``` +pip install -r requirements.txt +``` + +To run the folder's suite of tests, navigate to the `unit_tests` folder in the command line and run: + +``` +python -m pytest . +``` + +To run a specific test: + +``` +python -m pytest path/to/test.py +``` diff --git a/unit_tests/conftest.py b/unit_tests/conftest.py new file mode 100644 index 00000000..03f3f509 --- /dev/null +++ b/unit_tests/conftest.py @@ -0,0 +1,32 @@ +import pytest +import os + +# Import the standard functional fixtures as a plugin +pytest_plugins = ["dbt.tests.fixtures.project"] + +# The profile dictionary, used to write out profiles.yml +@pytest.fixture(scope="class") +def dbt_profile_target(): + # Set project and keyfile for github automated tests + if os.environ.get('GITHUB_ACTIONS') is not None: + return { + 'type': 'bigquery', + 'method': 'service-account', + 'keyfile': os.environ.get("GITHUB_WORKSPACE") + "/unit_tests/dbt-service-account.json", + 'threads': 4, + 'timeout_seconds': 300, + 'project': os.environ.get("BIGQUERY_PROJECT") + } + return { + 'type': 'bigquery', + 'method': 'oauth', + 'threads': 4, + 'project': os.environ.get("BIGQUERY_PROJECT") + } + +@pytest.fixture(scope="class") +def project_config_update(): + return { + 'name': 'ga4' + , 'vars':{'static_incremental_days':3} + } \ No newline at end of file diff --git a/unit_tests/requirements.txt b/unit_tests/requirements.txt new file mode 100644 index 00000000..fb47c46d --- /dev/null +++ b/unit_tests/requirements.txt @@ -0,0 +1,2 @@ +pytest +pytest-dotenv \ No newline at end of file diff --git a/unit_tests/test_macro_default_channel_grouping.py b/unit_tests/test_macro_default_channel_grouping.py new file mode 100644 index 00000000..ac300c95 --- /dev/null +++ b/unit_tests/test_macro_default_channel_grouping.py @@ -0,0 +1,302 @@ +import pytest +from dbt.tests.util import read_file,check_relations_equal,run_dbt + +traffic_data_with_expected_channels = [ + # Direct: Source exactly matches "(direct)" AND Medium is one of ("(not set)", "(none)") + { + "source": "(direct)", + "medium": "(none)", + "campaign": "", + "expected_channel": "Direct" + }, + { + "source": "(direct)", + "medium": "(not set)", + "campaign": "", + "expected_channel": "Direct" + }, + # Cross-network: Campaign Name contains "cross-network" + { + "source": "some-source", + "medium": "some-medium", + "campaign": "some-cross-network-campaign", + "expected_channel": "Cross-network" + }, + { + "source": "some-source", + "medium": "some-medium", + "campaign": "cross-network", + "expected_channel": "Cross-network" + }, + # Paid Shopping: + # (Source matches a list of shopping sites + # OR + # Campaign Name matches regex ^(.*(([^a-df-z]|^)shop|shopping).*)$) + # AND + # Medium matches regex ^(.*cp.*|ppc|retargeting|paid.*)$ + { + "source": "alibaba", + "medium": "", + "campaign": "", + "expected_channel": "Paid Shopping" + }, + { + "source": "some-source", + "medium": "retargeting", + "campaign": "shopping", + "expected_channel": "Paid Shopping" + }, + # Paid Search: + # Source matches a list of search sites + # AND + # Medium matches regex ^(.*cp.*|ppc|retargeting|paid.*)$ + { + "source": "google", + "medium": "ppc", + "campaign": "", + "expected_channel": "Paid Search" + }, + # Paid Social: + # Source matches a regex list of social sites + # AND + # Medium matches regex ^(.*cp.*|ppc|retargeting|paid.*)$ + { + "source": "facebook", + "medium": "retargeting", + "campaign": "", + "expected_channel": "Paid Social" + }, + # Paid Video: + # Source matches a list of video sites + # AND + # Medium matches regex ^(.*cp.*|ppc|retargeting|paid.*)$ + { + "source": "youtube.com", + "medium": "paid-something", + "campaign": "", + "expected_channel": "Paid Video" + }, + # Display: + # Medium is one of (“display”, “banner”, “expandable”, “interstitial”, “cpm”) + { + "source": "youtube.com", + "medium": "display", + "campaign": "", + "expected_channel": "Display" + }, + # Paid Other: + # Medium matches regex ^(.*cp.*|ppc|retargeting|paid.*)$ + { + "source": "some-source", + "medium": "cpc", + "campaign": "", + "expected_channel": "Paid Other" + }, + # Organic Shopping: + # Source matches a list of shopping sites + # OR + # Campaign name matches regex ^(.*(([^a-df-z]|^)shop|shopping).*)$ + { + "source": "Google Shopping", + "medium": "", + "campaign": "", + "expected_channel": "Organic Shopping" + }, + { + "source": "some-source", + "medium": "", + "campaign": "some-shopping-campaign", + "expected_channel": "Organic Shopping" + }, + # Organic Social: + # Source matches a regex list of social sites + # OR + # Medium is one of (“social”, “social-network”, “social-media”, “sm”, “social network”, “social media”) + { + "source": "facebook", + "medium": "", + "campaign": "", + "expected_channel": "Organic Social" + }, + { + "source": "some-source", + "medium": "social", + "campaign": "", + "expected_channel": "Organic Social" + }, + # Organic Video: + # Source matches a list of video sites + # OR + # Medium matches regex ^(.*video.*)$ + { + "source": "youtube.com", + "medium": "", + "campaign": "", + "expected_channel": "Organic Video" + }, + { + "source": "some-source", + "medium": "video", + "campaign": "", + "expected_channel": "Organic Video" + }, + # Organic Search: + # Source matches a list of search sites + # OR + # Medium exactly matches organic + { + "source": "bing", + "medium": "", + "campaign": "", + "expected_channel": "Organic Search" + }, + { + "source": "some-source", + "medium": "organic", + "campaign": "", + "expected_channel": "Organic Search" + }, + # Referral: + # Medium is one of ("referral", "app", or "link") + { + "source": "some-source", + "medium": "referral", + "campaign": "", + "expected_channel": "Referral" + }, + # Email: + # Source = email|e-mail|e_mail|e mail + # OR + # Medium = email|e-mail|e_mail|e mail + { + "source": "email", + "medium": "", + "campaign": "", + "expected_channel": "Email" + }, + { + "source": "", + "medium": "e mail", + "campaign": "", + "expected_channel": "Email" + }, + # Affiliates: + # Medium = affiliate + { + "source": "some-source", + "medium": "affiliate", + "campaign": "", + "expected_channel": "Affiliates" + }, + # Audio: + # Medium exactly matches audio + { + "source": "some-source", + "medium": "audio", + "campaign": "", + "expected_channel": "Audio" + }, + # SMS: + # Source exactly matches sms + # OR + # Medium exactly matches sms + { + "source": "sms", + "medium": "", + "campaign": "", + "expected_channel": "SMS" + }, + { + "source": "", + "medium": "sms", + "campaign": "", + "expected_channel": "SMS" + }, + # Mobile Push Notifications: + # Medium ends with "push" + # OR + # Medium contains "mobile" or "notification" + # OR + # Source exactly matches "firebase" + { + "source": "some-source", + "medium": "something-push", + "campaign": "", + "expected_channel": "Mobile Push Notifications" + }, + { + "source": "some-source", + "medium": "mobile-notification", + "campaign": "", + "expected_channel": "Mobile Push Notifications" + }, + { + "source": "firebase", + "medium": "", + "campaign": "", + "expected_channel": "Mobile Push Notifications" + }, + # Unassigned is the value Analytics uses when there are no other channel rules that match the event data. + { + "source": "some-source", + "medium": "some-medium", + "campaign": "some-campaign", + "expected_channel": "Unassigned" + }, +] + +# Generate the input CSV content and the expected CSV content +csv_header = "source,medium,campaign" +expected_header = "default_channel_grouping" + +traffic_input_lines = [csv_header] + [ + f"{row['source']},{row['medium']},{row['campaign']}" for row in traffic_data_with_expected_channels +] + +expected_csv_lines = [expected_header] + [ + row['expected_channel'] for row in traffic_data_with_expected_channels +] + +# Join the lines into a single string for input and expected CSV +traffic_input = "\n".join(traffic_input_lines) +expected_csv = "\n".join(expected_csv_lines) + + +actual = """ +with input as ( + select * from {{ref('traffic_input')}} + left join {{ref('source_category_mapping')}} using (source) +) +select +{{default_channel_grouping('source', 'medium', 'source_category','campaign')}} as default_channel_grouping +from input +""" + +class TestDefaultChannelGrouping(): + # everything that goes in the "seeds" directory (= CSV format) + @pytest.fixture(scope="class") + def seeds(self): + return { + "source_category_mapping.csv": read_file('../seeds/ga4_source_categories.csv'), + "traffic_input.csv": traffic_input, + "expected.csv": expected_csv, + } + + # everything that goes in the "models" directory (= SQL) + @pytest.fixture(scope="class") + def models(self): + return { + "actual.sql": actual, + } + + # everything that goes in the "macros" + @pytest.fixture(scope="class") + def macros(self): + return { + "macro_to_test.sql": read_file('../macros/default_channel_grouping.sql'), + } + + def test_mock_run_and_check(self, project): + #breakpoint() + run_dbt(["build"]) + check_relations_equal(project.adapter, ["actual", "expected"]) \ No newline at end of file diff --git a/unit_tests/test_macro_exclude_query_parameters.py b/unit_tests/test_macro_exclude_query_parameters.py new file mode 100644 index 00000000..fb30f7ce --- /dev/null +++ b/unit_tests/test_macro_exclude_query_parameters.py @@ -0,0 +1,52 @@ +import pytest +from dbt.tests.util import read_file,check_relations_equal,run_dbt + +# Define mocks via CSV (seeds) or SQL (models) +urls_to_test_csv = """url +www.website.com/?param_to_exclude=1234 +www.website.com/?param_to_exclude= +www.website.com/?foo=bar¶m_to_exclude=1234 +www.website.com/?foo=bar¶m_to_exclude=1234&another=parameter +www.website.com/?foo=bar¶m_to_exclude=1234&another=parameter&exclude=nope +""".lstrip() + +expected_csv = """url +www.website.com/ +www.website.com/ +www.website.com/?foo=bar +www.website.com/?foo=bar&another=parameter +www.website.com/?foo=bar&another=parameter&exclude=nope +""".lstrip() + +actual = """ +select +{{remove_query_parameters('url', ['param_to_exclude'])}} as url +from {{ref('urls_to_test')}} +""" + +class TestUsersFirstLastEvents(): + # everything that goes in the "seeds" directory (= CSV format) + @pytest.fixture(scope="class") + def seeds(self): + return { + "urls_to_test.csv": urls_to_test_csv, + "expected.csv": expected_csv, + } + + # everything that goes in the "models" directory (= SQL) + @pytest.fixture(scope="class") + def models(self): + return { + "actual.sql": actual, + } + + # everything that goes in the "macros" + @pytest.fixture(scope="class") + def macros(self): + return { + "macro_to_test.sql": read_file('../macros/url_parsing.sql'), + } + + def test_mock_run_and_check(self, project): + run_dbt(["build"]) + check_relations_equal(project.adapter, ["actual", "expected"]) diff --git a/unit_tests/test_macro_extract_query_parameter_value.py b/unit_tests/test_macro_extract_query_parameter_value.py new file mode 100644 index 00000000..473b5367 --- /dev/null +++ b/unit_tests/test_macro_extract_query_parameter_value.py @@ -0,0 +1,54 @@ +import pytest +from dbt.tests.util import read_file,check_relations_equal,run_dbt + +# Define mocks via CSV (seeds) or SQL (models) +urls_to_test_csv = """url +www.website.com/?param1=A +www.website.com/?param1=A¶m2=B +www.website.com/?param1=A¶m2=B¶m3=C +www.website.com/ +www.website.com/? +""".lstrip() + +expected_csv = """param1,param2,param3 +A,, +A,B, +A,B,C +,, +,, +""".lstrip() + +actual = """ + select + {{ extract_query_parameter_value( 'url' , 'param1' ) }} as param1, + {{ extract_query_parameter_value( 'url' , 'param2' ) }} as param2, + {{ extract_query_parameter_value( 'url' , 'param3' ) }} as param3 + from {{ref('urls_to_test')}} +""" + +class TestUsersFirstLastEvents(): + # everything that goes in the "seeds" directory (= CSV format) + @pytest.fixture(scope="class") + def seeds(self): + return { + "urls_to_test.csv": urls_to_test_csv, + "expected.csv": expected_csv, + } + + # everything that goes in the "models" directory (= SQL) + @pytest.fixture(scope="class") + def models(self): + return { + "actual.sql": actual, + } + + # everything that goes in the "macros" + @pytest.fixture(scope="class") + def macros(self): + return { + "macro_to_test.sql": read_file('../macros/url_parsing.sql'), + } + + def test_mock_run_and_check(self, project): + run_dbt(["build"]) + check_relations_equal(project.adapter, ["actual", "expected"]) diff --git a/unit_tests/test_stg_Ga4__user_id_mapping.py b/unit_tests/test_stg_Ga4__user_id_mapping.py new file mode 100644 index 00000000..bcb291f1 --- /dev/null +++ b/unit_tests/test_stg_Ga4__user_id_mapping.py @@ -0,0 +1,43 @@ +import pytest +from dbt.tests.util import read_file,check_relations_equal,run_dbt + +# Define mocks via CSV (seeds) or SQL (models) +mock_stg_ga4__events_csv = """client_key,user_id,event_timestamp +a1,,100 +a1,A,101 +b1,B,102 +c1,C,103 +c2,C,104 +c2,,105 +d1,,100 +""".lstrip() + +expected_csv = """last_seen_user_id,client_key,last_seen_user_id_timestamp +A,a1,101 +B,b1,102 +C,c1,103 +C,c2,104 +""".lstrip() + +actual = read_file('../models/staging/stg_ga4__user_id_mapping.sql') + +class TestUserIdMapping(): + # everything that goes in the "seeds" directory (= CSV format) + @pytest.fixture(scope="class") + def seeds(self): + return { + "stg_ga4__events.csv": mock_stg_ga4__events_csv, + "expected.csv": expected_csv, + } + + # everything that goes in the "models" directory (= SQL) + @pytest.fixture(scope="class") + def models(self): + return { + "actual.sql": actual, + } + + def test_mock_run_and_check(self, project): + run_dbt(["build"]) + #breakpoint() + check_relations_equal(project.adapter, ["actual", "expected"]) diff --git a/unit_tests/test_stg_ga4__derived_session_properties.py b/unit_tests/test_stg_ga4__derived_session_properties.py new file mode 100644 index 00000000..16c960eb --- /dev/null +++ b/unit_tests/test_stg_ga4__derived_session_properties.py @@ -0,0 +1,74 @@ +import pytest +from dbt.tests.util import read_file,check_relations_equal,run_dbt + +mock_stg_ga4__events_json = """ +{ "session_key": "AAA", "event_timestamp": "1617691790431476", "event_name": "first_visit", "event_params": [{ "key": "my_param", "value": { "string_value": null, "int_value": 1, "float_value": null, "double_value": null }}], "user_properties": [{ "key": "my_property", "value": { "string_value": "value1", "int_value": null, "float_value": null, "double_value": null }}]} +{ "session_key": "AAA", "event_timestamp": "1617691790431477", "event_name": "first_visit", "event_params": [{ "key": "my_param", "value": { "string_value": null, "int_value": 2, "float_value": null, "double_value": null }}]} +{ "session_key": "BBB", "event_timestamp": "1617691790431477", "event_name": "first_visit", "event_params": [{ "key": "my_param", "value": { "string_value": null, "int_value": 1, "float_value": null, "double_value": null }}], "user_properties": [{ "key": "my_property", "value": { "string_value": "value2", "int_value": null, "float_value": null, "double_value": null }}]} +""".lstrip() + +expected_csv = """session_key,my_derived_property,my_derived_property2 +AAA,2,value1 +BBB,1,value2 +""".lstrip() + +models__config_yml = """ +version: 2 +sources: + - name: fixture + schema: "{{ target.schema }}" + tables: + - name: mock_stg_ga4__events_json +""" + +class TestDerivedSessionProperties(): + # Update project name to ga4 so we can call macros with ga4.macro_name + @pytest.fixture(scope="class") + def project_config_update(self): + return { + "name": "ga4" + } + + # everything that goes in the "seeds" directory (= CSV format) + @pytest.fixture(scope="class") + def seeds(self): + return { + "expected.csv": expected_csv, + } + + # everything that goes in the "models" directory (= SQL) + @pytest.fixture(scope="class") + def models(self): + return { + "config.yml": models__config_yml, + "stg_ga4__events.sql": "select * from {{source('fixture','mock_stg_ga4__events_json')}}", + "actual.sql": read_file('../models/staging/stg_ga4__derived_session_properties.sql') + } + + # everything that goes in the "macros" + @pytest.fixture(scope="class") + def macros(self): + return { + "unnest_key.sql": read_file('../macros/unnest_key.sql'), + } + + def upload_json_fixture(self, project, file_name, json, table_name): + local_file_path = file_name + with open(local_file_path, "w") as outfile: + outfile.write(json) + project.adapter.upload_file( + local_file_path = local_file_path, + database = project.database, + table_schema = project.test_schema, + table_name = table_name, + kwargs = { + "source_format": "NEWLINE_DELIMITED_JSON", + "autodetect":"true" + } + ) + + def test_mock_run_and_check(self, project): + self.upload_json_fixture(project, "source.json", mock_stg_ga4__events_json, "mock_stg_ga4__events_json" ) + run_dbt(["build", "--vars", "derived_session_properties: [{'event_parameter':'my_param','session_property_name':'my_derived_property','value_type':'int_value'},{'user_property':'my_property','session_property_name':'my_derived_property2','value_type':'string_value'}]"]) + #breakpoint() + check_relations_equal(project.adapter, ["actual", "expected"]) diff --git a/unit_tests/test_stg_ga4__derived_user_properties.py b/unit_tests/test_stg_ga4__derived_user_properties.py new file mode 100644 index 00000000..2c04c34b --- /dev/null +++ b/unit_tests/test_stg_ga4__derived_user_properties.py @@ -0,0 +1,74 @@ +import pytest +from dbt.tests.util import read_file,check_relations_equal,run_dbt + +mock_stg_ga4__events_json = """ +{ "client_key": "AAA", "event_timestamp": "1617691790431476", "event_name": "first_visit", "event_params": [{ "key": "my_param", "value": { "string_value": null, "int_value": 1, "float_value": null, "double_value": null }}]} +{ "client_key": "AAA", "event_timestamp": "1617691790431477", "event_name": "first_visit", "event_params": [{ "key": "my_param", "value": { "string_value": null, "int_value": 2, "float_value": null, "double_value": null }}]} +{ "client_key": "BBB", "event_timestamp": "1617691790431477", "event_name": "first_visit", "event_params": [{ "key": "my_param", "value": { "string_value": null, "int_value": 1, "float_value": null, "double_value": null }}]} +""".lstrip() + +expected_csv = """client_key,my_derived_property +AAA,2 +BBB,1 +""".lstrip() + +models__config_yml = """ +version: 2 +sources: + - name: fixture + schema: "{{ target.schema }}" + tables: + - name: mock_stg_ga4__events_json +""" + +class TestDerivedUserProperties(): + # Update project name to ga4 so we can call macros with ga4.macro_name + @pytest.fixture(scope="class") + def project_config_update(self): + return { + "name": "ga4" + } + + # everything that goes in the "seeds" directory (= CSV format) + @pytest.fixture(scope="class") + def seeds(self): + return { + "expected.csv": expected_csv, + } + + # everything that goes in the "models" directory (= SQL) + @pytest.fixture(scope="class") + def models(self): + return { + "config.yml": models__config_yml, + "stg_ga4__events.sql": "select * from {{source('fixture','mock_stg_ga4__events_json')}}", + "actual.sql": read_file('../models/staging/stg_ga4__derived_user_properties.sql') + } + + # everything that goes in the "macros" + @pytest.fixture(scope="class") + def macros(self): + return { + "unnest_key.sql": read_file('../macros/unnest_key.sql'), + } + + def upload_json_fixture(self, project, file_name, json, table_name): + local_file_path = file_name + with open(local_file_path, "w") as outfile: + outfile.write(json) + project.adapter.upload_file( + local_file_path = local_file_path, + database = project.database, + table_schema = project.test_schema, + table_name = table_name, + kwargs = { + "source_format": "NEWLINE_DELIMITED_JSON", + "autodetect":"true" + } + ) + + def test_mock_run_and_check(self, project): + self.upload_json_fixture(project, "source.json", mock_stg_ga4__events_json, "mock_stg_ga4__events_json" ) + run_dbt(["build", "--vars", "derived_user_properties: [{'event_parameter':'my_param','user_property_name':'my_derived_property','value_type':'int_value'}]"]) + #breakpoint() + check_relations_equal(project.adapter, ["actual", "expected"]) diff --git a/unit_tests/test_stg_ga4__event_to_query_string_params.py b/unit_tests/test_stg_ga4__event_to_query_string_params.py new file mode 100644 index 00000000..ab10eeeb --- /dev/null +++ b/unit_tests/test_stg_ga4__event_to_query_string_params.py @@ -0,0 +1,45 @@ +import pytest +from dbt.tests.util import read_file,check_relations_equal,run_dbt + + +PARAMS_CSV = """event_key,page_query_string +aaa,param1=value1¶m2=value2 +bbb,param1 +ccc,param1= +""".lstrip() + +EXPECTED_CSV = """event_key,param,value +aaa,param1,value1 +aaa,param2,value2 +bbb,param1, +ccc,param1, +""".lstrip() + +actual = read_file('../models/staging/stg_ga4__event_to_query_string_params.sql').replace( + "ref('stg_ga4__events')", + "ref('params')" +) + + + +class TestEventToQueryStringParams(): + # everything that goes in the "seeds" directory (= CSV format) + @pytest.fixture(scope="class") + def seeds(self): + return { + "params.csv": PARAMS_CSV, + "expected.csv": EXPECTED_CSV, + + } + + # everything that goes in the "models" directory (= SQL) + @pytest.fixture(scope="class") + def models(self): + return { + "actual.sql": actual + } + + def test_mock_run_and_check(self, project): + #self.upload_json_fixture(project, "source.json", SOURCE_JSON, "SOURCE_JSON" ) + run_dbt(["build"]) + check_relations_equal(project.adapter, ["actual", "expected"]) diff --git a/unit_tests/test_stg_ga4__events.example b/unit_tests/test_stg_ga4__events.example new file mode 100644 index 00000000..d0733e26 --- /dev/null +++ b/unit_tests/test_stg_ga4__events.example @@ -0,0 +1,45 @@ +# This test doesn't quite work because the key columns are of type BYTE, but the JSON uploads the data as STRING. +# Keeping this file for now as an example of using JSON for both the input and expected output + +import pytest +from base_unit_test import BaseUnitTestModel +from dbt.tests.util import read_file,check_relations_equal,run_dbt + +SOURCE_JSON = """ +{ "event_date_dt": "2021-04-06", "event_timestamp": "1617691790431476", "event_name": "first_visit", "event_params": [{ "key": "ga_session_number", "value": { "string_value": null, "int_value": "1", "float_value": null, "double_value": null } }, { "key": "engaged_session_event", "value": { "string_value": null, "int_value": "1", "float_value": null, "double_value": null } }, { "key": "ga_session_id", "value": { "string_value": null, "int_value": "1617691775", "float_value": null, "double_value": null } }, { "key": "page_title", "value": { "string_value": "Velir | Behavior-Driven Testing in Drupal 8", "int_value": null, "float_value": null, "double_value": null } }, { "key": "page_location", "value": { "string_value": "https://www.velir.com/blog/2016/08/25/behavior-driven-testing-drupal-8", "int_value": null, "float_value": null, "double_value": null } }, { "key": "session_engaged", "value": { "string_value": null, "int_value": "1", "float_value": null, "double_value": null } }], "event_previous_timestamp": null, "event_value_in_usd": null, "event_bundle_sequence_id": "948327668", "event_server_timestamp_offset": null, "user_id": null, "client_id": "1166526666.1617691776", "privacy_info": null, "user_properties": [], "user_first_touch_timestamp": "1617691790431476", "user_ltv": { "revenue": "0.0", "currency": "USD" }, "device": { "category": "desktop", "mobile_brand_name": null, "mobile_model_name": null, "mobile_marketing_name": null, "mobile_os_hardware_model": null, "operating_system": "Windows", "operating_system_version": "Windows 10", "vendor_id": null, "advertising_id": null, "language": "en-us", "is_limited_ad_tracking": "No", "time_zone_offset_seconds": null, "browser": null, "browser_version": null, "web_info": { "browser": "Chrome", "browser_version": "89.0.4389.114", "hostname": "www.velir.com" } }, "geo": { "continent": "Asia", "country": "Vietnam", "region": "Ho Chi Minh City", "city": "Ho Chi Minh City", "sub_continent": "Southeast Asia", "metro": "(not set)" }, "app_info": null, "traffic_source": { "name": "(direct)", "medium": "(none)", "source": "(direct)" }, "stream_id": "1966637064", "platform": "WEB", "ecommerce": null, "items": [], "ga_session_id": "1617691775", "page_location": "https://www.velir.com/blog/2016/08/25/behavior-driven-testing-drupal-8", "ga_session_number": "1", "session_engaged": "1", "page_title": "Velir | Behavior-Driven Testing in Drupal 8", "page_referrer": null, "is_page_view": "0", "is_purchase": "0"} +""".lstrip() +EXPECTED_JSON = """ +{ "event_date_dt": "2021-04-06", "event_timestamp": "1617691790431476", "event_name": "first_visit", "event_params": [{ "key": "ga_session_number", "value": { "string_value": null, "int_value": "1", "float_value": null, "double_value": null } }, { "key": "engaged_session_event", "value": { "string_value": null, "int_value": "1", "float_value": null, "double_value": null } }, { "key": "ga_session_id", "value": { "string_value": null, "int_value": "1617691775", "float_value": null, "double_value": null } }, { "key": "page_title", "value": { "string_value": "Velir | Behavior-Driven Testing in Drupal 8", "int_value": null, "float_value": null, "double_value": null } }, { "key": "page_location", "value": { "string_value": "https://www.velir.com/blog/2016/08/25/behavior-driven-testing-drupal-8", "int_value": null, "float_value": null, "double_value": null } }, { "key": "session_engaged", "value": { "string_value": null, "int_value": "1", "float_value": null, "double_value": null } }], "event_previous_timestamp": null, "event_value_in_usd": null, "event_bundle_sequence_id": "948327668", "event_server_timestamp_offset": null, "user_id": null, "client_id": "1166526666.1617691776", "privacy_info": null, "user_properties": [], "user_first_touch_timestamp": "1617691790431476", "user_ltv": { "revenue": "0.0", "currency": "USD" }, "device": { "category": "desktop", "mobile_brand_name": null, "mobile_model_name": null, "mobile_marketing_name": null, "mobile_os_hardware_model": null, "operating_system": "Windows", "operating_system_version": "Windows 10", "vendor_id": null, "advertising_id": null, "language": "en-us", "is_limited_ad_tracking": "No", "time_zone_offset_seconds": null, "browser": null, "browser_version": null, "web_info": { "browser": "Chrome", "browser_version": "89.0.4389.114", "hostname": "www.velir.com" } }, "geo": { "continent": "Asia", "country": "Vietnam", "region": "Ho Chi Minh City", "city": "Ho Chi Minh City", "sub_continent": "Southeast Asia", "metro": "(not set)" }, "app_info": null, "traffic_source": { "name": "(direct)", "medium": "(none)", "source": "(direct)" }, "stream_id": "1966637064", "platform": "WEB", "ecommerce": null, "items": [], "ga_session_id": "1617691775", "page_location": "https://www.velir.com/blog/2016/08/25/behavior-driven-testing-drupal-8", "ga_session_number": "1", "session_engaged": "1", "page_title": "Velir | Behavior-Driven Testing in Drupal 8", "page_referrer": null, "is_page_view": "0", "is_purchase": "0", "session_key": "TAp7hHaymXXA/Way5byPBw\u003d\u003d", "session_event_number": "1", "event_key": "DGb378zSx/aIZs76gM4aTQ\u003d\u003d", "page_hostname": "velir.com", "page_query_string": null} +""".lstrip() + +models__config_yml = """ +version: 2 +sources: + - name: fixture + schema: "{{ target.schema }}" + tables: + - name: SOURCE_JSON + - name: EXPECTED_OUTPUT +""" + +actual = read_file('../models/staging/stg_ga4__events.sql').replace( + "ref('base_ga4__events')", + "source('fixture', 'SOURCE_JSON')" +) + +class TestStgGa4Events(BaseUnitTestModel): + # everything that goes in the "models" directory (= SQL) + @pytest.fixture(scope="class") + def models(self): + return { + "config.yml": models__config_yml, + "actual.sql": actual, + "expected.sql": "select * from {{ source('fixture', 'EXPECTED_OUTPUT') }}" + } + + def test_mock_run_and_check(self, project): + self.upload_json_fixture(project, "source.json", SOURCE_JSON, "SOURCE_JSON" ) + self.upload_json_fixture(project, "expected.json", EXPECTED_JSON, "EXPECTED_OUTPUT" ) + run_dbt(["run"]) + breakpoint() + check_relations_equal(project.adapter, ["actual", "expected"]) diff --git a/unit_tests/test_stg_ga4__events.todo b/unit_tests/test_stg_ga4__events.todo new file mode 100644 index 00000000..a1d76e9c --- /dev/null +++ b/unit_tests/test_stg_ga4__events.todo @@ -0,0 +1,43 @@ +# Test test currently fails because the event_key depends on the event_params nested field. Cannot mock that using CSV. + +import pytest +from dbt.tests.util import read_file,check_relations_equal,run_dbt + +# Define mocks via CSV (seeds) or SQL (models) +mock_base_ga4__events_csv = """user_id,event_name,event_timestamp,client_key,ga_session_id,stream_id,page_location,page_referrer,source,medium,campaign +user_id_1,pageview,12345,client_key_1,ga_session_id_1,stream_id_1,http://www.website.com/?foo=bar,http://www.cnn.com/,google,organic,(organic) +""".lstrip() + +expected_csv = """user_id,event_name,event_timestamp,client_key,ga_session_id,stream_id,source,user_key,session_key,event_key,medium,campaign,original_page_location,original_page_referrer,page_location,page_referrer,page_hostname,page_query_string +user_id_1,pageview,12345,client_key_1,ga_session_id_1,stream_id_1,google,c/nWU/GWhlWiLU0S6R/rwg==,9fDgaCrbd4ieAj1QpcWDjw==,70B/o+ww2nOTa32ASF/ulw==,organic,(organic),http://www.website.com/?foo=bar,http://www.cnn.com/,http://www.website.com/?foo=bar,http://www.cnn.com/,website.com,foo=bar +""" + +actual = read_file('../models/staging/stg_ga4__events.sql') + +class TestStgEvents(): + # everything that goes in the "seeds" directory (= CSV format) + @pytest.fixture(scope="class") + def seeds(self): + return { + "base_ga4__events.csv": mock_base_ga4__events_csv, + "expected.csv": expected_csv + } + + # everything that goes in the "macros" + @pytest.fixture(scope="class") + def macros(self): + return { + "macros.sql": read_file('../macros/url_parsing.sql'), + } + + # everything that goes in the "models" directory (= SQL) + @pytest.fixture(scope="class") + def models(self): + return { + "actual.sql": actual + } + + def test_mock_run_and_check(self, project): + run_dbt(["build"]) + breakpoint() + check_relations_equal(project.adapter, ["actual", "expected"]) diff --git a/unit_tests/test_stg_ga4__page_conversions.py b/unit_tests/test_stg_ga4__page_conversions.py new file mode 100644 index 00000000..a88c2fa2 --- /dev/null +++ b/unit_tests/test_stg_ga4__page_conversions.py @@ -0,0 +1,37 @@ +import pytest +from dbt.tests.util import read_file,check_relations_equal,run_dbt + +# Define mocks via CSV (seeds) or SQL (models) +mock_stg_ga4__events_csv = """event_name,page_key +page_view,A +page_view,A +page_view,B +""".lstrip() + +expected_csv = """page_key,page_view_count +A,2 +B,1 +""".lstrip() + +actual = read_file('../models/staging/stg_ga4__page_conversions.sql') + +class TestPageConversions(): + # everything that goes in the "seeds" directory (= CSV format) + @pytest.fixture(scope="class") + def seeds(self): + return { + "stg_ga4__events.csv": mock_stg_ga4__events_csv, + "expected.csv": expected_csv, + } + + # everything that goes in the "models" directory (= SQL) + @pytest.fixture(scope="class") + def models(self): + return { + "actual.sql": actual, + } + + def test_mock_run_and_check(self, project): + run_dbt(["build", "--vars", "conversion_events: ['page_view']"]) + #breakpoint() + check_relations_equal(project.adapter, ["actual", "expected"]) diff --git a/unit_tests/test_stg_ga4__session_conversions_daily.py b/unit_tests/test_stg_ga4__session_conversions_daily.py new file mode 100644 index 00000000..bc1cbc65 --- /dev/null +++ b/unit_tests/test_stg_ga4__session_conversions_daily.py @@ -0,0 +1,42 @@ +import pytest +from dbt.tests.util import read_file,check_relations_equal,run_dbt + +# Define mocks via CSV (seeds) or SQL (models) +mock_stg_ga4__events_csv = """session_key,session_partition_key,event_name,event_date_dt +A,A2022-01-01,page_view,2022-01-01 +A,A2022-01-01,my_conversion,2022-01-01 +A,A2022-01-01,my_conversion,2022-01-01 +B,B2022-01-01,my_conversion,2022-01-01 +C,C2022-01-01,some_other_event,2022-01-01 +A,A2022-01-02,my_conversion,2022-01-02 +""".lstrip() + +expected_csv = """session_key,session_partition_key,session_partition_date,my_conversion_count +A,A2022-01-01,2022-01-01,2 +B,B2022-01-01,2022-01-01,1 +C,C2022-01-01,2022-01-01,0 +A,A2022-01-02,2022-01-02,1 +""".lstrip() + +actual = read_file('../models/staging/stg_ga4__session_conversions_daily.sql') + +class TestUsersFirstLastEvents(): + # everything that goes in the "seeds" directory (= CSV format) + @pytest.fixture(scope="class") + def seeds(self): + return { + "stg_ga4__events.csv": mock_stg_ga4__events_csv, + "expected.csv": expected_csv, + } + + # everything that goes in the "models" directory (= SQL) + @pytest.fixture(scope="class") + def models(self): + return { + "actual.sql": actual, + } + + def test_mock_run_and_check(self, project): + run_dbt(["build", "--vars", "conversion_events: ['my_conversion']"]) + #breakpoint() + check_relations_equal(project.adapter, ["actual", "expected"]) diff --git a/unit_tests/test_stg_ga4__sessions_traffic_sources_last_non_direct_daily.py b/unit_tests/test_stg_ga4__sessions_traffic_sources_last_non_direct_daily.py new file mode 100644 index 00000000..d32ef800 --- /dev/null +++ b/unit_tests/test_stg_ga4__sessions_traffic_sources_last_non_direct_daily.py @@ -0,0 +1,40 @@ +import pytest +from dbt.tests.util import read_file,check_relations_equal,run_dbt + +# Define mocks via CSV (seeds) or SQL (models) +mock_stg_ga4__sessions_traffic_sources_daily_csv = """client_key,session_partition_key,session_partition_date,session_partition_timestamp,session_source,session_medium,session_source_category,session_campaign,session_content,session_term,session_default_channel_grouping,non_direct_session_partition_key +A,A,20230505,1683321359,source_a,medium_a,source_category_a,campaign_a,content_a,term_a,default_channel_grouping_a,A +A,B,20230506,1683407759,(direct),,,,,,, +A,C,20230507,1683494159,source_a,medium_a,source_category_a,campaign_a,content_a,term_a,default_channel_grouping_a,C +A,D,20230508,1683580559,(direct),,,,,,, +""".lstrip() + +expected_csv = """client_key,session_partition_key,session_partition_date,session_source,session_medium,session_source_category,session_campaign,session_content,session_term,session_default_channel_grouping,session_partition_key_last_non_direct,last_non_direct_source,last_non_direct_medium,last_non_direct_source_category,last_non_direct_campaign,last_non_direct_content,last_non_direct_term,last_non_direct_default_channel_grouping +A,A,20230505,source_a,medium_a,source_category_a,campaign_a,content_a,term_a,default_channel_grouping_a,A,source_a,medium_a,source_category_a,campaign_a,content_a,term_a,default_channel_grouping_a +A,B,20230506,(direct),,,,,,,A,source_a,medium_a,source_category_a,campaign_a,content_a,term_a,default_channel_grouping_a +A,C,20230507,source_a,medium_a,source_category_a,campaign_a,content_a,term_a,default_channel_grouping_a,C,source_a,medium_a,source_category_a,campaign_a,content_a,term_a,default_channel_grouping_a +A,D,20230508,(direct),,,,,,,C,source_a,medium_a,source_category_a,campaign_a,content_a,term_a,default_channel_grouping_a +""".lstrip() + +actual = read_file('../models/staging/stg_ga4__sessions_traffic_sources_last_non_direct_daily.sql') + +class TestSessionsTrafficSourcesLastNonDirectDaily(): + # everything that goes in the "seeds" directory (= CSV format) + @pytest.fixture(scope="class") + def seeds(self): + return { + "stg_ga4__sessions_traffic_sources_daily.csv": mock_stg_ga4__sessions_traffic_sources_daily_csv, + "expected.csv": expected_csv, + } + + # everything that goes in the "models" directory (= SQL) + @pytest.fixture(scope="class") + def models(self): + return { + # Hack-y solution to ensure the model is not partitioned. Loading mock data (date columns) from a seed file + partitioning don't work well together. + "actual.sql": actual.replace("materialized = 'incremental',","materialized = 'view',"), + } + + def test_mock_run_and_check(self, project): + run_dbt(["build"]) + check_relations_equal(project.adapter, ["actual", "expected"]) diff --git a/unit_tests/test_stg_ga4__users_first_last_events.py b/unit_tests/test_stg_ga4__users_first_last_events.py new file mode 100644 index 00000000..7880aaf3 --- /dev/null +++ b/unit_tests/test_stg_ga4__users_first_last_events.py @@ -0,0 +1,35 @@ +import pytest +from dbt.tests.util import read_file,check_relations_equal,run_dbt + +# Define mocks via CSV (seeds) or SQL (models) +mock_stg_ga4__events_csv = """stream_id,client_key,event_key,event_timestamp,geo_continent,geo_country,geo_region,geo_city,geo_sub_continent,geo_metro,device_category,device_mobile_brand_name,device_mobile_model_name,device_mobile_marketing_name,device_mobile_os_hardware_model,device_operating_system,device_operating_system_version,device_vendor_id,device_advertising_id,device_language,device_is_limited_ad_tracking,device_time_zone_offset_seconds,device_browser,device_browser_version,device_web_info_browser,device_web_info_browser_version,device_web_info_hostname,user_campaign,user_medium,user_source +1,IX+OyYJBgjwqML19GB/XIQ==,H06dLW6OhNJJ6SoEPFsSyg==,1661339279816517,Asia,India,Maharashtra,Mumbai,Southern Asia,(not set),desktop,Google,Chrome,,,Windows,Windows 10,,,en-us,No,,,,Chrome,104.0.0.0,www.velir.com,,, +1,IX+OyYJBgjwqML19GB/XIQ==,gt1SoAtrxDv33uDGwVeMVA==,1661339279816518,USA,Massachusetts,Maharashtra,Mumbai,Southern Asia,(not set),mobile,Google,Chrome,,,Windows,Windows 10,,,en-us,No,,,,Chrome,104.0.0.0,www.velir.com,,, +""".lstrip() + +expected_csv = """client_key,first_event,last_event,stream_id,first_geo_continent,first_geo_country,first_geo_region,first_geo_city,first_geo_sub_continent,first_geo_metro,first_device_category,first_device_mobile_brand_name,first_device_mobile_model_name,first_device_mobile_marketing_name,first_device_mobile_os_hardware_model,first_device_operating_system,first_device_operating_system_version,first_device_vendor_id,first_device_advertising_id,first_device_language,first_device_is_limited_ad_tracking,first_device_time_zone_offset_seconds,first_device_browser,first_device_browser_version,first_device_web_info_browser,first_device_web_info_browser_version,first_device_web_info_hostname,first_user_campaign,first_user_medium,first_user_source,last_geo_continent,last_geo_country,last_geo_region,last_geo_city,last_geo_sub_continent,last_geo_metro,last_device_category,last_device_mobile_brand_name,last_device_mobile_model_name,last_device_mobile_marketing_name,last_device_mobile_os_hardware_model,last_device_operating_system,last_device_operating_system_version,last_device_vendor_id,last_device_advertising_id,last_device_language,last_device_is_limited_ad_tracking,last_device_time_zone_offset_seconds,last_device_browser,last_device_browser_version,last_device_web_info_browser,last_device_web_info_browser_version,last_device_web_info_hostname,last_user_campaign,last_user_medium,last_user_source +IX+OyYJBgjwqML19GB/XIQ==,H06dLW6OhNJJ6SoEPFsSyg==,gt1SoAtrxDv33uDGwVeMVA==,1,Asia,India,Maharashtra,Mumbai,Southern Asia,(not set),desktop,Google,Chrome,,,Windows,Windows 10,,,en-us,No,,,,Chrome,104.0.0.0,www.velir.com,,,,USA,Massachusetts,Maharashtra,Mumbai,Southern Asia,(not set),mobile,Google,Chrome,,,Windows,Windows 10,,,en-us,No,,,,Chrome,104.0.0.0,www.velir.com,,, +""".lstrip() + +actual = read_file('../models/staging/stg_ga4__client_key_first_last_events.sql') + +class TestUsersFirstLastEvents(): + # everything that goes in the "seeds" directory (= CSV format) + @pytest.fixture(scope="class") + def seeds(self): + return { + "stg_ga4__events.csv": mock_stg_ga4__events_csv, + "expected.csv": expected_csv, + } + + # everything that goes in the "models" directory (= SQL) + @pytest.fixture(scope="class") + def models(self): + return { + "actual.sql": actual, + } + + def test_mock_run_and_check(self, project): + run_dbt(["build"]) + #breakpoint() + check_relations_equal(project.adapter, ["actual", "expected"]) From 5459116846d6fddf955db627a2f796521d02c003 Mon Sep 17 00:00:00 2001 From: Chien Le Date: Wed, 27 Mar 2024 17:42:01 +0700 Subject: [PATCH 3/9] Revert "fix cpc classification and add test (#306)" This reverts commit fd6fe8bfefd0d0a9649f9a3774d3cfe6ffdceb7a. --- models/staging/stg_ga4__events.sql | 2 -- tests/page_location_with_gclid_is_cpc.sql | 15 --------------- 2 files changed, 17 deletions(-) delete mode 100644 tests/page_location_with_gclid_is_cpc.sql diff --git a/models/staging/stg_ga4__events.sql b/models/staging/stg_ga4__events.sql index e3914bef..c8b2f655 100644 --- a/models/staging/stg_ga4__events.sql +++ b/models/staging/stg_ga4__events.sql @@ -38,12 +38,10 @@ detect_gclid as ( end as event_source, case when (page_location like '%gclid%' and event_medium is null) then "cpc" - when (page_location like '%gclid%' and event_medium = 'organic') then "cpc" else event_medium end as event_medium, case when (page_location like '%gclid%' and event_campaign is null) then "(cpc)" - when (page_location like '%gclid%' and event_campaign = 'organic') then "(cpc)" else event_campaign end as event_campaign from include_event_key diff --git a/tests/page_location_with_gclid_is_cpc.sql b/tests/page_location_with_gclid_is_cpc.sql deleted file mode 100644 index 8f38b7fb..00000000 --- a/tests/page_location_with_gclid_is_cpc.sql +++ /dev/null @@ -1,15 +0,0 @@ --- Google has changed the combination of parameters that are used to identify a CPC source in the past. --- In order to detect new changes, this test checks that a page_location with a gclid is classified as cpc. - -{{config( - severity = 'warn' -)}} -select - count(event_source) as sources - , count(event_medium) as mediums -from {{ref('stg_ga4__events')}} -where original_page_location like '%gclid%' - and event_source != 'google' - and event_medium != 'cpc' -having sources > 0 - or mediums > 0 \ No newline at end of file From 7e429ac1480f43fca83af47c95e757131aa1b50d Mon Sep 17 00:00:00 2001 From: Chien Le Date: Thu, 28 Mar 2024 09:58:48 +0700 Subject: [PATCH 4/9] fix: models incremental strategy avoid duplicate --- dbt_project.yml | 2 +- models/marts/core/dim_ga4__sessions.sql | 4 ++-- .../stg_ga4__sessions_first_last_pageviews.sql | 2 +- .../stg_ga4__sessions_first_last_pageviews.yml | 18 ++++++++++++++++++ .../stg_ga4__sessions_traffic_sources.sql | 1 + 5 files changed, 23 insertions(+), 4 deletions(-) create mode 100644 models/staging/stg_ga4__sessions_first_last_pageviews.yml diff --git a/dbt_project.yml b/dbt_project.yml index adab0c5f..812b80b1 100644 --- a/dbt_project.yml +++ b/dbt_project.yml @@ -19,4 +19,4 @@ models: ga4: +materialized: view marts: - +materialized: table \ No newline at end of file + +materialized: table diff --git a/models/marts/core/dim_ga4__sessions.sql b/models/marts/core/dim_ga4__sessions.sql index b9c6447d..e766442b 100644 --- a/models/marts/core/dim_ga4__sessions.sql +++ b/models/marts/core/dim_ga4__sessions.sql @@ -1,10 +1,10 @@ {{ config( materialized = 'incremental', - incremental_strategy = 'merge', + incremental_strategy = 'insert_overwrite', tags = ["incremental"], on_schema_change = 'sync_all_columns', - unnest_keys = ['session_key'], + unique_key = ['session_key'], partition_by={ "field": "session_partition_date", "data_type": "date", diff --git a/models/staging/stg_ga4__sessions_first_last_pageviews.sql b/models/staging/stg_ga4__sessions_first_last_pageviews.sql index e1d432c5..e51d0530 100644 --- a/models/staging/stg_ga4__sessions_first_last_pageviews.sql +++ b/models/staging/stg_ga4__sessions_first_last_pageviews.sql @@ -1,7 +1,7 @@ {{ config( materialized = 'incremental', - incremental_strategy = 'merge', + incremental_strategy = 'insert_overwrite', unique_key = ['session_key'], tags = ["incremental"], partition_by={ diff --git a/models/staging/stg_ga4__sessions_first_last_pageviews.yml b/models/staging/stg_ga4__sessions_first_last_pageviews.yml new file mode 100644 index 00000000..3b6777a3 --- /dev/null +++ b/models/staging/stg_ga4__sessions_first_last_pageviews.yml @@ -0,0 +1,18 @@ +models: + - name: stg_ga4__sessions_first_last_pageviews + description: "" + columns: + - name: session_key + description: "" + data_type: STRING + tests: + - unique + - name: first_page_view_event_key + description: "" + data_type: STRING + - name: last_page_view_event_key + description: "" + data_type: STRING + - name: first_page_view_event_time + description: "" + data_type: TIMESTAMP diff --git a/models/staging/stg_ga4__sessions_traffic_sources.sql b/models/staging/stg_ga4__sessions_traffic_sources.sql index 5dd34127..0183bf68 100644 --- a/models/staging/stg_ga4__sessions_traffic_sources.sql +++ b/models/staging/stg_ga4__sessions_traffic_sources.sql @@ -4,6 +4,7 @@ incremental_strategy = 'insert_overwrite', tags = ["incremental"], on_schema_change = 'sync_all_columns', + unique_key = ['session_key'], partition_by={ "field": "session_partition_date", "data_type": "date", From 3183b14e59ea6aea77bb50a2863a6b0af0a3b2f5 Mon Sep 17 00:00:00 2001 From: Chien Le Date: Thu, 28 Mar 2024 11:01:21 +0700 Subject: [PATCH 5/9] fix: sessions extended overnight cause duplicates --- dbt_project.yml | 28 +++++++++++++++++++++++++ models/marts/core/dim_ga4__sessions.sql | 17 +++++++++------ 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/dbt_project.yml b/dbt_project.yml index 812b80b1..03b2e26d 100644 --- a/dbt_project.yml +++ b/dbt_project.yml @@ -20,3 +20,31 @@ models: +materialized: view marts: +materialized: table +vars: + ga4: + source_project: "agile-scheme-394814" + property_ids: [336884118] + start_date: "20231001" + static_incremental_days: 2 + query_parameter_exclusions: ["gclid", "fbclid", "_ga"] + query_parameter_extraction: ["gclid", "fbclid", "keyword"] + conversion_events: ["purchase", "message", "call", "generate_lead", "store_visit"] + user_properties: + - user_property_name: "phone" + value_type: "string_value" + - user_property_name: "address" + value_type: "string_value" + - user_property_name: "email" + value_type: "string_value" + - user_property_name: "name" + value_type: "string_value" + derived_user_properties: + - event_parameter: "page_location" + user_property_name: "most_recent_page_location" + value_type: "string_value" + - event_parameter: "store" + user_property_name: "visited_branch" + value_type: "string_value" + - event_parameter: "location" + user_property_name: "searched_branch" + value_type: "string_value" \ No newline at end of file diff --git a/models/marts/core/dim_ga4__sessions.sql b/models/marts/core/dim_ga4__sessions.sql index e766442b..c645169b 100644 --- a/models/marts/core/dim_ga4__sessions.sql +++ b/models/marts/core/dim_ga4__sessions.sql @@ -10,6 +10,9 @@ "data_type": "date", "granularity": "day" }, + merge_exclude_columns= [ + 'session_partition_date' + ] ) }} @@ -17,14 +20,16 @@ -- Dimension table for sessions based on the first event that isn't session_start or first_visit. with session_first_event as ( - select * - from {{ref('stg_ga4__events')}} - where event_name != 'first_visit' - and event_name != 'session_start' + select e.* + from {{ref('stg_ga4__events')}} e + inner join {{ref("stg_ga4__sessions_first_last_pageviews")}} pv + on e.session_key = pv.session_key and e.event_date_dt = date(pv.first_page_view_event_time) + where e.event_name != 'first_visit' + and e.event_name != 'session_start' {% if is_incremental() %} - and event_date_dt >= date_sub(current_date, interval {{var('static_incremental_days',3) | int}} day) + and e.event_date_dt >= date_sub(current_date, interval {{var('static_incremental_days',3) | int}} day) {% endif %} - qualify row_number() over(partition by session_key order by event_timestamp) = 1 + qualify row_number() over(partition by e.session_key order by event_timestamp) = 1 ), session_start_dims as ( select From 1a4ae23af0d843f461c9018fe7e55425963d39ba Mon Sep 17 00:00:00 2001 From: Chien Le Date: Thu, 28 Mar 2024 11:23:33 +0700 Subject: [PATCH 6/9] fix: client_keys dimension model --- models/marts/core/dim_ga4__client_keys.sql | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/models/marts/core/dim_ga4__client_keys.sql b/models/marts/core/dim_ga4__client_keys.sql index 6a6cf4a8..53bb3dd8 100644 --- a/models/marts/core/dim_ga4__client_keys.sql +++ b/models/marts/core/dim_ga4__client_keys.sql @@ -70,14 +70,23 @@ include_first_last_page_views as ( ), include_user_properties as ( -select * from include_first_last_page_views +select p.*, + +{% if var('derived_user_properties', false) %} + dup.* except(last_updated,client_key), +{% endif %} +{% if var('user_properties', false) %} + up.* except(last_updated,client_key), +{% endif %} + +from include_first_last_page_views p {% if var('derived_user_properties', false) %} -- If derived user properties have been assigned as variables, join them on the client_key -left join {{ref('stg_ga4__derived_user_properties')}} using (client_key) +inner join {{ref('stg_ga4__derived_user_properties')}} as dup using (client_key) {% endif %} {% if var('user_properties', false) %} -- If user properties have been assigned as variables, join them on the client_key -left join {{ref('stg_ga4__user_properties')}} using (client_key) +inner join {{ref('stg_ga4__user_properties')}} as up using (client_key) {% endif %} ) From 5c42dd9abc59b4c0fc072a4f96ec2853f80d7f7e Mon Sep 17 00:00:00 2001 From: Chien Le Date: Thu, 28 Mar 2024 11:41:39 +0700 Subject: [PATCH 7/9] feat: add is_user_id_implemented var --- README.md | 8 ++++++ dbt_project.yml | 30 +-------------------- models/marts/core/fct_ga4__user_ids.sql | 8 ++++++ models/staging/stg_ga4__user_id_mapping.sql | 6 +++++ 4 files changed, 23 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 5cf39293..91ef1237 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,14 @@ vars: ## Optional Variables +### Exclude User Id models +Many websites does not implement log-in feature lead to undefined (or null) `user_id` in GA4 data. While this is not recommended, but you can drop `stg_ga4__user_id_mapping` and `fct_ga4__user_ids` models by specify `is_user_id_implemented`, default to true. +``` +vars: + ga4: + is_user_id_implemented: false +``` + ### Query Parameter Exclusions Setting `query_parameter_exclusions` will remove query string parameters from the `page_location` and `page_referrer` fields for all downstream processing. Original parameters are captured in the `original_page_location` and `original_page_referrer` fields. Ex: diff --git a/dbt_project.yml b/dbt_project.yml index 03b2e26d..adab0c5f 100644 --- a/dbt_project.yml +++ b/dbt_project.yml @@ -19,32 +19,4 @@ models: ga4: +materialized: view marts: - +materialized: table -vars: - ga4: - source_project: "agile-scheme-394814" - property_ids: [336884118] - start_date: "20231001" - static_incremental_days: 2 - query_parameter_exclusions: ["gclid", "fbclid", "_ga"] - query_parameter_extraction: ["gclid", "fbclid", "keyword"] - conversion_events: ["purchase", "message", "call", "generate_lead", "store_visit"] - user_properties: - - user_property_name: "phone" - value_type: "string_value" - - user_property_name: "address" - value_type: "string_value" - - user_property_name: "email" - value_type: "string_value" - - user_property_name: "name" - value_type: "string_value" - derived_user_properties: - - event_parameter: "page_location" - user_property_name: "most_recent_page_location" - value_type: "string_value" - - event_parameter: "store" - user_property_name: "visited_branch" - value_type: "string_value" - - event_parameter: "location" - user_property_name: "searched_branch" - value_type: "string_value" \ No newline at end of file + +materialized: table \ No newline at end of file diff --git a/models/marts/core/fct_ga4__user_ids.sql b/models/marts/core/fct_ga4__user_ids.sql index 018019f0..31372bb7 100644 --- a/models/marts/core/fct_ga4__user_ids.sql +++ b/models/marts/core/fct_ga4__user_ids.sql @@ -1,3 +1,11 @@ + + {{ + config( + enabled = var('is_user_id_implemented', true), + ) + }} + + with user_id_mapped as ( select client_keys.*, diff --git a/models/staging/stg_ga4__user_id_mapping.sql b/models/staging/stg_ga4__user_id_mapping.sql index 75786898..44a64cac 100644 --- a/models/staging/stg_ga4__user_id_mapping.sql +++ b/models/staging/stg_ga4__user_id_mapping.sql @@ -1,3 +1,9 @@ +{{ + config( + enabled = var('is_user_id_implemented', true), + ) + }} + with events_with_user_id as ( select user_id, From 4796a92dbc42bf30f1bb056baadd3b94d6728735 Mon Sep 17 00:00:00 2001 From: Chien Le Date: Thu, 28 Mar 2024 13:58:51 +0700 Subject: [PATCH 8/9] -f --- .github/workflows/run_unit_tests_on_pr.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/run_unit_tests_on_pr.yml b/.github/workflows/run_unit_tests_on_pr.yml index 6a2b6dd4..ee1d785f 100644 --- a/.github/workflows/run_unit_tests_on_pr.yml +++ b/.github/workflows/run_unit_tests_on_pr.yml @@ -32,6 +32,7 @@ jobs: pip install dbt-core pip install dbt-bigquery pip install pytest + pip install pyarrow - name: Run tests run: python -m pytest . From e3ee23adc3088c1d5d1a774646cf97480adab3d5 Mon Sep 17 00:00:00 2001 From: Chien Le Date: Thu, 28 Mar 2024 14:15:22 +0700 Subject: [PATCH 9/9] -f --- dbt_project.yml | 30 ++++++++++++++++++- .../test_stg_ga4__users_first_last_events.py | 4 +-- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/dbt_project.yml b/dbt_project.yml index adab0c5f..03b2e26d 100644 --- a/dbt_project.yml +++ b/dbt_project.yml @@ -19,4 +19,32 @@ models: ga4: +materialized: view marts: - +materialized: table \ No newline at end of file + +materialized: table +vars: + ga4: + source_project: "agile-scheme-394814" + property_ids: [336884118] + start_date: "20231001" + static_incremental_days: 2 + query_parameter_exclusions: ["gclid", "fbclid", "_ga"] + query_parameter_extraction: ["gclid", "fbclid", "keyword"] + conversion_events: ["purchase", "message", "call", "generate_lead", "store_visit"] + user_properties: + - user_property_name: "phone" + value_type: "string_value" + - user_property_name: "address" + value_type: "string_value" + - user_property_name: "email" + value_type: "string_value" + - user_property_name: "name" + value_type: "string_value" + derived_user_properties: + - event_parameter: "page_location" + user_property_name: "most_recent_page_location" + value_type: "string_value" + - event_parameter: "store" + user_property_name: "visited_branch" + value_type: "string_value" + - event_parameter: "location" + user_property_name: "searched_branch" + value_type: "string_value" \ No newline at end of file diff --git a/unit_tests/test_stg_ga4__users_first_last_events.py b/unit_tests/test_stg_ga4__users_first_last_events.py index 7880aaf3..575cfc15 100644 --- a/unit_tests/test_stg_ga4__users_first_last_events.py +++ b/unit_tests/test_stg_ga4__users_first_last_events.py @@ -7,8 +7,8 @@ 1,IX+OyYJBgjwqML19GB/XIQ==,gt1SoAtrxDv33uDGwVeMVA==,1661339279816518,USA,Massachusetts,Maharashtra,Mumbai,Southern Asia,(not set),mobile,Google,Chrome,,,Windows,Windows 10,,,en-us,No,,,,Chrome,104.0.0.0,www.velir.com,,, """.lstrip() -expected_csv = """client_key,first_event,last_event,stream_id,first_geo_continent,first_geo_country,first_geo_region,first_geo_city,first_geo_sub_continent,first_geo_metro,first_device_category,first_device_mobile_brand_name,first_device_mobile_model_name,first_device_mobile_marketing_name,first_device_mobile_os_hardware_model,first_device_operating_system,first_device_operating_system_version,first_device_vendor_id,first_device_advertising_id,first_device_language,first_device_is_limited_ad_tracking,first_device_time_zone_offset_seconds,first_device_browser,first_device_browser_version,first_device_web_info_browser,first_device_web_info_browser_version,first_device_web_info_hostname,first_user_campaign,first_user_medium,first_user_source,last_geo_continent,last_geo_country,last_geo_region,last_geo_city,last_geo_sub_continent,last_geo_metro,last_device_category,last_device_mobile_brand_name,last_device_mobile_model_name,last_device_mobile_marketing_name,last_device_mobile_os_hardware_model,last_device_operating_system,last_device_operating_system_version,last_device_vendor_id,last_device_advertising_id,last_device_language,last_device_is_limited_ad_tracking,last_device_time_zone_offset_seconds,last_device_browser,last_device_browser_version,last_device_web_info_browser,last_device_web_info_browser_version,last_device_web_info_hostname,last_user_campaign,last_user_medium,last_user_source -IX+OyYJBgjwqML19GB/XIQ==,H06dLW6OhNJJ6SoEPFsSyg==,gt1SoAtrxDv33uDGwVeMVA==,1,Asia,India,Maharashtra,Mumbai,Southern Asia,(not set),desktop,Google,Chrome,,,Windows,Windows 10,,,en-us,No,,,,Chrome,104.0.0.0,www.velir.com,,,,USA,Massachusetts,Maharashtra,Mumbai,Southern Asia,(not set),mobile,Google,Chrome,,,Windows,Windows 10,,,en-us,No,,,,Chrome,104.0.0.0,www.velir.com,,, +expected_csv = """client_key,first_visit,last_seen_at,first_event,last_event,stream_id,first_geo_continent,first_geo_country,first_geo_region,first_geo_city,first_geo_sub_continent,first_geo_metro,first_device_category,first_device_mobile_brand_name,first_device_mobile_model_name,first_device_mobile_marketing_name,first_device_mobile_os_hardware_model,first_device_operating_system,first_device_operating_system_version,first_device_vendor_id,first_device_advertising_id,first_device_language,first_device_is_limited_ad_tracking,first_device_time_zone_offset_seconds,first_device_browser,first_device_browser_version,first_device_web_info_browser,first_device_web_info_browser_version,first_device_web_info_hostname,first_user_campaign,first_user_medium,first_user_source,last_geo_continent,last_geo_country,last_geo_region,last_geo_city,last_geo_sub_continent,last_geo_metro,last_device_category,last_device_mobile_brand_name,last_device_mobile_model_name,last_device_mobile_marketing_name,last_device_mobile_os_hardware_model,last_device_operating_system,last_device_operating_system_version,last_device_vendor_id,last_device_advertising_id,last_device_language,last_device_is_limited_ad_tracking,last_device_time_zone_offset_seconds,last_device_browser,last_device_browser_version,last_device_web_info_browser,last_device_web_info_browser_version,last_device_web_info_hostname,last_user_campaign,last_user_medium,last_user_source +IX+OyYJBgjwqML19GB/XIQ==,1661339279816517,1661339279816518,H06dLW6OhNJJ6SoEPFsSyg==,gt1SoAtrxDv33uDGwVeMVA==,1,Asia,India,Maharashtra,Mumbai,Southern Asia,(not set),desktop,Google,Chrome,,,Windows,Windows 10,,,en-us,No,,,,Chrome,104.0.0.0,www.velir.com,,,,USA,Massachusetts,Maharashtra,Mumbai,Southern Asia,(not set),mobile,Google,Chrome,,,Windows,Windows 10,,,en-us,No,,,,Chrome,104.0.0.0,www.velir.com,,, """.lstrip() actual = read_file('../models/staging/stg_ga4__client_key_first_last_events.sql')