Skip to content

Commit

Permalink
Merge pull request #438 from cagov/imputation_daily_summary
Browse files Browse the repository at this point in the history
developed imputation QC model that will monitor daily sample balance for different imputation methods
  • Loading branch information
mmmiah authored Nov 21, 2024
2 parents b8518c9 + b467f46 commit 296acd9
Show file tree
Hide file tree
Showing 2 changed files with 142 additions and 0 deletions.
76 changes: 76 additions & 0 deletions transform/models/marts/quality/_quality.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,79 @@ models:
Counts the total number of detectors with a status other than Good on a daily basis.
- name: ALL_DETECTOR_STATUS_TOTAL_COUNT
description: Counts the total number of rows/detectors on a daily basis
- name: quality_imputation_daily_sample_count
description: |
This model counts the number of imputed, observed and observed-unimputed daily sample as
well as their percentage. The count of imputed sample was again broken down by imputation methodology.
columns:
- name: SAMPLE_DATE
description: |
The date associated with the aggregate daily counts of sample.
- name: SAMPLE_CT
description: Daily sample count that includes observed and null.
- name: VOL_LOCAL_AVG_IMPUTATION_SAMPLE
description: Number of daily volume sample that was imputed by local average method.
- name: VOL_LOCAL_IMPUTATION_SAMPLE
description: Number of daily volume sample that was imputed by local regression method.
- name: VOL_OBSERVED_SAMPLE
description: Number of daily volume sample that do not need imputation.
- name: VOL_IMPUTED_SAMPLE
description: Number of daily volume sample that was imputation.
- name: VOL_REGIONAL_AVG_IMPUTATION_SAMPLE
description: Number of daily volume sample that was imputed by regional average method.
- name: VOL_REGIONAL_IMPUTATION_SAMPLE
description: Number of daily volume sample that was imputed by regional regression method.
- name: VOL_UNOBSERVED_UNIMPUTED
description: Number of daily volume sample that was not imputed but is observed and null.
- name: VOL_GLOBAL_IMPUTATION_SAMPLE
description: Number of daily volume sample that was imputed by global regression method.
- name: SPEED_LOCAL_AVG_IMPUTATION_SAMPLE
description: Number of daily speed sample that was imputed by local average method.
- name: SPEED_LOCAL_IMPUTATION_SAMPLE
description: Number of daily speed sample that was imputed by local regression method.
- name: SPEED_OBSERVED_SAMPLE
description: Number of daily speed sample that do not need imputation.
- name: SPEED_IMPUTED_SAMPLE
description: Number of daily speed sample that was imputation.
- name: SPEED_REGIONAL_AVG_IMPUTATION_SAMPLE
description: Number of daily speed sample that was imputed by regional average method.
- name: SPEED_REGIONAL_IMPUTATION_SAMPLE
description: Number of daily speed sample that was imputed by regional regression method.
- name: SPEED_UNOBSERVED_UNIMPUTED
description: Number of daily speed sample that was not imputed but is observed and null.
- name: SPEED_GLOBAL_IMPUTATION_SAMPLE
description: Number of daily speed sample that was imputed by global regression method.
- name: OCC_LOCAL_AVG_IMPUTATION_SAMPLE
description: Number of daily occupancy sample that was imputed by local average method.
- name: OCC_LOCAL_IMPUTATION_SAMPLE
description: Number of daily occupancy sample that was imputed by local regression method.
- name: OCC_OBSERVED_SAMPLE
description: Number of daily occupancy sample that do not need imputation.
- name: OCC_IMPUTED_SAMPLE
description: Number of daily occupancy sample that was imputation.
- name: OCC_REGIONAL_AVG_IMPUTATION_SAMPLE
description: Number of daily occupancy sample that was imputed by regional average method.
- name: OCC_REGIONAL_IMPUTATION_SAMPLE
description: Number of daily occupancy sample that was imputed by regional regression method.
- name: OCC_UNOBSERVED_UNIMPUTED
description: Number of daily occupancy sample that was not imputed but is observed and null.
- name: OCC_GLOBAL_IMPUTATION_SAMPLE
description: Number of daily occupancy sample that was imputed by global regression method.
- name: PCT_OCC_IMPUTED
description: Percentage of daily occupancy sample that was imputed.
- name: PCT_SPEED_IMPUTED
description: Percentage of daily speed sample that was imputed.
- name: PCT_VOL_IMPUTED
description: Percentage of daily volume sample that was imputed.
- name: PCT_SPEED_OBSERVED
description: Percentage of daily speed sample that was observed.
- name: PCT_OCC_OBSERVED
description: Percentage of daily occupancy sample that was observed.
- name: PCT_VOL_OBSERVED
description: Percentage of daily volume sample that was observed.
- name: PCT_OCC_OBSERVED_UNIMPUTED
description: Percentage of daily occupancy sample that was observed but not imputed.
- name: PCT_SPEED_OBSERVED_UNIMPUTED
description: Percentage of daily speed sample that was observed but not imputed.
- name: PCT_VOL_OBSERVED_UNIMPUTED
description: Percentage of daily volume sample that was observed but not imputed.
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
{{ config(
materialized="incremental",
unique_key=['sample_date'],
snowflake_warehouse=get_snowflake_refresh_warehouse(big="XL"),
on_schema_change= "sync_all_columns"
) }}

-- read observed and imputed five minutes data
with obs_imputed_five_minutes_agg as (
select *
from {{ ref('int_imputation__detector_imputed_agg_five_minutes') }}
where station_type in ('HV', 'ML') and {{ make_model_incremental('sample_date') }}
),

imputation_status_count as (
select
sample_date,
count(*) as sample_ct,
count_if(occupancy_imputation_method = 'local') as occ_local_imputation_sample,
count_if(occupancy_imputation_method = 'regional') as occ_regional_imputation_sample,
count_if(occupancy_imputation_method = 'global') as occ_global_imputation_sample,
count_if(occupancy_imputation_method = 'local_avg') as occ_local_avg_imputation_sample,
count_if(occupancy_imputation_method = 'regional_avg') as occ_regional_avg_imputation_sample,
count_if(occupancy_imputation_method = 'observed') as occ_observed_sample,
count_if(occupancy_imputation_method = 'observed_unimputed') as occ_unobserved_unimputed,
count_if(volume_imputation_method = 'local') as vol_local_imputation_sample,
count_if(volume_imputation_method = 'regional') as vol_regional_imputation_sample,
count_if(volume_imputation_method = 'global') as vol_global_imputation_sample,
count_if(volume_imputation_method = 'local_avg') as vol_local_avg_imputation_sample,
count_if(volume_imputation_method = 'regional_avg') as vol_regional_avg_imputation_sample,
count_if(volume_imputation_method = 'observed') as vol_observed_sample,
count_if(volume_imputation_method = 'observed_unimputed') as vol_unobserved_unimputed,
count_if(speed_imputation_method = 'local') as speed_local_imputation_sample,
count_if(speed_imputation_method = 'regional') as speed_regional_imputation_sample,
count_if(speed_imputation_method = 'global') as speed_global_imputation_sample,
count_if(speed_imputation_method = 'local_avg') as speed_local_avg_imputation_sample,
count_if(speed_imputation_method = 'regional_avg') as speed_regional_avg_imputation_sample,
count_if(speed_imputation_method = 'observed') as speed_observed_sample,
count_if(speed_imputation_method = 'observed_unimputed') as speed_unobserved_unimputed,
count_if(occupancy_imputation_method != 'observed' and occupancy_imputation_method != 'observed_unimputed')
as occ_imputed_sample,
count_if(volume_imputation_method != 'observed' and volume_imputation_method != 'observed_unimputed')
as vol_imputed_sample,
count_if(speed_imputation_method != 'observed' and speed_imputation_method != 'observed_unimputed')
as speed_imputed_sample
from obs_imputed_five_minutes_agg
group by sample_date
),

sample_count as (
select
*,
(vol_imputed_sample / nullif(sample_ct, 0)) * 100 as pct_vol_imputed,
(vol_observed_sample / nullif(sample_ct, 0)) * 100 as pct_vol_observed,
(vol_unobserved_unimputed / nullif(sample_ct, 0)) * 100 as pct_vol_observed_unimputed,
(speed_imputed_sample / nullif(sample_ct, 0)) * 100 as pct_speed_imputed,
(speed_observed_sample / nullif(sample_ct, 0)) * 100 as pct_speed_observed,
(speed_unobserved_unimputed / nullif(sample_ct, 0)) * 100 as pct_speed_observed_unimputed,
(occ_imputed_sample / nullif(sample_ct, 0)) * 100 as pct_occ_imputed,
(occ_observed_sample / nullif(sample_ct, 0)) * 100 as pct_occ_observed,
(occ_unobserved_unimputed / nullif(sample_ct, 0)) * 100 as pct_occ_observed_unimputed
from imputation_status_count
)

select *
from sample_count

0 comments on commit 296acd9

Please sign in to comment.