From 6ffb0d0d17aacfc8d72ec3ee55840b51568fdb14 Mon Sep 17 00:00:00 2001 From: Adler Santos Date: Thu, 24 Mar 2022 12:36:16 -0400 Subject: [PATCH] feat: Revise Google Political Ads due to new dataset version (#317) * regenerate TF files, less blank lines * feat: refactor google_political_ads dataset pipeline --- .../advertiser_declared_stats_pipeline.tf | 39 - .../infra/advertiser_geo_spend_pipeline.tf | 39 - .../infra/advertiser_stats_pipeline.tf | 39 - .../infra/advertiser_weekly_spend_pipeline.tf | 39 - .../infra/campaign_targeting_pipeline.tf | 39 - .../infra/creative_stats_pipeline.tf | 39 - .../infra/geo_spend_pipeline.tf | 39 - .../infra/google_political_ads_dataset.tf | 2 +- .../infra/last_updated_pipeline.tf | 39 - .../process_csvs_and_load_to_bq_pipeline.tf | 178 +++ .../infra/top_keywords_history_pipeline.tf | 39 - .../run_csv_transform_kub/csv_transform.py | 137 +- .../advertiser_declared_stats_dag.py | 114 -- .../advertiser_declared_stats/pipeline.yaml | 135 -- .../advertiser_geo_spend_dag.py | 180 --- .../advertiser_geo_spend/pipeline.yaml | 179 --- .../advertiser_stats/advertiser_stats_dag.py | 190 --- .../pipelines/advertiser_stats/pipeline.yaml | 187 --- .../advertiser_weekly_spend_dag.py | 180 --- .../advertiser_weekly_spend/pipeline.yaml | 180 --- .../campaign_targeting_dag.py | 136 -- .../campaign_targeting/pipeline.yaml | 152 -- .../creative_stats/creative_stats_dag.py | 340 ----- .../pipelines/creative_stats/pipeline.yaml | 287 ---- .../pipelines/dataset.yaml | 71 +- .../pipelines/geo_spend/geo_spend_dag.py | 172 --- .../pipelines/geo_spend/pipeline.yaml | 176 --- .../last_updated/last_updated_dag.py | 82 -- .../pipelines/last_updated/pipeline.yaml | 116 -- .../process_csvs_and_load_to_bq/pipeline.yaml | 1030 +++++++++++++ .../process_csvs_and_load_to_bq_dag.py | 1287 +++++++++++++++++ .../top_keywords_history/pipeline.yaml | 176 --- .../top_keywords_history_dag.py | 174 --- 33 files changed, 2582 insertions(+), 3630 deletions(-) delete mode 100644 datasets/google_political_ads/infra/advertiser_declared_stats_pipeline.tf delete mode 100644 datasets/google_political_ads/infra/advertiser_geo_spend_pipeline.tf delete mode 100644 datasets/google_political_ads/infra/advertiser_stats_pipeline.tf delete mode 100644 datasets/google_political_ads/infra/advertiser_weekly_spend_pipeline.tf delete mode 100644 datasets/google_political_ads/infra/campaign_targeting_pipeline.tf delete mode 100644 datasets/google_political_ads/infra/creative_stats_pipeline.tf delete mode 100644 datasets/google_political_ads/infra/geo_spend_pipeline.tf delete mode 100644 datasets/google_political_ads/infra/last_updated_pipeline.tf create mode 100644 datasets/google_political_ads/infra/process_csvs_and_load_to_bq_pipeline.tf delete mode 100644 datasets/google_political_ads/infra/top_keywords_history_pipeline.tf delete mode 100644 datasets/google_political_ads/pipelines/advertiser_declared_stats/advertiser_declared_stats_dag.py delete mode 100644 datasets/google_political_ads/pipelines/advertiser_declared_stats/pipeline.yaml delete mode 100644 datasets/google_political_ads/pipelines/advertiser_geo_spend/advertiser_geo_spend_dag.py delete mode 100644 datasets/google_political_ads/pipelines/advertiser_geo_spend/pipeline.yaml delete mode 100644 datasets/google_political_ads/pipelines/advertiser_stats/advertiser_stats_dag.py delete mode 100644 datasets/google_political_ads/pipelines/advertiser_stats/pipeline.yaml delete mode 100644 datasets/google_political_ads/pipelines/advertiser_weekly_spend/advertiser_weekly_spend_dag.py delete mode 100644 datasets/google_political_ads/pipelines/advertiser_weekly_spend/pipeline.yaml delete mode 100644 datasets/google_political_ads/pipelines/campaign_targeting/campaign_targeting_dag.py delete mode 100644 datasets/google_political_ads/pipelines/campaign_targeting/pipeline.yaml delete mode 100644 datasets/google_political_ads/pipelines/creative_stats/creative_stats_dag.py delete mode 100644 datasets/google_political_ads/pipelines/creative_stats/pipeline.yaml delete mode 100644 datasets/google_political_ads/pipelines/geo_spend/geo_spend_dag.py delete mode 100644 datasets/google_political_ads/pipelines/geo_spend/pipeline.yaml delete mode 100644 datasets/google_political_ads/pipelines/last_updated/last_updated_dag.py delete mode 100644 datasets/google_political_ads/pipelines/last_updated/pipeline.yaml create mode 100644 datasets/google_political_ads/pipelines/process_csvs_and_load_to_bq/pipeline.yaml create mode 100644 datasets/google_political_ads/pipelines/process_csvs_and_load_to_bq/process_csvs_and_load_to_bq_dag.py delete mode 100644 datasets/google_political_ads/pipelines/top_keywords_history/pipeline.yaml delete mode 100644 datasets/google_political_ads/pipelines/top_keywords_history/top_keywords_history_dag.py diff --git a/datasets/google_political_ads/infra/advertiser_declared_stats_pipeline.tf b/datasets/google_political_ads/infra/advertiser_declared_stats_pipeline.tf deleted file mode 100644 index 118d9c389..000000000 --- a/datasets/google_political_ads/infra/advertiser_declared_stats_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "google_political_ads_advertiser_declared_stats" { - project = var.project_id - dataset_id = "google_political_ads" - table_id = "advertiser_declared_stats" - - description = "Certain California and New Zealand advertisers are required to submit additional data about themselves. The advertiser is responsible for the accuracy of this information, which Google has not confirmed. For California, this information is provided from our express notification process required for certain California advertisers, which is separate from our verification process. For New Zealand, this information is provided during our verification process." - - - - - depends_on = [ - google_bigquery_dataset.google_political_ads - ] -} - -output "bigquery_table-google_political_ads_advertiser_declared_stats-table_id" { - value = google_bigquery_table.google_political_ads_advertiser_declared_stats.table_id -} - -output "bigquery_table-google_political_ads_advertiser_declared_stats-id" { - value = google_bigquery_table.google_political_ads_advertiser_declared_stats.id -} diff --git a/datasets/google_political_ads/infra/advertiser_geo_spend_pipeline.tf b/datasets/google_political_ads/infra/advertiser_geo_spend_pipeline.tf deleted file mode 100644 index 46b557ea6..000000000 --- a/datasets/google_political_ads/infra/advertiser_geo_spend_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "google_political_ads_advertiser_geo_spend" { - project = var.project_id - dataset_id = "google_political_ads" - table_id = "advertiser_geo_spend" - - description = "This file contains total US advertiser spend on political ads, per US state and the District of Columbia." - - - - - depends_on = [ - google_bigquery_dataset.google_political_ads - ] -} - -output "bigquery_table-google_political_ads_advertiser_geo_spend-table_id" { - value = google_bigquery_table.google_political_ads_advertiser_geo_spend.table_id -} - -output "bigquery_table-google_political_ads_advertiser_geo_spend-id" { - value = google_bigquery_table.google_political_ads_advertiser_geo_spend.id -} diff --git a/datasets/google_political_ads/infra/advertiser_stats_pipeline.tf b/datasets/google_political_ads/infra/advertiser_stats_pipeline.tf deleted file mode 100644 index 4fdd80d62..000000000 --- a/datasets/google_political_ads/infra/advertiser_stats_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "google_political_ads_advertiser_stats" { - project = var.project_id - dataset_id = "google_political_ads" - table_id = "advertiser_stats" - - description = "This table contains the information about advertisers who have run an election ad on Google Ads Services with at least one impression. The table\u0027s primary key is advertiser_id. This table relates to the others in this dataset, with the following connections between columns: advertiser_id is referenced from: advertiser_weekly_spend.advertiser_id campaign_targeting.advertiser_id creative_stats.advertiser_id advertiser_name is referenced from: advertiser_weekly_spend.advertiser_name campaign_targeting.advertiser_name advertiser_id.advertiser_name" - - - - - depends_on = [ - google_bigquery_dataset.google_political_ads - ] -} - -output "bigquery_table-google_political_ads_advertiser_stats-table_id" { - value = google_bigquery_table.google_political_ads_advertiser_stats.table_id -} - -output "bigquery_table-google_political_ads_advertiser_stats-id" { - value = google_bigquery_table.google_political_ads_advertiser_stats.id -} diff --git a/datasets/google_political_ads/infra/advertiser_weekly_spend_pipeline.tf b/datasets/google_political_ads/infra/advertiser_weekly_spend_pipeline.tf deleted file mode 100644 index 754fe926c..000000000 --- a/datasets/google_political_ads/infra/advertiser_weekly_spend_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "google_political_ads_advertiser_weekly_spend" { - project = var.project_id - dataset_id = "google_political_ads" - table_id = "advertiser_weekly_spend" - - description = "This table contains the information for how much an advertiser spent on political ads during a given week. The table\u0027s primary key is advertiser_id, election_cycle, week_start_date" - - - - - depends_on = [ - google_bigquery_dataset.google_political_ads - ] -} - -output "bigquery_table-google_political_ads_advertiser_weekly_spend-table_id" { - value = google_bigquery_table.google_political_ads_advertiser_weekly_spend.table_id -} - -output "bigquery_table-google_political_ads_advertiser_weekly_spend-id" { - value = google_bigquery_table.google_political_ads_advertiser_weekly_spend.id -} diff --git a/datasets/google_political_ads/infra/campaign_targeting_pipeline.tf b/datasets/google_political_ads/infra/campaign_targeting_pipeline.tf deleted file mode 100644 index b0f3608b2..000000000 --- a/datasets/google_political_ads/infra/campaign_targeting_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "google_political_ads_campaign_targeting" { - project = var.project_id - dataset_id = "google_political_ads" - table_id = "campaign_targeting" - - description = "This table was deprecated and ad-level targeting information was made available in the `google_political_ads.creative_stats` BigQuery table, effective April 2020. This table contains the information related to ad campaigns run by advertisers." - - - - - depends_on = [ - google_bigquery_dataset.google_political_ads - ] -} - -output "bigquery_table-google_political_ads_campaign_targeting-table_id" { - value = google_bigquery_table.google_political_ads_campaign_targeting.table_id -} - -output "bigquery_table-google_political_ads_campaign_targeting-id" { - value = google_bigquery_table.google_political_ads_campaign_targeting.id -} diff --git a/datasets/google_political_ads/infra/creative_stats_pipeline.tf b/datasets/google_political_ads/infra/creative_stats_pipeline.tf deleted file mode 100644 index e16f16aae..000000000 --- a/datasets/google_political_ads/infra/creative_stats_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "google_political_ads_creative_stats" { - project = var.project_id - dataset_id = "google_political_ads" - table_id = "creative_stats" - - description = "This table contains the information for election ads that have appeared on Google Ads Services. Ad-level targeting data was added to this file in April 2020. ad_id is referenced from: campaign_targeting.ads_list Data that was previously available in the `google_political_ads.campaign_targeting` table has been deprecated and removed in favor of this table." - - - - - depends_on = [ - google_bigquery_dataset.google_political_ads - ] -} - -output "bigquery_table-google_political_ads_creative_stats-table_id" { - value = google_bigquery_table.google_political_ads_creative_stats.table_id -} - -output "bigquery_table-google_political_ads_creative_stats-id" { - value = google_bigquery_table.google_political_ads_creative_stats.id -} diff --git a/datasets/google_political_ads/infra/geo_spend_pipeline.tf b/datasets/google_political_ads/infra/geo_spend_pipeline.tf deleted file mode 100644 index 9a9729f21..000000000 --- a/datasets/google_political_ads/infra/geo_spend_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "google_political_ads_geo_spend" { - project = var.project_id - dataset_id = "google_political_ads" - table_id = "geo_spend" - - description = "This table contains the information for how much is spent buying election ads on Google Ads Services. The data is aggregated by Congressional district. The primary key is state, congressional_district." - - - - - depends_on = [ - google_bigquery_dataset.google_political_ads - ] -} - -output "bigquery_table-google_political_ads_geo_spend-table_id" { - value = google_bigquery_table.google_political_ads_geo_spend.table_id -} - -output "bigquery_table-google_political_ads_geo_spend-id" { - value = google_bigquery_table.google_political_ads_geo_spend.id -} diff --git a/datasets/google_political_ads/infra/google_political_ads_dataset.tf b/datasets/google_political_ads/infra/google_political_ads_dataset.tf index 8727b6317..b8c5381d7 100644 --- a/datasets/google_political_ads/infra/google_political_ads_dataset.tf +++ b/datasets/google_political_ads/infra/google_political_ads_dataset.tf @@ -18,7 +18,7 @@ resource "google_bigquery_dataset" "google_political_ads" { dataset_id = "google_political_ads" project = var.project_id - description = "Overview: This dataset contains information on how much money is spent by verified advertisers on political advertising across Google Ad Services. In addition, insights on demographic targeting used in political ad campaigns by these advertisers are also provided. Finally, links to the actual political ad in the Google Transparency Report (https://transparencyreport.google.com/) are provided. Data for an election expires 7 years after the election. After this point, the data are removed from the dataset and are no longer available.\n\nUpdate frequency: Weekly\n\nDataset source: Transparency Report: Political Advertising on Google\n\nTerms of use:\n\nSee the GCP Marketplace listing for more details and sample queries: https://console.cloud.google.com/marketplace/details/transparency-report/google-political-ads\n\nFor more information see:\nThe Political Advertising on Google Transparency Report at\nhttps://transparencyreport.google.com/political-ads/home\n\nThe supporting Frequently Asked Questions at\nhttps://support.google.com/transparencyreport/answer/9575640?hl=en\u0026ref_topic=7295796" + description = "Overview: This dataset contains information on how much money is spent by verified advertisers on political advertising across Google Ad Services. In addition, insights on demographic targeting used in political ad campaigns by these advertisers are also provided. Finally, links to the actual political ad in the Google Transparency Report (https://adstransparency.google.com) are provided. Data for an election expires 7 years after the election. After this point, the data are removed from the dataset and are no longer available.\n\nUpdate frequency: Daily\n\nDataset source: Transparency Report: Political Advertising on Google\n\nTerms of use:\n\nSee the GCP Marketplace listing for more details and sample queries: https://console.cloud.google.com/marketplace/details/transparency-report/google-political-ads\n\nFor more information see:\nThe Political Advertising on Google Transparency Report at\nhttps://adstransparency.google.com\n\nThe supporting Frequently Asked Questions at\nhttps://support.google.com/transparencyreport/answer/9575640?hl=en\u0026ref_topic=7295796" } output "bigquery_dataset-google_political_ads-dataset_id" { diff --git a/datasets/google_political_ads/infra/last_updated_pipeline.tf b/datasets/google_political_ads/infra/last_updated_pipeline.tf deleted file mode 100644 index f8ee6b800..000000000 --- a/datasets/google_political_ads/infra/last_updated_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "google_political_ads_last_updated" { - project = var.project_id - dataset_id = "google_political_ads" - table_id = "last_updated" - - description = "This table contains the information of the latest updated date for the Political Ads report. All dates provided are per UTC time zone." - - - - - depends_on = [ - google_bigquery_dataset.google_political_ads - ] -} - -output "bigquery_table-google_political_ads_last_updated-table_id" { - value = google_bigquery_table.google_political_ads_last_updated.table_id -} - -output "bigquery_table-google_political_ads_last_updated-id" { - value = google_bigquery_table.google_political_ads_last_updated.id -} diff --git a/datasets/google_political_ads/infra/process_csvs_and_load_to_bq_pipeline.tf b/datasets/google_political_ads/infra/process_csvs_and_load_to_bq_pipeline.tf new file mode 100644 index 000000000..9833c1852 --- /dev/null +++ b/datasets/google_political_ads/infra/process_csvs_and_load_to_bq_pipeline.tf @@ -0,0 +1,178 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "google_political_ads_advertiser_declared_stats" { + project = var.project_id + dataset_id = "google_political_ads" + table_id = "advertiser_declared_stats" + description = "Certain California and New Zealand advertisers are required to submit additional data about themselves. The advertiser is responsible for the accuracy of this information, which Google has not confirmed. For California, this information is provided from our express notification process required for certain California advertisers, which is separate from our verification process. For New Zealand, this information is provided during our verification process." + depends_on = [ + google_bigquery_dataset.google_political_ads + ] +} + +output "bigquery_table-google_political_ads_advertiser_declared_stats-table_id" { + value = google_bigquery_table.google_political_ads_advertiser_declared_stats.table_id +} + +output "bigquery_table-google_political_ads_advertiser_declared_stats-id" { + value = google_bigquery_table.google_political_ads_advertiser_declared_stats.id +} + +resource "google_bigquery_table" "google_political_ads_advertiser_geo_spend" { + project = var.project_id + dataset_id = "google_political_ads" + table_id = "advertiser_geo_spend" + description = "This file contains total US advertiser spend on political ads, per US state and the District of Columbia." + depends_on = [ + google_bigquery_dataset.google_political_ads + ] +} + +output "bigquery_table-google_political_ads_advertiser_geo_spend-table_id" { + value = google_bigquery_table.google_political_ads_advertiser_geo_spend.table_id +} + +output "bigquery_table-google_political_ads_advertiser_geo_spend-id" { + value = google_bigquery_table.google_political_ads_advertiser_geo_spend.id +} + +resource "google_bigquery_table" "google_political_ads_advertiser_stats" { + project = var.project_id + dataset_id = "google_political_ads" + table_id = "advertiser_stats" + description = "This table contains the information about advertisers who have run an election ad on Google Ads Services with at least one impression. The table\u0027s primary key is advertiser_id. This table relates to the others in this dataset, with the following connections between columns: advertiser_id is referenced from: advertiser_weekly_spend.advertiser_id campaign_targeting.advertiser_id creative_stats.advertiser_id advertiser_name is referenced from: advertiser_weekly_spend.advertiser_name campaign_targeting.advertiser_name advertiser_id.advertiser_name" + depends_on = [ + google_bigquery_dataset.google_political_ads + ] +} + +output "bigquery_table-google_political_ads_advertiser_stats-table_id" { + value = google_bigquery_table.google_political_ads_advertiser_stats.table_id +} + +output "bigquery_table-google_political_ads_advertiser_stats-id" { + value = google_bigquery_table.google_political_ads_advertiser_stats.id +} + +resource "google_bigquery_table" "google_political_ads_advertiser_weekly_spend" { + project = var.project_id + dataset_id = "google_political_ads" + table_id = "advertiser_weekly_spend" + description = "This table contains the information for how much an advertiser spent on political ads during a given week. The table\u0027s primary key is advertiser_id, election_cycle, week_start_date" + depends_on = [ + google_bigquery_dataset.google_political_ads + ] +} + +output "bigquery_table-google_political_ads_advertiser_weekly_spend-table_id" { + value = google_bigquery_table.google_political_ads_advertiser_weekly_spend.table_id +} + +output "bigquery_table-google_political_ads_advertiser_weekly_spend-id" { + value = google_bigquery_table.google_political_ads_advertiser_weekly_spend.id +} + +resource "google_bigquery_table" "google_political_ads_campaign_targeting" { + project = var.project_id + dataset_id = "google_political_ads" + table_id = "campaign_targeting" + description = "This table was deprecated and ad-level targeting information was made available in the `google_political_ads.creative_stats` BigQuery table, effective April 2020. This table contains the information related to ad campaigns run by advertisers." + depends_on = [ + google_bigquery_dataset.google_political_ads + ] +} + +output "bigquery_table-google_political_ads_campaign_targeting-table_id" { + value = google_bigquery_table.google_political_ads_campaign_targeting.table_id +} + +output "bigquery_table-google_political_ads_campaign_targeting-id" { + value = google_bigquery_table.google_political_ads_campaign_targeting.id +} + +resource "google_bigquery_table" "google_political_ads_creative_stats" { + project = var.project_id + dataset_id = "google_political_ads" + table_id = "creative_stats" + description = "This table contains the information for election ads that have appeared on Google Ads Services. Ad-level targeting data was added to this file in April 2020. ad_id is referenced from: campaign_targeting.ads_list Data that was previously available in the `google_political_ads.campaign_targeting` table has been deprecated and removed in favor of this table." + depends_on = [ + google_bigquery_dataset.google_political_ads + ] +} + +output "bigquery_table-google_political_ads_creative_stats-table_id" { + value = google_bigquery_table.google_political_ads_creative_stats.table_id +} + +output "bigquery_table-google_political_ads_creative_stats-id" { + value = google_bigquery_table.google_political_ads_creative_stats.id +} + +resource "google_bigquery_table" "google_political_ads_geo_spend" { + project = var.project_id + dataset_id = "google_political_ads" + table_id = "geo_spend" + description = "This table contains the information for how much is spent buying election ads on Google Ads Services. The data is aggregated by Congressional district. The primary key is state, congressional_district." + depends_on = [ + google_bigquery_dataset.google_political_ads + ] +} + +output "bigquery_table-google_political_ads_geo_spend-table_id" { + value = google_bigquery_table.google_political_ads_geo_spend.table_id +} + +output "bigquery_table-google_political_ads_geo_spend-id" { + value = google_bigquery_table.google_political_ads_geo_spend.id +} + +resource "google_bigquery_table" "google_political_ads_last_updated" { + project = var.project_id + dataset_id = "google_political_ads" + table_id = "last_updated" + description = "This table contains the information of the latest updated date for the Political Ads report. All dates provided are per UTC time zone." + depends_on = [ + google_bigquery_dataset.google_political_ads + ] +} + +output "bigquery_table-google_political_ads_last_updated-table_id" { + value = google_bigquery_table.google_political_ads_last_updated.table_id +} + +output "bigquery_table-google_political_ads_last_updated-id" { + value = google_bigquery_table.google_political_ads_last_updated.id +} + +resource "google_bigquery_table" "google_political_ads_top_keywords_history" { + project = var.project_id + dataset_id = "google_political_ads" + table_id = "top_keywords_history" + description = "The \"Top Keywords\" section of the US report was removed and updates to this table were terminated in December 2019. The table reflects historical data. This table contains the information for the top six keywords on which political advertisers have spent money during an election cycle. This data is only provided for US elections. The primary key is election_cycle, report_date." + depends_on = [ + google_bigquery_dataset.google_political_ads + ] +} + +output "bigquery_table-google_political_ads_top_keywords_history-table_id" { + value = google_bigquery_table.google_political_ads_top_keywords_history.table_id +} + +output "bigquery_table-google_political_ads_top_keywords_history-id" { + value = google_bigquery_table.google_political_ads_top_keywords_history.id +} diff --git a/datasets/google_political_ads/infra/top_keywords_history_pipeline.tf b/datasets/google_political_ads/infra/top_keywords_history_pipeline.tf deleted file mode 100644 index 7b0752a48..000000000 --- a/datasets/google_political_ads/infra/top_keywords_history_pipeline.tf +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright 2021 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -resource "google_bigquery_table" "google_political_ads_top_keywords_history" { - project = var.project_id - dataset_id = "google_political_ads" - table_id = "top_keywords_history" - - description = "The \u201cTop Keywords\u201d section of the US report was removed and updates to this table were terminated in December 2019. The table reflects historical data. This table contains the information for the top six keywords on which political advertisers have spent money during an election cycle. This data is only provided for US elections. The primary key is election_cycle, report_date." - - - - - depends_on = [ - google_bigquery_dataset.google_political_ads - ] -} - -output "bigquery_table-google_political_ads_top_keywords_history-table_id" { - value = google_bigquery_table.google_political_ads_top_keywords_history.table_id -} - -output "bigquery_table-google_political_ads_top_keywords_history-id" { - value = google_bigquery_table.google_political_ads_top_keywords_history.id -} diff --git a/datasets/google_political_ads/pipelines/_images/run_csv_transform_kub/csv_transform.py b/datasets/google_political_ads/pipelines/_images/run_csv_transform_kub/csv_transform.py index d66a13317..ab36b8b2c 100644 --- a/datasets/google_political_ads/pipelines/_images/run_csv_transform_kub/csv_transform.py +++ b/datasets/google_political_ads/pipelines/_images/run_csv_transform_kub/csv_transform.py @@ -14,75 +14,92 @@ import datetime -import fnmatch import json import logging -import math import os import pathlib import typing from zipfile import ZipFile import pandas as pd -import requests from google.cloud import storage +SPEND_RANGE_COLUMNS = [ + "spend_range_max_usd", + "spend_range_max_eur", + "spend_range_max_inr", + "spend_range_max_bgn", + "spend_range_max_hrk", + "spend_range_max_czk", + "spend_range_max_dkk", + "spend_range_max_huf", + "spend_range_max_pln", + "spend_range_max_ron", + "spend_range_max_gbp", + "spend_range_max_sek", + "spend_range_max_nzd", +] + +NUMERIC_COLUMNS = [ + "spend_usd", + "spend_eur", + "spend_inr", + "spend_bgn", + "spend_hrk", + "spend_czk", + "spend_dkk", + "spend_huf", + "spend_pln", + "spend_ron", + "spend_gbp", + "spend_sek", + "spend_nzd", +] + def main( - source_url: str, - source_file: pathlib.Path, - source_csv_name: str, + source_bucket: str, + source_object: str, + zip_file: pathlib.Path, + csv_file: str, target_file: pathlib.Path, target_gcs_bucket: str, target_gcs_path: str, headers: typing.List[str], rename_mappings: dict, - pipeline_name: str, + table_name: str, ) -> None: logging.info( - f"google political ads {pipeline_name} process started at " + f"google political ads {table_name} process started at " + str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) ) logging.info("creating 'files' folder") pathlib.Path("./files").mkdir(parents=True, exist_ok=True) - logging.info(f"Downloading file {source_url}") - download_file(source_url, source_file) + logging.info(f"Downloading file gs://{source_bucket}/{source_object}") + download_blob(source_bucket, source_object, zip_file) - logging.info(f"Opening file {source_file}") - df = read_csv_file(source_file, source_csv_name) + logging.info(f"Opening file {zip_file}") + df = read_csv_file(zip_file, csv_file) - logging.info(f"Transforming.. {source_file}") + logging.info(f"Transforming.. {csv_file}") - logging.info(f"Transform: Rename columns for {pipeline_name}..") + logging.info(f"Transform: Rename columns for {table_name}..") rename_headers(df, rename_mappings) - if pipeline_name == "creative_stats": - logging.info(f"Transform: converting to integer for {pipeline_name}..") - df["spend_range_max_usd"] = df["spend_range_max_usd"].apply(convert_to_int) - df["spend_range_max_eur"] = df["spend_range_max_eur"].apply(convert_to_int) - df["spend_range_max_inr"] = df["spend_range_max_inr"].apply(convert_to_int) - df["spend_range_max_bgn"] = df["spend_range_max_bgn"].apply(convert_to_int) - df["spend_range_max_hrk"] = df["spend_range_max_hrk"].apply(convert_to_int) - df["spend_range_max_czk"] = df["spend_range_max_czk"].apply(convert_to_int) - df["spend_range_max_dkk"] = df["spend_range_max_dkk"].apply(convert_to_int) - df["spend_range_max_huf"] = df["spend_range_max_huf"].apply(convert_to_int) - df["spend_range_max_pln"] = df["spend_range_max_pln"].apply(convert_to_int) - df["spend_range_max_ron"] = df["spend_range_max_ron"].apply(convert_to_int) - df["spend_range_max_gbp"] = df["spend_range_max_gbp"].apply(convert_to_int) - df["spend_range_max_sek"] = df["spend_range_max_sek"].apply(convert_to_int) - df["spend_range_max_nzd"] = df["spend_range_max_nzd"].apply(convert_to_int) - else: - df = df - - logging.info(f"Transform: Reordering headers for {pipeline_name}.. ") + if table_name == "creative_stats": + logging.info(f"Transform: converting to integer for {table_name}..") + for col in SPEND_RANGE_COLUMNS: + df = convert_to_int(df, col) + + logging.info(f"Transform: Reordering headers for {table_name}.. ") df = df[headers] logging.info(f"Saving to output file.. {target_file}") try: - save_to_new_file(df, file_path=str(target_file)) + save_to_new_file(df, str(target_file), table_name) except Exception as e: logging.error(f"Error saving output file: {e}.") @@ -92,66 +109,60 @@ def main( upload_file_to_gcs(target_file, target_gcs_bucket, target_gcs_path) logging.info( - f"Google Political Ads {pipeline_name} process completed at " + f"Google Political Ads {table_name} process completed at " + str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) ) -def save_to_new_file(df: pd.DataFrame, file_path: str) -> None: - df.to_csv(file_path, index=False) +def save_to_new_file(df: pd.DataFrame, file_path: str, table_name: str) -> None: + if table_name != "creative_stats" and "spend_usd" in df: + for column in NUMERIC_COLUMNS: + df[column] = pd.to_numeric(df[column]).astype(int) + df.to_csv(file_path, index=False, chunksize=10000) def upload_file_to_gcs(file_path: pathlib.Path, gcs_bucket: str, gcs_path: str) -> None: storage_client = storage.Client() bucket = storage_client.bucket(gcs_bucket) - blob = bucket.blob(gcs_path) + blob = bucket.blob(gcs_path, chunk_size=1000 * (2 ** 18)) blob.upload_from_filename(file_path) -def download_file(source_url: str, source_file: pathlib.Path) -> None: - logging.info(f"Downloading {source_url} into {source_file}") - r = requests.get(source_url, stream=True) - if r.status_code == 200: - with open(source_file, "wb") as f: - for chunk in r: - f.write(chunk) - else: - logging.error(f"Couldn't download {source_url}: {r.text}") +def download_blob(bucket, object, target_file): + """Downloads a blob from the bucket.""" + storage_client = storage.Client() + bucket = storage_client.bucket(bucket) + blob = bucket.blob(object) + blob.download_to_filename(target_file) def read_csv_file(source_file: pathlib.Path, source_csv_name: str) -> pd.DataFrame: with ZipFile(source_file) as zipfiles: - file_list = zipfiles.namelist() - csv_files = fnmatch.filter(file_list, source_csv_name) - data = [pd.read_csv(zipfiles.open(file_name)) for file_name in csv_files] - df = pd.concat(data) - return df + return pd.read_csv(zipfiles.open(source_csv_name), dtype=object) def rename_headers(df: pd.DataFrame, rename_mappings: dict) -> None: df.rename(columns=rename_mappings, inplace=True) -def convert_to_int(input: str) -> str: - str_val = "" - if input == "" or (math.isnan(input)): - str_val = "" - else: - str_val = str(int(round(input, 0))) - return str_val +def convert_to_int(df: pd.DataFrame, column_name: str) -> pd.DataFrame: + df[column_name] = df[column_name].fillna(0) + df[column_name] = df[column_name].astype(int) + return df if __name__ == "__main__": logging.getLogger().setLevel(logging.INFO) main( - source_url=os.environ["SOURCE_URL"], - source_file=pathlib.Path(os.environ["SOURCE_FILE"]).expanduser(), - source_csv_name=os.environ["FILE_NAME"], + source_bucket=os.environ["SOURCE_GCS_BUCKET"], + source_object=os.environ["SOURCE_GCS_OBJECT"], + zip_file=pathlib.Path(os.environ["ZIP_FILE"]).expanduser(), + csv_file=os.environ["CSV_FILE"], target_file=pathlib.Path(os.environ["TARGET_FILE"]).expanduser(), target_gcs_bucket=os.environ["TARGET_GCS_BUCKET"], target_gcs_path=os.environ["TARGET_GCS_PATH"], headers=json.loads(os.environ["CSV_HEADERS"]), rename_mappings=json.loads(os.environ["RENAME_MAPPINGS"]), - pipeline_name=os.environ["PIPELINE_NAME"], + table_name=os.environ["TABLE_NAME"], ) diff --git a/datasets/google_political_ads/pipelines/advertiser_declared_stats/advertiser_declared_stats_dag.py b/datasets/google_political_ads/pipelines/advertiser_declared_stats/advertiser_declared_stats_dag.py deleted file mode 100644 index 6490f5c0b..000000000 --- a/datasets/google_political_ads/pipelines/advertiser_declared_stats/advertiser_declared_stats_dag.py +++ /dev/null @@ -1,114 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="google_political_ads.advertiser_declared_stats", - default_args=default_args, - max_active_runs=1, - schedule_interval="@daily", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - advertiser_declared_stats_transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="advertiser_declared_stats_transform_csv", - startup_timeout_seconds=600, - name="advertiser_declared_stats", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://storage.googleapis.com/transparencyreport/google-political-ads-transparency-bundle.zip", - "SOURCE_FILE": "files/data.zip", - "FILE_NAME": "google-political-ads-transparency-bundle/*advertiser-declared-stats*", - "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/google_political_ads/advertiser_declared_stats/data_output.csv", - "PIPELINE_NAME": "advertiser_declared_stats", - "CSV_HEADERS": '["advertiser_id","advertiser_declared_name","advertiser_declared_regulatory_id","advertiser_declared_scope","advertiser_declared_promoter_name","advertiser_declared_promoter_address"]', - "RENAME_MAPPINGS": '{"Advertiser_ID" : "advertiser_id","Advertiser_Declared_Name" : "advertiser_declared_name","Advertiser_Declared_Regulatory_ID" : "advertiser_declared_regulatory_id","Advertiser_Declared_Scope" : "advertiser_declared_scope","Advertiser_Declared_Promoter_Name" : "advertiser_declared_promoter_name","Advertiser_Declared_Promoter_Address" : "advertiser_declared_promoter_address"}', - }, - resources={ - "request_memory": "2G", - "request_cpu": "1", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_advertiser_declared_stats_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_advertiser_declared_stats_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/google_political_ads/advertiser_declared_stats/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="google_political_ads.advertiser_declared_stats", - skip_leading_rows=1, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "advertiser_id", - "type": "string", - "description": "ID of the advertiser who purchased the ad.", - "mode": "nullable", - }, - { - "name": "advertiser_declared_name", - "type": "string", - "description": "The advertiser’s committee declared name.", - "mode": "nullable", - }, - { - "name": "advertiser_declared_regulatory_id", - "type": "string", - "description": "Committee declared identification number.", - "mode": "nullable", - }, - { - "name": "advertiser_declared_scope", - "type": "string", - "description": "Committee-provided information about the candidate and office or ballot proposition and jurisdiction to which the advertisement refers which is separate from our verification process.", - "mode": "nullable", - }, - { - "name": "advertiser_declared_promoter_name", - "type": "string", - "description": "The New Zealand advertiser’s declared Promoter Statement name.", - "mode": "nullable", - }, - { - "name": "advertiser_declared_promoter_address", - "type": "string", - "description": "The New Zealand advertiser’s declared Promoter Statement address.", - "mode": "nullable", - }, - ], - ) - - advertiser_declared_stats_transform_csv >> load_advertiser_declared_stats_to_bq diff --git a/datasets/google_political_ads/pipelines/advertiser_declared_stats/pipeline.yaml b/datasets/google_political_ads/pipelines/advertiser_declared_stats/pipeline.yaml deleted file mode 100644 index 0ba5c548a..000000000 --- a/datasets/google_political_ads/pipelines/advertiser_declared_stats/pipeline.yaml +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - # Required Properties: - table_id: advertiser_declared_stats - - # Description of the table - description: "Certain California and New Zealand advertisers are required to submit additional data about themselves. The advertiser is responsible for the accuracy of this information, which Google has not confirmed. For California, this information is provided from our express notification process required for certain California advertisers, which is separate from our verification process. For New Zealand, this information is provided during our verification process." - -dag: - airflow_version: 2 - initialize: - dag_id: advertiser_declared_stats - default_args: - owner: "Google" - - # When set to True, keeps a task from getting triggered if the previous schedule for the task hasn’t succeeded - depends_on_past: False - start_date: "2021-03-01" - max_active_runs: 1 - schedule_interval: "@daily" - catchup: False - default_view: graph - - tasks: - - operator: "KubernetesPodOperator" - - # Task description - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "advertiser_declared_stats_transform_csv" - - startup_timeout_seconds: 600 - - # The name of the pod in which the task will run. This will be used (plus a random suffix) to generate a pod id - name: "advertiser_declared_stats" - - # The namespace to run within Kubernetes. Always set its value to "default" because we follow the guideline that KubernetesPodOperator will only be used for very light workloads, i.e. use the Cloud Composer environment"s resources without starving other pipelines. - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - - # Docker images will be built and pushed to GCR by default whenever the `scripts/generate_dag.py` is run. To skip building and pushing images, use the optional `--skip-builds` flag. - image: "{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}" - - # Set the environment variables you need initialized in the container. Use these as input variables for the script your container is expected to perform. - env_vars: - SOURCE_URL: "https://storage.googleapis.com/transparencyreport/google-political-ads-transparency-bundle.zip" - SOURCE_FILE: "files/data.zip" - FILE_NAME: "google-political-ads-transparency-bundle/*advertiser-declared-stats*" - TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/google_political_ads/advertiser_declared_stats/data_output.csv" - PIPELINE_NAME: "advertiser_declared_stats" - CSV_HEADERS: >- - ["advertiser_id","advertiser_declared_name","advertiser_declared_regulatory_id","advertiser_declared_scope","advertiser_declared_promoter_name","advertiser_declared_promoter_address"] - RENAME_MAPPINGS: >- - {"Advertiser_ID" : "advertiser_id","Advertiser_Declared_Name" : "advertiser_declared_name","Advertiser_Declared_Regulatory_ID" : "advertiser_declared_regulatory_id","Advertiser_Declared_Scope" : "advertiser_declared_scope","Advertiser_Declared_Promoter_Name" : "advertiser_declared_promoter_name","Advertiser_Declared_Promoter_Address" : "advertiser_declared_promoter_address"} - # Set resource limits for the pod here. For resource units in Kubernetes, see https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-units-in-kubernetes - resources: - request_memory: "2G" - request_cpu: "1" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_advertiser_declared_stats_to_bq" - - # The GCS bucket where the CSV file is located in. - bucket: "{{ var.value.composer_bucket }}" - - # The GCS object path for the CSV file - source_objects: ["data/google_political_ads/advertiser_declared_stats/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "google_political_ads.advertiser_declared_stats" - - # Use this if your CSV file contains a header row - skip_leading_rows: 1 - - # How to write data to the table: overwrite, append, or write if empty - # See https://cloud.google.com/bigquery/docs/reference/auditlogs/rest/Shared.Types/WriteDisposition - write_disposition: "WRITE_TRUNCATE" - - # The BigQuery table schema based on the CSV file. For more info, see - # https://cloud.google.com/bigquery/docs/schemas. - # Always use snake_case and lowercase for column names, and be explicit, - # i.e. specify modes for all columns. - - schema_fields: - - name: "advertiser_id" - type: "string" - description: "ID of the advertiser who purchased the ad." - mode: "nullable" - - name: "advertiser_declared_name" - type: "string" - description: "The advertiser’s committee declared name." - mode: "nullable" - - name: "advertiser_declared_regulatory_id" - type: "string" - description: "Committee declared identification number." - mode: "nullable" - - name: "advertiser_declared_scope" - type: "string" - description: "Committee-provided information about the candidate and office or ballot proposition and jurisdiction to which the advertisement refers which is separate from our verification process." - mode: "nullable" - - name: "advertiser_declared_promoter_name" - type: "string" - description: "The New Zealand advertiser’s declared Promoter Statement name." - mode: "nullable" - - name: "advertiser_declared_promoter_address" - type: "string" - description: "The New Zealand advertiser’s declared Promoter Statement address." - mode: "nullable" - graph_paths: - - "advertiser_declared_stats_transform_csv >> load_advertiser_declared_stats_to_bq" diff --git a/datasets/google_political_ads/pipelines/advertiser_geo_spend/advertiser_geo_spend_dag.py b/datasets/google_political_ads/pipelines/advertiser_geo_spend/advertiser_geo_spend_dag.py deleted file mode 100644 index 8c50a38d1..000000000 --- a/datasets/google_political_ads/pipelines/advertiser_geo_spend/advertiser_geo_spend_dag.py +++ /dev/null @@ -1,180 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="google_political_ads.advertiser_geo_spend", - default_args=default_args, - max_active_runs=1, - schedule_interval="@daily", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - advertiser_geo_spend_transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="advertiser_geo_spend_transform_csv", - startup_timeout_seconds=600, - name="advertiser_geo_spend", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://storage.googleapis.com/transparencyreport/google-political-ads-transparency-bundle.zip", - "SOURCE_FILE": "files/data.zip", - "FILE_NAME": "google-political-ads-transparency-bundle/google-political-ads-advertiser-geo-spend.csv", - "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/google_political_ads/advertiser_geo_spend/data_output.csv", - "PIPELINE_NAME": "advertiser_geo_spend", - "CSV_HEADERS": '["advertiser_id","advertiser_name","country","country_subdivision_primary","spend_usd","spend_eur","spend_inr","spend_bgn","spend_hrk","spend_czk","spend_dkk","spend_huf","spend_pln","spend_ron","spend_sek","spend_gbp","spend_nzd"]', - "RENAME_MAPPINGS": '{"Advertiser_ID" : "advertiser_id" ,"Advertiser_Name" : "advertiser_name" ,"Country" : "country" ,"Country_Subdivision_Primary" : "country_subdivision_primary" ,"Spend_USD" : "spend_usd" ,"Spend_EUR" : "spend_eur" ,"Spend_INR" : "spend_inr" ,"Spend_BGN" : "spend_bgn" ,"Spend_HRK" : "spend_hrk" ,"Spend_CZK" : "spend_czk" ,"Spend_DKK" : "spend_dkk" ,"Spend_HUF" : "spend_huf" ,"Spend_PLN" : "spend_pln" ,"Spend_RON" : "spend_ron" ,"Spend_SEK" : "spend_sek" ,"Spend_GBP" : "spend_gbp" ,"Spend_NZD" : "spend_nzd"}', - }, - resources={ - "request_memory": "2G", - "request_cpu": "1", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_advertiser_geo_spend_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_advertiser_geo_spend_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/google_political_ads/advertiser_geo_spend/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="google_political_ads.advertiser_geo_spend", - skip_leading_rows=1, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "advertiser_id", - "type": "string", - "description": "Unique ID for an advertiser verified to run election ads on Google Ads Services.", - "mode": "nullable", - }, - { - "name": "advertiser_name", - "type": "string", - "description": "Name of the advertiser.", - "mode": "nullable", - }, - { - "name": "country", - "type": "string", - "description": 'The country where election ads were served specified in the ISO 3166-1 alpha-2 standard code. For example: "US" for United States.', - "mode": "nullable", - }, - { - "name": "country_subdivision_primary", - "type": "string", - "description": 'The primary subdivision of the country where election ads were served specified by the ISO 3166-2 standard code. For example: "US-CA" for California state in United States', - "mode": "nullable", - }, - { - "name": "spend_usd", - "type": "integer", - "description": "Total amount in USD spent on election ads in this region.", - "mode": "nullable", - }, - { - "name": "spend_eur", - "type": "integer", - "description": "Total amount in EUR spent on election ads in this region.", - "mode": "nullable", - }, - { - "name": "spend_inr", - "type": "integer", - "description": "Total amount in INR spent on election ads in this region.", - "mode": "nullable", - }, - { - "name": "spend_bgn", - "type": "integer", - "description": "Total amount in BGN spent on election ads in this region.", - "mode": "nullable", - }, - { - "name": "spend_hrk", - "type": "integer", - "description": "Total amount in HRK spent on election ads in this region.", - "mode": "nullable", - }, - { - "name": "spend_czk", - "type": "integer", - "description": "Total amount in CZK spent on election ads in this region.", - "mode": "nullable", - }, - { - "name": "spend_dkk", - "type": "integer", - "description": "Total amount in DKK spent on election ads in this region.", - "mode": "nullable", - }, - { - "name": "spend_huf", - "type": "integer", - "description": "Total amount in HUF spent on election ads in this region.", - "mode": "nullable", - }, - { - "name": "spend_pln", - "type": "integer", - "description": "Total amount in PLN spent on election ads in this region.", - "mode": "nullable", - }, - { - "name": "spend_ron", - "type": "integer", - "description": "Total amount in RON spent on election ads in this region.", - "mode": "nullable", - }, - { - "name": "spend_sek", - "type": "integer", - "description": "Total amount in SEK spent on election ads in this region.", - "mode": "nullable", - }, - { - "name": "spend_gbp", - "type": "integer", - "description": "Total amount in GBP spent on election ads in this region.", - "mode": "nullable", - }, - { - "name": "spend_nzd", - "type": "integer", - "description": "Total amount in NZD spent on election ads in this region.", - "mode": "nullable", - }, - ], - ) - - advertiser_geo_spend_transform_csv >> load_advertiser_geo_spend_to_bq diff --git a/datasets/google_political_ads/pipelines/advertiser_geo_spend/pipeline.yaml b/datasets/google_political_ads/pipelines/advertiser_geo_spend/pipeline.yaml deleted file mode 100644 index f280301d9..000000000 --- a/datasets/google_political_ads/pipelines/advertiser_geo_spend/pipeline.yaml +++ /dev/null @@ -1,179 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - # Required Properties: - table_id: advertiser_geo_spend - - # Description of the table - description: "This file contains total US advertiser spend on political ads, per US state and the District of Columbia." - -dag: - airflow_version: 2 - initialize: - dag_id: advertiser_geo_spend - default_args: - owner: "Google" - - # When set to True, keeps a task from getting triggered if the previous schedule for the task hasn’t succeeded - depends_on_past: False - start_date: "2021-03-01" - max_active_runs: 1 - schedule_interval: "@daily" - catchup: False - default_view: graph - - tasks: - - operator: "KubernetesPodOperator" - - # Task description - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "advertiser_geo_spend_transform_csv" - - startup_timeout_seconds: 600 - - # The name of the pod in which the task will run. This will be used (plus a random suffix) to generate a pod id - name: "advertiser_geo_spend" - - # The namespace to run within Kubernetes. Always set its value to "default" because we follow the guideline that KubernetesPodOperator will only be used for very light workloads, i.e. use the Cloud Composer environment"s resources without starving other pipelines. - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - - # Docker images will be built and pushed to GCR by default whenever the `scripts/generate_dag.py` is run. To skip building and pushing images, use the optional `--skip-builds` flag. - image: "{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}" - - # Set the environment variables you need initialized in the container. Use these as input variables for the script your container is expected to perform. - env_vars: - SOURCE_URL: "https://storage.googleapis.com/transparencyreport/google-political-ads-transparency-bundle.zip" - SOURCE_FILE: "files/data.zip" - FILE_NAME: "google-political-ads-transparency-bundle/google-political-ads-advertiser-geo-spend.csv" - TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/google_political_ads/advertiser_geo_spend/data_output.csv" - PIPELINE_NAME: "advertiser_geo_spend" - CSV_HEADERS: >- - ["advertiser_id","advertiser_name","country","country_subdivision_primary","spend_usd","spend_eur","spend_inr","spend_bgn","spend_hrk","spend_czk","spend_dkk","spend_huf","spend_pln","spend_ron","spend_sek","spend_gbp","spend_nzd"] - RENAME_MAPPINGS: >- - {"Advertiser_ID" : "advertiser_id" ,"Advertiser_Name" : "advertiser_name" ,"Country" : "country" ,"Country_Subdivision_Primary" : "country_subdivision_primary" ,"Spend_USD" : "spend_usd" ,"Spend_EUR" : "spend_eur" ,"Spend_INR" : "spend_inr" ,"Spend_BGN" : "spend_bgn" ,"Spend_HRK" : "spend_hrk" ,"Spend_CZK" : "spend_czk" ,"Spend_DKK" : "spend_dkk" ,"Spend_HUF" : "spend_huf" ,"Spend_PLN" : "spend_pln" ,"Spend_RON" : "spend_ron" ,"Spend_SEK" : "spend_sek" ,"Spend_GBP" : "spend_gbp" ,"Spend_NZD" : "spend_nzd"} - # Set resource limits for the pod here. For resource units in Kubernetes, see https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-units-in-kubernetes - resources: - request_memory: "2G" - request_cpu: "1" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_advertiser_geo_spend_to_bq" - - # The GCS bucket where the CSV file is located in. - bucket: "{{ var.value.composer_bucket }}" - - # The GCS object path for the CSV file - source_objects: ["data/google_political_ads/advertiser_geo_spend/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "google_political_ads.advertiser_geo_spend" - - # Use this if your CSV file contains a header row - skip_leading_rows: 1 - - # How to write data to the table: overwrite, append, or write if empty - # See https://cloud.google.com/bigquery/docs/reference/auditlogs/rest/Shared.Types/WriteDisposition - write_disposition: "WRITE_TRUNCATE" - - # The BigQuery table schema based on the CSV file. For more info, see - # https://cloud.google.com/bigquery/docs/schemas. - # Always use snake_case and lowercase for column names, and be explicit, - # i.e. specify modes for all columns. - - schema_fields: - - name: "advertiser_id" - type: "string" - description: "Unique ID for an advertiser verified to run election ads on Google Ads Services." - mode: "nullable" - - name: "advertiser_name" - type: "string" - description: "Name of the advertiser." - mode: "nullable" - - name: "country" - type: "string" - description: "The country where election ads were served specified in the ISO 3166-1 alpha-2 standard code. For example: \"US\" for United States." - mode: "nullable" - - name: "country_subdivision_primary" - type: "string" - description: "The primary subdivision of the country where election ads were served specified by the ISO 3166-2 standard code. For example: \"US-CA\" for California state in United States" - mode: "nullable" - - name: "spend_usd" - type: "integer" - description: "Total amount in USD spent on election ads in this region." - mode: "nullable" - - name: "spend_eur" - type: "integer" - description: "Total amount in EUR spent on election ads in this region." - mode: "nullable" - - name: "spend_inr" - type: "integer" - description: "Total amount in INR spent on election ads in this region." - mode: "nullable" - - name: "spend_bgn" - type: "integer" - description: "Total amount in BGN spent on election ads in this region." - mode: "nullable" - - name: "spend_hrk" - type: "integer" - description: "Total amount in HRK spent on election ads in this region." - mode: "nullable" - - name: "spend_czk" - type: "integer" - description: "Total amount in CZK spent on election ads in this region." - mode: "nullable" - - name: "spend_dkk" - type: "integer" - description: "Total amount in DKK spent on election ads in this region." - mode: "nullable" - - name: "spend_huf" - type: "integer" - description: "Total amount in HUF spent on election ads in this region." - mode: "nullable" - - name: "spend_pln" - type: "integer" - description: "Total amount in PLN spent on election ads in this region." - mode: "nullable" - - name: "spend_ron" - type: "integer" - description: "Total amount in RON spent on election ads in this region." - mode: "nullable" - - name: "spend_sek" - type: "integer" - description: "Total amount in SEK spent on election ads in this region." - mode: "nullable" - - name: "spend_gbp" - type: "integer" - description: "Total amount in GBP spent on election ads in this region." - mode: "nullable" - - name: "spend_nzd" - type: "integer" - description: "Total amount in NZD spent on election ads in this region." - mode: "nullable" - graph_paths: - - "advertiser_geo_spend_transform_csv >> load_advertiser_geo_spend_to_bq" diff --git a/datasets/google_political_ads/pipelines/advertiser_stats/advertiser_stats_dag.py b/datasets/google_political_ads/pipelines/advertiser_stats/advertiser_stats_dag.py deleted file mode 100644 index bfdac42a1..000000000 --- a/datasets/google_political_ads/pipelines/advertiser_stats/advertiser_stats_dag.py +++ /dev/null @@ -1,190 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="google_political_ads.advertiser_stats", - default_args=default_args, - max_active_runs=1, - schedule_interval="@daily", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - advertiser_stats_transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="advertiser_stats_transform_csv", - startup_timeout_seconds=600, - name="advertiser_stats", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://storage.googleapis.com/transparencyreport/google-political-ads-transparency-bundle.zip", - "SOURCE_FILE": "files/data.zip", - "FILE_NAME": "google-political-ads-transparency-bundle/google-political-ads-advertiser-stats.csv", - "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/google_political_ads/advertiser_stats/data_output.csv", - "PIPELINE_NAME": "advertiser_stats", - "CSV_HEADERS": '["advertiser_id","advertiser_name","public_ids_list","regions","elections","total_creatives","spend_usd","spend_eur","spend_inr","spend_bgn","spend_hrk","spend_czk","spend_dkk","spend_huf","spend_pln","spend_ron","spend_sek","spend_gbp","spend_nzd"]', - "RENAME_MAPPINGS": '{"Advertiser_ID": "advertiser_id","Advertiser_Name": "advertiser_name","Public_IDs_List": "public_ids_list","Regions": "regions","Elections": "elections","Total_Creatives": "total_creatives","Spend_USD": "spend_usd","Spend_EUR": "spend_eur","Spend_INR": "spend_inr","Spend_BGN": "spend_bgn","Spend_HRK": "spend_hrk","Spend_CZK": "spend_czk","Spend_DKK": "spend_dkk","Spend_HUF": "spend_huf","Spend_PLN": "spend_pln","Spend_RON": "spend_ron","Spend_SEK": "spend_sek","Spend_GBP": "spend_gbp","Spend_NZD": "spend_nzd"}', - }, - resources={ - "request_memory": "2G", - "request_cpu": "1", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_advertiser_stats_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_advertiser_stats_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=["data/google_political_ads/advertiser_stats/data_output.csv"], - source_format="CSV", - destination_project_dataset_table="google_political_ads.advertiser_stats", - skip_leading_rows=1, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "advertiser_id", - "type": "string", - "description": "Unique ID for an advertiser verified to run election ads on Google Ads Services.", - "mode": "nullable", - }, - { - "name": "advertiser_name", - "type": "string", - "description": "Name of advertiser.", - "mode": "nullable", - }, - { - "name": "public_ids_list", - "type": "string", - "description": "List of public IDs used to identify the advertiser if available.", - "mode": "nullable", - }, - { - "name": "regions", - "type": "string", - "description": "The list of regions where the ads of this advertiser were served", - "mode": "nullable", - }, - { - "name": "elections", - "type": "string", - "description": "The list of elections that this advertiser participated in based on the regions.", - "mode": "nullable", - }, - { - "name": "total_creatives", - "type": "integer", - "description": "Total number of election ads the advertiser ran with at least one impression.", - "mode": "nullable", - }, - { - "name": "spend_usd", - "type": "integer", - "description": "Total amount in USD spent on election ads by the advertiser.", - "mode": "nullable", - }, - { - "name": "spend_eur", - "type": "integer", - "description": "Total amount in EUR spent on election ads by the advertiser.", - "mode": "nullable", - }, - { - "name": "spend_inr", - "type": "integer", - "description": "Total amount in INR spent on election ads by the advertiser.", - "mode": "nullable", - }, - { - "name": "spend_bgn", - "type": "integer", - "description": "Total amount in BGN spent on election ads by the advertiser.", - "mode": "nullable", - }, - { - "name": "spend_hrk", - "type": "integer", - "description": "Total amount in HRK spent on election ads by the advertiser.", - "mode": "nullable", - }, - { - "name": "spend_czk", - "type": "integer", - "description": "Total amount in CZK spent on election ads by the advertiser.", - "mode": "nullable", - }, - { - "name": "spend_dkk", - "type": "integer", - "description": "Total amount in DKK spent on election ads by the advertiser.", - "mode": "nullable", - }, - { - "name": "spend_huf", - "type": "integer", - "description": "Total amount in HUF spent on election ads by the advertiser.", - "mode": "nullable", - }, - { - "name": "spend_pln", - "type": "integer", - "description": "Total amount in PLN spent on election ads by the advertiser.", - "mode": "nullable", - }, - { - "name": "spend_ron", - "type": "integer", - "description": "Total amount in RON spent on election ads by the advertiser.", - "mode": "nullable", - }, - { - "name": "spend_sek", - "type": "integer", - "description": "Total amount in SEK spent on election ads by the advertiser.", - "mode": "nullable", - }, - { - "name": "spend_gbp", - "type": "integer", - "description": "Total amount in GBP spent on election ads by the advertiser.", - "mode": "nullable", - }, - { - "name": "spend_nzd", - "type": "integer", - "description": "Total amount in NZD spent on election ads by the advertiser.", - "mode": "nullable", - }, - ], - ) - - advertiser_stats_transform_csv >> load_advertiser_stats_to_bq diff --git a/datasets/google_political_ads/pipelines/advertiser_stats/pipeline.yaml b/datasets/google_political_ads/pipelines/advertiser_stats/pipeline.yaml deleted file mode 100644 index 65dc7b17c..000000000 --- a/datasets/google_political_ads/pipelines/advertiser_stats/pipeline.yaml +++ /dev/null @@ -1,187 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - # Required Properties: - table_id: advertiser_stats - - # Description of the table - description: "This table contains the information about advertisers who have run an election ad on Google Ads Services with at least one impression. The table's primary key is advertiser_id. This table relates to the others in this dataset, with the following connections between columns: advertiser_id is referenced from: advertiser_weekly_spend.advertiser_id campaign_targeting.advertiser_id creative_stats.advertiser_id advertiser_name is referenced from: advertiser_weekly_spend.advertiser_name campaign_targeting.advertiser_name advertiser_id.advertiser_name" - -dag: - airflow_version: 2 - initialize: - dag_id: advertiser_stats - default_args: - owner: "Google" - - # When set to True, keeps a task from getting triggered if the previous schedule for the task hasn’t succeeded - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "@daily" - catchup: False - default_view: graph - - tasks: - - operator: "KubernetesPodOperator" - - # Task description - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "advertiser_stats_transform_csv" - - startup_timeout_seconds: 600 - - # The name of the pod in which the task will run. This will be used (plus a random suffix) to generate a pod id - name: "advertiser_stats" - - # The namespace to run within Kubernetes. Always set its value to "default" because we follow the guideline that KubernetesPodOperator will only be used for very light workloads, i.e. use the Cloud Composer environment's resources without starving other pipelines. - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - - # Docker images will be built and pushed to GCR by default whenever the `scripts/generate_dag.py` is run. To skip building and pushing images, use the optional `--skip-builds` flag. - image: "{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}" - - # Set the environment variables you need initialized in the container. Use these as input variables for the script your container is expected to perform. - env_vars: - SOURCE_URL: "https://storage.googleapis.com/transparencyreport/google-political-ads-transparency-bundle.zip" - SOURCE_FILE: "files/data.zip" - FILE_NAME: "google-political-ads-transparency-bundle/google-political-ads-advertiser-stats.csv" - TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/google_political_ads/advertiser_stats/data_output.csv" - PIPELINE_NAME: "advertiser_stats" - CSV_HEADERS: >- - ["advertiser_id","advertiser_name","public_ids_list","regions","elections","total_creatives","spend_usd","spend_eur","spend_inr","spend_bgn","spend_hrk","spend_czk","spend_dkk","spend_huf","spend_pln","spend_ron","spend_sek","spend_gbp","spend_nzd"] - RENAME_MAPPINGS: >- - {"Advertiser_ID": "advertiser_id","Advertiser_Name": "advertiser_name","Public_IDs_List": "public_ids_list","Regions": "regions","Elections": "elections","Total_Creatives": "total_creatives","Spend_USD": "spend_usd","Spend_EUR": "spend_eur","Spend_INR": "spend_inr","Spend_BGN": "spend_bgn","Spend_HRK": "spend_hrk","Spend_CZK": "spend_czk","Spend_DKK": "spend_dkk","Spend_HUF": "spend_huf","Spend_PLN": "spend_pln","Spend_RON": "spend_ron","Spend_SEK": "spend_sek","Spend_GBP": "spend_gbp","Spend_NZD": "spend_nzd"} - # Set resource limits for the pod here. For resource units in Kubernetes, see https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-units-in-kubernetes - resources: - request_memory: "2G" - request_cpu: "1" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_advertiser_stats_to_bq" - - # The GCS bucket where the CSV file is located in. - bucket: "{{ var.value.composer_bucket }}" - - # The GCS object path for the CSV file - source_objects: ["data/google_political_ads/advertiser_stats/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "google_political_ads.advertiser_stats" - - # Use this if your CSV file contains a header row - skip_leading_rows: 1 - - # How to write data to the table: overwrite, append, or write if empty - # See https://cloud.google.com/bigquery/docs/reference/auditlogs/rest/Shared.Types/WriteDisposition - write_disposition: "WRITE_TRUNCATE" - - # The BigQuery table schema based on the CSV file. For more info, see - # https://cloud.google.com/bigquery/docs/schemas. - # Always use snake_case and lowercase for column names, and be explicit, - # i.e. specify modes for all columns. - - schema_fields: - - name: "advertiser_id" - type: "string" - description: "Unique ID for an advertiser verified to run election ads on Google Ads Services." - mode: "nullable" - - name: "advertiser_name" - type: "string" - description: "Name of advertiser." - mode: "nullable" - - name: "public_ids_list" - type: "string" - description: "List of public IDs used to identify the advertiser if available." - mode: "nullable" - - name: "regions" - type: "string" - description: "The list of regions where the ads of this advertiser were served" - mode: "nullable" - - name: "elections" - type: "string" - description: "The list of elections that this advertiser participated in based on the regions." - mode: "nullable" - - name: "total_creatives" - type: "integer" - description: "Total number of election ads the advertiser ran with at least one impression." - mode: "nullable" - - name: "spend_usd" - type: "integer" - description: "Total amount in USD spent on election ads by the advertiser." - mode: "nullable" - - name: "spend_eur" - type: "integer" - description: "Total amount in EUR spent on election ads by the advertiser." - mode: "nullable" - - name: "spend_inr" - type: "integer" - description: "Total amount in INR spent on election ads by the advertiser." - mode: "nullable" - - name: "spend_bgn" - type: "integer" - description: "Total amount in BGN spent on election ads by the advertiser." - mode: "nullable" - - name: "spend_hrk" - type: "integer" - description: "Total amount in HRK spent on election ads by the advertiser." - mode: "nullable" - - name: "spend_czk" - type: "integer" - description: "Total amount in CZK spent on election ads by the advertiser." - mode: "nullable" - - name: "spend_dkk" - type: "integer" - description: "Total amount in DKK spent on election ads by the advertiser." - mode: "nullable" - - name: "spend_huf" - type: "integer" - description: "Total amount in HUF spent on election ads by the advertiser." - mode: "nullable" - - name: "spend_pln" - type: "integer" - description: "Total amount in PLN spent on election ads by the advertiser." - mode: "nullable" - - name: "spend_ron" - type: "integer" - description: "Total amount in RON spent on election ads by the advertiser." - mode: "nullable" - - name: "spend_sek" - type: "integer" - description: "Total amount in SEK spent on election ads by the advertiser." - mode: "nullable" - - name: "spend_gbp" - type: "integer" - description: "Total amount in GBP spent on election ads by the advertiser." - mode: "nullable" - - name: "spend_nzd" - type: "integer" - description: "Total amount in NZD spent on election ads by the advertiser." - mode: "nullable" - graph_paths: - - "advertiser_stats_transform_csv >> load_advertiser_stats_to_bq" diff --git a/datasets/google_political_ads/pipelines/advertiser_weekly_spend/advertiser_weekly_spend_dag.py b/datasets/google_political_ads/pipelines/advertiser_weekly_spend/advertiser_weekly_spend_dag.py deleted file mode 100644 index e47f27663..000000000 --- a/datasets/google_political_ads/pipelines/advertiser_weekly_spend/advertiser_weekly_spend_dag.py +++ /dev/null @@ -1,180 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="google_political_ads.advertiser_weekly_spend", - default_args=default_args, - max_active_runs=1, - schedule_interval="@daily", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - advertiser_weekly_spend_transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="advertiser_weekly_spend_transform_csv", - startup_timeout_seconds=600, - name="advertiser_weekly_spend", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://storage.googleapis.com/transparencyreport/google-political-ads-transparency-bundle.zip", - "SOURCE_FILE": "files/data.zip", - "FILE_NAME": "google-political-ads-transparency-bundle/google-political-ads-advertiser-weekly-spend.csv", - "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/google_political_ads/advertiser_weekly_spend/data_output.csv", - "PIPELINE_NAME": "advertiser_weekly_spend", - "CSV_HEADERS": '["advertiser_id","advertiser_name","election_cycle","week_start_date","spend_usd","spend_eur","spend_inr","spend_bgn","spend_hrk","spend_czk","spend_dkk","spend_huf","spend_pln","spend_ron","spend_sek","spend_gbp","spend_nzd"]', - "RENAME_MAPPINGS": '{"Advertiser_ID": "advertiser_id","Advertiser_Name": "advertiser_name","Election_Cycle": "election_cycle","Week_Start_Date": "week_start_date","Spend_USD": "spend_usd","Spend_EUR": "spend_eur","Spend_INR": "spend_inr","Spend_BGN": "spend_bgn","Spend_HRK": "spend_hrk","Spend_CZK": "spend_czk","Spend_DKK": "spend_dkk","Spend_HUF": "spend_huf","Spend_PLN": "spend_pln","Spend_RON": "spend_ron","Spend_SEK": "spend_sek","Spend_GBP": "spend_gbp","Spend_NZD": "spend_nzd"}', - }, - resources={ - "request_memory": "2G", - "request_cpu": "1", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_advertiser_weekly_spend_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_advertiser_weekly_spend_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/google_political_ads/advertiser_weekly_spend/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="google_political_ads.advertiser_weekly_spend", - skip_leading_rows=1, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "advertiser_id", - "type": "string", - "description": "Unique ID for an advertiser verified to run election ads on Google Ads Services.", - "mode": "nullable", - }, - { - "name": "advertiser_name", - "type": "string", - "description": "Name of advertiser.", - "mode": "nullable", - }, - { - "name": "election_cycle", - "type": "string", - "description": "[DEPRECATED] This field is deprecated in favor of the Elections column in advertiser_stats table. It will be deleted some time after July 2019.", - "mode": "nullable", - }, - { - "name": "week_start_date", - "type": "date", - "description": "The start date for the week where spending occurred.", - "mode": "nullable", - }, - { - "name": "spend_usd", - "type": "integer", - "description": "The amount in USD spent on election ads during the given week by the advertiser.", - "mode": "nullable", - }, - { - "name": "spend_eur", - "type": "integer", - "description": "The amount in EUR spent on election ads during the given week by the advertiser.", - "mode": "nullable", - }, - { - "name": "spend_inr", - "type": "integer", - "description": "The amount in INR spent on election ads during the given week by the advertiser.", - "mode": "nullable", - }, - { - "name": "spend_bgn", - "type": "integer", - "description": "The amount in BGN spent on election ads during the given week by the advertiser.", - "mode": "nullable", - }, - { - "name": "spend_hrk", - "type": "integer", - "description": "The amount in HRK spent on election ads during the given week by the advertiser.", - "mode": "nullable", - }, - { - "name": "spend_czk", - "type": "integer", - "description": "The amount in CZK spent on election ads during the given week by the advertiser.", - "mode": "nullable", - }, - { - "name": "spend_dkk", - "type": "integer", - "description": "The amount in DKK spent on election ads during the given week by the advertiser.", - "mode": "nullable", - }, - { - "name": "spend_huf", - "type": "integer", - "description": "The amount in HUF spent on election ads during the given week by the advertiser.", - "mode": "nullable", - }, - { - "name": "spend_pln", - "type": "integer", - "description": "The amount in PLN spent on election ads during the given week by the advertiser.", - "mode": "nullable", - }, - { - "name": "spend_ron", - "type": "integer", - "description": "The amount in RON spent on election ads during the given week by the advertiser.", - "mode": "nullable", - }, - { - "name": "spend_sek", - "type": "integer", - "description": "The amount in SEK spent on election ads during the given week by the advertiser.", - "mode": "nullable", - }, - { - "name": "spend_gbp", - "type": "integer", - "description": "The amount in GBP spent on election ads during the given week by the advertiser.", - "mode": "nullable", - }, - { - "name": "spend_nzd", - "type": "integer", - "description": "The amount in NZD spent on election ads during the given week by the advertiser.", - "mode": "nullable", - }, - ], - ) - - advertiser_weekly_spend_transform_csv >> load_advertiser_weekly_spend_to_bq diff --git a/datasets/google_political_ads/pipelines/advertiser_weekly_spend/pipeline.yaml b/datasets/google_political_ads/pipelines/advertiser_weekly_spend/pipeline.yaml deleted file mode 100644 index a4c01a513..000000000 --- a/datasets/google_political_ads/pipelines/advertiser_weekly_spend/pipeline.yaml +++ /dev/null @@ -1,180 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - # Required Properties: - table_id: advertiser_weekly_spend - - # Description of the table - description: "This table contains the information for how much an advertiser spent on political ads during a given week. The table's primary key is advertiser_id, election_cycle, week_start_date" - -dag: - airflow_version: 2 - initialize: - dag_id: advertiser_weekly_spend - default_args: - owner: "Google" - - # When set to True, keeps a task from getting triggered if the previous schedule for the task hasn’t succeeded - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "@daily" - catchup: False - default_view: graph - - tasks: - - operator: "KubernetesPodOperator" - - # Task description - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "advertiser_weekly_spend_transform_csv" - - startup_timeout_seconds: 600 - - # The name of the pod in which the task will run. This will be used (plus a random suffix) to generate a pod id - name: "advertiser_weekly_spend" - - # The namespace to run within Kubernetes. Always set its value to "default" because we follow the guideline that KubernetesPodOperator will only be used for very light workloads, i.e. use the Cloud Composer environment's resources without starving other pipelines. - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - - # Docker images will be built and pushed to GCR by default whenever the `scripts/generate_dag.py` is run. To skip building and pushing images, use the optional `--skip-builds` flag. - image: "{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}" - - # Set the environment variables you need initialized in the container. Use these as input variables for the script your container is expected to perform. - env_vars: - SOURCE_URL: "https://storage.googleapis.com/transparencyreport/google-political-ads-transparency-bundle.zip" - SOURCE_FILE: "files/data.zip" - FILE_NAME: "google-political-ads-transparency-bundle/google-political-ads-advertiser-weekly-spend.csv" - TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/google_political_ads/advertiser_weekly_spend/data_output.csv" - PIPELINE_NAME: "advertiser_weekly_spend" - CSV_HEADERS: >- - ["advertiser_id","advertiser_name","election_cycle","week_start_date","spend_usd","spend_eur","spend_inr","spend_bgn","spend_hrk","spend_czk","spend_dkk","spend_huf","spend_pln","spend_ron","spend_sek","spend_gbp","spend_nzd"] - RENAME_MAPPINGS: >- - {"Advertiser_ID": "advertiser_id","Advertiser_Name": "advertiser_name","Election_Cycle": "election_cycle","Week_Start_Date": "week_start_date","Spend_USD": "spend_usd","Spend_EUR": "spend_eur","Spend_INR": "spend_inr","Spend_BGN": "spend_bgn","Spend_HRK": "spend_hrk","Spend_CZK": "spend_czk","Spend_DKK": "spend_dkk","Spend_HUF": "spend_huf","Spend_PLN": "spend_pln","Spend_RON": "spend_ron","Spend_SEK": "spend_sek","Spend_GBP": "spend_gbp","Spend_NZD": "spend_nzd"} - - # Set resource limits for the pod here. For resource units in Kubernetes, see https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-units-in-kubernetes - resources: - request_memory: "2G" - request_cpu: "1" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_advertiser_weekly_spend_to_bq" - - # The GCS bucket where the CSV file is located in. - bucket: "{{ var.value.composer_bucket }}" - - # The GCS object path for the CSV file - source_objects: ["data/google_political_ads/advertiser_weekly_spend/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "google_political_ads.advertiser_weekly_spend" - - # Use this if your CSV file contains a header row - skip_leading_rows: 1 - - # How to write data to the table: overwrite, append, or write if empty - # See https://cloud.google.com/bigquery/docs/reference/auditlogs/rest/Shared.Types/WriteDisposition - write_disposition: "WRITE_TRUNCATE" - - # The BigQuery table schema based on the CSV file. For more info, see - # https://cloud.google.com/bigquery/docs/schemas. - # Always use snake_case and lowercase for column names, and be explicit, - # i.e. specify modes for all columns. - - schema_fields: - - name: "advertiser_id" - type: "string" - description: "Unique ID for an advertiser verified to run election ads on Google Ads Services." - mode: "nullable" - - name: "advertiser_name" - type: "string" - description: "Name of advertiser." - mode: "nullable" - - name: "election_cycle" - type: "string" - description: "[DEPRECATED] This field is deprecated in favor of the Elections column in advertiser_stats table. It will be deleted some time after July 2019." - mode: "nullable" - - name: "week_start_date" - type: "date" - description: "The start date for the week where spending occurred." - mode: "nullable" - - name: "spend_usd" - type: "integer" - description: "The amount in USD spent on election ads during the given week by the advertiser." - mode: "nullable" - - name: "spend_eur" - type: "integer" - description: "The amount in EUR spent on election ads during the given week by the advertiser." - mode: "nullable" - - name: "spend_inr" - type: "integer" - description: "The amount in INR spent on election ads during the given week by the advertiser." - mode: "nullable" - - name: "spend_bgn" - type: "integer" - description: "The amount in BGN spent on election ads during the given week by the advertiser." - mode: "nullable" - - name: "spend_hrk" - type: "integer" - description: "The amount in HRK spent on election ads during the given week by the advertiser." - mode: "nullable" - - name: "spend_czk" - type: "integer" - description: "The amount in CZK spent on election ads during the given week by the advertiser." - mode: "nullable" - - name: "spend_dkk" - type: "integer" - description: "The amount in DKK spent on election ads during the given week by the advertiser." - mode: "nullable" - - name: "spend_huf" - type: "integer" - description: "The amount in HUF spent on election ads during the given week by the advertiser." - mode: "nullable" - - name: "spend_pln" - type: "integer" - description: "The amount in PLN spent on election ads during the given week by the advertiser." - mode: "nullable" - - name: "spend_ron" - type: "integer" - description: "The amount in RON spent on election ads during the given week by the advertiser." - mode: "nullable" - - name: "spend_sek" - type: "integer" - description: "The amount in SEK spent on election ads during the given week by the advertiser." - mode: "nullable" - - name: "spend_gbp" - type: "integer" - description: "The amount in GBP spent on election ads during the given week by the advertiser." - mode: "nullable" - - name: "spend_nzd" - type: "integer" - description: "The amount in NZD spent on election ads during the given week by the advertiser." - mode: "nullable" - graph_paths: - - "advertiser_weekly_spend_transform_csv >> load_advertiser_weekly_spend_to_bq" diff --git a/datasets/google_political_ads/pipelines/campaign_targeting/campaign_targeting_dag.py b/datasets/google_political_ads/pipelines/campaign_targeting/campaign_targeting_dag.py deleted file mode 100644 index 992e7b5a0..000000000 --- a/datasets/google_political_ads/pipelines/campaign_targeting/campaign_targeting_dag.py +++ /dev/null @@ -1,136 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="google_political_ads.campaign_targeting", - default_args=default_args, - max_active_runs=1, - schedule_interval="@daily", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - campaign_targeting_transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="campaign_targeting_transform_csv", - startup_timeout_seconds=600, - name="campaign_targeting", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://storage.googleapis.com/transparencyreport/google-political-ads-transparency-bundle.zip", - "SOURCE_FILE": "files/data.zip", - "FILE_NAME": "google-political-ads-transparency-bundle/google-political-ads-campaign-targeting.csv", - "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/google_political_ads/campaign_targeting/data_output.csv", - "PIPELINE_NAME": "campaign_targeting", - "CSV_HEADERS": '["campaign_id","age_targeting","gender_targeting","geo_targeting_included","geo_targeting_excluded","start_date","end_date","ads_list","advertiser_id","advertiser_name"]', - "RENAME_MAPPINGS": '{"Campaign_ID": "campaign_id","Age_Targeting": "age_targeting","Gender_Targeting": "gender_targeting","Geo_Targeting_Included": "geo_targeting_included","Geo_Targeting_Excluded": "geo_targeting_excluded","Start_Date": "start_date","End_Date": "end_date","Ads_List": "ads_list","Advertiser_ID": "advertiser_id","Advertiser_Name": "advertiser_name"}', - }, - resources={ - "request_memory": "2G", - "request_cpu": "1", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_campaign_targeting_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_campaign_targeting_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=["data/google_political_ads/campaign_targeting/data_output.csv"], - source_format="CSV", - destination_project_dataset_table="google_political_ads.campaign_targeting", - skip_leading_rows=1, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "campaign_id", - "type": "string", - "description": "[DEPRECATED] Unique ID for a political ad campaign.", - "mode": "nullable", - }, - { - "name": "age_targeting", - "type": "string", - "description": "[DEPRECATED] Age ranges included in the campaign's targeting.", - "mode": "nullable", - }, - { - "name": "gender_targeting", - "type": "string", - "description": "[DEPRECATED] Genders included in the campaign's targeting", - "mode": "nullable", - }, - { - "name": "geo_targeting_included", - "type": "string", - "description": "[DEPRECATED] Geographic locations included in the campaign's targeting.", - "mode": "nullable", - }, - { - "name": "geo_targeting_excluded", - "type": "string", - "description": "[DEPRECATED] Geographic locations excluded from the campaign's targeting.", - "mode": "nullable", - }, - { - "name": "start_date", - "type": "date", - "description": "[DEPRECATED] Start date for the campaign.", - "mode": "nullable", - }, - { - "name": "end_date", - "type": "date", - "description": "[DEPRECATED] End date for the campaign.", - "mode": "nullable", - }, - { - "name": "ads_list", - "type": "string", - "description": "[DEPRECATED] List of Ad_IDs for the campaign.", - "mode": "nullable", - }, - { - "name": "advertiser_id", - "type": "string", - "description": "[DEPRECATED] ID of the advertiser who purchased the ad.", - "mode": "nullable", - }, - { - "name": "advertiser_name", - "type": "string", - "description": "[DEPRECATED] Name of advertiser.", - "mode": "nullable", - }, - ], - ) - - campaign_targeting_transform_csv >> load_campaign_targeting_to_bq diff --git a/datasets/google_political_ads/pipelines/campaign_targeting/pipeline.yaml b/datasets/google_political_ads/pipelines/campaign_targeting/pipeline.yaml deleted file mode 100644 index 3dac0a737..000000000 --- a/datasets/google_political_ads/pipelines/campaign_targeting/pipeline.yaml +++ /dev/null @@ -1,152 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - # Required Properties: - table_id: campaign_targeting - - # Description of the table - description: "This table was deprecated and ad-level targeting information was made available in the `google_political_ads.creative_stats` BigQuery table, effective April 2020. This table contains the information related to ad campaigns run by advertisers." - -dag: - airflow_version: 2 - initialize: - dag_id: campaign_targeting - default_args: - owner: "Google" - - # When set to True, keeps a task from getting triggered if the previous schedule for the task hasn’t succeeded - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "@daily" - catchup: False - default_view: graph - - tasks: - - operator: "KubernetesPodOperator" - - # Task description - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "campaign_targeting_transform_csv" - - startup_timeout_seconds: 600 - - # The name of the pod in which the task will run. This will be used (plus a random suffix) to generate a pod id - name: "campaign_targeting" - - # The namespace to run within Kubernetes. Always set its value to "default" because we follow the guideline that KubernetesPodOperator will only be used for very light workloads, i.e. use the Cloud Composer environment's resources without starving other pipelines. - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - - # Docker images will be built and pushed to GCR by default whenever the `scripts/generate_dag.py` is run. To skip building and pushing images, use the optional `--skip-builds` flag. - image: "{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}" - - # Set the environment variables you need initialized in the container. Use these as input variables for the script your container is expected to perform. - env_vars: - SOURCE_URL: "https://storage.googleapis.com/transparencyreport/google-political-ads-transparency-bundle.zip" - SOURCE_FILE: "files/data.zip" - FILE_NAME: "google-political-ads-transparency-bundle/google-political-ads-campaign-targeting.csv" - TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/google_political_ads/campaign_targeting/data_output.csv" - PIPELINE_NAME: "campaign_targeting" - CSV_HEADERS: >- - ["campaign_id","age_targeting","gender_targeting","geo_targeting_included","geo_targeting_excluded","start_date","end_date","ads_list","advertiser_id","advertiser_name"] - RENAME_MAPPINGS: >- - {"Campaign_ID": "campaign_id","Age_Targeting": "age_targeting","Gender_Targeting": "gender_targeting","Geo_Targeting_Included": "geo_targeting_included","Geo_Targeting_Excluded": "geo_targeting_excluded","Start_Date": "start_date","End_Date": "end_date","Ads_List": "ads_list","Advertiser_ID": "advertiser_id","Advertiser_Name": "advertiser_name"} - - # Set resource limits for the pod here. For resource units in Kubernetes, see https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-units-in-kubernetes - resources: - request_memory: "2G" - request_cpu: "1" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_campaign_targeting_to_bq" - - # The GCS bucket where the CSV file is located in. - bucket: "{{ var.value.composer_bucket }}" - - # The GCS object path for the CSV file - source_objects: ["data/google_political_ads/campaign_targeting/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "google_political_ads.campaign_targeting" - - # Use this if your CSV file contains a header row - skip_leading_rows: 1 - - # How to write data to the table: overwrite, append, or write if empty - # See https://cloud.google.com/bigquery/docs/reference/auditlogs/rest/Shared.Types/WriteDisposition - write_disposition: "WRITE_TRUNCATE" - - # The BigQuery table schema based on the CSV file. For more info, see - # https://cloud.google.com/bigquery/docs/schemas. - # Always use snake_case and lowercase for column names, and be explicit, - # i.e. specify modes for all columns. - - schema_fields: - - name: "campaign_id" - type: "string" - description: "[DEPRECATED] Unique ID for a political ad campaign." - mode: "nullable" - - name: "age_targeting" - type: "string" - description: "[DEPRECATED] Age ranges included in the campaign's targeting." - mode: "nullable" - - name: "gender_targeting" - type: "string" - description: "[DEPRECATED] Genders included in the campaign's targeting" - mode: "nullable" - - name: "geo_targeting_included" - type: "string" - description: "[DEPRECATED] Geographic locations included in the campaign's targeting." - mode: "nullable" - - name: "geo_targeting_excluded" - type: "string" - description: "[DEPRECATED] Geographic locations excluded from the campaign's targeting." - mode: "nullable" - - name: "start_date" - type: "date" - description: "[DEPRECATED] Start date for the campaign." - mode: "nullable" - - name: "end_date" - type: "date" - description: "[DEPRECATED] End date for the campaign." - mode: "nullable" - - name: "ads_list" - type: "string" - description: "[DEPRECATED] List of Ad_IDs for the campaign." - mode: "nullable" - - name: "advertiser_id" - type: "string" - description: "[DEPRECATED] ID of the advertiser who purchased the ad." - mode: "nullable" - - name: "advertiser_name" - type: "string" - description: "[DEPRECATED] Name of advertiser." - mode: "nullable" - graph_paths: - - "campaign_targeting_transform_csv >> load_campaign_targeting_to_bq" diff --git a/datasets/google_political_ads/pipelines/creative_stats/creative_stats_dag.py b/datasets/google_political_ads/pipelines/creative_stats/creative_stats_dag.py deleted file mode 100644 index 72e436671..000000000 --- a/datasets/google_political_ads/pipelines/creative_stats/creative_stats_dag.py +++ /dev/null @@ -1,340 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="google_political_ads.creative_stats", - default_args=default_args, - max_active_runs=1, - schedule_interval="@daily", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - creative_stats_transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="creative_stats_transform_csv", - startup_timeout_seconds=600, - name="creative_stats", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://storage.googleapis.com/transparencyreport/google-political-ads-transparency-bundle.zip", - "SOURCE_FILE": "files/data.zip", - "FILE_NAME": "google-political-ads-transparency-bundle/google-political-ads-creative-stats.csv", - "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/google_political_ads/creative_stats/data_output.csv", - "PIPELINE_NAME": "creative_stats", - "CSV_HEADERS": '["ad_id","ad_url","ad_type","regions","advertiser_id","advertiser_name","ad_campaigns_list","date_range_start","date_range_end","num_of_days","impressions","spend_usd","first_served_timestamp","last_served_timestamp","age_targeting","gender_targeting","geo_targeting_included","geo_targeting_excluded","spend_range_min_usd","spend_range_max_usd","spend_range_min_eur","spend_range_max_eur","spend_range_min_inr","spend_range_max_inr","spend_range_min_bgn","spend_range_max_bgn","spend_range_min_hrk","spend_range_max_hrk","spend_range_min_czk","spend_range_max_czk","spend_range_min_dkk","spend_range_max_dkk","spend_range_min_huf","spend_range_max_huf","spend_range_min_pln","spend_range_max_pln","spend_range_min_ron","spend_range_max_ron","spend_range_min_sek","spend_range_max_sek","spend_range_min_gbp","spend_range_max_gbp","spend_range_min_nzd","spend_range_max_nzd"]', - "RENAME_MAPPINGS": '{"Ad_ID": "ad_id","Ad_URL": "ad_url","Ad_Type": "ad_type","Regions": "regions","Advertiser_ID": "advertiser_id","Advertiser_Name": "advertiser_name","Ad_Campaigns_List": "ad_campaigns_list","Date_Range_Start": "date_range_start","Date_Range_End": "date_range_end","Num_of_Days": "num_of_days","Impressions": "impressions","Spend_USD": "spend_usd","Spend_Range_Min_USD": "spend_range_min_usd","Spend_Range_Max_USD": "spend_range_max_usd","Spend_Range_Min_EUR": "spend_range_min_eur","Spend_Range_Max_EUR": "spend_range_max_eur","Spend_Range_Min_INR": "spend_range_min_inr","Spend_Range_Max_INR": "spend_range_max_inr","Spend_Range_Min_BGN": "spend_range_min_bgn","Spend_Range_Max_BGN": "spend_range_max_bgn","Spend_Range_Min_HRK": "spend_range_min_hrk","Spend_Range_Max_HRK": "spend_range_max_hrk","Spend_Range_Min_CZK": "spend_range_min_czk","Spend_Range_Max_CZK": "spend_range_max_czk","Spend_Range_Min_DKK": "spend_range_min_dkk","Spend_Range_Max_DKK": "spend_range_max_dkk","Spend_Range_Min_HUF": "spend_range_min_huf","Spend_Range_Max_HUF": "spend_range_max_huf","Spend_Range_Min_PLN": "spend_range_min_pln","Spend_Range_Max_PLN": "spend_range_max_pln","Spend_Range_Min_RON": "spend_range_min_ron","Spend_Range_Max_RON": "spend_range_max_ron","Spend_Range_Min_SEK": "spend_range_min_sek","Spend_Range_Max_SEK": "spend_range_max_sek","Spend_Range_Min_GBP": "spend_range_min_gbp","Spend_Range_Max_GBP": "spend_range_max_gbp","Spend_Range_Min_NZD": "spend_range_min_nzd","Spend_Range_Max_NZD": "spend_range_max_nzd","Age_Targeting": "age_targeting","Gender_Targeting": "gender_targeting","Geo_Targeting_Included": "geo_targeting_included","Geo_Targeting_Excluded": "geo_targeting_excluded","First_Served_Timestamp": "first_served_timestamp","Last_Served_Timestamp": "last_served_timestamp"}', - }, - resources={ - "request_memory": "8G", - "request_cpu": "2", - "request_ephemeral_storage": "10G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_creative_stats_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_creative_stats_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=["data/google_political_ads/creative_stats/data_output.csv"], - source_format="CSV", - destination_project_dataset_table="google_political_ads.creative_stats", - skip_leading_rows=1, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "ad_id", - "type": "string", - "description": "Unique id for a specific election ad.", - "mode": "nullable", - }, - { - "name": "ad_url", - "type": "string", - "description": "URL to view the election ad in the election Advertising on Google report.", - "mode": "nullable", - }, - { - "name": "ad_type", - "type": "string", - "description": "The type of the ad. Can be TEXT VIDEO or IMAGE.", - "mode": "nullable", - }, - { - "name": "regions", - "type": "string", - "description": "The regions that this ad is verified for or were served in.", - "mode": "nullable", - }, - { - "name": "advertiser_id", - "type": "string", - "description": "ID of the advertiser who purchased the ad.", - "mode": "nullable", - }, - { - "name": "advertiser_name", - "type": "string", - "description": "Name of advertiser.", - "mode": "nullable", - }, - { - "name": "ad_campaigns_list", - "type": "string", - "description": "IDs of all election ad campaigns that included the ad.", - "mode": "nullable", - }, - { - "name": "date_range_start", - "type": "date", - "description": "First day a election ad ran and had an impression.", - "mode": "nullable", - }, - { - "name": "date_range_end", - "type": "date", - "description": "Most recent day a election ad ran and had an impression.", - "mode": "nullable", - }, - { - "name": "num_of_days", - "type": "integer", - "description": "Total number of days a election ad ran and had an impression.", - "mode": "nullable", - }, - { - "name": "impressions", - "type": "string", - "description": "Number of impressions for the election ad. Impressions are grouped into several buckets ≤ 10k 10k-100k 100k-1M 1M-10M > 10M.", - "mode": "nullable", - }, - { - "name": "spend_usd", - "type": "string", - "description": "[DEPRECATED] This field is deprecated in favor of specifying the lower and higher spend bucket bounds in separate Spend_Range_Min and Spend_Range_Max columns.", - "mode": "nullable", - }, - { - "name": "first_served_timestamp", - "type": "timestamp", - "description": "The timestamp of the earliest impression for this ad.", - "mode": "nullable", - }, - { - "name": "last_served_timestamp", - "type": "timestamp", - "description": "The timestamp of the most recent impression for this ad.", - "mode": "nullable", - }, - { - "name": "age_targeting", - "type": "string", - "description": "Age ranges included in the ad's targeting", - "mode": "nullable", - }, - { - "name": "gender_targeting", - "type": "string", - "description": "Genders included in the ad's targeting.", - "mode": "nullable", - }, - { - "name": "geo_targeting_included", - "type": "string", - "description": "Geographic locations included in the ad's targeting.", - "mode": "nullable", - }, - { - "name": "geo_targeting_excluded", - "type": "string", - "description": "Geographic locations excluded in the ad's targeting.", - "mode": "nullable", - }, - { - "name": "spend_range_min_usd", - "type": "integer", - "description": "Lower bound of the amount in USD spent by the advertiser on the election ad.", - "mode": "nullable", - }, - { - "name": "spend_range_max_usd", - "type": "integer", - "description": "Upper bound of the amount in USD spent by the advertiser on the election ad.", - "mode": "nullable", - }, - { - "name": "spend_range_min_eur", - "type": "integer", - "description": "Lower bound of the amount in EUR spent by the advertiser on the election ad.", - "mode": "nullable", - }, - { - "name": "spend_range_max_eur", - "type": "integer", - "description": "Upper bound of the amount in EUR spent by the advertiser on the election ad.", - "mode": "nullable", - }, - { - "name": "spend_range_min_inr", - "type": "integer", - "description": "Lower bound of the amount in INR spent by the advertiser on the election ad.", - "mode": "nullable", - }, - { - "name": "spend_range_max_inr", - "type": "integer", - "description": "Upper bound of the amount in INR spent by the advertiser on the election ad.", - "mode": "nullable", - }, - { - "name": "spend_range_min_bgn", - "type": "integer", - "description": "Lower bound of the amount in BGN spent by the advertiser on the election ad.", - "mode": "nullable", - }, - { - "name": "spend_range_max_bgn", - "type": "integer", - "description": "Upper bound of the amount in BGN spent by the advertiser on the election ad.", - "mode": "nullable", - }, - { - "name": "spend_range_min_hrk", - "type": "integer", - "description": "Lower bound of the amount in HRK spent by the advertiser on the election ad.", - "mode": "nullable", - }, - { - "name": "spend_range_max_hrk", - "type": "integer", - "description": "Upper bound of the amount in HRK spent by the advertiser on the election ad.", - "mode": "nullable", - }, - { - "name": "spend_range_min_czk", - "type": "integer", - "description": "Lower bound of the amount in CZK spent by the advertiser on the election ad.", - "mode": "nullable", - }, - { - "name": "spend_range_max_czk", - "type": "integer", - "description": "Upper bound of the amount in CZK spent by the advertiser on the election ad.", - "mode": "nullable", - }, - { - "name": "spend_range_min_dkk", - "type": "integer", - "description": "Lower bound of the amount in DKK spent by the advertiser on the election ad.", - "mode": "nullable", - }, - { - "name": "spend_range_max_dkk", - "type": "integer", - "description": "Upper bound of the amount in DKK spent by the advertiser on the election ad.", - "mode": "nullable", - }, - { - "name": "spend_range_min_huf", - "type": "integer", - "description": "Lower bound of the amount in HUF spent by the advertiser on the election ad.", - "mode": "nullable", - }, - { - "name": "spend_range_max_huf", - "type": "integer", - "description": "Upper bound of the amount in HUF spent by the advertiser on the election ad.", - "mode": "nullable", - }, - { - "name": "spend_range_min_pln", - "type": "integer", - "description": "Lower bound of the amount in PLN spent by the advertiser on the election ad.", - "mode": "nullable", - }, - { - "name": "spend_range_max_pln", - "type": "integer", - "description": "Upper bound of the amount in PLN spent by the advertiser on the election ad.", - "mode": "nullable", - }, - { - "name": "spend_range_min_ron", - "type": "integer", - "description": "Lower bound of the amount in RON spent by the advertiser on the election ad.", - "mode": "nullable", - }, - { - "name": "spend_range_max_ron", - "type": "integer", - "description": "Upper bound of the amount in RON spent by the advertiser on the election ad.", - "mode": "nullable", - }, - { - "name": "spend_range_min_sek", - "type": "integer", - "description": "Lower bound of the amount in SEK spent by the advertiser on the election ad.", - "mode": "nullable", - }, - { - "name": "spend_range_max_sek", - "type": "integer", - "description": "Upper bound of the amount in SEK spent by the advertiser on the election ad.", - "mode": "nullable", - }, - { - "name": "spend_range_min_gbp", - "type": "integer", - "description": "Lower bound of the amount in GBP spent by the advertiser on the election ad.", - "mode": "nullable", - }, - { - "name": "spend_range_max_gbp", - "type": "integer", - "description": "Upper bound of the amount in GBP spent by the advertiser on the election ad.", - "mode": "nullable", - }, - { - "name": "spend_range_min_nzd", - "type": "integer", - "description": "Lower bound of the amount in NZD spent by the advertiser on the election ad.", - "mode": "nullable", - }, - { - "name": "spend_range_max_nzd", - "type": "integer", - "description": "Upper bound of the amount in NZD spent by the advertiser on the election ad.", - "mode": "nullable", - }, - ], - ) - - creative_stats_transform_csv >> load_creative_stats_to_bq diff --git a/datasets/google_political_ads/pipelines/creative_stats/pipeline.yaml b/datasets/google_political_ads/pipelines/creative_stats/pipeline.yaml deleted file mode 100644 index 53e38a4bd..000000000 --- a/datasets/google_political_ads/pipelines/creative_stats/pipeline.yaml +++ /dev/null @@ -1,287 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - # Required Properties: - table_id: creative_stats - - # Description of the table - description: "This table contains the information for election ads that have appeared on Google Ads Services. Ad-level targeting data was added to this file in April 2020. ad_id is referenced from: campaign_targeting.ads_list Data that was previously available in the `google_political_ads.campaign_targeting` table has been deprecated and removed in favor of this table." - -dag: - airflow_version: 2 - initialize: - dag_id: creative_stats - default_args: - owner: "Google" - - # When set to True, keeps a task from getting triggered if the previous schedule for the task hasn’t succeeded - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "@daily" - catchup: False - default_view: graph - - tasks: - - operator: "KubernetesPodOperator" - - # Task description - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "creative_stats_transform_csv" - - startup_timeout_seconds: 600 - - # The name of the pod in which the task will run. This will be used (plus a random suffix) to generate a pod id - name: "creative_stats" - - # The namespace to run within Kubernetes. Always set its value to "default" because we follow the guideline that KubernetesPodOperator will only be used for very light workloads, i.e. use the Cloud Composer environment's resources without starving other pipelines. - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - - # Docker images will be built and pushed to GCR by default whenever the `scripts/generate_dag.py` is run. To skip building and pushing images, use the optional `--skip-builds` flag. - image: "{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}" - - # Set the environment variables you need initialized in the container. Use these as input variables for the script your container is expected to perform. - env_vars: - SOURCE_URL: "https://storage.googleapis.com/transparencyreport/google-political-ads-transparency-bundle.zip" - SOURCE_FILE: "files/data.zip" - FILE_NAME: "google-political-ads-transparency-bundle/google-political-ads-creative-stats.csv" - TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/google_political_ads/creative_stats/data_output.csv" - PIPELINE_NAME: "creative_stats" - CSV_HEADERS: >- - ["ad_id","ad_url","ad_type","regions","advertiser_id","advertiser_name","ad_campaigns_list","date_range_start","date_range_end","num_of_days","impressions","spend_usd","first_served_timestamp","last_served_timestamp","age_targeting","gender_targeting","geo_targeting_included","geo_targeting_excluded","spend_range_min_usd","spend_range_max_usd","spend_range_min_eur","spend_range_max_eur","spend_range_min_inr","spend_range_max_inr","spend_range_min_bgn","spend_range_max_bgn","spend_range_min_hrk","spend_range_max_hrk","spend_range_min_czk","spend_range_max_czk","spend_range_min_dkk","spend_range_max_dkk","spend_range_min_huf","spend_range_max_huf","spend_range_min_pln","spend_range_max_pln","spend_range_min_ron","spend_range_max_ron","spend_range_min_sek","spend_range_max_sek","spend_range_min_gbp","spend_range_max_gbp","spend_range_min_nzd","spend_range_max_nzd"] - RENAME_MAPPINGS: >- - {"Ad_ID": "ad_id","Ad_URL": "ad_url","Ad_Type": "ad_type","Regions": "regions","Advertiser_ID": "advertiser_id","Advertiser_Name": "advertiser_name","Ad_Campaigns_List": "ad_campaigns_list","Date_Range_Start": "date_range_start","Date_Range_End": "date_range_end","Num_of_Days": "num_of_days","Impressions": "impressions","Spend_USD": "spend_usd","Spend_Range_Min_USD": "spend_range_min_usd","Spend_Range_Max_USD": "spend_range_max_usd","Spend_Range_Min_EUR": "spend_range_min_eur","Spend_Range_Max_EUR": "spend_range_max_eur","Spend_Range_Min_INR": "spend_range_min_inr","Spend_Range_Max_INR": "spend_range_max_inr","Spend_Range_Min_BGN": "spend_range_min_bgn","Spend_Range_Max_BGN": "spend_range_max_bgn","Spend_Range_Min_HRK": "spend_range_min_hrk","Spend_Range_Max_HRK": "spend_range_max_hrk","Spend_Range_Min_CZK": "spend_range_min_czk","Spend_Range_Max_CZK": "spend_range_max_czk","Spend_Range_Min_DKK": "spend_range_min_dkk","Spend_Range_Max_DKK": "spend_range_max_dkk","Spend_Range_Min_HUF": "spend_range_min_huf","Spend_Range_Max_HUF": "spend_range_max_huf","Spend_Range_Min_PLN": "spend_range_min_pln","Spend_Range_Max_PLN": "spend_range_max_pln","Spend_Range_Min_RON": "spend_range_min_ron","Spend_Range_Max_RON": "spend_range_max_ron","Spend_Range_Min_SEK": "spend_range_min_sek","Spend_Range_Max_SEK": "spend_range_max_sek","Spend_Range_Min_GBP": "spend_range_min_gbp","Spend_Range_Max_GBP": "spend_range_max_gbp","Spend_Range_Min_NZD": "spend_range_min_nzd","Spend_Range_Max_NZD": "spend_range_max_nzd","Age_Targeting": "age_targeting","Gender_Targeting": "gender_targeting","Geo_Targeting_Included": "geo_targeting_included","Geo_Targeting_Excluded": "geo_targeting_excluded","First_Served_Timestamp": "first_served_timestamp","Last_Served_Timestamp": "last_served_timestamp"} - # Set resource limits for the pod here. For resource units in Kubernetes, see https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-units-in-kubernetes - resources: - request_memory: "8G" - request_cpu: "2" - request_ephemeral_storage: "10G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_creative_stats_to_bq" - - # The GCS bucket where the CSV file is located in. - bucket: "{{ var.value.composer_bucket }}" - - # The GCS object path for the CSV file - source_objects: ["data/google_political_ads/creative_stats/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "google_political_ads.creative_stats" - - # Use this if your CSV file contains a header row - skip_leading_rows: 1 - - # How to write data to the table: overwrite, append, or write if empty - # See https://cloud.google.com/bigquery/docs/reference/auditlogs/rest/Shared.Types/WriteDisposition - write_disposition: "WRITE_TRUNCATE" - - # The BigQuery table schema based on the CSV file. For more info, see - # https://cloud.google.com/bigquery/docs/schemas. - # Always use snake_case and lowercase for column names, and be explicit, - # i.e. specify modes for all columns. - - schema_fields: - - name: "ad_id" - type: "string" - description: "Unique id for a specific election ad." - mode: "nullable" - - name: "ad_url" - type: "string" - description: "URL to view the election ad in the election Advertising on Google report." - mode: "nullable" - - name: "ad_type" - type: "string" - description: "The type of the ad. Can be TEXT VIDEO or IMAGE." - mode: "nullable" - - name: "regions" - type: "string" - description: "The regions that this ad is verified for or were served in." - mode: "nullable" - - name: "advertiser_id" - type: "string" - description: "ID of the advertiser who purchased the ad." - mode: "nullable" - - name: "advertiser_name" - type: "string" - description: "Name of advertiser." - mode: "nullable" - - name: "ad_campaigns_list" - type: "string" - description: "IDs of all election ad campaigns that included the ad." - mode: "nullable" - - name: "date_range_start" - type: "date" - description: "First day a election ad ran and had an impression." - mode: "nullable" - - name: "date_range_end" - type: "date" - description: "Most recent day a election ad ran and had an impression." - mode: "nullable" - - name: "num_of_days" - type: "integer" - description: "Total number of days a election ad ran and had an impression." - mode: "nullable" - - name: "impressions" - type: "string" - description: "Number of impressions for the election ad. Impressions are grouped into several buckets ≤ 10k 10k-100k 100k-1M 1M-10M > 10M." - mode: "nullable" - - name: "spend_usd" - type: "string" - description: "[DEPRECATED] This field is deprecated in favor of specifying the lower and higher spend bucket bounds in separate Spend_Range_Min and Spend_Range_Max columns." - mode: "nullable" - - name: "first_served_timestamp" - type: "timestamp" - description: "The timestamp of the earliest impression for this ad." - mode: "nullable" - - name: "last_served_timestamp" - type: "timestamp" - description: "The timestamp of the most recent impression for this ad." - mode: "nullable" - - name: "age_targeting" - type: "string" - description: "Age ranges included in the ad's targeting" - mode: "nullable" - - name: "gender_targeting" - type: "string" - description: "Genders included in the ad's targeting." - mode: "nullable" - - name: "geo_targeting_included" - type: "string" - description: "Geographic locations included in the ad's targeting." - mode: "nullable" - - name: "geo_targeting_excluded" - type: "string" - description: "Geographic locations excluded in the ad's targeting." - mode: "nullable" - - name: "spend_range_min_usd" - type: "integer" - description: "Lower bound of the amount in USD spent by the advertiser on the election ad." - mode: "nullable" - - name: "spend_range_max_usd" - type: "integer" - description: "Upper bound of the amount in USD spent by the advertiser on the election ad." - mode: "nullable" - - name: "spend_range_min_eur" - type: "integer" - description: "Lower bound of the amount in EUR spent by the advertiser on the election ad." - mode: "nullable" - - name: "spend_range_max_eur" - type: "integer" - description: "Upper bound of the amount in EUR spent by the advertiser on the election ad." - mode: "nullable" - - name: "spend_range_min_inr" - type: "integer" - description: "Lower bound of the amount in INR spent by the advertiser on the election ad." - mode: "nullable" - - name: "spend_range_max_inr" - type: "integer" - description: "Upper bound of the amount in INR spent by the advertiser on the election ad." - mode: "nullable" - - name: "spend_range_min_bgn" - type: "integer" - description: "Lower bound of the amount in BGN spent by the advertiser on the election ad." - mode: "nullable" - - name: "spend_range_max_bgn" - type: "integer" - description: "Upper bound of the amount in BGN spent by the advertiser on the election ad." - mode: "nullable" - - name: "spend_range_min_hrk" - type: "integer" - description: "Lower bound of the amount in HRK spent by the advertiser on the election ad." - mode: "nullable" - - name: "spend_range_max_hrk" - type: "integer" - description: "Upper bound of the amount in HRK spent by the advertiser on the election ad." - mode: "nullable" - - name: "spend_range_min_czk" - type: "integer" - description: "Lower bound of the amount in CZK spent by the advertiser on the election ad." - mode: "nullable" - - name: "spend_range_max_czk" - type: "integer" - description: "Upper bound of the amount in CZK spent by the advertiser on the election ad." - mode: "nullable" - - name: "spend_range_min_dkk" - type: "integer" - description: "Lower bound of the amount in DKK spent by the advertiser on the election ad." - mode: "nullable" - - name: "spend_range_max_dkk" - type: "integer" - description: "Upper bound of the amount in DKK spent by the advertiser on the election ad." - mode: "nullable" - - name: "spend_range_min_huf" - type: "integer" - description: "Lower bound of the amount in HUF spent by the advertiser on the election ad." - mode: "nullable" - - name: "spend_range_max_huf" - type: "integer" - description: "Upper bound of the amount in HUF spent by the advertiser on the election ad." - mode: "nullable" - - name: "spend_range_min_pln" - type: "integer" - description: "Lower bound of the amount in PLN spent by the advertiser on the election ad." - mode: "nullable" - - name: "spend_range_max_pln" - type: "integer" - description: "Upper bound of the amount in PLN spent by the advertiser on the election ad." - mode: "nullable" - - name: "spend_range_min_ron" - type: "integer" - description: "Lower bound of the amount in RON spent by the advertiser on the election ad." - mode: "nullable" - - name: "spend_range_max_ron" - type: "integer" - description: "Upper bound of the amount in RON spent by the advertiser on the election ad." - mode: "nullable" - - name: "spend_range_min_sek" - type: "integer" - description: "Lower bound of the amount in SEK spent by the advertiser on the election ad." - mode: "nullable" - - name: "spend_range_max_sek" - type: "integer" - description: "Upper bound of the amount in SEK spent by the advertiser on the election ad." - mode: "nullable" - - name: "spend_range_min_gbp" - type: "integer" - description: "Lower bound of the amount in GBP spent by the advertiser on the election ad." - mode: "nullable" - - name: "spend_range_max_gbp" - type: "integer" - description: "Upper bound of the amount in GBP spent by the advertiser on the election ad." - mode: "nullable" - - name: "spend_range_min_nzd" - type: "integer" - description: "Lower bound of the amount in NZD spent by the advertiser on the election ad." - mode: "nullable" - - name: "spend_range_max_nzd" - type: "integer" - description: "Upper bound of the amount in NZD spent by the advertiser on the election ad." - mode: "nullable" - graph_paths: - - "creative_stats_transform_csv >> load_creative_stats_to_bq" diff --git a/datasets/google_political_ads/pipelines/dataset.yaml b/datasets/google_political_ads/pipelines/dataset.yaml index 351017529..688891819 100644 --- a/datasets/google_political_ads/pipelines/dataset.yaml +++ b/datasets/google_political_ads/pipelines/dataset.yaml @@ -13,76 +13,29 @@ # limitations under the License. dataset: - # The `dataset` block includes properties for your dataset that will be shown - # to users of your data on the Google Cloud website. - - # Must be exactly the same name as the folder name your dataset.yaml is in. name: google_political_ads - - # A friendly, human-readable name of the dataset friendly_name: google_political_ads - - # A short, descriptive summary of the dataset. - description: |- - Overview: This dataset contains information on how much money is spent by verified advertisers on political advertising across Google Ad Services. In addition, insights on demographic targeting used in political ad campaigns by these advertisers are also provided. Finally, links to the actual political ad in the Google Transparency Report (https://transparencyreport.google.com/) are provided. Data for an election expires 7 years after the election. After this point, the data are removed from the dataset and are no longer available. - - Update frequency: Weekly - - Dataset source: Transparency Report: Political Advertising on Google - - Terms of use: - - See the GCP Marketplace listing for more details and sample queries: https://console.cloud.google.com/marketplace/details/transparency-report/google-political-ads - - For more information see: - The Political Advertising on Google Transparency Report at - https://transparencyreport.google.com/political-ads/home - - The supporting Frequently Asked Questions at - https://support.google.com/transparencyreport/answer/9575640?hl=en&ref_topic=7295796 - - # A list of sources the dataset is derived from, using the YAML list syntax. + description: ~ dataset_sources: ~ - - # A list of terms and conditions that users of the dataset should agree on, - # using the YAML list syntax. terms_of_use: ~ -resources: - # A list of Google Cloud resources needed by your dataset. In principle, all - # pipelines under a dataset should be able to share these resources. - # - # The currently supported resources are shown below. Use only the resources - # you need, and delete the rest as needed by your pipeline. - # - # We will keep adding to the list below to support more Google Cloud resources - # over time. If a resource you need isn't supported, please file an issue on - # the repository. +resources: - type: bigquery_dataset - # Google BigQuery dataset to namespace all tables managed by this folder - # - # Required Properties: - # dataset_id - # - # Optional Properties: - # friendly_name (A user-friendly name of the dataset) - # description (A user-friendly description of the dataset) - # location (The geographic location where the dataset should reside) dataset_id: google_political_ads description: |- - Overview: This dataset contains information on how much money is spent by verified advertisers on political advertising across Google Ad Services. In addition, insights on demographic targeting used in political ad campaigns by these advertisers are also provided. Finally, links to the actual political ad in the Google Transparency Report (https://transparencyreport.google.com/) are provided. Data for an election expires 7 years after the election. After this point, the data are removed from the dataset and are no longer available. + Overview: This dataset contains information on how much money is spent by verified advertisers on political advertising across Google Ad Services. In addition, insights on demographic targeting used in political ad campaigns by these advertisers are also provided. Finally, links to the actual political ad in the Google Transparency Report (https://adstransparency.google.com) are provided. Data for an election expires 7 years after the election. After this point, the data are removed from the dataset and are no longer available. - Update frequency: Weekly + Update frequency: Daily - Dataset source: Transparency Report: Political Advertising on Google + Dataset source: Transparency Report: Political Advertising on Google - Terms of use: + Terms of use: - See the GCP Marketplace listing for more details and sample queries: https://console.cloud.google.com/marketplace/details/transparency-report/google-political-ads + See the GCP Marketplace listing for more details and sample queries: https://console.cloud.google.com/marketplace/details/transparency-report/google-political-ads - For more information see: - The Political Advertising on Google Transparency Report at - https://transparencyreport.google.com/political-ads/home + For more information see: + The Political Advertising on Google Transparency Report at + https://adstransparency.google.com - The supporting Frequently Asked Questions at - https://support.google.com/transparencyreport/answer/9575640?hl=en&ref_topic=7295796 + The supporting Frequently Asked Questions at + https://support.google.com/transparencyreport/answer/9575640?hl=en&ref_topic=7295796 diff --git a/datasets/google_political_ads/pipelines/geo_spend/geo_spend_dag.py b/datasets/google_political_ads/pipelines/geo_spend/geo_spend_dag.py deleted file mode 100644 index 3ce7c506a..000000000 --- a/datasets/google_political_ads/pipelines/geo_spend/geo_spend_dag.py +++ /dev/null @@ -1,172 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="google_political_ads.geo_spend", - default_args=default_args, - max_active_runs=1, - schedule_interval="@daily", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - geo_spend_transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="geo_spend_transform_csv", - startup_timeout_seconds=600, - name="geo_spend", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://storage.googleapis.com/transparencyreport/google-political-ads-transparency-bundle.zip", - "SOURCE_FILE": "files/data.zip", - "FILE_NAME": "google-political-ads-transparency-bundle/google-political-ads-geo-spend.csv", - "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/google_political_ads/geo_spend/data_output.csv", - "PIPELINE_NAME": "geo_spend", - "CSV_HEADERS": '["country","country_subdivision_primary","country_subdivision_secondary","spend_usd","spend_eur","spend_inr","spend_bgn","spend_hrk","spend_czk","spend_dkk","spend_huf","spend_pln","spend_ron","spend_sek","spend_gbp","spend_nzd"]', - "RENAME_MAPPINGS": '{"Country": "country","Country_Subdivision_Primary": "country_subdivision_primary","Country_Subdivision_Secondary": "country_subdivision_secondary","Spend_USD": "spend_usd","Spend_EUR": "spend_eur","Spend_INR": "spend_inr","Spend_BGN": "spend_bgn","Spend_HRK": "spend_hrk","Spend_CZK": "spend_czk","Spend_DKK": "spend_dkk","Spend_HUF": "spend_huf","Spend_PLN": "spend_pln","Spend_RON": "spend_ron","Spend_SEK": "spend_sek","Spend_GBP": "spend_gbp","Spend_NZD": "spend_nzd"}', - }, - resources={ - "request_memory": "2G", - "request_cpu": "1", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_geo_spend_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_geo_spend_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=["data/google_political_ads/geo_spend/data_output.csv"], - source_format="CSV", - destination_project_dataset_table="google_political_ads.geo_spend", - skip_leading_rows=1, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "country", - "type": "string", - "description": 'The country where election ads were served specified in the ISO 3166-1 alpha-2 standard code. For example "US" for United States.', - "mode": "nullable", - }, - { - "name": "country_subdivision_primary", - "type": "string", - "description": 'The primary subdivision of the country where election ads were served specified by the ISO 3166-2 standard code. For example "US-CA" for California state in United States', - "mode": "nullable", - }, - { - "name": "country_subdivision_secondary", - "type": "string", - "description": "The name of the secondary subdivision. For example The name of a US congressional district.", - "mode": "nullable", - }, - { - "name": "spend_usd", - "type": "integer", - "description": "Total amount in USD spent on election ads in this region.", - "mode": "nullable", - }, - { - "name": "spend_eur", - "type": "integer", - "description": "Total amount in EUR spent on election ads in this region.", - "mode": "nullable", - }, - { - "name": "spend_inr", - "type": "integer", - "description": "Total amount in INR spent on election ads in this region.", - "mode": "nullable", - }, - { - "name": "spend_bgn", - "type": "integer", - "description": "Total amount in BGN spent on election ads in this region.", - "mode": "nullable", - }, - { - "name": "spend_hrk", - "type": "integer", - "description": "Total amount in HRK spent on election ads in this region.", - "mode": "nullable", - }, - { - "name": "spend_czk", - "type": "integer", - "description": "Total amount in CZK spent on election ads in this region.", - "mode": "nullable", - }, - { - "name": "spend_dkk", - "type": "integer", - "description": "Total amount in DKK spent on election ads in this region.", - "mode": "nullable", - }, - { - "name": "spend_huf", - "type": "integer", - "description": "Total amount in HUF spent on election ads in this region.", - "mode": "nullable", - }, - { - "name": "spend_pln", - "type": "integer", - "description": "Total amount in PLN spent on election ads in this region.", - "mode": "nullable", - }, - { - "name": "spend_ron", - "type": "integer", - "description": "Total amount in RON spent on election ads in this region.", - "mode": "nullable", - }, - { - "name": "spend_sek", - "type": "integer", - "description": "Total amount in SEK spent on election ads in this region.", - "mode": "nullable", - }, - { - "name": "spend_gbp", - "type": "integer", - "description": "Total amount in GBP spent on election ads in this region.", - "mode": "nullable", - }, - { - "name": "spend_nzd", - "type": "integer", - "description": "Total amount in NZD spent on election ads in this region.", - "mode": "nullable", - }, - ], - ) - - geo_spend_transform_csv >> load_geo_spend_to_bq diff --git a/datasets/google_political_ads/pipelines/geo_spend/pipeline.yaml b/datasets/google_political_ads/pipelines/geo_spend/pipeline.yaml deleted file mode 100644 index ffc4b2b3d..000000000 --- a/datasets/google_political_ads/pipelines/geo_spend/pipeline.yaml +++ /dev/null @@ -1,176 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - # Required Properties: - table_id: geo_spend - - # Description of the table - description: "This table contains the information for how much is spent buying election ads on Google Ads Services. The data is aggregated by Congressional district. The primary key is state, congressional_district." - -dag: - airflow_version: 2 - initialize: - dag_id: geo_spend - default_args: - owner: "Google" - - # When set to True, keeps a task from getting triggered if the previous schedule for the task hasn’t succeeded - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "@daily" - catchup: False - default_view: graph - - tasks: - - operator: "KubernetesPodOperator" - - # Task description - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "geo_spend_transform_csv" - - startup_timeout_seconds: 600 - - # The name of the pod in which the task will run. This will be used (plus a random suffix) to generate a pod id - name: "geo_spend" - - # The namespace to run within Kubernetes. Always set its value to "default" because we follow the guideline that KubernetesPodOperator will only be used for very light workloads, i.e. use the Cloud Composer environment's resources without starving other pipelines. - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - - # Docker images will be built and pushed to GCR by default whenever the `scripts/generate_dag.py` is run. To skip building and pushing images, use the optional `--skip-builds` flag. - image: "{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}" - - # Set the environment variables you need initialized in the container. Use these as input variables for the script your container is expected to perform. - env_vars: - SOURCE_URL: "https://storage.googleapis.com/transparencyreport/google-political-ads-transparency-bundle.zip" - SOURCE_FILE: "files/data.zip" - FILE_NAME: "google-political-ads-transparency-bundle/google-political-ads-geo-spend.csv" - TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/google_political_ads/geo_spend/data_output.csv" - PIPELINE_NAME: "geo_spend" - CSV_HEADERS: >- - ["country","country_subdivision_primary","country_subdivision_secondary","spend_usd","spend_eur","spend_inr","spend_bgn","spend_hrk","spend_czk","spend_dkk","spend_huf","spend_pln","spend_ron","spend_sek","spend_gbp","spend_nzd"] - RENAME_MAPPINGS: >- - {"Country": "country","Country_Subdivision_Primary": "country_subdivision_primary","Country_Subdivision_Secondary": "country_subdivision_secondary","Spend_USD": "spend_usd","Spend_EUR": "spend_eur","Spend_INR": "spend_inr","Spend_BGN": "spend_bgn","Spend_HRK": "spend_hrk","Spend_CZK": "spend_czk","Spend_DKK": "spend_dkk","Spend_HUF": "spend_huf","Spend_PLN": "spend_pln","Spend_RON": "spend_ron","Spend_SEK": "spend_sek","Spend_GBP": "spend_gbp","Spend_NZD": "spend_nzd"} - - # Set resource limits for the pod here. For resource units in Kubernetes, see https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-units-in-kubernetes - resources: - request_memory: "2G" - request_cpu: "1" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_geo_spend_to_bq" - - # The GCS bucket where the CSV file is located in. - bucket: "{{ var.value.composer_bucket }}" - - # The GCS object path for the CSV file - source_objects: ["data/google_political_ads/geo_spend/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "google_political_ads.geo_spend" - - # Use this if your CSV file contains a header row - skip_leading_rows: 1 - - # How to write data to the table: overwrite, append, or write if empty - # See https://cloud.google.com/bigquery/docs/reference/auditlogs/rest/Shared.Types/WriteDisposition - write_disposition: "WRITE_TRUNCATE" - - # The BigQuery table schema based on the CSV file. For more info, see - # https://cloud.google.com/bigquery/docs/schemas. - # Always use snake_case and lowercase for column names, and be explicit, - # i.e. specify modes for all columns. - - schema_fields: - - name: "country" - type: "string" - description: "The country where election ads were served specified in the ISO 3166-1 alpha-2 standard code. For example \"US\" for United States." - mode: "nullable" - - name: "country_subdivision_primary" - type: "string" - description: "The primary subdivision of the country where election ads were served specified by the ISO 3166-2 standard code. For example \"US-CA\" for California state in United States" - mode: "nullable" - - name: "country_subdivision_secondary" - type: "string" - description: "The name of the secondary subdivision. For example The name of a US congressional district." - mode: "nullable" - - name: "spend_usd" - type: "integer" - description: "Total amount in USD spent on election ads in this region." - mode: "nullable" - - name: "spend_eur" - type: "integer" - description: "Total amount in EUR spent on election ads in this region." - mode: "nullable" - - name: "spend_inr" - type: "integer" - description: "Total amount in INR spent on election ads in this region." - mode: "nullable" - - name: "spend_bgn" - type: "integer" - description: "Total amount in BGN spent on election ads in this region." - mode: "nullable" - - name: "spend_hrk" - type: "integer" - description: "Total amount in HRK spent on election ads in this region." - mode: "nullable" - - name: "spend_czk" - type: "integer" - description: "Total amount in CZK spent on election ads in this region." - mode: "nullable" - - name: "spend_dkk" - type: "integer" - description: "Total amount in DKK spent on election ads in this region." - mode: "nullable" - - name: "spend_huf" - type: "integer" - description: "Total amount in HUF spent on election ads in this region." - mode: "nullable" - - name: "spend_pln" - type: "integer" - description: "Total amount in PLN spent on election ads in this region." - mode: "nullable" - - name: "spend_ron" - type: "integer" - description: "Total amount in RON spent on election ads in this region." - mode: "nullable" - - name: "spend_sek" - type: "integer" - description: "Total amount in SEK spent on election ads in this region." - mode: "nullable" - - name: "spend_gbp" - type: "integer" - description: "Total amount in GBP spent on election ads in this region." - mode: "nullable" - - name: "spend_nzd" - type: "integer" - description: "Total amount in NZD spent on election ads in this region." - mode: "nullable" - graph_paths: - - "geo_spend_transform_csv >> load_geo_spend_to_bq" diff --git a/datasets/google_political_ads/pipelines/last_updated/last_updated_dag.py b/datasets/google_political_ads/pipelines/last_updated/last_updated_dag.py deleted file mode 100644 index 16160e4cc..000000000 --- a/datasets/google_political_ads/pipelines/last_updated/last_updated_dag.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="google_political_ads.last_updated", - default_args=default_args, - max_active_runs=1, - schedule_interval="@daily", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - last_updated_transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="last_updated_transform_csv", - startup_timeout_seconds=600, - name="last_updated", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://storage.googleapis.com/transparencyreport/google-political-ads-transparency-bundle.zip", - "SOURCE_FILE": "files/data.zip", - "FILE_NAME": "google-political-ads-transparency-bundle/google-political-ads-updated*", - "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/google_political_ads/last_updated/data_output.csv", - "PIPELINE_NAME": "last_updated", - "CSV_HEADERS": '["report_data_updated_date"]', - "RENAME_MAPPINGS": '{"Report_Data_Updated_Date": "report_data_updated_date"}', - }, - resources={ - "request_memory": "2G", - "request_cpu": "1", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_last_updated_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_last_updated_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=["data/google_political_ads/last_updated/data_output.csv"], - source_format="CSV", - destination_project_dataset_table="google_political_ads.last_updated", - skip_leading_rows=1, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "report_data_updated_date", - "type": "Date", - "description": "The date the report data was most reecntly updated", - "mode": "nullable", - } - ], - ) - - last_updated_transform_csv >> load_last_updated_to_bq diff --git a/datasets/google_political_ads/pipelines/last_updated/pipeline.yaml b/datasets/google_political_ads/pipelines/last_updated/pipeline.yaml deleted file mode 100644 index 51e88f057..000000000 --- a/datasets/google_political_ads/pipelines/last_updated/pipeline.yaml +++ /dev/null @@ -1,116 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - # Required Properties: - table_id: last_updated - - # Description of the table - description: "This table contains the information of the latest updated date for the Political Ads report. All dates provided are per UTC time zone." - -dag: - airflow_version: 2 - initialize: - dag_id: last_updated - default_args: - owner: "Google" - - # When set to True, keeps a task from getting triggered if the previous schedule for the task hasn’t succeeded - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "@daily" - catchup: False - default_view: graph - - tasks: - - operator: "KubernetesPodOperator" - - # Task description - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "last_updated_transform_csv" - - startup_timeout_seconds: 600 - - # The name of the pod in which the task will run. This will be used (plus a random suffix) to generate a pod id - name: "last_updated" - - # The namespace to run within Kubernetes. Always set its value to "default" because we follow the guideline that KubernetesPodOperator will only be used for very light workloads, i.e. use the Cloud Composer environment's resources without starving other pipelines. - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - - # Docker images will be built and pushed to GCR by default whenever the `scripts/generate_dag.py` is run. To skip building and pushing images, use the optional `--skip-builds` flag. - image: "{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}" - - # Set the environment variables you need initialized in the container. Use these as input variables for the script your container is expected to perform. - env_vars: - SOURCE_URL: "https://storage.googleapis.com/transparencyreport/google-political-ads-transparency-bundle.zip" - SOURCE_FILE: "files/data.zip" - FILE_NAME: "google-political-ads-transparency-bundle/google-political-ads-updated*" - TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/google_political_ads/last_updated/data_output.csv" - PIPELINE_NAME: "last_updated" - CSV_HEADERS: >- - ["report_data_updated_date"] - RENAME_MAPPINGS: >- - {"Report_Data_Updated_Date": "report_data_updated_date"} - - # Set resource limits for the pod here. For resource units in Kubernetes, see https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-units-in-kubernetes - resources: - request_memory: "2G" - request_cpu: "1" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_last_updated_to_bq" - - # The GCS bucket where the CSV file is located in. - bucket: "{{ var.value.composer_bucket }}" - - # The GCS object path for the CSV file - source_objects: ["data/google_political_ads/last_updated/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "google_political_ads.last_updated" - - # Use this if your CSV file contains a header row - skip_leading_rows: 1 - - # How to write data to the table: overwrite, append, or write if empty - # See https://cloud.google.com/bigquery/docs/reference/auditlogs/rest/Shared.Types/WriteDisposition - write_disposition: "WRITE_TRUNCATE" - - # The BigQuery table schema based on the CSV file. For more info, see - # https://cloud.google.com/bigquery/docs/schemas. - # Always use snake_case and lowercase for column names, and be explicit, - # i.e. specify modes for all columns. - - schema_fields: - - name: "report_data_updated_date" - type: "Date" - description: "The date the report data was most reecntly updated" - mode: "nullable" - graph_paths: - - "last_updated_transform_csv >> load_last_updated_to_bq" diff --git a/datasets/google_political_ads/pipelines/process_csvs_and_load_to_bq/pipeline.yaml b/datasets/google_political_ads/pipelines/process_csvs_and_load_to_bq/pipeline.yaml new file mode 100644 index 000000000..822cd7de4 --- /dev/null +++ b/datasets/google_political_ads/pipelines/process_csvs_and_load_to_bq/pipeline.yaml @@ -0,0 +1,1030 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + + - type: bigquery_table + table_id: advertiser_declared_stats + description: "Certain California and New Zealand advertisers are required to submit additional data about themselves. The advertiser is responsible for the accuracy of this information, which Google has not confirmed. For California, this information is provided from our express notification process required for certain California advertisers, which is separate from our verification process. For New Zealand, this information is provided during our verification process." + + - type: bigquery_table + table_id: advertiser_geo_spend + description: "This file contains total US advertiser spend on political ads, per US state and the District of Columbia." + + - type: bigquery_table + table_id: advertiser_stats + description: "This table contains the information about advertisers who have run an election ad on Google Ads Services with at least one impression. The table's primary key is advertiser_id. This table relates to the others in this dataset, with the following connections between columns: advertiser_id is referenced from: advertiser_weekly_spend.advertiser_id campaign_targeting.advertiser_id creative_stats.advertiser_id advertiser_name is referenced from: advertiser_weekly_spend.advertiser_name campaign_targeting.advertiser_name advertiser_id.advertiser_name" + + - type: bigquery_table + table_id: advertiser_weekly_spend + description: "This table contains the information for how much an advertiser spent on political ads during a given week. The table's primary key is advertiser_id, election_cycle, week_start_date" + + - type: bigquery_table + table_id: campaign_targeting + description: "This table was deprecated and ad-level targeting information was made available in the `google_political_ads.creative_stats` BigQuery table, effective April 2020. This table contains the information related to ad campaigns run by advertisers." + + - type: bigquery_table + table_id: creative_stats + description: "This table contains the information for election ads that have appeared on Google Ads Services. Ad-level targeting data was added to this file in April 2020. ad_id is referenced from: campaign_targeting.ads_list Data that was previously available in the `google_political_ads.campaign_targeting` table has been deprecated and removed in favor of this table." + + - type: bigquery_table + table_id: geo_spend + description: "This table contains the information for how much is spent buying election ads on Google Ads Services. The data is aggregated by Congressional district. The primary key is state, congressional_district." + + - type: bigquery_table + table_id: last_updated + description: "This table contains the information of the latest updated date for the Political Ads report. All dates provided are per UTC time zone." + + - type: bigquery_table + table_id: top_keywords_history + description: "The \"Top Keywords\" section of the US report was removed and updates to this table were terminated in December 2019. The table reflects historical data. This table contains the information for the top six keywords on which political advertisers have spent money during an election cycle. This data is only provided for US elections. The primary key is election_cycle, report_date." + + +dag: + airflow_version: 2 + initialize: + dag_id: process_csvs_and_load_to_bq + default_args: + owner: "Google" + depends_on_past: False + start_date: "2021-03-01" + max_active_runs: 1 + schedule_interval: "@daily" + catchup: False + default_view: graph + + tasks: + - operator: "GoogleCloudStorageToGoogleCloudStorageOperator" + description: "Task to archive the CSV file in the destination bucket" + args: + task_id: "download_zip_file_to_composer_bucket" + source_bucket: "political-csv" + source_object: "google-political-ads-transparency-bundle.zip" + destination_bucket: "{{ var.value.composer_bucket }}" + destination_object: "data/google_political_ads/google-political-ads-transparency-bundle.zip" + impersonation_chain: "{{ var.json.google_political_ads.service_account }}" + move_object: False + + # advertiser_declared_stats + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "transform_advertiser_declared_stats_csv" + startup_timeout_seconds: 600 + name: "advertiser_declared_stats" + namespace: "composer" + service_account_name: "datasets" + image_pull_policy: "Always" + image: "{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_GCS_BUCKET: "{{ var.value.composer_bucket }}" + SOURCE_GCS_OBJECT: "data/google_political_ads/google-political-ads-transparency-bundle.zip" + ZIP_FILE: "files/google-political-ads-transparency-bundle.zip" + CSV_FILE: "google-political-ads-advertiser-declared-stats.csv" + TARGET_FILE: "files/data_output.csv" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/google_political_ads/advertiser_declared_stats/data_output.csv" + TABLE_NAME: "advertiser_declared_stats" + CSV_HEADERS: >- + ["advertiser_id","advertiser_declared_name","advertiser_declared_regulatory_id","advertiser_declared_scope","advertiser_declared_promoter_name","advertiser_declared_promoter_address"] + RENAME_MAPPINGS: >- + {"Advertiser_ID" : "advertiser_id","Advertiser_Declared_Name" : "advertiser_declared_name","Advertiser_Declared_Regulatory_ID" : "advertiser_declared_regulatory_id","Advertiser_Declared_Scope" : "advertiser_declared_scope","Advertiser_Declared_Promoter_Name" : "advertiser_declared_promoter_name","Advertiser_Declared_Promoter_Address" : "advertiser_declared_promoter_address"} + resources: + request_memory: "1G" + request_cpu: "200m" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + args: + task_id: "load_advertiser_declared_stats_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/google_political_ads/advertiser_declared_stats/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "google_political_ads.advertiser_declared_stats" + skip_leading_rows: 1 + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - name: "advertiser_id" + type: "string" + description: "ID of the advertiser who purchased the ad." + mode: "nullable" + - name: "advertiser_declared_name" + type: "string" + description: "The advertiser's committee declared name." + mode: "nullable" + - name: "advertiser_declared_regulatory_id" + type: "string" + description: "Committee declared identification number." + mode: "nullable" + - name: "advertiser_declared_scope" + type: "string" + description: "Committee-provided information about the candidate and office or ballot proposition and jurisdiction to which the advertisement refers which is separate from our verification process." + mode: "nullable" + - name: "advertiser_declared_promoter_name" + type: "string" + description: "The New Zealand advertiser's declared Promoter Statement name." + mode: "nullable" + - name: "advertiser_declared_promoter_address" + type: "string" + description: "The New Zealand advertiser's declared Promoter Statement address." + mode: "nullable" + + # advertiser_geo_spend + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "transform_advertiser_geo_spend_csv" + startup_timeout_seconds: 600 + name: "advertiser_geo_spend" + namespace: "composer" + service_account_name: "datasets" + image_pull_policy: "Always" + image: "{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_GCS_BUCKET: "{{ var.value.composer_bucket }}" + SOURCE_GCS_OBJECT: "data/google_political_ads/google-political-ads-transparency-bundle.zip" + ZIP_FILE: "files/google-political-ads-transparency-bundle.zip" + CSV_FILE: "google-political-ads-advertiser-geo-spend.csv" + TARGET_FILE: "files/data_output.csv" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/google_political_ads/advertiser_geo_spend/data_output.csv" + TABLE_NAME: "advertiser_geo_spend" + CSV_HEADERS: >- + ["advertiser_id","advertiser_name","country","country_subdivision_primary","spend_usd","spend_eur","spend_inr","spend_bgn","spend_hrk","spend_czk","spend_dkk","spend_huf","spend_pln","spend_ron","spend_sek","spend_gbp","spend_nzd"] + RENAME_MAPPINGS: >- + {"Advertiser_ID" : "advertiser_id" ,"Advertiser_Name" : "advertiser_name" ,"Country" : "country" ,"Country_Subdivision_Primary" : "country_subdivision_primary" ,"Spend_USD" : "spend_usd" ,"Spend_EUR" : "spend_eur" ,"Spend_INR" : "spend_inr" ,"Spend_BGN" : "spend_bgn" ,"Spend_HRK" : "spend_hrk" ,"Spend_CZK" : "spend_czk" ,"Spend_DKK" : "spend_dkk" ,"Spend_HUF" : "spend_huf" ,"Spend_PLN" : "spend_pln" ,"Spend_RON" : "spend_ron" ,"Spend_SEK" : "spend_sek" ,"Spend_GBP" : "spend_gbp" ,"Spend_NZD" : "spend_nzd"} + resources: + request_memory: "1G" + request_cpu: "200m" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + args: + task_id: "load_advertiser_geo_spend_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/google_political_ads/advertiser_geo_spend/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "google_political_ads.advertiser_geo_spend" + skip_leading_rows: 1 + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - name: "advertiser_id" + type: "string" + description: "Unique ID for an advertiser verified to run election ads on Google Ads Services." + mode: "nullable" + - name: "advertiser_name" + type: "string" + description: "Name of the advertiser." + mode: "nullable" + - name: "country" + type: "string" + description: "The country where election ads were served specified in the ISO 3166-1 alpha-2 standard code. For example: \"US\" for United States." + mode: "nullable" + - name: "country_subdivision_primary" + type: "string" + description: "The primary subdivision of the country where election ads were served specified by the ISO 3166-2 standard code. For example: \"US-CA\" for California state in United States" + mode: "nullable" + - name: "spend_usd" + type: "integer" + description: "Total amount in USD spent on election ads in this region." + mode: "nullable" + - name: "spend_eur" + type: "integer" + description: "Total amount in EUR spent on election ads in this region." + mode: "nullable" + - name: "spend_inr" + type: "integer" + description: "Total amount in INR spent on election ads in this region." + mode: "nullable" + - name: "spend_bgn" + type: "integer" + description: "Total amount in BGN spent on election ads in this region." + mode: "nullable" + - name: "spend_hrk" + type: "integer" + description: "Total amount in HRK spent on election ads in this region." + mode: "nullable" + - name: "spend_czk" + type: "integer" + description: "Total amount in CZK spent on election ads in this region." + mode: "nullable" + - name: "spend_dkk" + type: "integer" + description: "Total amount in DKK spent on election ads in this region." + mode: "nullable" + - name: "spend_huf" + type: "integer" + description: "Total amount in HUF spent on election ads in this region." + mode: "nullable" + - name: "spend_pln" + type: "integer" + description: "Total amount in PLN spent on election ads in this region." + mode: "nullable" + - name: "spend_ron" + type: "integer" + description: "Total amount in RON spent on election ads in this region." + mode: "nullable" + - name: "spend_sek" + type: "integer" + description: "Total amount in SEK spent on election ads in this region." + mode: "nullable" + - name: "spend_gbp" + type: "integer" + description: "Total amount in GBP spent on election ads in this region." + mode: "nullable" + - name: "spend_nzd" + type: "integer" + description: "Total amount in NZD spent on election ads in this region." + mode: "nullable" + + # advertiser_stats + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "transform_advertiser_stats_csv" + startup_timeout_seconds: 600 + name: "advertiser_stats" + namespace: "composer" + service_account_name: "datasets" + image_pull_policy: "Always" + image: "{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_GCS_BUCKET: "{{ var.value.composer_bucket }}" + SOURCE_GCS_OBJECT: "data/google_political_ads/google-political-ads-transparency-bundle.zip" + ZIP_FILE: "files/google-political-ads-transparency-bundle.zip" + CSV_FILE: "google-political-ads-advertiser-stats.csv" + TARGET_FILE: "files/data_output.csv" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/google_political_ads/advertiser_stats/data_output.csv" + TABLE_NAME: "advertiser_stats" + CSV_HEADERS: >- + ["advertiser_id","advertiser_name","public_ids_list","regions","elections","total_creatives","spend_usd","spend_eur","spend_inr","spend_bgn","spend_hrk","spend_czk","spend_dkk","spend_huf","spend_pln","spend_ron","spend_sek","spend_gbp","spend_nzd"] + RENAME_MAPPINGS: >- + {"Advertiser_ID": "advertiser_id","Advertiser_Name": "advertiser_name","Public_IDs_List": "public_ids_list","Regions": "regions","Elections": "elections","Total_Creatives": "total_creatives","Spend_USD": "spend_usd","Spend_EUR": "spend_eur","Spend_INR": "spend_inr","Spend_BGN": "spend_bgn","Spend_HRK": "spend_hrk","Spend_CZK": "spend_czk","Spend_DKK": "spend_dkk","Spend_HUF": "spend_huf","Spend_PLN": "spend_pln","Spend_RON": "spend_ron","Spend_SEK": "spend_sek","Spend_GBP": "spend_gbp","Spend_NZD": "spend_nzd"} + resources: + request_memory: "1G" + request_cpu: "200m" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + args: + task_id: "load_advertiser_stats_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/google_political_ads/advertiser_stats/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "google_political_ads.advertiser_stats" + skip_leading_rows: 1 + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - name: "advertiser_id" + type: "string" + description: "Unique ID for an advertiser verified to run election ads on Google Ads Services." + mode: "nullable" + - name: "advertiser_name" + type: "string" + description: "Name of advertiser." + mode: "nullable" + - name: "public_ids_list" + type: "string" + description: "List of public IDs used to identify the advertiser if available." + mode: "nullable" + - name: "regions" + type: "string" + description: "The list of regions where the ads of this advertiser were served" + mode: "nullable" + - name: "elections" + type: "string" + description: "The list of elections that this advertiser participated in based on the regions." + mode: "nullable" + - name: "total_creatives" + type: "integer" + description: "Total number of election ads the advertiser ran with at least one impression." + mode: "nullable" + - name: "spend_usd" + type: "integer" + description: "Total amount in USD spent on election ads by the advertiser." + mode: "nullable" + - name: "spend_eur" + type: "integer" + description: "Total amount in EUR spent on election ads by the advertiser." + mode: "nullable" + - name: "spend_inr" + type: "integer" + description: "Total amount in INR spent on election ads by the advertiser." + mode: "nullable" + - name: "spend_bgn" + type: "integer" + description: "Total amount in BGN spent on election ads by the advertiser." + mode: "nullable" + - name: "spend_hrk" + type: "integer" + description: "Total amount in HRK spent on election ads by the advertiser." + mode: "nullable" + - name: "spend_czk" + type: "integer" + description: "Total amount in CZK spent on election ads by the advertiser." + mode: "nullable" + - name: "spend_dkk" + type: "integer" + description: "Total amount in DKK spent on election ads by the advertiser." + mode: "nullable" + - name: "spend_huf" + type: "integer" + description: "Total amount in HUF spent on election ads by the advertiser." + mode: "nullable" + - name: "spend_pln" + type: "integer" + description: "Total amount in PLN spent on election ads by the advertiser." + mode: "nullable" + - name: "spend_ron" + type: "integer" + description: "Total amount in RON spent on election ads by the advertiser." + mode: "nullable" + - name: "spend_sek" + type: "integer" + description: "Total amount in SEK spent on election ads by the advertiser." + mode: "nullable" + - name: "spend_gbp" + type: "integer" + description: "Total amount in GBP spent on election ads by the advertiser." + mode: "nullable" + - name: "spend_nzd" + type: "integer" + description: "Total amount in NZD spent on election ads by the advertiser." + mode: "nullable" + + # advertiser_weekly_spend + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "transform_advertiser_weekly_spend_csv" + startup_timeout_seconds: 600 + name: "advertiser_weekly_spend" + namespace: "composer" + service_account_name: "datasets" + image_pull_policy: "Always" + image: "{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_GCS_BUCKET: "{{ var.value.composer_bucket }}" + SOURCE_GCS_OBJECT: "data/google_political_ads/google-political-ads-transparency-bundle.zip" + ZIP_FILE: "files/google-political-ads-transparency-bundle.zip" + CSV_FILE: "google-political-ads-advertiser-weekly-spend.csv" + TARGET_FILE: "files/data_output.csv" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/google_political_ads/advertiser_weekly_spend/data_output.csv" + TABLE_NAME: "advertiser_weekly_spend" + CSV_HEADERS: >- + ["advertiser_id","advertiser_name","election_cycle","week_start_date","spend_usd","spend_eur","spend_inr","spend_bgn","spend_hrk","spend_czk","spend_dkk","spend_huf","spend_pln","spend_ron","spend_sek","spend_gbp","spend_nzd"] + RENAME_MAPPINGS: >- + {"Advertiser_ID": "advertiser_id","Advertiser_Name": "advertiser_name","Election_Cycle": "election_cycle","Week_Start_Date": "week_start_date","Spend_USD": "spend_usd","Spend_EUR": "spend_eur","Spend_INR": "spend_inr","Spend_BGN": "spend_bgn","Spend_HRK": "spend_hrk","Spend_CZK": "spend_czk","Spend_DKK": "spend_dkk","Spend_HUF": "spend_huf","Spend_PLN": "spend_pln","Spend_RON": "spend_ron","Spend_SEK": "spend_sek","Spend_GBP": "spend_gbp","Spend_NZD": "spend_nzd"} + resources: + request_memory: "1G" + request_cpu: "200m" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + args: + task_id: "load_advertiser_weekly_spend_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/google_political_ads/advertiser_weekly_spend/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "google_political_ads.advertiser_weekly_spend" + skip_leading_rows: 1 + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - name: "advertiser_id" + type: "string" + description: "Unique ID for an advertiser verified to run election ads on Google Ads Services." + mode: "nullable" + - name: "advertiser_name" + type: "string" + description: "Name of advertiser." + mode: "nullable" + - name: "election_cycle" + type: "string" + description: "[DEPRECATED] This field is deprecated in favor of the Elections column in advertiser_stats table. It will be deleted some time after July 2019." + mode: "nullable" + - name: "week_start_date" + type: "date" + description: "The start date for the week where spending occurred." + mode: "nullable" + - name: "spend_usd" + type: "integer" + description: "The amount in USD spent on election ads during the given week by the advertiser." + mode: "nullable" + - name: "spend_eur" + type: "integer" + description: "The amount in EUR spent on election ads during the given week by the advertiser." + mode: "nullable" + - name: "spend_inr" + type: "integer" + description: "The amount in INR spent on election ads during the given week by the advertiser." + mode: "nullable" + - name: "spend_bgn" + type: "integer" + description: "The amount in BGN spent on election ads during the given week by the advertiser." + mode: "nullable" + - name: "spend_hrk" + type: "integer" + description: "The amount in HRK spent on election ads during the given week by the advertiser." + mode: "nullable" + - name: "spend_czk" + type: "integer" + description: "The amount in CZK spent on election ads during the given week by the advertiser." + mode: "nullable" + - name: "spend_dkk" + type: "integer" + description: "The amount in DKK spent on election ads during the given week by the advertiser." + mode: "nullable" + - name: "spend_huf" + type: "integer" + description: "The amount in HUF spent on election ads during the given week by the advertiser." + mode: "nullable" + - name: "spend_pln" + type: "integer" + description: "The amount in PLN spent on election ads during the given week by the advertiser." + mode: "nullable" + - name: "spend_ron" + type: "integer" + description: "The amount in RON spent on election ads during the given week by the advertiser." + mode: "nullable" + - name: "spend_sek" + type: "integer" + description: "The amount in SEK spent on election ads during the given week by the advertiser." + mode: "nullable" + - name: "spend_gbp" + type: "integer" + description: "The amount in GBP spent on election ads during the given week by the advertiser." + mode: "nullable" + - name: "spend_nzd" + type: "integer" + description: "The amount in NZD spent on election ads during the given week by the advertiser." + mode: "nullable" + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "transform_campaign_targeting_csv" + startup_timeout_seconds: 600 + name: "campaign_targeting" + namespace: "composer" + service_account_name: "datasets" + image_pull_policy: "Always" + image: "{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_GCS_BUCKET: "{{ var.value.composer_bucket }}" + SOURCE_GCS_OBJECT: "data/google_political_ads/google-political-ads-transparency-bundle.zip" + ZIP_FILE: "files/google-political-ads-transparency-bundle.zip" + CSV_FILE: "google-political-ads-campaign-targeting.csv" + TARGET_FILE: "files/data_output.csv" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/google_political_ads/campaign_targeting/data_output.csv" + TABLE_NAME: "campaign_targeting" + CSV_HEADERS: >- + ["campaign_id","age_targeting","gender_targeting","geo_targeting_included","geo_targeting_excluded","start_date","end_date","ads_list","advertiser_id","advertiser_name"] + RENAME_MAPPINGS: >- + {"Campaign_ID": "campaign_id","Age_Targeting": "age_targeting","Gender_Targeting": "gender_targeting","Geo_Targeting_Included": "geo_targeting_included","Geo_Targeting_Excluded": "geo_targeting_excluded","Start_Date": "start_date","End_Date": "end_date","Ads_List": "ads_list","Advertiser_ID": "advertiser_id","Advertiser_Name": "advertiser_name"} + resources: + request_memory: "1G" + request_cpu: "200m" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + args: + task_id: "load_campaign_targeting_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/google_political_ads/campaign_targeting/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "google_political_ads.campaign_targeting" + skip_leading_rows: 1 + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - name: "campaign_id" + type: "string" + description: "[DEPRECATED] Unique ID for a political ad campaign." + mode: "nullable" + - name: "age_targeting" + type: "string" + description: "[DEPRECATED] Age ranges included in the campaign's targeting." + mode: "nullable" + - name: "gender_targeting" + type: "string" + description: "[DEPRECATED] Genders included in the campaign's targeting" + mode: "nullable" + - name: "geo_targeting_included" + type: "string" + description: "[DEPRECATED] Geographic locations included in the campaign's targeting." + mode: "nullable" + - name: "geo_targeting_excluded" + type: "string" + description: "[DEPRECATED] Geographic locations excluded from the campaign's targeting." + mode: "nullable" + - name: "start_date" + type: "date" + description: "[DEPRECATED] Start date for the campaign." + mode: "nullable" + - name: "end_date" + type: "date" + description: "[DEPRECATED] End date for the campaign." + mode: "nullable" + - name: "ads_list" + type: "string" + description: "[DEPRECATED] List of Ad_IDs for the campaign." + mode: "nullable" + - name: "advertiser_id" + type: "string" + description: "[DEPRECATED] ID of the advertiser who purchased the ad." + mode: "nullable" + - name: "advertiser_name" + type: "string" + description: "[DEPRECATED] Name of advertiser." + mode: "nullable" + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "transform_creative_stats_csv" + startup_timeout_seconds: 600 + name: "creative_stats" + namespace: "composer" + service_account_name: "datasets" + image_pull_policy: "Always" + image: "{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_GCS_BUCKET: "{{ var.value.composer_bucket }}" + SOURCE_GCS_OBJECT: "data/google_political_ads/google-political-ads-transparency-bundle.zip" + ZIP_FILE: "files/google-political-ads-transparency-bundle.zip" + CSV_FILE: "google-political-ads-creative-stats.csv" + TARGET_FILE: "files/data_output.csv" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/google_political_ads/creative_stats/data_output.csv" + TABLE_NAME: "creative_stats" + CSV_HEADERS: >- + ["ad_id","ad_url","ad_type","regions","advertiser_id","advertiser_name","ad_campaigns_list","date_range_start","date_range_end","num_of_days","impressions","spend_usd","first_served_timestamp","last_served_timestamp","age_targeting","gender_targeting","geo_targeting_included","geo_targeting_excluded","spend_range_min_usd","spend_range_max_usd","spend_range_min_eur","spend_range_max_eur","spend_range_min_inr","spend_range_max_inr","spend_range_min_bgn","spend_range_max_bgn","spend_range_min_hrk","spend_range_max_hrk","spend_range_min_czk","spend_range_max_czk","spend_range_min_dkk","spend_range_max_dkk","spend_range_min_huf","spend_range_max_huf","spend_range_min_pln","spend_range_max_pln","spend_range_min_ron","spend_range_max_ron","spend_range_min_sek","spend_range_max_sek","spend_range_min_gbp","spend_range_max_gbp","spend_range_min_nzd","spend_range_max_nzd"] + RENAME_MAPPINGS: >- + {"Ad_ID": "ad_id","Ad_URL": "ad_url","Ad_Type": "ad_type","Regions": "regions","Advertiser_ID": "advertiser_id","Advertiser_Name": "advertiser_name","Ad_Campaigns_List": "ad_campaigns_list","Date_Range_Start": "date_range_start","Date_Range_End": "date_range_end","Num_of_Days": "num_of_days","Impressions": "impressions","Spend_USD": "spend_usd","Spend_Range_Min_USD": "spend_range_min_usd","Spend_Range_Max_USD": "spend_range_max_usd","Spend_Range_Min_EUR": "spend_range_min_eur","Spend_Range_Max_EUR": "spend_range_max_eur","Spend_Range_Min_INR": "spend_range_min_inr","Spend_Range_Max_INR": "spend_range_max_inr","Spend_Range_Min_BGN": "spend_range_min_bgn","Spend_Range_Max_BGN": "spend_range_max_bgn","Spend_Range_Min_HRK": "spend_range_min_hrk","Spend_Range_Max_HRK": "spend_range_max_hrk","Spend_Range_Min_CZK": "spend_range_min_czk","Spend_Range_Max_CZK": "spend_range_max_czk","Spend_Range_Min_DKK": "spend_range_min_dkk","Spend_Range_Max_DKK": "spend_range_max_dkk","Spend_Range_Min_HUF": "spend_range_min_huf","Spend_Range_Max_HUF": "spend_range_max_huf","Spend_Range_Min_PLN": "spend_range_min_pln","Spend_Range_Max_PLN": "spend_range_max_pln","Spend_Range_Min_RON": "spend_range_min_ron","Spend_Range_Max_RON": "spend_range_max_ron","Spend_Range_Min_SEK": "spend_range_min_sek","Spend_Range_Max_SEK": "spend_range_max_sek","Spend_Range_Min_GBP": "spend_range_min_gbp","Spend_Range_Max_GBP": "spend_range_max_gbp","Spend_Range_Min_NZD": "spend_range_min_nzd","Spend_Range_Max_NZD": "spend_range_max_nzd","Age_Targeting": "age_targeting","Gender_Targeting": "gender_targeting","Geo_Targeting_Included": "geo_targeting_included","Geo_Targeting_Excluded": "geo_targeting_excluded","First_Served_Timestamp": "first_served_timestamp","Last_Served_Timestamp": "last_served_timestamp"} + resources: + request_memory: "16G" + request_cpu: "2" + request_ephemeral_storage: "10G" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + args: + task_id: "load_creative_stats_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/google_political_ads/creative_stats/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "google_political_ads.creative_stats" + skip_leading_rows: 1 + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - name: "ad_id" + type: "string" + description: "Unique id for a specific election ad." + mode: "nullable" + - name: "ad_url" + type: "string" + description: "URL to view the election ad in the election Advertising on Google report." + mode: "nullable" + - name: "ad_type" + type: "string" + description: "The type of the ad. Can be TEXT VIDEO or IMAGE." + mode: "nullable" + - name: "regions" + type: "string" + description: "The regions that this ad is verified for or were served in." + mode: "nullable" + - name: "advertiser_id" + type: "string" + description: "ID of the advertiser who purchased the ad." + mode: "nullable" + - name: "advertiser_name" + type: "string" + description: "Name of advertiser." + mode: "nullable" + - name: "ad_campaigns_list" + type: "string" + description: "IDs of all election ad campaigns that included the ad." + mode: "nullable" + - name: "date_range_start" + type: "date" + description: "First day a election ad ran and had an impression." + mode: "nullable" + - name: "date_range_end" + type: "date" + description: "Most recent day a election ad ran and had an impression." + mode: "nullable" + - name: "num_of_days" + type: "integer" + description: "Total number of days a election ad ran and had an impression." + mode: "nullable" + - name: "impressions" + type: "string" + description: "Number of impressions for the election ad. Impressions are grouped into several buckets ≤ 10k 10k-100k 100k-1M 1M-10M > 10M." + mode: "nullable" + - name: "spend_usd" + type: "string" + description: "[DEPRECATED] This field is deprecated in favor of specifying the lower and higher spend bucket bounds in separate Spend_Range_Min and Spend_Range_Max columns." + mode: "nullable" + - name: "first_served_timestamp" + type: "timestamp" + description: "The timestamp of the earliest impression for this ad." + mode: "nullable" + - name: "last_served_timestamp" + type: "timestamp" + description: "The timestamp of the most recent impression for this ad." + mode: "nullable" + - name: "age_targeting" + type: "string" + description: "Age ranges included in the ad's targeting" + mode: "nullable" + - name: "gender_targeting" + type: "string" + description: "Genders included in the ad's targeting." + mode: "nullable" + - name: "geo_targeting_included" + type: "string" + description: "Geographic locations included in the ad's targeting." + mode: "nullable" + - name: "geo_targeting_excluded" + type: "string" + description: "Geographic locations excluded in the ad's targeting." + mode: "nullable" + - name: "spend_range_min_usd" + type: "integer" + description: "Lower bound of the amount in USD spent by the advertiser on the election ad." + mode: "nullable" + - name: "spend_range_max_usd" + type: "integer" + description: "Upper bound of the amount in USD spent by the advertiser on the election ad." + mode: "nullable" + - name: "spend_range_min_eur" + type: "integer" + description: "Lower bound of the amount in EUR spent by the advertiser on the election ad." + mode: "nullable" + - name: "spend_range_max_eur" + type: "integer" + description: "Upper bound of the amount in EUR spent by the advertiser on the election ad." + mode: "nullable" + - name: "spend_range_min_inr" + type: "integer" + description: "Lower bound of the amount in INR spent by the advertiser on the election ad." + mode: "nullable" + - name: "spend_range_max_inr" + type: "integer" + description: "Upper bound of the amount in INR spent by the advertiser on the election ad." + mode: "nullable" + - name: "spend_range_min_bgn" + type: "integer" + description: "Lower bound of the amount in BGN spent by the advertiser on the election ad." + mode: "nullable" + - name: "spend_range_max_bgn" + type: "integer" + description: "Upper bound of the amount in BGN spent by the advertiser on the election ad." + mode: "nullable" + - name: "spend_range_min_hrk" + type: "integer" + description: "Lower bound of the amount in HRK spent by the advertiser on the election ad." + mode: "nullable" + - name: "spend_range_max_hrk" + type: "integer" + description: "Upper bound of the amount in HRK spent by the advertiser on the election ad." + mode: "nullable" + - name: "spend_range_min_czk" + type: "integer" + description: "Lower bound of the amount in CZK spent by the advertiser on the election ad." + mode: "nullable" + - name: "spend_range_max_czk" + type: "integer" + description: "Upper bound of the amount in CZK spent by the advertiser on the election ad." + mode: "nullable" + - name: "spend_range_min_dkk" + type: "integer" + description: "Lower bound of the amount in DKK spent by the advertiser on the election ad." + mode: "nullable" + - name: "spend_range_max_dkk" + type: "integer" + description: "Upper bound of the amount in DKK spent by the advertiser on the election ad." + mode: "nullable" + - name: "spend_range_min_huf" + type: "integer" + description: "Lower bound of the amount in HUF spent by the advertiser on the election ad." + mode: "nullable" + - name: "spend_range_max_huf" + type: "integer" + description: "Upper bound of the amount in HUF spent by the advertiser on the election ad." + mode: "nullable" + - name: "spend_range_min_pln" + type: "integer" + description: "Lower bound of the amount in PLN spent by the advertiser on the election ad." + mode: "nullable" + - name: "spend_range_max_pln" + type: "integer" + description: "Upper bound of the amount in PLN spent by the advertiser on the election ad." + mode: "nullable" + - name: "spend_range_min_ron" + type: "integer" + description: "Lower bound of the amount in RON spent by the advertiser on the election ad." + mode: "nullable" + - name: "spend_range_max_ron" + type: "integer" + description: "Upper bound of the amount in RON spent by the advertiser on the election ad." + mode: "nullable" + - name: "spend_range_min_sek" + type: "integer" + description: "Lower bound of the amount in SEK spent by the advertiser on the election ad." + mode: "nullable" + - name: "spend_range_max_sek" + type: "integer" + description: "Upper bound of the amount in SEK spent by the advertiser on the election ad." + mode: "nullable" + - name: "spend_range_min_gbp" + type: "integer" + description: "Lower bound of the amount in GBP spent by the advertiser on the election ad." + mode: "nullable" + - name: "spend_range_max_gbp" + type: "integer" + description: "Upper bound of the amount in GBP spent by the advertiser on the election ad." + mode: "nullable" + - name: "spend_range_min_nzd" + type: "integer" + description: "Lower bound of the amount in NZD spent by the advertiser on the election ad." + mode: "nullable" + - name: "spend_range_max_nzd" + type: "integer" + description: "Upper bound of the amount in NZD spent by the advertiser on the election ad." + mode: "nullable" + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "transform_geo_spend_csv" + startup_timeout_seconds: 600 + name: "geo_spend" + namespace: "composer" + service_account_name: "datasets" + image_pull_policy: "Always" + image: "{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_GCS_BUCKET: "{{ var.value.composer_bucket }}" + SOURCE_GCS_OBJECT: "data/google_political_ads/google-political-ads-transparency-bundle.zip" + ZIP_FILE: "files/google-political-ads-transparency-bundle.zip" + CSV_FILE: "google-political-ads-geo-spend.csv" + TARGET_FILE: "files/data_output.csv" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/google_political_ads/geo_spend/data_output.csv" + TABLE_NAME: "geo_spend" + CSV_HEADERS: >- + ["country","country_subdivision_primary","country_subdivision_secondary","spend_usd","spend_eur","spend_inr","spend_bgn","spend_hrk","spend_czk","spend_dkk","spend_huf","spend_pln","spend_ron","spend_sek","spend_gbp","spend_nzd"] + RENAME_MAPPINGS: >- + {"Country": "country","Country_Subdivision_Primary": "country_subdivision_primary","Country_Subdivision_Secondary": "country_subdivision_secondary","Spend_USD": "spend_usd","Spend_EUR": "spend_eur","Spend_INR": "spend_inr","Spend_BGN": "spend_bgn","Spend_HRK": "spend_hrk","Spend_CZK": "spend_czk","Spend_DKK": "spend_dkk","Spend_HUF": "spend_huf","Spend_PLN": "spend_pln","Spend_RON": "spend_ron","Spend_SEK": "spend_sek","Spend_GBP": "spend_gbp","Spend_NZD": "spend_nzd"} + resources: + request_memory: "1G" + request_cpu: "200m" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + args: + task_id: "load_geo_spend_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/google_political_ads/geo_spend/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "google_political_ads.geo_spend" + skip_leading_rows: 1 + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - name: "country" + type: "string" + description: "The country where election ads were served specified in the ISO 3166-1 alpha-2 standard code. For example \"US\" for United States." + mode: "nullable" + - name: "country_subdivision_primary" + type: "string" + description: "The primary subdivision of the country where election ads were served specified by the ISO 3166-2 standard code. For example \"US-CA\" for California state in United States" + mode: "nullable" + - name: "country_subdivision_secondary" + type: "string" + description: "The name of the secondary subdivision. For example The name of a US congressional district." + mode: "nullable" + - name: "spend_usd" + type: "integer" + description: "Total amount in USD spent on election ads in this region." + mode: "nullable" + - name: "spend_eur" + type: "integer" + description: "Total amount in EUR spent on election ads in this region." + mode: "nullable" + - name: "spend_inr" + type: "integer" + description: "Total amount in INR spent on election ads in this region." + mode: "nullable" + - name: "spend_bgn" + type: "integer" + description: "Total amount in BGN spent on election ads in this region." + mode: "nullable" + - name: "spend_hrk" + type: "integer" + description: "Total amount in HRK spent on election ads in this region." + mode: "nullable" + - name: "spend_czk" + type: "integer" + description: "Total amount in CZK spent on election ads in this region." + mode: "nullable" + - name: "spend_dkk" + type: "integer" + description: "Total amount in DKK spent on election ads in this region." + mode: "nullable" + - name: "spend_huf" + type: "integer" + description: "Total amount in HUF spent on election ads in this region." + mode: "nullable" + - name: "spend_pln" + type: "integer" + description: "Total amount in PLN spent on election ads in this region." + mode: "nullable" + - name: "spend_ron" + type: "integer" + description: "Total amount in RON spent on election ads in this region." + mode: "nullable" + - name: "spend_sek" + type: "integer" + description: "Total amount in SEK spent on election ads in this region." + mode: "nullable" + - name: "spend_gbp" + type: "integer" + description: "Total amount in GBP spent on election ads in this region." + mode: "nullable" + - name: "spend_nzd" + type: "integer" + description: "Total amount in NZD spent on election ads in this region." + mode: "nullable" + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "transform_last_updated_csv" + startup_timeout_seconds: 600 + name: "last_updated" + namespace: "composer" + service_account_name: "datasets" + image_pull_policy: "Always" + image: "{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_GCS_BUCKET: "{{ var.value.composer_bucket }}" + SOURCE_GCS_OBJECT: "data/google_political_ads/google-political-ads-transparency-bundle.zip" + ZIP_FILE: "files/google-political-ads-transparency-bundle.zip" + CSV_FILE: "google-political-ads-updated.csv" + TARGET_FILE: "files/data_output.csv" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/google_political_ads/last_updated/data_output.csv" + TABLE_NAME: "last_updated" + CSV_HEADERS: >- + ["report_data_updated_time"] + RENAME_MAPPINGS: >- + {"Report_Data_Updated_Time (PT)": "report_data_updated_time"} + resources: + request_memory: "128M" + request_cpu: "200m" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + args: + task_id: "load_last_updated_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/google_political_ads/last_updated/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "google_political_ads.last_updated" + skip_leading_rows: 1 + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - name: "report_data_updated_time" + type: "datetime" + description: "The time the report data was most recently updated" + mode: "nullable" + + - operator: "KubernetesPodOperator" + description: "Run CSV transform within kubernetes pod" + args: + task_id: "transform_top_keywords_history_csv" + startup_timeout_seconds: 600 + name: "top_keywords_history" + namespace: "composer" + service_account_name: "datasets" + image_pull_policy: "Always" + image: "{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}" + env_vars: + SOURCE_GCS_BUCKET: "{{ var.value.composer_bucket }}" + SOURCE_GCS_OBJECT: "data/google_political_ads/google-political-ads-transparency-bundle.zip" + ZIP_FILE: "files/google-political-ads-transparency-bundle.zip" + CSV_FILE: "google-political-ads-top-keywords-history.csv" + TARGET_FILE: "files/data_output.csv" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/google_political_ads/top_keywords_history/data_output.csv" + TABLE_NAME: "top_keywords_history" + CSV_HEADERS: >- + ["election_cycle","report_date","keyword_1","spend_usd_1","keyword_2","spend_usd_2","keyword_3","spend_usd_3","keyword_4","spend_usd_4","keyword_5","spend_usd_5","keyword_6","spend_usd_6","region","elections"] + RENAME_MAPPINGS: >- + {"Election_Cycle": "election_cycle","Report_Date": "report_date","Keyword_1": "keyword_1","Spend_USD_1": "spend_usd_1","Keyword_2": "keyword_2","Spend_USD_2": "spend_usd_2","Keyword_3": "keyword_3","Spend_USD_3": "spend_usd_3","Keyword_4": "keyword_4","Spend_USD_4": "spend_usd_4","Keyword_5": "keyword_5","Spend_USD_5": "spend_usd_5","Keyword_6": "keyword_6","Spend_USD_6": "spend_usd_6","Region": "region","Elections": "elections"} + resources: + request_memory: "1G" + request_cpu: "200m" + + - operator: "GoogleCloudStorageToBigQueryOperator" + description: "Task to load CSV data to a BigQuery table" + args: + task_id: "load_top_keywords_history_to_bq" + bucket: "{{ var.value.composer_bucket }}" + source_objects: ["data/google_political_ads/top_keywords_history/data_output.csv"] + source_format: "CSV" + destination_project_dataset_table: "google_political_ads.top_keywords_history" + skip_leading_rows: 1 + write_disposition: "WRITE_TRUNCATE" + schema_fields: + - name: "election_cycle" + type: "string" + description: "[DEPRECATED] This field is deprecated in favor of the Region and Elections field. It will be deleted some time after July 2019." + mode: "nullable" + - name: "report_date" + type: "date" + description: "[DEPRECATED] The start date for the week where the spending was reported." + mode: "nullable" + - name: "keyword_1" + type: "string" + description: " [DEPRECATED] Keyword with the most spend by advertisers for political ads" + mode: "nullable" + - name: "spend_usd_1" + type: "integer" + description: "[DEPRECATED] Total spend in USD for Keyword_1." + mode: "nullable" + - name: "keyword_2" + type: "string" + description: "[DEPRECATED] Keyword with the next most spend by advertisers for political ads" + mode: "nullable" + - name: "spend_usd_2" + type: "integer" + description: "[DEPRECATED] Total spend in USD for Keyword_2." + mode: "nullable" + - name: "keyword_3" + type: "string" + description: "[DEPRECATED] Keyword with the next most spend by advertisers for political ads" + mode: "nullable" + - name: "spend_usd_3" + type: "integer" + description: "[DEPRECATED] Total spend in USD for Keyword_3." + mode: "nullable" + - name: "keyword_4" + type: "string" + description: "[DEPRECATED] Keyword with the next most spend by advertisers for political ads" + mode: "nullable" + - name: "spend_usd_4" + type: "integer" + description: "[DEPRECATED] Total spend in USD for Keyword_4." + mode: "nullable" + - name: "keyword_5" + type: "string" + description: "[DEPRECATED] Keyword with the next most spend by advertisers for political ads" + mode: "nullable" + - name: "spend_usd_5" + type: "integer" + description: "[DEPRECATED] Total spend in USD for Keyword_5." + mode: "nullable" + - name: "keyword_6" + type: "string" + description: "[DEPRECATED] Keyword with the next most spend by advertisers for political ads" + mode: "nullable" + - name: "spend_usd_6" + type: "integer" + description: "[DEPRECATED] Total spend in USD for Keyword_6." + mode: "nullable" + - name: "region" + type: "string" + description: "[DEPRECATED] The region where advertisers used these keywords." + mode: "nullable" + - name: "elections" + type: "string" + description: "[DEPRECATED] The elections during which these keywords were used." + mode: "nullable" + + graph_paths: + - "download_zip_file_to_composer_bucket >> [transform_advertiser_declared_stats_csv, transform_advertiser_geo_spend_csv, transform_advertiser_stats_csv, transform_advertiser_weekly_spend_csv, transform_campaign_targeting_csv, transform_creative_stats_csv, transform_geo_spend_csv, transform_last_updated_csv, transform_top_keywords_history_csv]" + - "transform_advertiser_declared_stats_csv >> load_advertiser_declared_stats_to_bq" + - "transform_advertiser_geo_spend_csv >> load_advertiser_geo_spend_to_bq" + - "transform_advertiser_stats_csv >> load_advertiser_stats_to_bq" + - "transform_advertiser_weekly_spend_csv >> load_advertiser_weekly_spend_to_bq" + - "transform_campaign_targeting_csv >> load_campaign_targeting_to_bq" + - "transform_creative_stats_csv >> load_creative_stats_to_bq" + - "transform_geo_spend_csv >> load_geo_spend_to_bq" + - "transform_last_updated_csv >> load_last_updated_to_bq" + - "transform_top_keywords_history_csv >> load_top_keywords_history_to_bq" diff --git a/datasets/google_political_ads/pipelines/process_csvs_and_load_to_bq/process_csvs_and_load_to_bq_dag.py b/datasets/google_political_ads/pipelines/process_csvs_and_load_to_bq/process_csvs_and_load_to_bq_dag.py new file mode 100644 index 000000000..9f75a4446 --- /dev/null +++ b/datasets/google_political_ads/pipelines/process_csvs_and_load_to_bq/process_csvs_and_load_to_bq_dag.py @@ -0,0 +1,1287 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.transfers import gcs_to_bigquery, gcs_to_gcs + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="google_political_ads.process_csvs_and_load_to_bq", + default_args=default_args, + max_active_runs=1, + schedule_interval="@daily", + catchup=False, + default_view="graph", +) as dag: + + # Task to archive the CSV file in the destination bucket + download_zip_file_to_composer_bucket = gcs_to_gcs.GCSToGCSOperator( + task_id="download_zip_file_to_composer_bucket", + source_bucket="political-csv", + source_object="google-political-ads-transparency-bundle.zip", + destination_bucket="{{ var.value.composer_bucket }}", + destination_object="data/google_political_ads/google-political-ads-transparency-bundle.zip", + impersonation_chain="{{ var.json.google_political_ads.service_account }}", + move_object=False, + ) + + # Run CSV transform within kubernetes pod + transform_advertiser_declared_stats_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_advertiser_declared_stats_csv", + startup_timeout_seconds=600, + name="advertiser_declared_stats", + namespace="composer", + service_account_name="datasets", + image_pull_policy="Always", + image="{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "SOURCE_GCS_OBJECT": "data/google_political_ads/google-political-ads-transparency-bundle.zip", + "ZIP_FILE": "files/google-political-ads-transparency-bundle.zip", + "CSV_FILE": "google-political-ads-advertiser-declared-stats.csv", + "TARGET_FILE": "files/data_output.csv", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/google_political_ads/advertiser_declared_stats/data_output.csv", + "TABLE_NAME": "advertiser_declared_stats", + "CSV_HEADERS": '["advertiser_id","advertiser_declared_name","advertiser_declared_regulatory_id","advertiser_declared_scope","advertiser_declared_promoter_name","advertiser_declared_promoter_address"]', + "RENAME_MAPPINGS": '{"Advertiser_ID" : "advertiser_id","Advertiser_Declared_Name" : "advertiser_declared_name","Advertiser_Declared_Regulatory_ID" : "advertiser_declared_regulatory_id","Advertiser_Declared_Scope" : "advertiser_declared_scope","Advertiser_Declared_Promoter_Name" : "advertiser_declared_promoter_name","Advertiser_Declared_Promoter_Address" : "advertiser_declared_promoter_address"}', + }, + resources={"request_memory": "1G", "request_cpu": "200m"}, + ) + + # Task to load CSV data to a BigQuery table + load_advertiser_declared_stats_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_advertiser_declared_stats_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/google_political_ads/advertiser_declared_stats/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="google_political_ads.advertiser_declared_stats", + skip_leading_rows=1, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "advertiser_id", + "type": "string", + "description": "ID of the advertiser who purchased the ad.", + "mode": "nullable", + }, + { + "name": "advertiser_declared_name", + "type": "string", + "description": "The advertiser's committee declared name.", + "mode": "nullable", + }, + { + "name": "advertiser_declared_regulatory_id", + "type": "string", + "description": "Committee declared identification number.", + "mode": "nullable", + }, + { + "name": "advertiser_declared_scope", + "type": "string", + "description": "Committee-provided information about the candidate and office or ballot proposition and jurisdiction to which the advertisement refers which is separate from our verification process.", + "mode": "nullable", + }, + { + "name": "advertiser_declared_promoter_name", + "type": "string", + "description": "The New Zealand advertiser's declared Promoter Statement name.", + "mode": "nullable", + }, + { + "name": "advertiser_declared_promoter_address", + "type": "string", + "description": "The New Zealand advertiser's declared Promoter Statement address.", + "mode": "nullable", + }, + ], + ) + + # Run CSV transform within kubernetes pod + transform_advertiser_geo_spend_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_advertiser_geo_spend_csv", + startup_timeout_seconds=600, + name="advertiser_geo_spend", + namespace="composer", + service_account_name="datasets", + image_pull_policy="Always", + image="{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "SOURCE_GCS_OBJECT": "data/google_political_ads/google-political-ads-transparency-bundle.zip", + "ZIP_FILE": "files/google-political-ads-transparency-bundle.zip", + "CSV_FILE": "google-political-ads-advertiser-geo-spend.csv", + "TARGET_FILE": "files/data_output.csv", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/google_political_ads/advertiser_geo_spend/data_output.csv", + "TABLE_NAME": "advertiser_geo_spend", + "CSV_HEADERS": '["advertiser_id","advertiser_name","country","country_subdivision_primary","spend_usd","spend_eur","spend_inr","spend_bgn","spend_hrk","spend_czk","spend_dkk","spend_huf","spend_pln","spend_ron","spend_sek","spend_gbp","spend_nzd"]', + "RENAME_MAPPINGS": '{"Advertiser_ID" : "advertiser_id" ,"Advertiser_Name" : "advertiser_name" ,"Country" : "country" ,"Country_Subdivision_Primary" : "country_subdivision_primary" ,"Spend_USD" : "spend_usd" ,"Spend_EUR" : "spend_eur" ,"Spend_INR" : "spend_inr" ,"Spend_BGN" : "spend_bgn" ,"Spend_HRK" : "spend_hrk" ,"Spend_CZK" : "spend_czk" ,"Spend_DKK" : "spend_dkk" ,"Spend_HUF" : "spend_huf" ,"Spend_PLN" : "spend_pln" ,"Spend_RON" : "spend_ron" ,"Spend_SEK" : "spend_sek" ,"Spend_GBP" : "spend_gbp" ,"Spend_NZD" : "spend_nzd"}', + }, + resources={"request_memory": "1G", "request_cpu": "200m"}, + ) + + # Task to load CSV data to a BigQuery table + load_advertiser_geo_spend_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_advertiser_geo_spend_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/google_political_ads/advertiser_geo_spend/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="google_political_ads.advertiser_geo_spend", + skip_leading_rows=1, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "advertiser_id", + "type": "string", + "description": "Unique ID for an advertiser verified to run election ads on Google Ads Services.", + "mode": "nullable", + }, + { + "name": "advertiser_name", + "type": "string", + "description": "Name of the advertiser.", + "mode": "nullable", + }, + { + "name": "country", + "type": "string", + "description": 'The country where election ads were served specified in the ISO 3166-1 alpha-2 standard code. For example: "US" for United States.', + "mode": "nullable", + }, + { + "name": "country_subdivision_primary", + "type": "string", + "description": 'The primary subdivision of the country where election ads were served specified by the ISO 3166-2 standard code. For example: "US-CA" for California state in United States', + "mode": "nullable", + }, + { + "name": "spend_usd", + "type": "integer", + "description": "Total amount in USD spent on election ads in this region.", + "mode": "nullable", + }, + { + "name": "spend_eur", + "type": "integer", + "description": "Total amount in EUR spent on election ads in this region.", + "mode": "nullable", + }, + { + "name": "spend_inr", + "type": "integer", + "description": "Total amount in INR spent on election ads in this region.", + "mode": "nullable", + }, + { + "name": "spend_bgn", + "type": "integer", + "description": "Total amount in BGN spent on election ads in this region.", + "mode": "nullable", + }, + { + "name": "spend_hrk", + "type": "integer", + "description": "Total amount in HRK spent on election ads in this region.", + "mode": "nullable", + }, + { + "name": "spend_czk", + "type": "integer", + "description": "Total amount in CZK spent on election ads in this region.", + "mode": "nullable", + }, + { + "name": "spend_dkk", + "type": "integer", + "description": "Total amount in DKK spent on election ads in this region.", + "mode": "nullable", + }, + { + "name": "spend_huf", + "type": "integer", + "description": "Total amount in HUF spent on election ads in this region.", + "mode": "nullable", + }, + { + "name": "spend_pln", + "type": "integer", + "description": "Total amount in PLN spent on election ads in this region.", + "mode": "nullable", + }, + { + "name": "spend_ron", + "type": "integer", + "description": "Total amount in RON spent on election ads in this region.", + "mode": "nullable", + }, + { + "name": "spend_sek", + "type": "integer", + "description": "Total amount in SEK spent on election ads in this region.", + "mode": "nullable", + }, + { + "name": "spend_gbp", + "type": "integer", + "description": "Total amount in GBP spent on election ads in this region.", + "mode": "nullable", + }, + { + "name": "spend_nzd", + "type": "integer", + "description": "Total amount in NZD spent on election ads in this region.", + "mode": "nullable", + }, + ], + ) + + # Run CSV transform within kubernetes pod + transform_advertiser_stats_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_advertiser_stats_csv", + startup_timeout_seconds=600, + name="advertiser_stats", + namespace="composer", + service_account_name="datasets", + image_pull_policy="Always", + image="{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "SOURCE_GCS_OBJECT": "data/google_political_ads/google-political-ads-transparency-bundle.zip", + "ZIP_FILE": "files/google-political-ads-transparency-bundle.zip", + "CSV_FILE": "google-political-ads-advertiser-stats.csv", + "TARGET_FILE": "files/data_output.csv", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/google_political_ads/advertiser_stats/data_output.csv", + "TABLE_NAME": "advertiser_stats", + "CSV_HEADERS": '["advertiser_id","advertiser_name","public_ids_list","regions","elections","total_creatives","spend_usd","spend_eur","spend_inr","spend_bgn","spend_hrk","spend_czk","spend_dkk","spend_huf","spend_pln","spend_ron","spend_sek","spend_gbp","spend_nzd"]', + "RENAME_MAPPINGS": '{"Advertiser_ID": "advertiser_id","Advertiser_Name": "advertiser_name","Public_IDs_List": "public_ids_list","Regions": "regions","Elections": "elections","Total_Creatives": "total_creatives","Spend_USD": "spend_usd","Spend_EUR": "spend_eur","Spend_INR": "spend_inr","Spend_BGN": "spend_bgn","Spend_HRK": "spend_hrk","Spend_CZK": "spend_czk","Spend_DKK": "spend_dkk","Spend_HUF": "spend_huf","Spend_PLN": "spend_pln","Spend_RON": "spend_ron","Spend_SEK": "spend_sek","Spend_GBP": "spend_gbp","Spend_NZD": "spend_nzd"}', + }, + resources={"request_memory": "1G", "request_cpu": "200m"}, + ) + + # Task to load CSV data to a BigQuery table + load_advertiser_stats_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_advertiser_stats_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=["data/google_political_ads/advertiser_stats/data_output.csv"], + source_format="CSV", + destination_project_dataset_table="google_political_ads.advertiser_stats", + skip_leading_rows=1, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "advertiser_id", + "type": "string", + "description": "Unique ID for an advertiser verified to run election ads on Google Ads Services.", + "mode": "nullable", + }, + { + "name": "advertiser_name", + "type": "string", + "description": "Name of advertiser.", + "mode": "nullable", + }, + { + "name": "public_ids_list", + "type": "string", + "description": "List of public IDs used to identify the advertiser if available.", + "mode": "nullable", + }, + { + "name": "regions", + "type": "string", + "description": "The list of regions where the ads of this advertiser were served", + "mode": "nullable", + }, + { + "name": "elections", + "type": "string", + "description": "The list of elections that this advertiser participated in based on the regions.", + "mode": "nullable", + }, + { + "name": "total_creatives", + "type": "integer", + "description": "Total number of election ads the advertiser ran with at least one impression.", + "mode": "nullable", + }, + { + "name": "spend_usd", + "type": "integer", + "description": "Total amount in USD spent on election ads by the advertiser.", + "mode": "nullable", + }, + { + "name": "spend_eur", + "type": "integer", + "description": "Total amount in EUR spent on election ads by the advertiser.", + "mode": "nullable", + }, + { + "name": "spend_inr", + "type": "integer", + "description": "Total amount in INR spent on election ads by the advertiser.", + "mode": "nullable", + }, + { + "name": "spend_bgn", + "type": "integer", + "description": "Total amount in BGN spent on election ads by the advertiser.", + "mode": "nullable", + }, + { + "name": "spend_hrk", + "type": "integer", + "description": "Total amount in HRK spent on election ads by the advertiser.", + "mode": "nullable", + }, + { + "name": "spend_czk", + "type": "integer", + "description": "Total amount in CZK spent on election ads by the advertiser.", + "mode": "nullable", + }, + { + "name": "spend_dkk", + "type": "integer", + "description": "Total amount in DKK spent on election ads by the advertiser.", + "mode": "nullable", + }, + { + "name": "spend_huf", + "type": "integer", + "description": "Total amount in HUF spent on election ads by the advertiser.", + "mode": "nullable", + }, + { + "name": "spend_pln", + "type": "integer", + "description": "Total amount in PLN spent on election ads by the advertiser.", + "mode": "nullable", + }, + { + "name": "spend_ron", + "type": "integer", + "description": "Total amount in RON spent on election ads by the advertiser.", + "mode": "nullable", + }, + { + "name": "spend_sek", + "type": "integer", + "description": "Total amount in SEK spent on election ads by the advertiser.", + "mode": "nullable", + }, + { + "name": "spend_gbp", + "type": "integer", + "description": "Total amount in GBP spent on election ads by the advertiser.", + "mode": "nullable", + }, + { + "name": "spend_nzd", + "type": "integer", + "description": "Total amount in NZD spent on election ads by the advertiser.", + "mode": "nullable", + }, + ], + ) + + # Run CSV transform within kubernetes pod + transform_advertiser_weekly_spend_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_advertiser_weekly_spend_csv", + startup_timeout_seconds=600, + name="advertiser_weekly_spend", + namespace="composer", + service_account_name="datasets", + image_pull_policy="Always", + image="{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "SOURCE_GCS_OBJECT": "data/google_political_ads/google-political-ads-transparency-bundle.zip", + "ZIP_FILE": "files/google-political-ads-transparency-bundle.zip", + "CSV_FILE": "google-political-ads-advertiser-weekly-spend.csv", + "TARGET_FILE": "files/data_output.csv", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/google_political_ads/advertiser_weekly_spend/data_output.csv", + "TABLE_NAME": "advertiser_weekly_spend", + "CSV_HEADERS": '["advertiser_id","advertiser_name","election_cycle","week_start_date","spend_usd","spend_eur","spend_inr","spend_bgn","spend_hrk","spend_czk","spend_dkk","spend_huf","spend_pln","spend_ron","spend_sek","spend_gbp","spend_nzd"]', + "RENAME_MAPPINGS": '{"Advertiser_ID": "advertiser_id","Advertiser_Name": "advertiser_name","Election_Cycle": "election_cycle","Week_Start_Date": "week_start_date","Spend_USD": "spend_usd","Spend_EUR": "spend_eur","Spend_INR": "spend_inr","Spend_BGN": "spend_bgn","Spend_HRK": "spend_hrk","Spend_CZK": "spend_czk","Spend_DKK": "spend_dkk","Spend_HUF": "spend_huf","Spend_PLN": "spend_pln","Spend_RON": "spend_ron","Spend_SEK": "spend_sek","Spend_GBP": "spend_gbp","Spend_NZD": "spend_nzd"}', + }, + resources={"request_memory": "1G", "request_cpu": "200m"}, + ) + + # Task to load CSV data to a BigQuery table + load_advertiser_weekly_spend_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_advertiser_weekly_spend_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/google_political_ads/advertiser_weekly_spend/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="google_political_ads.advertiser_weekly_spend", + skip_leading_rows=1, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "advertiser_id", + "type": "string", + "description": "Unique ID for an advertiser verified to run election ads on Google Ads Services.", + "mode": "nullable", + }, + { + "name": "advertiser_name", + "type": "string", + "description": "Name of advertiser.", + "mode": "nullable", + }, + { + "name": "election_cycle", + "type": "string", + "description": "[DEPRECATED] This field is deprecated in favor of the Elections column in advertiser_stats table. It will be deleted some time after July 2019.", + "mode": "nullable", + }, + { + "name": "week_start_date", + "type": "date", + "description": "The start date for the week where spending occurred.", + "mode": "nullable", + }, + { + "name": "spend_usd", + "type": "integer", + "description": "The amount in USD spent on election ads during the given week by the advertiser.", + "mode": "nullable", + }, + { + "name": "spend_eur", + "type": "integer", + "description": "The amount in EUR spent on election ads during the given week by the advertiser.", + "mode": "nullable", + }, + { + "name": "spend_inr", + "type": "integer", + "description": "The amount in INR spent on election ads during the given week by the advertiser.", + "mode": "nullable", + }, + { + "name": "spend_bgn", + "type": "integer", + "description": "The amount in BGN spent on election ads during the given week by the advertiser.", + "mode": "nullable", + }, + { + "name": "spend_hrk", + "type": "integer", + "description": "The amount in HRK spent on election ads during the given week by the advertiser.", + "mode": "nullable", + }, + { + "name": "spend_czk", + "type": "integer", + "description": "The amount in CZK spent on election ads during the given week by the advertiser.", + "mode": "nullable", + }, + { + "name": "spend_dkk", + "type": "integer", + "description": "The amount in DKK spent on election ads during the given week by the advertiser.", + "mode": "nullable", + }, + { + "name": "spend_huf", + "type": "integer", + "description": "The amount in HUF spent on election ads during the given week by the advertiser.", + "mode": "nullable", + }, + { + "name": "spend_pln", + "type": "integer", + "description": "The amount in PLN spent on election ads during the given week by the advertiser.", + "mode": "nullable", + }, + { + "name": "spend_ron", + "type": "integer", + "description": "The amount in RON spent on election ads during the given week by the advertiser.", + "mode": "nullable", + }, + { + "name": "spend_sek", + "type": "integer", + "description": "The amount in SEK spent on election ads during the given week by the advertiser.", + "mode": "nullable", + }, + { + "name": "spend_gbp", + "type": "integer", + "description": "The amount in GBP spent on election ads during the given week by the advertiser.", + "mode": "nullable", + }, + { + "name": "spend_nzd", + "type": "integer", + "description": "The amount in NZD spent on election ads during the given week by the advertiser.", + "mode": "nullable", + }, + ], + ) + + # Run CSV transform within kubernetes pod + transform_campaign_targeting_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_campaign_targeting_csv", + startup_timeout_seconds=600, + name="campaign_targeting", + namespace="composer", + service_account_name="datasets", + image_pull_policy="Always", + image="{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "SOURCE_GCS_OBJECT": "data/google_political_ads/google-political-ads-transparency-bundle.zip", + "ZIP_FILE": "files/google-political-ads-transparency-bundle.zip", + "CSV_FILE": "google-political-ads-campaign-targeting.csv", + "TARGET_FILE": "files/data_output.csv", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/google_political_ads/campaign_targeting/data_output.csv", + "TABLE_NAME": "campaign_targeting", + "CSV_HEADERS": '["campaign_id","age_targeting","gender_targeting","geo_targeting_included","geo_targeting_excluded","start_date","end_date","ads_list","advertiser_id","advertiser_name"]', + "RENAME_MAPPINGS": '{"Campaign_ID": "campaign_id","Age_Targeting": "age_targeting","Gender_Targeting": "gender_targeting","Geo_Targeting_Included": "geo_targeting_included","Geo_Targeting_Excluded": "geo_targeting_excluded","Start_Date": "start_date","End_Date": "end_date","Ads_List": "ads_list","Advertiser_ID": "advertiser_id","Advertiser_Name": "advertiser_name"}', + }, + resources={"request_memory": "1G", "request_cpu": "200m"}, + ) + + # Task to load CSV data to a BigQuery table + load_campaign_targeting_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_campaign_targeting_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=["data/google_political_ads/campaign_targeting/data_output.csv"], + source_format="CSV", + destination_project_dataset_table="google_political_ads.campaign_targeting", + skip_leading_rows=1, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "campaign_id", + "type": "string", + "description": "[DEPRECATED] Unique ID for a political ad campaign.", + "mode": "nullable", + }, + { + "name": "age_targeting", + "type": "string", + "description": "[DEPRECATED] Age ranges included in the campaign's targeting.", + "mode": "nullable", + }, + { + "name": "gender_targeting", + "type": "string", + "description": "[DEPRECATED] Genders included in the campaign's targeting", + "mode": "nullable", + }, + { + "name": "geo_targeting_included", + "type": "string", + "description": "[DEPRECATED] Geographic locations included in the campaign's targeting.", + "mode": "nullable", + }, + { + "name": "geo_targeting_excluded", + "type": "string", + "description": "[DEPRECATED] Geographic locations excluded from the campaign's targeting.", + "mode": "nullable", + }, + { + "name": "start_date", + "type": "date", + "description": "[DEPRECATED] Start date for the campaign.", + "mode": "nullable", + }, + { + "name": "end_date", + "type": "date", + "description": "[DEPRECATED] End date for the campaign.", + "mode": "nullable", + }, + { + "name": "ads_list", + "type": "string", + "description": "[DEPRECATED] List of Ad_IDs for the campaign.", + "mode": "nullable", + }, + { + "name": "advertiser_id", + "type": "string", + "description": "[DEPRECATED] ID of the advertiser who purchased the ad.", + "mode": "nullable", + }, + { + "name": "advertiser_name", + "type": "string", + "description": "[DEPRECATED] Name of advertiser.", + "mode": "nullable", + }, + ], + ) + + # Run CSV transform within kubernetes pod + transform_creative_stats_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_creative_stats_csv", + startup_timeout_seconds=600, + name="creative_stats", + namespace="composer", + service_account_name="datasets", + image_pull_policy="Always", + image="{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "SOURCE_GCS_OBJECT": "data/google_political_ads/google-political-ads-transparency-bundle.zip", + "ZIP_FILE": "files/google-political-ads-transparency-bundle.zip", + "CSV_FILE": "google-political-ads-creative-stats.csv", + "TARGET_FILE": "files/data_output.csv", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/google_political_ads/creative_stats/data_output.csv", + "TABLE_NAME": "creative_stats", + "CSV_HEADERS": '["ad_id","ad_url","ad_type","regions","advertiser_id","advertiser_name","ad_campaigns_list","date_range_start","date_range_end","num_of_days","impressions","spend_usd","first_served_timestamp","last_served_timestamp","age_targeting","gender_targeting","geo_targeting_included","geo_targeting_excluded","spend_range_min_usd","spend_range_max_usd","spend_range_min_eur","spend_range_max_eur","spend_range_min_inr","spend_range_max_inr","spend_range_min_bgn","spend_range_max_bgn","spend_range_min_hrk","spend_range_max_hrk","spend_range_min_czk","spend_range_max_czk","spend_range_min_dkk","spend_range_max_dkk","spend_range_min_huf","spend_range_max_huf","spend_range_min_pln","spend_range_max_pln","spend_range_min_ron","spend_range_max_ron","spend_range_min_sek","spend_range_max_sek","spend_range_min_gbp","spend_range_max_gbp","spend_range_min_nzd","spend_range_max_nzd"]', + "RENAME_MAPPINGS": '{"Ad_ID": "ad_id","Ad_URL": "ad_url","Ad_Type": "ad_type","Regions": "regions","Advertiser_ID": "advertiser_id","Advertiser_Name": "advertiser_name","Ad_Campaigns_List": "ad_campaigns_list","Date_Range_Start": "date_range_start","Date_Range_End": "date_range_end","Num_of_Days": "num_of_days","Impressions": "impressions","Spend_USD": "spend_usd","Spend_Range_Min_USD": "spend_range_min_usd","Spend_Range_Max_USD": "spend_range_max_usd","Spend_Range_Min_EUR": "spend_range_min_eur","Spend_Range_Max_EUR": "spend_range_max_eur","Spend_Range_Min_INR": "spend_range_min_inr","Spend_Range_Max_INR": "spend_range_max_inr","Spend_Range_Min_BGN": "spend_range_min_bgn","Spend_Range_Max_BGN": "spend_range_max_bgn","Spend_Range_Min_HRK": "spend_range_min_hrk","Spend_Range_Max_HRK": "spend_range_max_hrk","Spend_Range_Min_CZK": "spend_range_min_czk","Spend_Range_Max_CZK": "spend_range_max_czk","Spend_Range_Min_DKK": "spend_range_min_dkk","Spend_Range_Max_DKK": "spend_range_max_dkk","Spend_Range_Min_HUF": "spend_range_min_huf","Spend_Range_Max_HUF": "spend_range_max_huf","Spend_Range_Min_PLN": "spend_range_min_pln","Spend_Range_Max_PLN": "spend_range_max_pln","Spend_Range_Min_RON": "spend_range_min_ron","Spend_Range_Max_RON": "spend_range_max_ron","Spend_Range_Min_SEK": "spend_range_min_sek","Spend_Range_Max_SEK": "spend_range_max_sek","Spend_Range_Min_GBP": "spend_range_min_gbp","Spend_Range_Max_GBP": "spend_range_max_gbp","Spend_Range_Min_NZD": "spend_range_min_nzd","Spend_Range_Max_NZD": "spend_range_max_nzd","Age_Targeting": "age_targeting","Gender_Targeting": "gender_targeting","Geo_Targeting_Included": "geo_targeting_included","Geo_Targeting_Excluded": "geo_targeting_excluded","First_Served_Timestamp": "first_served_timestamp","Last_Served_Timestamp": "last_served_timestamp"}', + }, + resources={ + "request_memory": "16G", + "request_cpu": "2", + "request_ephemeral_storage": "10G", + }, + ) + + # Task to load CSV data to a BigQuery table + load_creative_stats_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_creative_stats_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=["data/google_political_ads/creative_stats/data_output.csv"], + source_format="CSV", + destination_project_dataset_table="google_political_ads.creative_stats", + skip_leading_rows=1, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "ad_id", + "type": "string", + "description": "Unique id for a specific election ad.", + "mode": "nullable", + }, + { + "name": "ad_url", + "type": "string", + "description": "URL to view the election ad in the election Advertising on Google report.", + "mode": "nullable", + }, + { + "name": "ad_type", + "type": "string", + "description": "The type of the ad. Can be TEXT VIDEO or IMAGE.", + "mode": "nullable", + }, + { + "name": "regions", + "type": "string", + "description": "The regions that this ad is verified for or were served in.", + "mode": "nullable", + }, + { + "name": "advertiser_id", + "type": "string", + "description": "ID of the advertiser who purchased the ad.", + "mode": "nullable", + }, + { + "name": "advertiser_name", + "type": "string", + "description": "Name of advertiser.", + "mode": "nullable", + }, + { + "name": "ad_campaigns_list", + "type": "string", + "description": "IDs of all election ad campaigns that included the ad.", + "mode": "nullable", + }, + { + "name": "date_range_start", + "type": "date", + "description": "First day a election ad ran and had an impression.", + "mode": "nullable", + }, + { + "name": "date_range_end", + "type": "date", + "description": "Most recent day a election ad ran and had an impression.", + "mode": "nullable", + }, + { + "name": "num_of_days", + "type": "integer", + "description": "Total number of days a election ad ran and had an impression.", + "mode": "nullable", + }, + { + "name": "impressions", + "type": "string", + "description": "Number of impressions for the election ad. Impressions are grouped into several buckets ≤ 10k 10k-100k 100k-1M 1M-10M > 10M.", + "mode": "nullable", + }, + { + "name": "spend_usd", + "type": "string", + "description": "[DEPRECATED] This field is deprecated in favor of specifying the lower and higher spend bucket bounds in separate Spend_Range_Min and Spend_Range_Max columns.", + "mode": "nullable", + }, + { + "name": "first_served_timestamp", + "type": "timestamp", + "description": "The timestamp of the earliest impression for this ad.", + "mode": "nullable", + }, + { + "name": "last_served_timestamp", + "type": "timestamp", + "description": "The timestamp of the most recent impression for this ad.", + "mode": "nullable", + }, + { + "name": "age_targeting", + "type": "string", + "description": "Age ranges included in the ad's targeting", + "mode": "nullable", + }, + { + "name": "gender_targeting", + "type": "string", + "description": "Genders included in the ad's targeting.", + "mode": "nullable", + }, + { + "name": "geo_targeting_included", + "type": "string", + "description": "Geographic locations included in the ad's targeting.", + "mode": "nullable", + }, + { + "name": "geo_targeting_excluded", + "type": "string", + "description": "Geographic locations excluded in the ad's targeting.", + "mode": "nullable", + }, + { + "name": "spend_range_min_usd", + "type": "integer", + "description": "Lower bound of the amount in USD spent by the advertiser on the election ad.", + "mode": "nullable", + }, + { + "name": "spend_range_max_usd", + "type": "integer", + "description": "Upper bound of the amount in USD spent by the advertiser on the election ad.", + "mode": "nullable", + }, + { + "name": "spend_range_min_eur", + "type": "integer", + "description": "Lower bound of the amount in EUR spent by the advertiser on the election ad.", + "mode": "nullable", + }, + { + "name": "spend_range_max_eur", + "type": "integer", + "description": "Upper bound of the amount in EUR spent by the advertiser on the election ad.", + "mode": "nullable", + }, + { + "name": "spend_range_min_inr", + "type": "integer", + "description": "Lower bound of the amount in INR spent by the advertiser on the election ad.", + "mode": "nullable", + }, + { + "name": "spend_range_max_inr", + "type": "integer", + "description": "Upper bound of the amount in INR spent by the advertiser on the election ad.", + "mode": "nullable", + }, + { + "name": "spend_range_min_bgn", + "type": "integer", + "description": "Lower bound of the amount in BGN spent by the advertiser on the election ad.", + "mode": "nullable", + }, + { + "name": "spend_range_max_bgn", + "type": "integer", + "description": "Upper bound of the amount in BGN spent by the advertiser on the election ad.", + "mode": "nullable", + }, + { + "name": "spend_range_min_hrk", + "type": "integer", + "description": "Lower bound of the amount in HRK spent by the advertiser on the election ad.", + "mode": "nullable", + }, + { + "name": "spend_range_max_hrk", + "type": "integer", + "description": "Upper bound of the amount in HRK spent by the advertiser on the election ad.", + "mode": "nullable", + }, + { + "name": "spend_range_min_czk", + "type": "integer", + "description": "Lower bound of the amount in CZK spent by the advertiser on the election ad.", + "mode": "nullable", + }, + { + "name": "spend_range_max_czk", + "type": "integer", + "description": "Upper bound of the amount in CZK spent by the advertiser on the election ad.", + "mode": "nullable", + }, + { + "name": "spend_range_min_dkk", + "type": "integer", + "description": "Lower bound of the amount in DKK spent by the advertiser on the election ad.", + "mode": "nullable", + }, + { + "name": "spend_range_max_dkk", + "type": "integer", + "description": "Upper bound of the amount in DKK spent by the advertiser on the election ad.", + "mode": "nullable", + }, + { + "name": "spend_range_min_huf", + "type": "integer", + "description": "Lower bound of the amount in HUF spent by the advertiser on the election ad.", + "mode": "nullable", + }, + { + "name": "spend_range_max_huf", + "type": "integer", + "description": "Upper bound of the amount in HUF spent by the advertiser on the election ad.", + "mode": "nullable", + }, + { + "name": "spend_range_min_pln", + "type": "integer", + "description": "Lower bound of the amount in PLN spent by the advertiser on the election ad.", + "mode": "nullable", + }, + { + "name": "spend_range_max_pln", + "type": "integer", + "description": "Upper bound of the amount in PLN spent by the advertiser on the election ad.", + "mode": "nullable", + }, + { + "name": "spend_range_min_ron", + "type": "integer", + "description": "Lower bound of the amount in RON spent by the advertiser on the election ad.", + "mode": "nullable", + }, + { + "name": "spend_range_max_ron", + "type": "integer", + "description": "Upper bound of the amount in RON spent by the advertiser on the election ad.", + "mode": "nullable", + }, + { + "name": "spend_range_min_sek", + "type": "integer", + "description": "Lower bound of the amount in SEK spent by the advertiser on the election ad.", + "mode": "nullable", + }, + { + "name": "spend_range_max_sek", + "type": "integer", + "description": "Upper bound of the amount in SEK spent by the advertiser on the election ad.", + "mode": "nullable", + }, + { + "name": "spend_range_min_gbp", + "type": "integer", + "description": "Lower bound of the amount in GBP spent by the advertiser on the election ad.", + "mode": "nullable", + }, + { + "name": "spend_range_max_gbp", + "type": "integer", + "description": "Upper bound of the amount in GBP spent by the advertiser on the election ad.", + "mode": "nullable", + }, + { + "name": "spend_range_min_nzd", + "type": "integer", + "description": "Lower bound of the amount in NZD spent by the advertiser on the election ad.", + "mode": "nullable", + }, + { + "name": "spend_range_max_nzd", + "type": "integer", + "description": "Upper bound of the amount in NZD spent by the advertiser on the election ad.", + "mode": "nullable", + }, + ], + ) + + # Run CSV transform within kubernetes pod + transform_geo_spend_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_geo_spend_csv", + startup_timeout_seconds=600, + name="geo_spend", + namespace="composer", + service_account_name="datasets", + image_pull_policy="Always", + image="{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "SOURCE_GCS_OBJECT": "data/google_political_ads/google-political-ads-transparency-bundle.zip", + "ZIP_FILE": "files/google-political-ads-transparency-bundle.zip", + "CSV_FILE": "google-political-ads-geo-spend.csv", + "TARGET_FILE": "files/data_output.csv", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/google_political_ads/geo_spend/data_output.csv", + "TABLE_NAME": "geo_spend", + "CSV_HEADERS": '["country","country_subdivision_primary","country_subdivision_secondary","spend_usd","spend_eur","spend_inr","spend_bgn","spend_hrk","spend_czk","spend_dkk","spend_huf","spend_pln","spend_ron","spend_sek","spend_gbp","spend_nzd"]', + "RENAME_MAPPINGS": '{"Country": "country","Country_Subdivision_Primary": "country_subdivision_primary","Country_Subdivision_Secondary": "country_subdivision_secondary","Spend_USD": "spend_usd","Spend_EUR": "spend_eur","Spend_INR": "spend_inr","Spend_BGN": "spend_bgn","Spend_HRK": "spend_hrk","Spend_CZK": "spend_czk","Spend_DKK": "spend_dkk","Spend_HUF": "spend_huf","Spend_PLN": "spend_pln","Spend_RON": "spend_ron","Spend_SEK": "spend_sek","Spend_GBP": "spend_gbp","Spend_NZD": "spend_nzd"}', + }, + resources={"request_memory": "1G", "request_cpu": "200m"}, + ) + + # Task to load CSV data to a BigQuery table + load_geo_spend_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_geo_spend_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=["data/google_political_ads/geo_spend/data_output.csv"], + source_format="CSV", + destination_project_dataset_table="google_political_ads.geo_spend", + skip_leading_rows=1, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "country", + "type": "string", + "description": 'The country where election ads were served specified in the ISO 3166-1 alpha-2 standard code. For example "US" for United States.', + "mode": "nullable", + }, + { + "name": "country_subdivision_primary", + "type": "string", + "description": 'The primary subdivision of the country where election ads were served specified by the ISO 3166-2 standard code. For example "US-CA" for California state in United States', + "mode": "nullable", + }, + { + "name": "country_subdivision_secondary", + "type": "string", + "description": "The name of the secondary subdivision. For example The name of a US congressional district.", + "mode": "nullable", + }, + { + "name": "spend_usd", + "type": "integer", + "description": "Total amount in USD spent on election ads in this region.", + "mode": "nullable", + }, + { + "name": "spend_eur", + "type": "integer", + "description": "Total amount in EUR spent on election ads in this region.", + "mode": "nullable", + }, + { + "name": "spend_inr", + "type": "integer", + "description": "Total amount in INR spent on election ads in this region.", + "mode": "nullable", + }, + { + "name": "spend_bgn", + "type": "integer", + "description": "Total amount in BGN spent on election ads in this region.", + "mode": "nullable", + }, + { + "name": "spend_hrk", + "type": "integer", + "description": "Total amount in HRK spent on election ads in this region.", + "mode": "nullable", + }, + { + "name": "spend_czk", + "type": "integer", + "description": "Total amount in CZK spent on election ads in this region.", + "mode": "nullable", + }, + { + "name": "spend_dkk", + "type": "integer", + "description": "Total amount in DKK spent on election ads in this region.", + "mode": "nullable", + }, + { + "name": "spend_huf", + "type": "integer", + "description": "Total amount in HUF spent on election ads in this region.", + "mode": "nullable", + }, + { + "name": "spend_pln", + "type": "integer", + "description": "Total amount in PLN spent on election ads in this region.", + "mode": "nullable", + }, + { + "name": "spend_ron", + "type": "integer", + "description": "Total amount in RON spent on election ads in this region.", + "mode": "nullable", + }, + { + "name": "spend_sek", + "type": "integer", + "description": "Total amount in SEK spent on election ads in this region.", + "mode": "nullable", + }, + { + "name": "spend_gbp", + "type": "integer", + "description": "Total amount in GBP spent on election ads in this region.", + "mode": "nullable", + }, + { + "name": "spend_nzd", + "type": "integer", + "description": "Total amount in NZD spent on election ads in this region.", + "mode": "nullable", + }, + ], + ) + + # Run CSV transform within kubernetes pod + transform_last_updated_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_last_updated_csv", + startup_timeout_seconds=600, + name="last_updated", + namespace="composer", + service_account_name="datasets", + image_pull_policy="Always", + image="{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "SOURCE_GCS_OBJECT": "data/google_political_ads/google-political-ads-transparency-bundle.zip", + "ZIP_FILE": "files/google-political-ads-transparency-bundle.zip", + "CSV_FILE": "google-political-ads-updated.csv", + "TARGET_FILE": "files/data_output.csv", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/google_political_ads/last_updated/data_output.csv", + "TABLE_NAME": "last_updated", + "CSV_HEADERS": '["report_data_updated_time"]', + "RENAME_MAPPINGS": '{"Report_Data_Updated_Time (PT)": "report_data_updated_time"}', + }, + resources={"request_memory": "128M", "request_cpu": "200m"}, + ) + + # Task to load CSV data to a BigQuery table + load_last_updated_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_last_updated_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=["data/google_political_ads/last_updated/data_output.csv"], + source_format="CSV", + destination_project_dataset_table="google_political_ads.last_updated", + skip_leading_rows=1, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "report_data_updated_time", + "type": "datetime", + "description": "The time the report data was most recently updated", + "mode": "nullable", + } + ], + ) + + # Run CSV transform within kubernetes pod + transform_top_keywords_history_csv = kubernetes_pod.KubernetesPodOperator( + task_id="transform_top_keywords_history_csv", + startup_timeout_seconds=600, + name="top_keywords_history", + namespace="composer", + service_account_name="datasets", + image_pull_policy="Always", + image="{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}", + env_vars={ + "SOURCE_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "SOURCE_GCS_OBJECT": "data/google_political_ads/google-political-ads-transparency-bundle.zip", + "ZIP_FILE": "files/google-political-ads-transparency-bundle.zip", + "CSV_FILE": "google-political-ads-top-keywords-history.csv", + "TARGET_FILE": "files/data_output.csv", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/google_political_ads/top_keywords_history/data_output.csv", + "TABLE_NAME": "top_keywords_history", + "CSV_HEADERS": '["election_cycle","report_date","keyword_1","spend_usd_1","keyword_2","spend_usd_2","keyword_3","spend_usd_3","keyword_4","spend_usd_4","keyword_5","spend_usd_5","keyword_6","spend_usd_6","region","elections"]', + "RENAME_MAPPINGS": '{"Election_Cycle": "election_cycle","Report_Date": "report_date","Keyword_1": "keyword_1","Spend_USD_1": "spend_usd_1","Keyword_2": "keyword_2","Spend_USD_2": "spend_usd_2","Keyword_3": "keyword_3","Spend_USD_3": "spend_usd_3","Keyword_4": "keyword_4","Spend_USD_4": "spend_usd_4","Keyword_5": "keyword_5","Spend_USD_5": "spend_usd_5","Keyword_6": "keyword_6","Spend_USD_6": "spend_usd_6","Region": "region","Elections": "elections"}', + }, + resources={"request_memory": "1G", "request_cpu": "200m"}, + ) + + # Task to load CSV data to a BigQuery table + load_top_keywords_history_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( + task_id="load_top_keywords_history_to_bq", + bucket="{{ var.value.composer_bucket }}", + source_objects=[ + "data/google_political_ads/top_keywords_history/data_output.csv" + ], + source_format="CSV", + destination_project_dataset_table="google_political_ads.top_keywords_history", + skip_leading_rows=1, + write_disposition="WRITE_TRUNCATE", + schema_fields=[ + { + "name": "election_cycle", + "type": "string", + "description": "[DEPRECATED] This field is deprecated in favor of the Region and Elections field. It will be deleted some time after July 2019.", + "mode": "nullable", + }, + { + "name": "report_date", + "type": "date", + "description": "[DEPRECATED] The start date for the week where the spending was reported.", + "mode": "nullable", + }, + { + "name": "keyword_1", + "type": "string", + "description": " [DEPRECATED] Keyword with the most spend by advertisers for political ads", + "mode": "nullable", + }, + { + "name": "spend_usd_1", + "type": "integer", + "description": "[DEPRECATED] Total spend in USD for Keyword_1.", + "mode": "nullable", + }, + { + "name": "keyword_2", + "type": "string", + "description": "[DEPRECATED] Keyword with the next most spend by advertisers for political ads", + "mode": "nullable", + }, + { + "name": "spend_usd_2", + "type": "integer", + "description": "[DEPRECATED] Total spend in USD for Keyword_2.", + "mode": "nullable", + }, + { + "name": "keyword_3", + "type": "string", + "description": "[DEPRECATED] Keyword with the next most spend by advertisers for political ads", + "mode": "nullable", + }, + { + "name": "spend_usd_3", + "type": "integer", + "description": "[DEPRECATED] Total spend in USD for Keyword_3.", + "mode": "nullable", + }, + { + "name": "keyword_4", + "type": "string", + "description": "[DEPRECATED] Keyword with the next most spend by advertisers for political ads", + "mode": "nullable", + }, + { + "name": "spend_usd_4", + "type": "integer", + "description": "[DEPRECATED] Total spend in USD for Keyword_4.", + "mode": "nullable", + }, + { + "name": "keyword_5", + "type": "string", + "description": "[DEPRECATED] Keyword with the next most spend by advertisers for political ads", + "mode": "nullable", + }, + { + "name": "spend_usd_5", + "type": "integer", + "description": "[DEPRECATED] Total spend in USD for Keyword_5.", + "mode": "nullable", + }, + { + "name": "keyword_6", + "type": "string", + "description": "[DEPRECATED] Keyword with the next most spend by advertisers for political ads", + "mode": "nullable", + }, + { + "name": "spend_usd_6", + "type": "integer", + "description": "[DEPRECATED] Total spend in USD for Keyword_6.", + "mode": "nullable", + }, + { + "name": "region", + "type": "string", + "description": "[DEPRECATED] The region where advertisers used these keywords.", + "mode": "nullable", + }, + { + "name": "elections", + "type": "string", + "description": "[DEPRECATED] The elections during which these keywords were used.", + "mode": "nullable", + }, + ], + ) + + download_zip_file_to_composer_bucket >> [ + transform_advertiser_declared_stats_csv, + transform_advertiser_geo_spend_csv, + transform_advertiser_stats_csv, + transform_advertiser_weekly_spend_csv, + transform_campaign_targeting_csv, + transform_creative_stats_csv, + transform_geo_spend_csv, + transform_last_updated_csv, + transform_top_keywords_history_csv, + ] + transform_advertiser_declared_stats_csv >> load_advertiser_declared_stats_to_bq + transform_advertiser_geo_spend_csv >> load_advertiser_geo_spend_to_bq + transform_advertiser_stats_csv >> load_advertiser_stats_to_bq + transform_advertiser_weekly_spend_csv >> load_advertiser_weekly_spend_to_bq + transform_campaign_targeting_csv >> load_campaign_targeting_to_bq + transform_creative_stats_csv >> load_creative_stats_to_bq + transform_geo_spend_csv >> load_geo_spend_to_bq + transform_last_updated_csv >> load_last_updated_to_bq + transform_top_keywords_history_csv >> load_top_keywords_history_to_bq diff --git a/datasets/google_political_ads/pipelines/top_keywords_history/pipeline.yaml b/datasets/google_political_ads/pipelines/top_keywords_history/pipeline.yaml deleted file mode 100644 index b3037faa0..000000000 --- a/datasets/google_political_ads/pipelines/top_keywords_history/pipeline.yaml +++ /dev/null @@ -1,176 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - # Required Properties: - table_id: top_keywords_history - - # Description of the table - description: "The “Top Keywords” section of the US report was removed and updates to this table were terminated in December 2019. The table reflects historical data. This table contains the information for the top six keywords on which political advertisers have spent money during an election cycle. This data is only provided for US elections. The primary key is election_cycle, report_date." - -dag: - airflow_version: 2 - initialize: - dag_id: top_keywords_history - default_args: - owner: "Google" - - # When set to True, keeps a task from getting triggered if the previous schedule for the task hasn’t succeeded - depends_on_past: False - start_date: '2021-03-01' - max_active_runs: 1 - schedule_interval: "@daily" - catchup: False - default_view: graph - - tasks: - - operator: "KubernetesPodOperator" - - # Task description - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "top_keywords_history_transform_csv" - - startup_timeout_seconds: 600 - - # The name of the pod in which the task will run. This will be used (plus a random suffix) to generate a pod id - name: "top_keywords_history" - - # The namespace to run within Kubernetes. Always set its value to "default" because we follow the guideline that KubernetesPodOperator will only be used for very light workloads, i.e. use the Cloud Composer environment's resources without starving other pipelines. - namespace: "composer" - service_account_name: "datasets" - - image_pull_policy: "Always" - - # Docker images will be built and pushed to GCR by default whenever the `scripts/generate_dag.py` is run. To skip building and pushing images, use the optional `--skip-builds` flag. - image: "{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}" - - # Set the environment variables you need initialized in the container. Use these as input variables for the script your container is expected to perform. - env_vars: - SOURCE_URL: "https://storage.googleapis.com/transparencyreport/google-political-ads-transparency-bundle.zip" - SOURCE_FILE: "files/data.zip" - FILE_NAME: "google-political-ads-transparency-bundle/google-political-ads-top-keywords-history.csv" - TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/google_political_ads/top_keywords_history/data_output.csv" - PIPELINE_NAME: "top_keywords_history" - CSV_HEADERS: >- - ["election_cycle","report_date","keyword_1","spend_usd_1","keyword_2","spend_usd_2","keyword_3","spend_usd_3","keyword_4","spend_usd_4","keyword_5","spend_usd_5","keyword_6","spend_usd_6","region","elections"] - RENAME_MAPPINGS: >- - {"Election_Cycle": "election_cycle","Report_Date": "report_date","Keyword_1": "keyword_1","Spend_USD_1": "spend_usd_1","Keyword_2": "keyword_2","Spend_USD_2": "spend_usd_2","Keyword_3": "keyword_3","Spend_USD_3": "spend_usd_3","Keyword_4": "keyword_4","Spend_USD_4": "spend_usd_4","Keyword_5": "keyword_5","Spend_USD_5": "spend_usd_5","Keyword_6": "keyword_6","Spend_USD_6": "spend_usd_6","Region": "region","Elections": "elections"} - - # Set resource limits for the pod here. For resource units in Kubernetes, see https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-units-in-kubernetes - resources: - request_memory: "2G" - request_cpu: "1" - request_ephemeral_storage: "5G" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_top_keywords_history_to_bq" - - # The GCS bucket where the CSV file is located in. - bucket: "{{ var.value.composer_bucket }}" - - # The GCS object path for the CSV file - source_objects: ["data/google_political_ads/top_keywords_history/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "google_political_ads.top_keywords_history" - - # Use this if your CSV file contains a header row - skip_leading_rows: 1 - - # How to write data to the table: overwrite, append, or write if empty - # See https://cloud.google.com/bigquery/docs/reference/auditlogs/rest/Shared.Types/WriteDisposition - write_disposition: "WRITE_TRUNCATE" - - # The BigQuery table schema based on the CSV file. For more info, see - # https://cloud.google.com/bigquery/docs/schemas. - # Always use snake_case and lowercase for column names, and be explicit, - # i.e. specify modes for all columns. - - schema_fields: - - name: "election_cycle" - type: "string" - description: "[DEPRECATED] This field is deprecated in favor of the Region and Elections field. It will be deleted some time after July 2019." - mode: "nullable" - - name: "report_date" - type: "date" - description: "[DEPRECATED] The start date for the week where the spending was reported." - mode: "nullable" - - name: "keyword_1" - type: "string" - description: " [DEPRECATED] Keyword with the most spend by advertisers for political ads" - mode: "nullable" - - name: "spend_usd_1" - type: "integer" - description: "[DEPRECATED] Total spend in USD for Keyword_1." - mode: "nullable" - - name: "keyword_2" - type: "string" - description: "[DEPRECATED] Keyword with the next most spend by advertisers for political ads" - mode: "nullable" - - name: "spend_usd_2" - type: "integer" - description: "[DEPRECATED] Total spend in USD for Keyword_2." - mode: "nullable" - - name: "keyword_3" - type: "string" - description: "[DEPRECATED] Keyword with the next most spend by advertisers for political ads" - mode: "nullable" - - name: "spend_usd_3" - type: "integer" - description: "[DEPRECATED] Total spend in USD for Keyword_3." - mode: "nullable" - - name: "keyword_4" - type: "string" - description: "[DEPRECATED] Keyword with the next most spend by advertisers for political ads" - mode: "nullable" - - name: "spend_usd_4" - type: "integer" - description: "[DEPRECATED] Total spend in USD for Keyword_4." - mode: "nullable" - - name: "keyword_5" - type: "string" - description: "[DEPRECATED] Keyword with the next most spend by advertisers for political ads" - mode: "nullable" - - name: "spend_usd_5" - type: "integer" - description: "[DEPRECATED] Total spend in USD for Keyword_5." - mode: "nullable" - - name: "keyword_6" - type: "string" - description: "[DEPRECATED] Keyword with the next most spend by advertisers for political ads" - mode: "nullable" - - name: "spend_usd_6" - type: "integer" - description: "[DEPRECATED] Total spend in USD for Keyword_6." - mode: "nullable" - - name: "region" - type: "string" - description: "[DEPRECATED] The region where advertisers used these keywords." - mode: "nullable" - - name: "elections" - type: "string" - description: "[DEPRECATED] The elections during which these keywords were used." - mode: "nullable" - graph_paths: - - "top_keywords_history_transform_csv >> load_top_keywords_history_to_bq" diff --git a/datasets/google_political_ads/pipelines/top_keywords_history/top_keywords_history_dag.py b/datasets/google_political_ads/pipelines/top_keywords_history/top_keywords_history_dag.py deleted file mode 100644 index 4e4935649..000000000 --- a/datasets/google_political_ads/pipelines/top_keywords_history/top_keywords_history_dag.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-03-01", -} - - -with DAG( - dag_id="google_political_ads.top_keywords_history", - default_args=default_args, - max_active_runs=1, - schedule_interval="@daily", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - top_keywords_history_transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="top_keywords_history_transform_csv", - startup_timeout_seconds=600, - name="top_keywords_history", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.google_political_ads.container_registry.run_csv_transform_kub }}", - env_vars={ - "SOURCE_URL": "https://storage.googleapis.com/transparencyreport/google-political-ads-transparency-bundle.zip", - "SOURCE_FILE": "files/data.zip", - "FILE_NAME": "google-political-ads-transparency-bundle/google-political-ads-top-keywords-history.csv", - "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/google_political_ads/top_keywords_history/data_output.csv", - "PIPELINE_NAME": "top_keywords_history", - "CSV_HEADERS": '["election_cycle","report_date","keyword_1","spend_usd_1","keyword_2","spend_usd_2","keyword_3","spend_usd_3","keyword_4","spend_usd_4","keyword_5","spend_usd_5","keyword_6","spend_usd_6","region","elections"]', - "RENAME_MAPPINGS": '{"Election_Cycle": "election_cycle","Report_Date": "report_date","Keyword_1": "keyword_1","Spend_USD_1": "spend_usd_1","Keyword_2": "keyword_2","Spend_USD_2": "spend_usd_2","Keyword_3": "keyword_3","Spend_USD_3": "spend_usd_3","Keyword_4": "keyword_4","Spend_USD_4": "spend_usd_4","Keyword_5": "keyword_5","Spend_USD_5": "spend_usd_5","Keyword_6": "keyword_6","Spend_USD_6": "spend_usd_6","Region": "region","Elections": "elections"}', - }, - resources={ - "request_memory": "2G", - "request_cpu": "1", - "request_ephemeral_storage": "5G", - }, - ) - - # Task to load CSV data to a BigQuery table - load_top_keywords_history_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_top_keywords_history_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=[ - "data/google_political_ads/top_keywords_history/data_output.csv" - ], - source_format="CSV", - destination_project_dataset_table="google_political_ads.top_keywords_history", - skip_leading_rows=1, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - { - "name": "election_cycle", - "type": "string", - "description": "[DEPRECATED] This field is deprecated in favor of the Region and Elections field. It will be deleted some time after July 2019.", - "mode": "nullable", - }, - { - "name": "report_date", - "type": "date", - "description": "[DEPRECATED] The start date for the week where the spending was reported.", - "mode": "nullable", - }, - { - "name": "keyword_1", - "type": "string", - "description": " [DEPRECATED] Keyword with the most spend by advertisers for political ads", - "mode": "nullable", - }, - { - "name": "spend_usd_1", - "type": "integer", - "description": "[DEPRECATED] Total spend in USD for Keyword_1.", - "mode": "nullable", - }, - { - "name": "keyword_2", - "type": "string", - "description": "[DEPRECATED] Keyword with the next most spend by advertisers for political ads", - "mode": "nullable", - }, - { - "name": "spend_usd_2", - "type": "integer", - "description": "[DEPRECATED] Total spend in USD for Keyword_2.", - "mode": "nullable", - }, - { - "name": "keyword_3", - "type": "string", - "description": "[DEPRECATED] Keyword with the next most spend by advertisers for political ads", - "mode": "nullable", - }, - { - "name": "spend_usd_3", - "type": "integer", - "description": "[DEPRECATED] Total spend in USD for Keyword_3.", - "mode": "nullable", - }, - { - "name": "keyword_4", - "type": "string", - "description": "[DEPRECATED] Keyword with the next most spend by advertisers for political ads", - "mode": "nullable", - }, - { - "name": "spend_usd_4", - "type": "integer", - "description": "[DEPRECATED] Total spend in USD for Keyword_4.", - "mode": "nullable", - }, - { - "name": "keyword_5", - "type": "string", - "description": "[DEPRECATED] Keyword with the next most spend by advertisers for political ads", - "mode": "nullable", - }, - { - "name": "spend_usd_5", - "type": "integer", - "description": "[DEPRECATED] Total spend in USD for Keyword_5.", - "mode": "nullable", - }, - { - "name": "keyword_6", - "type": "string", - "description": "[DEPRECATED] Keyword with the next most spend by advertisers for political ads", - "mode": "nullable", - }, - { - "name": "spend_usd_6", - "type": "integer", - "description": "[DEPRECATED] Total spend in USD for Keyword_6.", - "mode": "nullable", - }, - { - "name": "region", - "type": "string", - "description": "[DEPRECATED] The region where advertisers used these keywords.", - "mode": "nullable", - }, - { - "name": "elections", - "type": "string", - "description": "[DEPRECATED] The elections during which these keywords were used.", - "mode": "nullable", - }, - ], - ) - - top_keywords_history_transform_csv >> load_top_keywords_history_to_bq