From 0f1c696c73a1ccc16ded61de39db6aa8d06806e3 Mon Sep 17 00:00:00 2001 From: Nicholas Large <84149918+nlarge-google@users.noreply.github.com> Date: Fri, 10 Jun 2022 12:07:22 -0500 Subject: [PATCH] feat: Onboard NOAA datasets (#353) --- datasets/noaa/infra/noaa_pipeline.tf | 160 +++ datasets/noaa/infra/variables.tf | 3 + .../_images/ghcnd_by_year_schema.json | 62 + .../_images/ghcnd_countries_schema.json | 24 + .../_images/ghcnd_hurricanes_schema.json | 920 +++++++++++++++ .../_images/ghcnd_inventory_schema.json | 44 + .../_images/ghcnd_states_schema.json | 24 + .../_images/ghcnd_stations_schema.json | 59 + .../_images/gsod_stations_schema.json | 69 ++ .../noaa_lightning_strikes_schema.json | 32 + .../Dockerfile | 17 - .../run_csv_transform_kub/csv_transform.py | 1034 +++++++++++++++++ .../run_csv_transform_kub/requirements.txt | 5 + .../csv_transform.py | 259 ----- .../requirements.txt | 3 - .../Dockerfile | 38 - .../csv_transform.py | 142 --- .../requirements.txt | 3 - .../gsod_stations/gsod_stations_dag.py | 81 -- .../pipelines/gsod_stations/pipeline.yaml | 164 --- datasets/noaa/pipelines/noaa/noaa_dag.py | 370 ++++++ datasets/noaa/pipelines/noaa/pipeline.yaml | 957 +++++++++++++++ 22 files changed, 3763 insertions(+), 707 deletions(-) create mode 100644 datasets/noaa/infra/noaa_pipeline.tf create mode 100644 datasets/noaa/pipelines/_images/ghcnd_by_year_schema.json create mode 100644 datasets/noaa/pipelines/_images/ghcnd_countries_schema.json create mode 100644 datasets/noaa/pipelines/_images/ghcnd_hurricanes_schema.json create mode 100644 datasets/noaa/pipelines/_images/ghcnd_inventory_schema.json create mode 100644 datasets/noaa/pipelines/_images/ghcnd_states_schema.json create mode 100644 datasets/noaa/pipelines/_images/ghcnd_stations_schema.json create mode 100644 datasets/noaa/pipelines/_images/gsod_stations_schema.json create mode 100644 datasets/noaa/pipelines/_images/noaa_lightning_strikes_schema.json rename datasets/noaa/pipelines/_images/{run_csv_transform_kub_gsod_stations => run_csv_transform_kub}/Dockerfile (54%) create mode 100644 datasets/noaa/pipelines/_images/run_csv_transform_kub/csv_transform.py create mode 100644 datasets/noaa/pipelines/_images/run_csv_transform_kub/requirements.txt delete mode 100644 datasets/noaa/pipelines/_images/run_csv_transform_kub_gsod_stations/csv_transform.py delete mode 100644 datasets/noaa/pipelines/_images/run_csv_transform_kub_gsod_stations/requirements.txt delete mode 100644 datasets/noaa/pipelines/_images/run_csv_transform_kub_lightning_strikes_by_year/Dockerfile delete mode 100644 datasets/noaa/pipelines/_images/run_csv_transform_kub_lightning_strikes_by_year/csv_transform.py delete mode 100644 datasets/noaa/pipelines/_images/run_csv_transform_kub_lightning_strikes_by_year/requirements.txt delete mode 100644 datasets/noaa/pipelines/gsod_stations/gsod_stations_dag.py delete mode 100644 datasets/noaa/pipelines/gsod_stations/pipeline.yaml create mode 100644 datasets/noaa/pipelines/noaa/noaa_dag.py create mode 100644 datasets/noaa/pipelines/noaa/pipeline.yaml diff --git a/datasets/noaa/infra/noaa_pipeline.tf b/datasets/noaa/infra/noaa_pipeline.tf new file mode 100644 index 000000000..9a745c853 --- /dev/null +++ b/datasets/noaa/infra/noaa_pipeline.tf @@ -0,0 +1,160 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_bigquery_table" "noaa_ghcnd_by_year" { + project = var.project_id + dataset_id = "noaa" + table_id = "ghcnd_by_year" + description = "noaaspc" + depends_on = [ + google_bigquery_dataset.noaa + ] +} + +output "bigquery_table-noaa_ghcnd_by_year-table_id" { + value = google_bigquery_table.noaa_ghcnd_by_year.table_id +} + +output "bigquery_table-noaa_ghcnd_by_year-id" { + value = google_bigquery_table.noaa_ghcnd_by_year.id +} + +resource "google_bigquery_table" "noaa_ghcnd_countries" { + project = var.project_id + dataset_id = "noaa" + table_id = "ghcnd_countries" + description = "noaaspc" + depends_on = [ + google_bigquery_dataset.noaa + ] +} + +output "bigquery_table-noaa_ghcnd_countries-table_id" { + value = google_bigquery_table.noaa_ghcnd_countries.table_id +} + +output "bigquery_table-noaa_ghcnd_countries-id" { + value = google_bigquery_table.noaa_ghcnd_countries.id +} + +resource "google_bigquery_table" "noaa_ghcnd_inventory" { + project = var.project_id + dataset_id = "noaa" + table_id = "ghcnd_inventory" + description = "noaaspc" + depends_on = [ + google_bigquery_dataset.noaa + ] +} + +output "bigquery_table-noaa_ghcnd_inventory-table_id" { + value = google_bigquery_table.noaa_ghcnd_inventory.table_id +} + +output "bigquery_table-noaa_ghcnd_inventory-id" { + value = google_bigquery_table.noaa_ghcnd_inventory.id +} + +resource "google_bigquery_table" "noaa_ghcnd_states" { + project = var.project_id + dataset_id = "noaa" + table_id = "ghcnd_states" + description = "noaaspc" + depends_on = [ + google_bigquery_dataset.noaa + ] +} + +output "bigquery_table-noaa_ghcnd_states-table_id" { + value = google_bigquery_table.noaa_ghcnd_states.table_id +} + +output "bigquery_table-noaa_ghcnd_states-id" { + value = google_bigquery_table.noaa_ghcnd_states.id +} + +resource "google_bigquery_table" "noaa_ghcnd_stations" { + project = var.project_id + dataset_id = "noaa" + table_id = "ghcnd_stations" + description = "noaaspc" + depends_on = [ + google_bigquery_dataset.noaa + ] +} + +output "bigquery_table-noaa_ghcnd_stations-table_id" { + value = google_bigquery_table.noaa_ghcnd_stations.table_id +} + +output "bigquery_table-noaa_ghcnd_stations-id" { + value = google_bigquery_table.noaa_ghcnd_stations.id +} + +resource "google_bigquery_table" "noaa_gsod_stations" { + project = var.project_id + dataset_id = "noaa" + table_id = "gsod_stations" + description = "noaaspc" + depends_on = [ + google_bigquery_dataset.noaa + ] +} + +output "bigquery_table-noaa_gsod_stations-table_id" { + value = google_bigquery_table.noaa_gsod_stations.table_id +} + +output "bigquery_table-noaa_gsod_stations-id" { + value = google_bigquery_table.noaa_gsod_stations.id +} + +resource "google_bigquery_table" "noaa_hurricanes" { + project = var.project_id + dataset_id = "noaa" + table_id = "hurricanes" + description = "noaaspc" + depends_on = [ + google_bigquery_dataset.noaa + ] +} + +output "bigquery_table-noaa_hurricanes-table_id" { + value = google_bigquery_table.noaa_hurricanes.table_id +} + +output "bigquery_table-noaa_hurricanes-id" { + value = google_bigquery_table.noaa_hurricanes.id +} + +resource "google_bigquery_table" "noaa_lightning_strikes_by_year" { + project = var.project_id + dataset_id = "noaa" + table_id = "lightning_strikes_by_year" + description = "noaaspc" + depends_on = [ + google_bigquery_dataset.noaa + ] +} + +output "bigquery_table-noaa_lightning_strikes_by_year-table_id" { + value = google_bigquery_table.noaa_lightning_strikes_by_year.table_id +} + +output "bigquery_table-noaa_lightning_strikes_by_year-id" { + value = google_bigquery_table.noaa_lightning_strikes_by_year.id +} diff --git a/datasets/noaa/infra/variables.tf b/datasets/noaa/infra/variables.tf index c3ec7c506..53f483735 100644 --- a/datasets/noaa/infra/variables.tf +++ b/datasets/noaa/infra/variables.tf @@ -20,4 +20,7 @@ variable "bucket_name_prefix" {} variable "impersonating_acct" {} variable "region" {} variable "env" {} +variable "iam_policies" { + default = {} +} diff --git a/datasets/noaa/pipelines/_images/ghcnd_by_year_schema.json b/datasets/noaa/pipelines/_images/ghcnd_by_year_schema.json new file mode 100644 index 000000000..9abc8760e --- /dev/null +++ b/datasets/noaa/pipelines/_images/ghcnd_by_year_schema.json @@ -0,0 +1,62 @@ +[ + { + "name": "id", + "type": "STRING", + "description": "", + "mode": "REQUIRED" + }, + { + "name": "date", + "type": "DATE", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "element", + "type": "STRING", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "value", + "type": "FLOAT", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "mflag", + "type": "STRING", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "qflag", + "type": "STRING", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "sflag", + "type": "STRING", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "time", + "type": "STRING", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "source_url", + "type": "STRING", + "description": "Source ", + "mode": "NULLABLE" + }, + { + "name": "etl_timestamp", + "type": "TIMESTAMP", + "description": "Load time for this data row", + "mode": "NULLABLE" + } +] diff --git a/datasets/noaa/pipelines/_images/ghcnd_countries_schema.json b/datasets/noaa/pipelines/_images/ghcnd_countries_schema.json new file mode 100644 index 000000000..acbb78ff5 --- /dev/null +++ b/datasets/noaa/pipelines/_images/ghcnd_countries_schema.json @@ -0,0 +1,24 @@ +[ + { + "name": "code", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "name", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "source_url", + "type": "STRING", + "description": "Source ", + "mode": "NULLABLE" + }, + { + "name": "etl_timestamp", + "type": "TIMESTAMP", + "description": "Load time for this data row", + "mode": "NULLABLE" + } +] diff --git a/datasets/noaa/pipelines/_images/ghcnd_hurricanes_schema.json b/datasets/noaa/pipelines/_images/ghcnd_hurricanes_schema.json new file mode 100644 index 000000000..19d711172 --- /dev/null +++ b/datasets/noaa/pipelines/_images/ghcnd_hurricanes_schema.json @@ -0,0 +1,920 @@ +[ + { + "name": "sid", + "type": "STRING", + "description": "Storm Identifier.", + "mode": "NULLABLE" + }, + { + "name": "season", + "type": "STRING", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "number", + "type": "INTEGER", + "description": "The cardinal number of the system for that season. The count includes all basins/nso this will not be continuous for basin files.", + "mode": "NULLABLE" + }, + { + "name": "basin", + "type": "STRING", + "description": "Basins include: NA - North Atlantic EP - Eastern North Pacific WP - Western North Pacific NI - North Indian SI - South Indian SP - Southern Pacific SA - South Atlantic MM - Missing - should not appear in final IBTrACS product", + "mode": "NULLABLE" + }, + { + "name": "subbasin", + "type": "STRING", + "description": "Subbasins include: MM - missing - no sub basin for this basin (no subbasins provided for WP/nSI) CS - Caribbean Sea GM - Gulf of Mexico CP - Central Pacific BB - Bay of Bengal AS - Arabian Sea WA - Western Australia EA - Eastern Australia", + "mode": "NULLABLE" + }, + { + "name": "name", + "type": "STRING", + "description": "Name provided by the agency. IBTrACS ignores most names that include digits or abbreviations.", + "mode": "NULLABLE" + }, + { + "name": "iso_time", + "type": "TIMESTAMP", + "description": "ISO Time provided in Universal Time Coordinates (UTC). Format is YYYY-MM-DD HH:mm:ss Most points are provided at 6 hour intervals. Some agencies provided 3 hour points (e.g./nNew Delhi) or times at important observations (e.g./nlandfall times in the North Atlantic/netc.).", + "mode": "NULLABLE" + }, + { + "name": "nature", + "type": "STRING", + "description": "Combined storm type. This is assigned based on all available storm types. They include: DS - Disturbance TS - Tropical ET - Extratropical SS - Subtropical NR - Not reported MX - Mixture (contradicting nature reports from different agencies)", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "wmo_wind", + "type": "INTEGER", + "description": "Maximum sustained wind speed from the WMO agency for the current location. NO adjustment is made for differences in wind speed averaging periods. hurdat/atcf = North Atlantic - U.S. Miami (NOAA NHC) - 1-minute winds tokyo = RSMC Tokyo (JMA) - 10-minute newdelhi = RSMC New Delhi (IMD) - 3-minute reunion = RSMC La Reunion (MFLR) - 10 minute bom = Australian TCWCs (TCWC Perth/nDarwin/nBrisbane) - 10-minute nadi = RSMC Nadi (FMS) - 10 minute wellington = TCWC Wellington (NZMS) - 10-minute", + "mode": "NULLABLE" + }, + { + "name": "wmo_pressure", + "type": "INTEGER", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "wmo_agency", + "type": "STRING", + "description": "This is the reporting agency responsible for the basin as currently listed. It should be noted that many of the agencies did not accept official WMO responsibility until relatively recently/ne.g./nLa Reunion in 1993 or IMD in 1990. Therefore the WMO agency is used loosely todescribe the currently reponsible agency.", + "mode": "NULLABLE" + }, + { + "name": "track_type", + "type": "STRING", + "description": "Track type Tropical storms can interact. This identifies : PROVISIONAL - Real time data used to populate the position and other parameters of this system. This is a provisional track that will be replaced when reanalysis of the storm is performed. (Usually within 2 years of the storm's occurence) main - primary track associated with a storm system. spur - usually short lived tracks associated with a main track and either represent alternate positions at the beginning of a system. Can also represent actual system interactions (e.g./nFujiwhara interactions).", + "mode": "NULLABLE" + }, + { + "name": "dist2land", + "type": "INTEGER", + "description": "Distance to land from the current position. The land dataset includes all continents and any islands larger than XX. The distance is the nearest at the present time in any direction.", + "mode": "NULLABLE" + }, + { + "name": "landfall", + "type": "INTEGER", + "description": "Nearest location to land within next 6 hours. This can be thought of a landfall flag: =0 -- Landfall within 6 hours. >0 -- No landfall within next 6 hours. Calculations are based on storm center (columns 9,10). Values less than 60 nmile likely are impacted by the system even though the center of the system is not over land. The uses the same land mask as DIST2LAND.", + "mode": "NULLABLE" + }, + { + "name": "iflag", + "type": "STRING", + "description": "Interpolation Flag A 14 character flag string which denotes the source of each agency's report: Interpolation Flags include: _ == missing reports. No information provided. O == original report as provided by the agency. P == position was interpolated (all variables were interpolated/filled/nincluding intensity) I == Position was provided/nbut Intensity variables (and likely other variables) were interpolated/filled V = Position and intensity variables are original but some variables were interpolated/filled. The order of the 14 characters refers to the following 14 datasets: 1 - USA Agency (see column 18) 2 - Tokyo 3 - CMA 4 - HKO 5 - NewDelhi 6 - Reunion 7 - BoM 8 - Nadi 9 - Wellington 10 - ds824 11 - TD9636 12 - TD9635 13 - Neumann Southern Hemisphere data set 14 - M.L. Chenoweth N Atlantic Historic dataset", + "mode": "NULLABLE" + }, + { + "name": "usa_agency", + "type": "STRING", + "description": "The agency file providing the information: The representative US agency data is derived from a hierarchical selection: the first dataset in the following list to provide information at the given time is used as the USA_agency: - HURDAT_ATL - HURSAT_EPA - ATCF (for NA and EP basins only) - JTWC_WP - JTWC_IO - JTWC_EP - JTWC_CP - JTWC_SH - CPHC [separate file provided by CPHC for years TBD] - tcvitals - THIS INDICATES THAT THE DATA ARE PRELIMINARY While these agencies are generally orthogonal/nthere are cases where a system is provided in more than one source. In this case/nthe report from the highest source is used. ATCF format info from: https://www.nrlmry.navy.mil/atcf_web/docs/database/new/abdeck.txt HURDAT2 info from: http://www.nhc.noaa.gov/data/hurdat/hurdat2-format-atlantic.pdf", + "mode": "NULLABLE" + }, + { + "name": "usa_latitude", + "type": "FLOAT", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "usa_longitude", + "type": "FLOAT", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "usa_record", + "type": "STRING", + "description": "Record identifier (see notes below) C – Closest approach to a coast/nnot followed by a landfall G – Genesis I – An intensity peak in terms of both pressure and wind L – Landfall (center of system crossing a coastline) P – Minimum in central pressure R – Provides additional detail on the intensity of the cyclone when rapid changes are underway S – Change of status of the system T – Provides additional detail on the track (position) of the cyclone W – Maximum sustained wind speed", + "mode": "NULLABLE" + }, + { + "name": "usa_status", + "type": "STRING", + "description": "Status of system. Options are: DB - disturbance/n TD - tropical depression/n TS - tropical storm/n TY - typhoon/n ST - super typhoon/n TC - tropical cyclone/n HU,HR - hurricane/n SD - subtropical depression/n SS - subtropical storm/n EX - extratropical systems/n PT - post tropical/n IN - inland/n DS - dissipating/n LO - low/n WV - tropical wave/n ET - extrapolated/n MD - monsoon depression/n XX - unknown.", + "mode": "NULLABLE" + }, + { + "name": "usa_wind", + "type": "INTEGER", + "description": "Maximum sustained wind speed in knots: 0 - 300 kts.", + "mode": "NULLABLE" + }, + { + "name": "usa_pressure", + "type": "INTEGER", + "description": "Minimum central pressure (mb)", + "mode": "NULLABLE" + }, + { + "name": "usa_sshs", + "type": "INTEGER", + "description": "Saffir-Simpson Hurricane Scale information based on the wind speed provided by the US agency wind speed (US agencies provide 1-minute wind speeds) -5 = Unknown [XX] -4 = Post-tropical [EX/nET/nPT] -3 = Miscellaneous disturbances [WV/nLO/nDB/nDS/nIN/nMD] -2 = Subtropical [SS/nSD] Tropical systems classified based on wind speeds [TD/nTS/nHU/nTY,/nTC/nST/nHR] -1 = Tropical depression (W<34) 0 = Tropical storm [34= 137]", + "mode": "NULLABLE" + }, + { + "name": "usa_r34_ne", + "type": "INTEGER", + "description": "– 34 kt wind radii maximum extent in northeastern quadrant", + "mode": "NULLABLE" + }, + { + "name": "usa_r34_se", + "type": "INTEGER", + "description": "34 kt wind radii maximum extent in southeastern quadrant", + "mode": "NULLABLE" + }, + { + "name": "usa_r34_sw", + "type": "INTEGER", + "description": "– 34 kt wind radii maximum extent in southwestern quadrant", + "mode": "NULLABLE" + }, + { + "name": "usa_r34_nw", + "type": "INTEGER", + "description": "– 34 kt wind radii maximum extent in northwestern quadrant", + "mode": "NULLABLE" + }, + { + "name": "usa_r50_ne", + "type": "INTEGER", + "description": "50 kt wind radii maximum extent in northeastern quadrant", + "mode": "NULLABLE" + }, + { + "name": "usa_r50_se", + "type": "INTEGER", + "description": "– 50 kt wind radii maximum extent in southeastern quadrant", + "mode": "NULLABLE" + }, + { + "name": "usa_r50_sw", + "type": "INTEGER", + "description": "– 50 kt wind radii maximum extent in southwestern quadrant", + "mode": "NULLABLE" + }, + { + "name": "usa_r50_nw", + "type": "INTEGER", + "description": "50 kt wind radii maximum extent in northwestern quadrant", + "mode": "NULLABLE" + }, + { + "name": "usa_r64_ne", + "type": "INTEGER", + "description": "– 64 kt wind radii maximum extent in northeastern quadrant", + "mode": "NULLABLE" + }, + { + "name": "usa_r64_se", + "type": "INTEGER", + "description": "64 kt wind radii maximum extent in southeastern quadrant", + "mode": "NULLABLE" + }, + { + "name": "usa_r64_sw", + "type": "INTEGER", + "description": "– 64 kt wind radii maximum extent in southwestern quadrant", + "mode": "NULLABLE" + }, + { + "name": "usa_r64_nw", + "type": "INTEGER", + "description": "64 kt wind radii maximum extent in northwestern quadrant", + "mode": "NULLABLE" + }, + { + "name": "usa_poci", + "type": "INTEGER", + "description": "pressure in millibars of the last closed isobar/n900 - 1050 mb NOT BEST-TRACKED (not reanalyzed)", + "mode": "NULLABLE" + }, + { + "name": "usa_roci", + "type": "INTEGER", + "description": "radius of the last closed isobar/n0 - 999 n mi. NOT BEST TRACKED (not reanalyzed)", + "mode": "NULLABLE" + }, + { + "name": "usa_rmw", + "type": "INTEGER", + "description": "radius of max winds/n0 - 999 n mi. NOT BEST TRACKED (not reanalyzed)", + "mode": "NULLABLE" + }, + { + "name": "usa_eye", + "type": "STRING", + "description": "eye diameter/n0 - 120 n mi. NOT BEST TRACKED (not reanalyzed)", + "mode": "NULLABLE" + }, + { + "name": "tokyo_latitude", + "type": "FLOAT", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "tokyo_longitude", + "type": "FLOAT", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "tokyo_grade", + "type": "INTEGER", + "description": " 1 : Not used 2 : Tropical Depression (TD) 3 : Tropical Storm (TS) 4 : Severe Tropical Storm (STS) 5 : Typhoon (TY) 6 : Extratropical Cyclone (L) 7 : Just entering into the responsible area of Japan Meteorological Agency (JMA) 8 : Not used 9 : Tropical Cyclone of TS intensity or higher", + "mode": "NULLABLE" + }, + { + "name": "tokyo_wind", + "type": "INTEGER", + "description": "Maximum sustained wind speed [10-min averaging period]", + "mode": "NULLABLE" + }, + { + "name": "tokyo_pressure", + "type": "INTEGER", + "description": "Central pressure", + "mode": "NULLABLE" + }, + { + "name": "tokyo_r50_dir", + "type": "INTEGER", + "description": "1 : Northeast (NE) 2 : East (E) 3 : Southeast (SE) 4 : South (S) 5 : Southwest (SW) 6 : West (W) 7 : Northwest (NW) 8 : North (N) 9 : (symmetric circle)", + "mode": "NULLABLE" + }, + { + "name": "tokyo_r50_longitude", + "type": "INTEGER", + "description": "The longest radius of 50kt winds or greater", + "mode": "NULLABLE" + }, + { + "name": "tokyo_r50_short", + "type": "INTEGER", + "description": "The shortest radius of 50kt winds or greater", + "mode": "NULLABLE" + }, + { + "name": "tokyo_r30_dir", + "type": "INTEGER", + "description": "1 : Northeast (NE) 2 : East (E) 3 : Southeast (SE) 4 : South (S) 5 : Southwest (SW) 6 : West (W) 7 : Northwest (NW) 8 : North (N) 9 : (symmetric circle)", + "mode": "NULLABLE" + }, + { + "name": "tokyo_r30_long", + "type": "INTEGER", + "description": "The longest radius of 30kt winds or greater", + "mode": "NULLABLE" + }, + { + "name": "tokyo_r30_short", + "type": "INTEGER", + "description": "The shortest radius of 30kt winds or greater", + "mode": "NULLABLE" + }, + { + "name": "tokyo_land", + "type": "INTEGER", + "description": " Landfall or passage over the Japanese islands occurred within one hour after the time of the analysis with this indicator.", + "mode": "NULLABLE" + }, + { + "name": "cma_latitude", + "type": "FLOAT", + "description": "Latitude from Chinese Meteorological Administration data from Shanghai Typhoon Institute", + "mode": "NULLABLE" + }, + { + "name": "cma_longitude", + "type": "FLOAT", + "description": "Longitude from Chinese Meteorological Administration data from Shanghai Typhoon Institute", + "mode": "NULLABLE" + }, + { + "name": "cma_cat", + "type": "INTEGER", + "description": "Intensity category according to the Chinese National Standard for Grade of Tropical Cyclones (which has been used since 15 June 2006): 0 –– Weaker than Tropical Depression or unknown intensity; 1 –– Tropical Depression (TD: 10.8–17.1 m/s); 2 –– Tropical Storm (TS:17.2–24.4 m/s); 3 –– Severe Tropical Storm (STS: 24.5–32.6 m/s); 4 –– Typhoon (TY: 32.7–41.4 m/s); 5 –– Severe Typhoon (STY: 41.5–50.9 m/s); 6 –– Super Typhoon (SuperTY: ≥51.0 m/s); 9 –– Extratropical Cyclone (ET) stage.", + "mode": "NULLABLE" + }, + { + "name": "cma_wind", + "type": "INTEGER", + "description": "Two-minute mean maximum sustained wind (MSW; m/s) near the TC center. WND = 9 indicates MSW < 10 m/s/nWND = 0 indicates unknown intensity", + "mode": "NULLABLE" + }, + { + "name": "cma_pressure", + "type": "INTEGER", + "description": "Minimum pressure (hPa) near the TC center.", + "mode": "NULLABLE" + }, + { + "name": "hko_latitude", + "type": "STRING", + "description": "Latitude from Hong Kong Observatory", + "mode": "NULLABLE" + }, + { + "name": "hko_longitude", + "type": "FLOAT", + "description": "Longitude from Hong Kong Observatory", + "mode": "NULLABLE" + }, + { + "name": "hko_cat", + "type": "STRING", + "description": "After 2009/nwe further classified two more storm types above typhoon/nso there are in total 7 storm types LW (Low) <22 kt TD (Tropical Depression) 22 – 33 kt TS (Tropical Storm) 34 – 47 kt STS (Severe Tropical Storm) 48 – 63 kt T (Typhoon) 64 – 80 kt ST (Severe Typhoon) 81 – 99 kt SuperT (Super Typhoon) >= 100 kt", + "mode": "NULLABLE" + }, + { + "name": "hko_wind", + "type": "INTEGER", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "hko_pressure", + "type": "INTEGER", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "newdelhi_latitude", + "type": "FLOAT", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "newdelhi_longitude", + "type": "FLOAT", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "newdelhi_grade", + "type": "STRING", + "description": "Types of disturbances: Low pressure area W<17 knots D - Depression 17<=W<28 DD - Deep Depression 28<=W<34 CS - Cyclonic Storm 34<=W<48 SCS - Severe Cyclonic Storm 48<=W<64 VSCS - Very Severe Cyclonic Storm 64<=W<120 SCS - Super Cyclonic Storm W>=120 knots", + "mode": "NULLABLE" + }, + { + "name": "newdelhi_wind", + "type": "INTEGER", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "newdelhi_pressure", + "type": "INTEGER", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "newdelhi_ci", + "type": "FLOAT", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "newdelhi_dp", + "type": "INTEGER", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "newdelhi_poci", + "type": "INTEGER", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "reunion_latitude", + "type": "FLOAT", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "reunion_longitude", + "type": "FLOAT", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "reunion_type", + "type": "INTEGER", + "description": "01= tropics; disturbance ( no closed isobars) 02= <34 knot winds/n<17m/s winds and at least one closed isobar 03= 34-63 knots/n17-32m/s 04= >63 knots/n>32m/s 05= extratropical 06= dissipating 07= subtropical cyclone (nonfrontal/nlow pressure system that comprises initially baroclinic circulation developing over subtropical water) 08= overland 09= unknown", + "mode": "NULLABLE" + }, + { + "name": "reunion_wind", + "type": "INTEGER", + "description": "Maximum average wind speed", + "mode": "NULLABLE" + }, + { + "name": "reunion_pressure", + "type": "INTEGER", + "description": "Central pressure", + "mode": "NULLABLE" + }, + { + "name": "reunion_tnum", + "type": "FLOAT", + "description": "Dvorak T-number", + "mode": "NULLABLE" + }, + { + "name": "reunion_ci", + "type": "FLOAT", + "description": "Dvorak CI-number", + "mode": "NULLABLE" + }, + { + "name": "reunion_rmw", + "type": "INTEGER", + "description": "Radius of maximum winds", + "mode": "NULLABLE" + }, + { + "name": "reunion_r34_ne", + "type": "INTEGER", + "description": "34 kt wind radii maximum extent in northeastern quadrant", + "mode": "NULLABLE" + }, + { + "name": "reunion_r34_se", + "type": "INTEGER", + "description": "34 kt wind radii maximum extent in southeastern quadrant", + "mode": "NULLABLE" + }, + { + "name": "reunion_r34_sw", + "type": "INTEGER", + "description": "34 kt wind radii maximum extent in southwestern quadrant", + "mode": "NULLABLE" + }, + { + "name": "reunion_r34_nw", + "type": "INTEGER", + "description": "34 kt wind radii maximum extent in northwestern quadrant", + "mode": "NULLABLE" + }, + { + "name": "reunion_r50_ne", + "type": "INTEGER", + "description": "50 kt wind radii maximum extent in northeastern quadrant", + "mode": "NULLABLE" + }, + { + "name": "reunion_r50_se", + "type": "INTEGER", + "description": "50 kt wind radii maximum extent in southeastern quadrant", + "mode": "NULLABLE" + }, + { + "name": "reunion_r50_sw", + "type": "INTEGER", + "description": "50 kt wind radii maximum extent in southwestern quadrant", + "mode": "NULLABLE" + }, + { + "name": "reunion_r50_nw", + "type": "INTEGER", + "description": "50 kt wind radii maximum extent in northwestern quadrant", + "mode": "NULLABLE" + }, + { + "name": "reunion_r64_ne", + "type": "INTEGER", + "description": "64 kt wind radii maximum extent in northeastern quadrant", + "mode": "NULLABLE" + }, + { + "name": "reunion_r64_se", + "type": "INTEGER", + "description": "64 kt wind radii maximum extent in southeastern quadrant", + "mode": "NULLABLE" + }, + { + "name": "reunion_r64_sw", + "type": "INTEGER", + "description": "64 kt wind radii maximum extent in southwestern quadrant", + "mode": "NULLABLE" + }, + { + "name": "reunion_r64_nw", + "type": "INTEGER", + "description": "64 kt wind radii maximum extent in northwestern quadrant", + "mode": "NULLABLE" + }, + { + "name": "bom_latitude", + "type": "FLOAT", + "description": "Latitude from Australian Bureau of Meterology", + "mode": "NULLABLE" + }, + { + "name": "bom_longitude", + "type": "FLOAT", + "description": "Longitude from Australian Bureau of Meterology", + "mode": "NULLABLE" + }, + { + "name": "bom_type", + "type": "INTEGER", + "description": "This indicates the type of system that this cyclone was at the time of the observation. Note that cyclones can evolve during their lifetimes and hence change type mid-stream (e.g. Extratropical transition (ETT))", + "mode": "NULLABLE" + }, + { + "name": "bom_wind", + "type": "INTEGER", + "description": "This is the estimated maximum mean wind around the cyclone – that is in the vicinity of the centre.", + "mode": "NULLABLE" + }, + { + "name": "bom_pressure", + "type": "INTEGER", + "description": "Central pressure of the cyclone", + "mode": "NULLABLE" + }, + { + "name": "bom_tnum", + "type": "FLOAT", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "bom_ci", + "type": "FLOAT", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "bom_rmw", + "type": "INTEGER", + "description": "This is the mean radius (from the system centre) of the maximum mean wind.", + "mode": "NULLABLE" + }, + { + "name": "bom_r34_ne", + "type": "INTEGER", + "description": "This is the mean radius (from the system centre) of the extent of winds; gale-force (17m/s) or above. The four sectors show the mean extent in the respective quadrant centred on the cardinal point. Northeast quadrant", + "mode": "NULLABLE" + }, + { + "name": "bom_r34_se", + "type": "INTEGER", + "description": "This is the mean radius (from the system centre) of the extent of winds; gale-force (17m/s) or above. The four sectors show the mean extent in the respective quadrant centred on the cardinal point. Southeast quadrant", + "mode": "NULLABLE" + }, + { + "name": "bom_r34_sw", + "type": "INTEGER", + "description": "This is the mean radius (from the system centre) of the extent of winds; gale-force (17m/s) or above. The four sectors show the mean extent in the respective quadrant centred on the cardinal point. Southwest quadrant", + "mode": "NULLABLE" + }, + { + "name": "bom_r34_nw", + "type": "INTEGER", + "description": "This is the mean radius (from the system centre) of the extent of winds; gale-force (17m/s) or above. The four sectors show the mean extent in the respective quadrant centred on the cardinal point. Northwest quadrant", + "mode": "NULLABLE" + }, + { + "name": "bom_r50_ne", + "type": "INTEGER", + "description": "These are the mean radius (from the system centre) of the extent of winds; storm-force (25m/s) or above. Northeast quadrant.", + "mode": "NULLABLE" + }, + { + "name": "bom_r50_se", + "type": "INTEGER", + "description": "These are the mean radius (from the system centre) of the extent of winds; storm-force (25m/s) or above. Southeast quadrant.", + "mode": "NULLABLE" + }, + { + "name": "bom_r50_sw", + "type": "INTEGER", + "description": "These are the mean radius (from the system centre) of the extent of winds; storm-force (25m/s) or above. Southwest quadrant.", + "mode": "NULLABLE" + }, + { + "name": "bom_r50_nw", + "type": "INTEGER", + "description": "These are the mean radius (from the system centre) of the extent of winds; storm-force (25m/s) or above. Northwest quadrant.", + "mode": "NULLABLE" + }, + { + "name": "bom_r64_ne", + "type": "INTEGER", + "description": "These are the mean radius (from the system centre) of the extent of winds; hurricane-force (33m/s) or above. Northeast quadrant", + "mode": "NULLABLE" + }, + { + "name": "bom_r64_se", + "type": "INTEGER", + "description": "These are the mean radius (from the system centre) of the extent of winds; hurricane-force (33m/s) or above. Southeast quadrant", + "mode": "NULLABLE" + }, + { + "name": "bom_r64_sw", + "type": "INTEGER", + "description": "These are the mean radius (from the system centre) of the extent of winds; hurricane-force (33m/s) or above. Southwest quadrant", + "mode": "NULLABLE" + }, + { + "name": "bom_r64_nw", + "type": "INTEGER", + "description": "These are the mean radius (from the system centre) of the extent of winds; hurricane-force (33m/s) or above. Northwest quadrant", + "mode": "NULLABLE" + }, + { + "name": "bom_roci", + "type": "INTEGER", + "description": "The estimated mean radius of the outermost closed isobar (1-hPa spacing).", + "mode": "NULLABLE" + }, + { + "name": "bom_poci", + "type": "INTEGER", + "description": "Environmental pressure in which the cyclone is embedded", + "mode": "NULLABLE" + }, + { + "name": "bom_eye", + "type": "INTEGER", + "description": "Mean radius of the cyclone eye.", + "mode": "NULLABLE" + }, + { + "name": "bom_pos_method", + "type": "INTEGER", + "description": "This indicates the tools that were used to derive the centre location of the system. ADAM Code Method to derive position NULL Default - unknown 1 no sat/nno rad/nno obs 2 no sat/nno rad/nobs only 3 Sat IR/Vis; no clear eye 4 Sat IR/Vis; clearly defined eye 5 aircraft radar report 6 land-based radar report 7 Sat IR/Vis & rad & obs 8 report inside eye 10 Sat- Scatterometer 11 Sat- Microwave 12 Manned Aircraft Reconnaissance 13 UAV Aircraft Reconnaissance", + "mode": "NULLABLE" + }, + { + "name": "bom_pressure_method", + "type": "INTEGER", + "description": "This code may need to be expanded to handle new systems in the future/nand also to differentiate between pressure-wind relationships used to derive the central pressure. ADAM code Method WMO Code NULL Unknown or N/A 1 Aircraft or Dropsonde observation 1 2 Over water observation (e.g. buoy) 2 3 Over land observation 3 4 Instrument – unknown type 5 5 Derived Directly from DVORAK 4 6 Derived from wind via a P-W equation 5 7 Estimate from surrounding obs 5 8 Extrapolation from radar 5 9 Other 5", + "mode": "NULLABLE" + }, + { + "name": "wellington_latitude", + "type": "FLOAT", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "wellington_longitude", + "type": "FLOAT", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "wellington_wind", + "type": "INTEGER", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "wellington_pressure", + "type": "INTEGER", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "nadi_latitude", + "type": "FLOAT", + "description": "Latitude from Fiji Meteorological Service data from RSMC Hadi", + "mode": "NULLABLE" + }, + { + "name": "nadi_longitude", + "type": "FLOAT", + "description": "Longitude from Fiji Meteorological Service data from RSMC Hadi", + "mode": "NULLABLE" + }, + { + "name": "nadi_cat", + "type": "INTEGER", + "description": "Nadi assigned category", + "mode": "NULLABLE" + }, + { + "name": "nadi_wind", + "type": "INTEGER", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "nadi_pressure", + "type": "INTEGER", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "ds824_latitude", + "type": "FLOAT", + "description": "Latitude from DataSet 824 - A historic dataset with data from the 1800s through 1980(ish)", + "mode": "NULLABLE" + }, + { + "name": "ds824_longitude", + "type": "FLOAT", + "description": "Longitude from DataSet 824 - A historic dataset with data from the 1800s through 1980(ish)", + "mode": "NULLABLE" + }, + { + "name": "ds824_stage", + "type": "STRING", + "description": "TC - Tropical cyclone", + "mode": "NULLABLE" + }, + { + "name": "ds824_wind", + "type": "INTEGER", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "ds824_pressure", + "type": "INTEGER", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "td9636_latitude", + "type": "FLOAT", + "description": "Latitude from Dataset of a collection of global storms (1842-1980)", + "mode": "NULLABLE" + }, + { + "name": "td9636_longitude", + "type": "FLOAT", + "description": "Longitude from Dataset of a collection of global storms (1842-1980)", + "mode": "NULLABLE" + }, + { + "name": "td9636_stage", + "type": "INTEGER", + "description": "This field gives an estimate of the highest winds occurring in the storm at the time and location indicated. The entire storm was coded as to the highest stage reached for some of the earlier years. 0 - Tropical disturbance (1969 onward) 1 - depression < 34 [some variation in definition for S Indian] 2 - Storm 34-63 [with some variation in definition for S Indian] 3 - point where wind reached 64 knots [except N Indian where it is wind 43-47 knots] 4 - Hurricane > 64 [except in N Indian/nWind > 48] 5 - Extratropical 6 - Dissipating 7 - Unknown Intensity or doubtful track", + "mode": "NULLABLE" + }, + { + "name": "td9636_wind", + "type": "INTEGER", + "description": "Estimated highest wind speed at the time indicated. These estimates are subjective and must be interpreted with caution.", + "mode": "NULLABLE" + }, + { + "name": "td9636_pressure", + "type": "INTEGER", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "td9635_latitude", + "type": "FLOAT", + "description": "Latitude from Dataset of a collection of western Pacific Storms (~1945-1976)", + "mode": "NULLABLE" + }, + { + "name": "td9635_longitude", + "type": "FLOAT", + "description": "Longitude from Dataset of a collection of western Pacific Storms (~1945-1976)", + "mode": "NULLABLE" + }, + { + "name": "td9635_wind", + "type": "FLOAT", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "td9635_pressure", + "type": "INTEGER", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "td9635_roci", + "type": "INTEGER", + "description": "Size. (Radius of system)", + "mode": "NULLABLE" + }, + { + "name": "neumann_latitude", + "type": "FLOAT", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "neumann_longitude", + "type": "FLOAT", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "neumann_class", + "type": "STRING", + "description": "EX - Extratropical TC - Tropical MM - Missing", + "mode": "NULLABLE" + }, + { + "name": "neumann_wind", + "type": "INTEGER", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "neumann_pressure", + "type": "INTEGER", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "mlc_latitude", + "type": "FLOAT", + "description": "Latitude from Mike Chenoweth data with updated data for the North Atlantic for the 1800s.", + "mode": "NULLABLE" + }, + { + "name": "mlc_longitude", + "type": "FLOAT", + "description": "Longitude from Mike Chenoweth data with updated data for the North Atlantic for the 1800s.", + "mode": "NULLABLE" + }, + { + "name": "mlc_class", + "type": "STRING", + "description": "Storm classification EX - Extratropical HU - Hurricane LO - Low MH SD - Subtropical depression SS - Subtropical storm TD - Tropical Depression TS - Tropical Storm TW WV - Open Wave", + "mode": "NULLABLE" + }, + { + "name": "mlc_wind", + "type": "INTEGER", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "mlc_pressure", + "type": "INTEGER", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "usa_atcf_id", + "type": "STRING", + "description": "The ATCF ID is assigned by US agencies and can be used to comparethe storm with other US cyclone-related datasets. If two (or more) ATCF tracks make up one storm/nthen the IDs are separated by a colon. The format of the ATCF ID is B where bb is the basin ID/nnn is the number of the storm in that basin and yyyy is the year. Possible basin values are: AL: North Atlantic/nSL: South Atlantic/nEP: East Pacific/nWP: West Pacific/nSH: Southern Hemisphere/nIO: North Indian For the provisional data/nother basin identifiers were provided that include: CP: Central Pacific/nSP: South Pacific/nSI: South Indian/nAS: Arabian Sea (North Indian) and BB: Bay of Bengal (North Indian)", + "mode": "NULLABLE" + }, + { + "name": "source_url", + "type": "STRING", + "description": "Source ", + "mode": "NULLABLE" + }, + { + "name": "etl_timestamp", + "type": "TIMESTAMP", + "description": "Load time for this data row", + "mode": "NULLABLE" + } +] diff --git a/datasets/noaa/pipelines/_images/ghcnd_inventory_schema.json b/datasets/noaa/pipelines/_images/ghcnd_inventory_schema.json new file mode 100644 index 000000000..8bd38ade2 --- /dev/null +++ b/datasets/noaa/pipelines/_images/ghcnd_inventory_schema.json @@ -0,0 +1,44 @@ +[ + { + "name": "id", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "mode": "NULLABLE" + }, + { + "name": "element", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "firstyear", + "type": "INTEGER", + "mode": "NULLABLE" + }, + { + "name": "lastyear", + "type": "INTEGER", + "mode": "NULLABLE" + }, + { + "name": "source_url", + "type": "STRING", + "description": "Source ", + "mode": "NULLABLE" + }, + { + "name": "etl_timestamp", + "type": "TIMESTAMP", + "description": "Load time for this data row", + "mode": "NULLABLE" + } +] diff --git a/datasets/noaa/pipelines/_images/ghcnd_states_schema.json b/datasets/noaa/pipelines/_images/ghcnd_states_schema.json new file mode 100644 index 000000000..480843c19 --- /dev/null +++ b/datasets/noaa/pipelines/_images/ghcnd_states_schema.json @@ -0,0 +1,24 @@ +[ + { + "name": "code", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "name", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "source_url", + "type": "STRING", + "description": "Source ", + "mode": "NULLABLE" + }, + { + "name": "etl_timestamp", + "type": "TIMESTAMP", + "description": "Load time for this data row", + "mode": "NULLABLE" + } +] diff --git a/datasets/noaa/pipelines/_images/ghcnd_stations_schema.json b/datasets/noaa/pipelines/_images/ghcnd_stations_schema.json new file mode 100644 index 000000000..f8068f0f4 --- /dev/null +++ b/datasets/noaa/pipelines/_images/ghcnd_stations_schema.json @@ -0,0 +1,59 @@ +[ + { + "name": "id", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "latitude", + "type": "FLOAT", + "mode": "NULLABLE" + }, + { + "name": "longitude", + "type": "FLOAT", + "mode": "NULLABLE" + }, + { + "name": "elevation", + "type": "FLOAT", + "mode": "NULLABLE" + }, + { + "name": "state", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "name", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "gsn_flag", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "hcn_cm_flag", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "wmoid", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "source_url", + "type": "STRING", + "description": "Source ", + "mode": "NULLABLE" + }, + { + "name": "etl_timestamp", + "type": "TIMESTAMP", + "description": "Load time for this data row", + "mode": "NULLABLE" + } +] diff --git a/datasets/noaa/pipelines/_images/gsod_stations_schema.json b/datasets/noaa/pipelines/_images/gsod_stations_schema.json new file mode 100644 index 000000000..e4f230e67 --- /dev/null +++ b/datasets/noaa/pipelines/_images/gsod_stations_schema.json @@ -0,0 +1,69 @@ +[ + { + "name": "usaf", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "wban", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "name", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "country", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "state", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "call", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "lat", + "type": "FLOAT", + "mode": "NULLABLE" + }, + { + "name": "lon", + "type": "FLOAT", + "mode": "NULLABLE" + }, + { + "name": "elev", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "begin", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "end", + "type": "STRING", + "mode": "NULLABLE" + }, + { + "name": "source_url", + "type": "STRING", + "description": "Source ", + "mode": "NULLABLE" + }, + { + "name": "etl_timestamp", + "type": "TIMESTAMP", + "description": "Load time for this data row", + "mode": "NULLABLE" + } +] diff --git a/datasets/noaa/pipelines/_images/noaa_lightning_strikes_schema.json b/datasets/noaa/pipelines/_images/noaa_lightning_strikes_schema.json new file mode 100644 index 000000000..da465b9c1 --- /dev/null +++ b/datasets/noaa/pipelines/_images/noaa_lightning_strikes_schema.json @@ -0,0 +1,32 @@ +[ + { + "name": "date", + "type": "TIMESTAMP", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "number_of_strikes", + "type": "INTEGER", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "center_point_geom", + "type": "GEOGRAPHY", + "description": "", + "mode": "NULLABLE" + }, + { + "name": "source_url", + "type": "STRING", + "description": "Source ", + "mode": "NULLABLE" + }, + { + "name": "etl_timestamp", + "type": "TIMESTAMP", + "description": "Load time for this data row", + "mode": "NULLABLE" + } +] diff --git a/datasets/noaa/pipelines/_images/run_csv_transform_kub_gsod_stations/Dockerfile b/datasets/noaa/pipelines/_images/run_csv_transform_kub/Dockerfile similarity index 54% rename from datasets/noaa/pipelines/_images/run_csv_transform_kub_gsod_stations/Dockerfile rename to datasets/noaa/pipelines/_images/run_csv_transform_kub/Dockerfile index 85af90570..748bc3bec 100644 --- a/datasets/noaa/pipelines/_images/run_csv_transform_kub_gsod_stations/Dockerfile +++ b/datasets/noaa/pipelines/_images/run_csv_transform_kub/Dockerfile @@ -12,27 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -# The base image for this build -# FROM gcr.io/google.com/cloudsdktool/cloud-sdk:slim FROM python:3.8 - -# Allow statements and log messages to appear in Cloud logs ENV PYTHONUNBUFFERED True - -# Copy the requirements file into the image COPY requirements.txt ./ - -# Install the packages specified in the requirements file RUN python3 -m pip install --no-cache-dir -r requirements.txt - -# The WORKDIR instruction sets the working directory for any RUN, CMD, -# ENTRYPOINT, COPY and ADD instructions that follow it in the Dockerfile. -# If the WORKDIR doesn’t exist, it will be created even if it’s not used in -# any subsequent Dockerfile instruction WORKDIR /custom - -# Copy the specific data processing script/s in the image under /custom/* COPY ./csv_transform.py . - -# Command to run the data processing script when the container is run CMD ["python3", "csv_transform.py"] diff --git a/datasets/noaa/pipelines/_images/run_csv_transform_kub/csv_transform.py b/datasets/noaa/pipelines/_images/run_csv_transform_kub/csv_transform.py new file mode 100644 index 000000000..852be8a07 --- /dev/null +++ b/datasets/noaa/pipelines/_images/run_csv_transform_kub/csv_transform.py @@ -0,0 +1,1034 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +import datetime +import ftplib +import gzip +import json +import logging +import os +import pathlib +import re +import time +import typing +from urllib.request import Request, urlopen + +import pandas as pd +import requests +from bs4 import BeautifulSoup +from google.api_core.exceptions import NotFound +from google.cloud import bigquery, storage + + +def main( + pipeline_name: str, + source_url: str, + source_file: pathlib.Path, + target_file: pathlib.Path, + chunksize: str, + ftp_host: str, + ftp_dir: str, + project_id: str, + dataset_id: str, + table_id: str, + target_gcs_bucket: str, + target_gcs_path: str, + schema_path: str, + drop_dest_table: str, + input_field_delimiter: str, + ftp_batch_size: str, + ftp_batch_sleep_time: str, + full_data_load: str, + start_year: str, + input_csv_headers: typing.List[str], + data_dtypes: dict, + reorder_headers_list: typing.List[str], + null_rows_list: typing.List[str], + date_format_list: typing.List[str], + slice_column_list: dict, + regex_list: dict, + rename_headers_list: dict, + remove_source_file: str, + delete_target_file: str, + number_of_header_rows: str, + int_date_list: typing.List[str], + gen_location_list: dict, +) -> None: + logging.info(f"{pipeline_name} process started") + pathlib.Path("./files").mkdir(parents=True, exist_ok=True) + execute_pipeline( + pipeline_name=pipeline_name, + source_url=source_url, + source_file=source_file, + target_file=target_file, + chunksize=chunksize, + ftp_host=ftp_host, + ftp_dir=ftp_dir, + project_id=project_id, + dataset_id=dataset_id, + destination_table=table_id, + target_gcs_bucket=target_gcs_bucket, + target_gcs_path=target_gcs_path, + schema_path=schema_path, + drop_dest_table=drop_dest_table, + input_field_delimiter=input_field_delimiter, + ftp_batch_size=ftp_batch_size, + ftp_batch_sleep_time=ftp_batch_sleep_time, + full_data_load=full_data_load, + start_year=start_year, + input_csv_headers=input_csv_headers, + data_dtypes=data_dtypes, + reorder_headers_list=reorder_headers_list, + null_rows_list=null_rows_list, + date_format_list=date_format_list, + slice_column_list=slice_column_list, + regex_list=regex_list, + rename_headers_list=rename_headers_list, + remove_source_file=(remove_source_file == "Y"), + delete_target_file=(delete_target_file == "Y"), + number_of_header_rows=int(number_of_header_rows), + int_date_list=int_date_list, + gen_location_list=gen_location_list, + ) + logging.info(f"{pipeline_name} process completed") + + +def execute_pipeline( + pipeline_name: str, + source_url: str, + source_file: pathlib.Path, + target_file: pathlib.Path, + chunksize: str, + ftp_host: str, + ftp_dir: str, + project_id: str, + dataset_id: str, + destination_table: str, + target_gcs_bucket: str, + target_gcs_path: str, + schema_path: str, + drop_dest_table: str, + input_field_delimiter: str, + ftp_batch_size: str, + ftp_batch_sleep_time: str, + full_data_load: str, + start_year: str, + input_csv_headers: typing.List[str], + data_dtypes: dict, + reorder_headers_list: typing.List[str], + null_rows_list: typing.List[str], + date_format_list: typing.List[str], + slice_column_list: dict, + regex_list: dict, + remove_source_file: bool, + rename_headers_list: dict, + delete_target_file: bool, + number_of_header_rows: int, + int_date_list: typing.List[str], + gen_location_list: dict, +) -> None: + if pipeline_name == "GHCND by year": + if full_data_load == "N": + start = str(datetime.datetime.now().year - 6) + else: + start = start_year + ftp_batch = 1 + for yr in range(int(start), datetime.datetime.now().year + 1): + yr_str = str(yr) + source_zipfile = str.replace(str(source_file), ".csv", f"_{yr_str}.csv.gz") + source_file_unzipped = str.replace(str(source_zipfile), ".csv.gz", ".csv") + target_file_year = str.replace(str(target_file), ".csv", f"_{yr_str}.csv") + destination_table_year = f"{destination_table}_{yr_str}" + source_url_year = str.replace(source_url, ".csv.gz", f"{yr_str}.csv.gz") + target_gcs_path_year = str.replace( + target_gcs_path, ".csv", f"_{yr_str}.csv" + ) + if ftp_batch == int(ftp_batch_size): + logging.info("Sleeping...") + time.sleep(int(ftp_batch_sleep_time)) + ftp_batch = 1 + else: + ftp_batch += 1 + download_file_ftp( + ftp_host=ftp_host, + ftp_dir=ftp_dir, + ftp_filename=f"{yr_str}.csv.gz", + local_file=source_zipfile, + source_url=source_url_year, + ) + gz_decompress( + infile=source_zipfile, tofile=source_file_unzipped, delete_zipfile=True + ) + process_and_load_table( + source_file=source_file_unzipped, + target_file=target_file_year, + pipeline_name=pipeline_name, + source_url=source_url_year, + chunksize=chunksize, + project_id=project_id, + dataset_id=dataset_id, + destination_table=destination_table_year, + target_gcs_bucket=target_gcs_bucket, + target_gcs_path=target_gcs_path_year, + schema_path=schema_path, + drop_dest_table=drop_dest_table, + input_field_delimiter=input_field_delimiter, + input_csv_headers=input_csv_headers, + data_dtypes=data_dtypes, + reorder_headers_list=reorder_headers_list, + null_rows_list=null_rows_list, + date_format_list=date_format_list, + slice_column_list=slice_column_list, + regex_list=regex_list, + rename_headers_list=rename_headers_list, + remove_source_file=remove_source_file, + delete_target_file=delete_target_file, + int_date_list=int_date_list, + gen_location_list=gen_location_list, + ) + if pipeline_name in [ + "GHCND countries", + "GHCND inventory", + "GHCND states", + "GHCND stations", + "GSOD stations", + ]: + ftp_filename = os.path.split(source_url)[1] + download_file_ftp(ftp_host, ftp_dir, ftp_filename, source_file, source_url) + if number_of_header_rows > 0: + remove_header_rows(source_file, number_of_header_rows=number_of_header_rows) + else: + pass + process_and_load_table( + source_file=source_file, + target_file=target_file, + pipeline_name=pipeline_name, + source_url=source_url, + chunksize=chunksize, + project_id=project_id, + dataset_id=dataset_id, + destination_table=destination_table, + target_gcs_bucket=target_gcs_bucket, + target_gcs_path=target_gcs_path, + schema_path=schema_path, + drop_dest_table=drop_dest_table, + input_field_delimiter=input_field_delimiter, + input_csv_headers=input_csv_headers, + data_dtypes=data_dtypes, + reorder_headers_list=reorder_headers_list, + null_rows_list=null_rows_list, + date_format_list=date_format_list, + slice_column_list=slice_column_list, + regex_list=regex_list, + rename_headers_list=rename_headers_list, + remove_source_file=remove_source_file, + delete_target_file=delete_target_file, + int_date_list=int_date_list, + gen_location_list=gen_location_list, + ) + if pipeline_name == "GHCND hurricanes": + download_file(source_url, source_file) + if number_of_header_rows > 0: + remove_header_rows(source_file, number_of_header_rows=number_of_header_rows) + else: + pass + process_and_load_table( + source_file=source_file, + target_file=target_file, + pipeline_name=pipeline_name, + source_url=source_url, + chunksize=chunksize, + project_id=project_id, + dataset_id=dataset_id, + destination_table=destination_table, + target_gcs_bucket=target_gcs_bucket, + target_gcs_path=target_gcs_path, + schema_path=schema_path, + drop_dest_table=drop_dest_table, + input_field_delimiter=input_field_delimiter, + input_csv_headers=input_csv_headers, + data_dtypes=data_dtypes, + reorder_headers_list=reorder_headers_list, + null_rows_list=null_rows_list, + date_format_list=date_format_list, + slice_column_list=slice_column_list, + regex_list=regex_list, + rename_headers_list=rename_headers_list, + remove_source_file=remove_source_file, + delete_target_file=delete_target_file, + int_date_list=int_date_list, + gen_location_list=gen_location_list, + ) + if pipeline_name == "NOAA lightning strikes by year": + url_path = os.path.split(source_url)[0] + file_pattern = str.split(os.path.split(source_url)[1], "*")[0] + url_list = url_directory_list(f"{url_path}/", file_pattern) + if full_data_load == "N": + start = datetime.datetime.now().year - 6 + else: + start = int(start_year) + for yr in range(start, datetime.datetime.now().year): + for url in url_list: + url_file_name = os.path.split(url)[1] + if str(url_file_name).find(f"{file_pattern}{yr}") >= 0: + source_file_path = os.path.split(source_file)[0] + source_file_zipped = f"{source_file_path}/{url_file_name}" + source_file_year = str.replace( + str(source_file), ".csv", f"_{yr}.csv" + ) + target_file_year = str.replace( + str(target_file), ".csv", f"_{yr}.csv" + ) + download_file(url, source_file_zipped) + gz_decompress( + infile=source_file_zipped, + tofile=source_file_year, + delete_zipfile=True, + ) + if number_of_header_rows > 0: + remove_header_rows( + source_file_year, + number_of_header_rows=number_of_header_rows, + ) + else: + pass + if not full_data_load: + delete_source_file_data_from_bq( + project_id=project_id, + dataset_id=dataset_id, + table_id=destination_table, + source_url=url, + ) + process_and_load_table( + source_file=source_file_year, + target_file=target_file_year, + pipeline_name=pipeline_name, + source_url=url, + chunksize=chunksize, + project_id=project_id, + dataset_id=dataset_id, + destination_table=destination_table, + target_gcs_bucket=target_gcs_bucket, + target_gcs_path=target_gcs_path, + schema_path=schema_path, + drop_dest_table=drop_dest_table, + input_field_delimiter=input_field_delimiter, + input_csv_headers=input_csv_headers, + data_dtypes=data_dtypes, + reorder_headers_list=reorder_headers_list, + null_rows_list=null_rows_list, + date_format_list=date_format_list, + slice_column_list=slice_column_list, + regex_list=regex_list, + rename_headers_list=rename_headers_list, + remove_source_file=remove_source_file, + delete_target_file=delete_target_file, + int_date_list=int_date_list, + gen_location_list=gen_location_list, + ) + + +def process_and_load_table( + source_file: str, + target_file: str, + pipeline_name: str, + source_url: str, + chunksize: str, + project_id: str, + dataset_id: str, + destination_table: str, + target_gcs_bucket: str, + target_gcs_path: str, + schema_path: str, + drop_dest_table: str, + input_field_delimiter: str, + input_csv_headers: typing.List[str], + data_dtypes: dict, + reorder_headers_list: typing.List[str], + null_rows_list: typing.List[str], + date_format_list: typing.List[str], + slice_column_list: dict, + regex_list: dict, + rename_headers_list: dict, + remove_source_file: bool, + delete_target_file: bool, + int_date_list: typing.List[str], + gen_location_list: dict, + encoding: str = "utf-8", +) -> None: + process_source_file( + source_url=source_url, + source_file=source_file, + pipeline_name=pipeline_name, + chunksize=chunksize, + input_csv_headers=input_csv_headers, + data_dtypes=data_dtypes, + target_file=target_file, + reorder_headers_list=reorder_headers_list, + null_rows_list=null_rows_list, + date_format_list=date_format_list, + input_field_delimiter=input_field_delimiter, + slice_column_list=slice_column_list, + regex_list=regex_list, + rename_headers_list=rename_headers_list, + remove_source_file=remove_source_file, + int_date_list=int_date_list, + gen_location_list=gen_location_list, + encoding=encoding, + ) + if os.path.exists(target_file): + upload_file_to_gcs( + file_path=target_file, + target_gcs_bucket=target_gcs_bucket, + target_gcs_path=target_gcs_path, + ) + if drop_dest_table == "Y": + drop_table = True + else: + drop_table = False + table_exists = create_dest_table( + project_id=project_id, + dataset_id=dataset_id, + table_id=destination_table, + schema_filepath=schema_path, + bucket_name=target_gcs_bucket, + drop_table=drop_table, + ) + if table_exists: + load_data_to_bq( + project_id=project_id, + dataset_id=dataset_id, + table_id=destination_table, + file_path=target_file, + truncate_table=True, + field_delimiter="|", + ) + else: + error_msg = f"Error: Data was not loaded because the destination table {project_id}.{dataset_id}.{destination_table} does not exist and/or could not be created." + raise ValueError(error_msg) + if delete_target_file: + logging.info(f"Removing target file {target_file}") + os.remove(target_file) + else: + logging.info( + f"Informational: The data file {target_file} was not generated because no data file was available. Continuing." + ) + + +def process_source_file( + source_file: str, + chunksize: str, + input_csv_headers: str, + pipeline_name: str, + data_dtypes: str, + source_url: str, + target_file: str, + reorder_headers_list: typing.List[str], + null_rows_list: typing.List[str], + date_format_list: typing.List[str], + input_field_delimiter: str, + slice_column_list: dict, + regex_list: dict, + rename_headers_list: dict, + int_date_list: typing.List[str], + gen_location_list: dict, + encoding: str = "utf8", + remove_source_file: bool = False, +) -> None: + logging.info(f"Opening source file {source_file}") + csv.field_size_limit(512 << 10) + csv.register_dialect( + "TabDialect", quotechar='"', delimiter=input_field_delimiter, strict=True + ) + with open(source_file, encoding=encoding, mode="r") as reader: + data = [] + chunk_number = 1 + for index, line in enumerate( + csv.reader((line.replace("\0", "") for line in reader), "TabDialect"), 0 + ): + data.append(line) + if index % int(chunksize) == 0 and index > 0: + process_dataframe_chunk( + data=data, + pipeline_name=pipeline_name, + input_csv_headers=input_csv_headers, + data_dtypes=data_dtypes, + source_url=source_url, + target_file=target_file, + chunk_number=chunk_number, + reorder_headers_list=reorder_headers_list, + date_format_list=date_format_list, + null_rows_list=null_rows_list, + slice_column_list=slice_column_list, + regex_list=regex_list, + rename_headers_list=rename_headers_list, + int_date_list=int_date_list, + gen_location_list=gen_location_list, + ) + data = [] + chunk_number += 1 + + if data: + process_dataframe_chunk( + data=data, + pipeline_name=pipeline_name, + input_csv_headers=input_csv_headers, + data_dtypes=data_dtypes, + source_url=source_url, + target_file=target_file, + chunk_number=chunk_number, + reorder_headers_list=reorder_headers_list, + date_format_list=date_format_list, + null_rows_list=null_rows_list, + slice_column_list=slice_column_list, + regex_list=regex_list, + rename_headers_list=rename_headers_list, + int_date_list=int_date_list, + gen_location_list=gen_location_list, + ) + if remove_source_file: + os.remove(source_file) + + +def process_dataframe_chunk( + data: typing.List[str], + pipeline_name: str, + input_csv_headers: typing.List[str], + data_dtypes: dict, + source_url: str, + target_file: str, + chunk_number: int, + reorder_headers_list: typing.List[str], + date_format_list: typing.List[str], + null_rows_list: typing.List[str], + slice_column_list: dict, + regex_list: dict, + rename_headers_list: dict, + int_date_list: typing.List[str], + gen_location_list: dict, +) -> None: + logging.info(f"Processing chunk #{chunk_number}") + df = pd.DataFrame(data, columns=input_csv_headers) + set_df_datatypes(df, data_dtypes) + target_file_batch = str(target_file).replace( + ".csv", "-" + str(chunk_number) + ".csv" + ) + process_chunk( + df=df, + source_url=source_url, + target_file_batch=target_file_batch, + target_file=target_file, + skip_header=(not chunk_number == 1), + pipeline_name=pipeline_name, + reorder_headers_list=reorder_headers_list, + date_format_list=date_format_list, + null_rows_list=null_rows_list, + slice_column_list=slice_column_list, + regex_list=regex_list, + rename_headers_list=rename_headers_list, + int_date_list=int_date_list, + gen_location_list=gen_location_list, + ) + + +def set_df_datatypes(df: pd.DataFrame, data_dtypes: dict) -> pd.DataFrame: + logging.info("Setting data types") + for key, item in data_dtypes.items(): + df[key] = df[key].astype(item) + return df + + +def process_chunk( + df: pd.DataFrame, + source_url: str, + target_file_batch: str, + target_file: str, + skip_header: bool, + pipeline_name: str, + reorder_headers_list: dict, + null_rows_list: typing.List[str], + date_format_list: typing.List[str], + slice_column_list: dict, + regex_list: dict, + rename_headers_list: dict, + int_date_list: typing.List[str], + gen_location_list: dict, +) -> None: + if pipeline_name == "GHCND by year": + df = filter_null_rows(df, null_rows_list=null_rows_list) + df = add_metadata_cols(df, source_url=source_url) + df = source_convert_date_formats(df, date_format_list=date_format_list) + df = reorder_headers(df, reorder_headers_list=reorder_headers_list) + if pipeline_name in [ + "GHCND countries", + "GHCND inventory", + "GHCND states", + "GHCND stations", + ]: + df = slice_column(df, slice_column_list) + df = add_metadata_cols(df, source_url=source_url) + df = reorder_headers(df, reorder_headers_list=reorder_headers_list) + if pipeline_name == "GSOD stations": + df = slice_column(df, slice_column_list) + df = filter_null_rows(df, null_rows_list=null_rows_list) + df = add_metadata_cols(df, source_url=source_url) + df = reorder_headers(df, reorder_headers_list=reorder_headers_list) + df["lat"] = df["lat"].astype(str) + df["lon"] = df["lon"].astype(str) + df = apply_regex(df, regex_list) + if pipeline_name == "GHCND hurricanes": + df.columns = df.columns.str.lower() + df = rename_headers(df, rename_headers_list=rename_headers_list) + df = add_metadata_cols(df, source_url=source_url) + df = reorder_headers(df, reorder_headers_list=reorder_headers_list) + if pipeline_name == "NOAA lightning strikes by year": + df.columns = df.columns.str.lower() + df = rename_headers(df, rename_headers_list=rename_headers_list) + df = convert_date_from_int(df, int_date_list=int_date_list) + df = generate_location(df, gen_location_list=gen_location_list) + df = add_metadata_cols(df, source_url=source_url) + df = reorder_headers(df, reorder_headers_list=reorder_headers_list) + save_to_new_file(df, file_path=str(target_file_batch)) + append_batch_file(target_file_batch, target_file, skip_header, not (skip_header)) + + +def convert_date_from_int(df: pd.DataFrame, int_date_list: dict) -> pd.DataFrame: + logging.info("Converting dates from integers") + for key, values in int_date_list.items(): + dt_col = key + dt_int_col = values + df[dt_col] = ( + pd.to_datetime( + (df[dt_int_col][:].astype("string") + "000000"), "raise", False, True + ).astype("string") + + " 00:00:00" + ) + return df + + +def generate_location(df: pd.DataFrame, gen_location_list: dict) -> pd.DataFrame: + logging.info("Generating location data") + for key, values in gen_location_list.items(): + df[key] = df[[values[0], values[1]]].apply( + lambda x: f"POINT({x[0]} {x[1]})", axis=1 + ) + return df + + +def url_directory_list( + source_url_path: str, file_pattern: str = "" +) -> typing.List[str]: + rtn_list = [] + url = source_url_path.replace(" ", "%20") + req = Request(url) + a = urlopen(req).read() + soup = BeautifulSoup(a, "html.parser") + x = soup.find_all("a") + for i in x: + file_name = i.extract().get_text() + url_new = url + file_name + url_new = url_new.replace(" ", "%20") + if file_pattern == "": + rtn_list.append(url_new) + else: + if re.search("" + file_pattern, file_name): + rtn_list.append(url_new) + else: + pass + return rtn_list + + +def rename_headers(df: pd.DataFrame, rename_headers_list: dict) -> pd.DataFrame: + df.rename(columns=rename_headers_list, inplace=True) + return df + + +def remove_header_rows(source_file: str, number_of_header_rows: int) -> None: + logging.info(f"Removing header from {source_file}") + os.system(f"sed -i '1,{number_of_header_rows}d' {source_file} ") + + +def add_metadata_cols(df: pd.DataFrame, source_url: str) -> pd.DataFrame: + logging.info("Adding metadata columns") + df["source_url"] = source_url + df["etl_timestamp"] = pd.to_datetime( + datetime.datetime.now(), format="%Y-%m-%d %H:%M:%S", infer_datetime_format=True + ) + return df + + +def reorder_headers( + df: pd.DataFrame, reorder_headers_list: typing.List[str] +) -> pd.DataFrame: + logging.info("Reordering headers..") + return df[reorder_headers_list] + + +def gz_decompress(infile: str, tofile: str, delete_zipfile: bool = False) -> None: + logging.info(f"Decompressing {infile}") + with open(infile, "rb") as inf, open(tofile, "w", encoding="utf8") as tof: + decom_str = gzip.decompress(inf.read()).decode("utf-8") + tof.write(decom_str) + if delete_zipfile: + os.remove(infile) + + +def filter_null_rows( + df: pd.DataFrame, null_rows_list: typing.List[str] +) -> pd.DataFrame: + logging.info("Removing rows with blank id's..") + for fld in null_rows_list: + df = df[df[fld] != ""] + return df + + +def convert_dt_format(dt_str: str) -> str: + if not dt_str or dt_str.lower() == "nan": + return dt_str + else: + return str( + datetime.datetime.strptime(dt_str, "%Y%m%d").date().strftime("%Y-%m-%d") + ) + + +def source_convert_date_formats( + df: pd.DataFrame, date_format_list: typing.List[str] +) -> pd.DataFrame: + logging.info("Converting Date Format..") + for fld in date_format_list: + df[fld] = df[fld].apply(convert_dt_format) + return df + + +def slice_column( + df: pd.DataFrame, slice_column_list: dict, pipeline_name: str = "" +) -> pd.DataFrame: + logging.info("Extracting column data..") + for key, values in slice_column_list.items(): + src_col = values[0] + dest_col = key + start_pos = values[1] + end_pos = values[2] + if pipeline_name == "GHCND states": + if dest_col == "name": + # Work-around for Alabama - bad data + df[dest_col] = df[src_col].apply( + lambda x: "ALABAMA" + if str(x)[0:2] == "AL" + else str(x)[int(start_pos) :].strip() + ) + else: + if end_pos == "": + df[dest_col] = df[src_col].apply( + lambda x: str(x)[int(start_pos) :].strip() + ) + else: + df[dest_col] = df[src_col].apply( + lambda x: str(x)[int(start_pos) : int(end_pos)].strip() + ) + else: + if end_pos == "": + df[dest_col] = df[src_col].apply( + lambda x: str(x)[int(start_pos) :].strip() + ) + else: + df[dest_col] = df[src_col].apply( + lambda x: str(x)[int(start_pos) : int(end_pos)].strip() + ) + return df + + +def get_column_country_code(col_val: str) -> str: + return col_val.strip().split(" ")[0] + + +def get_column_country_name(col_val: str) -> str: + len_code = len(str.split(str.strip(col_val), " ")[0]) + strmain1 = str.strip(col_val) + len_main = len(str.strip(col_val)) + len_out = len_main - len_code + return str.strip((strmain1[::-1])[0:(len_out)][::-1]) + + +def apply_regex(df: pd.DataFrame, regex_list: dict) -> pd.DataFrame: + logging.info("Applying RegEx") + for key, values in regex_list.items(): + regex_expr = values[0] + replace_expr = values[1] + isregex = values[2] == "True" + df[key][:].replace(regex_expr, replace_expr, regex=isregex, inplace=True) + return df + + +def load_data_to_bq( + project_id: str, + dataset_id: str, + table_id: str, + file_path: str, + truncate_table: bool, + field_delimiter: str = "|", +) -> None: + logging.info( + f"Loading data from {file_path} into {project_id}.{dataset_id}.{table_id} started" + ) + client = bigquery.Client(project=project_id) + table_ref = client.dataset(dataset_id).table(table_id) + job_config = bigquery.LoadJobConfig() + job_config.source_format = bigquery.SourceFormat.CSV + job_config.field_delimiter = field_delimiter + if truncate_table: + job_config.write_disposition = "WRITE_TRUNCATE" + else: + job_config.write_disposition = "WRITE_APPEND" + job_config.skip_leading_rows = 1 # ignore the header + job_config.autodetect = False + with open(file_path, "rb") as source_file: + job = client.load_table_from_file(source_file, table_ref, job_config=job_config) + job.result() + logging.info( + f"Loading data from {file_path} into {project_id}.{dataset_id}.{table_id} completed" + ) + + +def create_dest_table( + project_id: str, + dataset_id: str, + table_id: str, + schema_filepath: list, + bucket_name: str, + drop_table: bool = False, +) -> bool: + table_ref = f"{project_id}.{dataset_id}.{table_id}" + logging.info(f"Attempting to create table {table_ref} if it doesn't already exist") + client = bigquery.Client() + table_exists = False + try: + table = client.get_table(table_ref) + table_exists_id = table.table_id + logging.info(f"Table {table_exists_id} currently exists.") + if drop_table: + logging.info("Dropping existing table") + client.delete_table(table) + table = None + except NotFound: + table = None + if not table: + logging.info( + ( + f"Table {table_ref} currently does not exist. Attempting to create table." + ) + ) + if check_gcs_file_exists(schema_filepath, bucket_name): + schema = create_table_schema([], bucket_name, schema_filepath) + table = bigquery.Table(table_ref, schema=schema) + client.create_table(table) + print(f"Table {table_ref} was created".format(table_id)) + table_exists = True + else: + file_name = os.path.split(schema_filepath)[1] + file_path = os.path.split(schema_filepath)[0] + logging.info( + f"Error: Unable to create table {table_ref} because schema file {file_name} does not exist in location {file_path} in bucket {bucket_name}" + ) + table_exists = False + else: + table_exists = True + return table_exists + + +def check_gcs_file_exists(file_path: str, bucket_name: str) -> bool: + storage_client = storage.Client() + bucket = storage_client.bucket(bucket_name) + exists = storage.Blob(bucket=bucket, name=file_path).exists(storage_client) + return exists + + +def delete_source_file_data_from_bq( + project_id: str, dataset_id: str, table_id: str, source_url: str +) -> None: + logging.info( + f"Deleting data from {project_id}.{dataset_id}.{table_id} where source_url = '{source_url}'" + ) + client = bigquery.Client() + query = f""" + DELETE + FROM {project_id}.{dataset_id}.{table_id} + WHERE source_url = '@source_url' + """ + job_config = bigquery.QueryJobConfig( + query_parameters=[ + bigquery.ScalarQueryParameter("project_id", "STRING", project_id), + bigquery.ScalarQueryParameter("dataset_id", "STRING", dataset_id), + bigquery.ScalarQueryParameter("table_id", "STRING", table_id), + bigquery.ScalarQueryParameter("source_url", "STRING", source_url), + ] + ) + query_job = client.query(query, job_config=job_config) # Make an API request. + query_job.result() + + +def create_table_schema( + schema_structure: list, bucket_name: str = "", schema_filepath: str = "" +) -> list: + logging.info(f"Defining table schema... {bucket_name} ... {schema_filepath}") + schema = [] + if not (schema_filepath): + schema_struct = schema_structure + else: + storage_client = storage.Client() + bucket = storage_client.get_bucket(bucket_name) + blob = bucket.blob(schema_filepath) + schema_struct = json.loads(blob.download_as_bytes(client=None)) + for schema_field in schema_struct: + fld_name = schema_field["name"] + fld_type = schema_field["type"] + try: + fld_descr = schema_field["description"] + except KeyError: + fld_descr = "" + fld_mode = schema_field["mode"] + schema.append( + bigquery.SchemaField( + name=fld_name, field_type=fld_type, mode=fld_mode, description=fld_descr + ) + ) + return schema + + +def save_to_new_file(df: pd.DataFrame, file_path: str, sep: str = "|") -> None: + logging.info(f"Saving data to target file.. {file_path} ...") + df.to_csv(file_path, index=False, sep=sep) + + +def append_batch_file( + batch_file_path: str, target_file_path: str, skip_header: bool, truncate_file: bool +) -> None: + with open(batch_file_path, "r") as data_file: + if truncate_file: + target_file = open(target_file_path, "w+").close() + with open(target_file_path, "a+") as target_file: + if skip_header: + logging.info( + f"Appending batch file {batch_file_path} to {target_file_path} with skip header" + ) + next(data_file) + else: + logging.info( + f"Appending batch file {batch_file_path} to {target_file_path}" + ) + target_file.write(data_file.read()) + if os.path.exists(batch_file_path): + os.remove(batch_file_path) + + +def download_file(source_url: str, source_file: pathlib.Path) -> None: + logging.info(f"Downloading {source_url} to {source_file}") + r = requests.get(source_url, stream=True) + if r.status_code == 200: + with open(source_file, "wb") as f: + for chunk in r: + f.write(chunk) + else: + logging.error(f"Couldn't download {source_url}: {r.text}") + + +def download_file_ftp( + ftp_host: str, + ftp_dir: str, + ftp_filename: str, + local_file: pathlib.Path, + source_url: str, +) -> None: + logging.info(f"Downloading {source_url} into {local_file}") + for retry in range(1, 3): + if not download_file_ftp_single_try( + ftp_host, ftp_dir, ftp_filename, local_file + ): + logging.info(f"FTP file download failed. Retrying #{retry} in 60 seconds") + time.sleep(60) + else: + break + + +def download_file_ftp_single_try( + ftp_host: str, ftp_dir: str, ftp_filename: str, local_file: pathlib.Path +) -> bool: + # try: + with ftplib.FTP(ftp_host, timeout=60) as ftp_conn: + ftp_conn.login("", "") + ftp_conn.cwd(ftp_dir) + ftp_conn.encoding = "utf-8" + with open(local_file, "wb") as dest_file: + ftp_conn.retrbinary("RETR %s" % ftp_filename, dest_file.write) + ftp_conn.quit() + return True + # except: + # return True + + +def upload_file_to_gcs( + file_path: pathlib.Path, target_gcs_bucket: str, target_gcs_path: str +) -> None: + if os.path.exists(file_path): + logging.info( + f"Uploading output file to gs://{target_gcs_bucket}/{target_gcs_path}" + ) + storage_client = storage.Client() + bucket = storage_client.bucket(target_gcs_bucket) + blob = bucket.blob(target_gcs_path) + blob.upload_from_filename(file_path) + else: + logging.info( + f"Cannot upload file to gs://{target_gcs_bucket}/{target_gcs_path} as it does not exist." + ) + + +if __name__ == "__main__": + logging.getLogger().setLevel(logging.INFO) + + main( + pipeline_name=os.environ.get("PIPELINE_NAME", ""), + source_url=os.environ.get("SOURCE_URL", ""), + source_file=pathlib.Path(os.environ.get("SOURCE_FILE", "")).expanduser(), + target_file=pathlib.Path(os.environ.get("TARGET_FILE", "")).expanduser(), + chunksize=os.environ.get("CHUNKSIZE", "100000"), + ftp_host=os.environ.get("FTP_HOST", ""), + ftp_dir=os.environ.get("FTP_DIR", ""), + project_id=os.environ.get("PROJECT_ID", ""), + dataset_id=os.environ.get("DATASET_ID", ""), + table_id=os.environ.get("TABLE_ID", ""), + drop_dest_table=os.environ.get("DROP_DEST_TABLE", "N"), + schema_path=os.environ.get("SCHEMA_PATH", ""), + target_gcs_bucket=os.environ.get("TARGET_GCS_BUCKET", ""), + target_gcs_path=os.environ.get("TARGET_GCS_PATH", ""), + input_field_delimiter=os.environ.get("INPUT_FIELD_DELIMITER", "N"), + ftp_batch_size=os.environ.get("FTP_BATCH_SIZE", "20"), + ftp_batch_sleep_time=os.environ.get("FTP_BATCH_SLEEP_TIME", "30"), + full_data_load=os.environ.get("FULL_DATA_LOAD", "N"), + start_year=os.environ.get("START_YEAR", ""), + input_csv_headers=json.loads(os.environ.get("INPUT_CSV_HEADERS", r"[]")), + data_dtypes=json.loads(os.environ.get("DATA_DTYPES", r"{}")), + reorder_headers_list=json.loads(os.environ.get("REORDER_HEADERS_LIST", r"[]")), + null_rows_list=json.loads(os.environ.get("NULL_ROWS_LIST", r"[]")), + date_format_list=json.loads(os.environ.get("DATE_FORMAT_LIST", r"[]")), + slice_column_list=json.loads(os.environ.get("SLICE_COLUMN_LIST", r"{}")), + rename_headers_list=json.loads(os.environ.get("RENAME_HEADERS_LIST", r"{}")), + remove_source_file=os.environ.get("REMOVE_SOURCE_FILE", "N"), + delete_target_file=os.environ.get("DELETE_TARGET_FILE", "N"), + number_of_header_rows=os.environ.get("NUMBER_OF_HEADER_ROWS", "0"), + regex_list=json.loads(os.environ.get("REGEX_LIST", r"{}")), + int_date_list=json.loads(os.environ.get("INT_DATE_LIST", r"[]")), + gen_location_list=json.loads(os.environ.get("GEN_LOCATION_LIST", r"{}")), + ) diff --git a/datasets/noaa/pipelines/_images/run_csv_transform_kub/requirements.txt b/datasets/noaa/pipelines/_images/run_csv_transform_kub/requirements.txt new file mode 100644 index 000000000..4c7f177b1 --- /dev/null +++ b/datasets/noaa/pipelines/_images/run_csv_transform_kub/requirements.txt @@ -0,0 +1,5 @@ +bs4 +google-cloud-bigquery +google-cloud-storage +numpy +pandas diff --git a/datasets/noaa/pipelines/_images/run_csv_transform_kub_gsod_stations/csv_transform.py b/datasets/noaa/pipelines/_images/run_csv_transform_kub_gsod_stations/csv_transform.py deleted file mode 100644 index 37c0b18f5..000000000 --- a/datasets/noaa/pipelines/_images/run_csv_transform_kub_gsod_stations/csv_transform.py +++ /dev/null @@ -1,259 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gzip -import logging -import os -import pathlib -import re -import urllib.request -from ftplib import FTP - -import pandas as pd -import requests -from google.cloud import storage - - -def main( - source_url: str, - ftp_host: str, - ftp_dir: str, - ftp_filename: str, - source_file: pathlib.Path, - target_file: pathlib.Path, - target_gcs_bucket: str, - target_gcs_path: str, -): - - # source_url STRING -> The full url of the source file to transform - # ftp_host STRING -> The host IP of the ftp file (IP only) - # ftp_dir STRING -> The remote working directory that the FTP file resides in (directory only) - # ftp_filename STRING -> The name of the file to pull from the FTP site - # source_file PATHLIB.PATH -> The (local) path pertaining to the downloaded source file - # target_file PATHLIB.PATH -> The (local) target transformed file + filename - # target_gcs_bucket STRING -> The target GCS bucket to place the output (transformed) file - # target_gcs_path STRING -> The target GCS path ( within the GCS bucket ) to place the output (transformed) file - - logging.info("NOAA GSOD Stations By Year process started") - - logging.info(f"starting processing {source_url}") - - if url_is_reachable(source_url): - - logging.info("creating 'files' folder") - pathlib.Path("./files").mkdir(parents=True, exist_ok=True) - - logging.info(f"Downloading FTP file {source_url} from {ftp_host}") - download_file_ftp(ftp_host, ftp_dir, ftp_filename, source_file, source_url) - - logging.info(f"Removing unnecessary header in {source_file}") - os.system(f"tail -n +21 {source_file}.bak > {source_file}.1") - os.system(f"sed '2d' {source_file}.1 > {source_file}") - os.unlink(str(source_file) + ".bak") - os.unlink(str(source_file) + ".1") - - logging.info(f"Opening source file {source_file}") - colspecs = [ - (0, 6), # usaf - (7, 12), # wban - (13, 42), # name - (43, 45), # country - (48, 50), # state - (51, 56), # call - (57, 64), # lat - (65, 74), # lon - (75, 81), # elev - (82, 90), # begin - (91, 99), # end - ] - df = pd.read_fwf(str(source_file), colspecs=colspecs) - - logging.info(f"Transform: Renaming Headers.. {source_file}") - df.columns = [ - "usaf", - "wban", - "name", - "country", - "state", - "call", - "lat", - "lon", - "elev", - "begin", - "end", - ] - - # remove rows with empty (usaf) data - df = df[df.usaf != ""] - - # execute reg-ex replacements - logging.info(f"Transform: Executing Reg Ex.. {source_file}") - logging.info(f" Executing Reg Ex.. (lat) {source_file}") - df["lat"] = df["lat"].astype(str) - df["lat"][:].replace("^(-[0]+)(.*)", "-$2", regex=True, inplace=True) - df["lat"][:].replace("^(\\s+)$", "", regex=True, inplace=True) - df["lat"][:].replace( - "^(\\+\\d+\\.\\d+[0-9])\\s+", "$1", regex=True, inplace=True - ) - df["lat"][:].replace("^(-\\d+\\.\\d+[0-9])\\s+", "$1", regex=True, inplace=True) - df["lat"][:].replace("nan", "", regex=False, inplace=True) - - logging.info(f" Executing Reg Ex.. (lon) {source_file}") - df["lon"] = df["lon"].astype(str) - df["lon"][:].replace("^(-[0]+)(.*)", "-$2", regex=True, inplace=True) - df["lon"][:].replace("^(\\s+)$", "", regex=True, inplace=True) - df["lon"][:].replace( - "^(\\+\\d+\\.\\d+[0-9])\\s+", "$1", regex=True, inplace=True - ) - df["lon"][:].replace("^(-\\d+\\.\\d+[0-9])\\s+", "$1", regex=True, inplace=True) - df["lon"][:].replace("nan", "", regex=False, inplace=True) - - logging.info(f" Executing Reg Ex.. (usaf) {source_file}") - df["usaf"][:].replace("(\\d{1,})(\\s{1,})$", "$1", regex=True, inplace=True) - - logging.info(f" Executing Reg Ex.. (name) {source_file}") - df["name"][:].replace("^\\s{1,}([a-zA-Z]\\D+)", "$1", regex=True, inplace=True) - df["name"][:].replace("^(\\D+[a-zA-Z])\\s{1,}$", "$1", regex=True, inplace=True) - df["name"][:].replace("^(\\s+)$", "", regex=True, inplace=True) - - logging.info(f" Executing Reg Ex.. (call) {source_file}") - df["call"][:].replace("^(\\s+)$", "", regex=True, inplace=True) - df["call"][:].replace("^([a-zA-Z]+)\\s+", "$1", regex=True, inplace=True) - - logging.info(f" Executing Reg Ex.. (elev) {source_file}") - df["elev"][:].replace("^(\\s+)$", "", regex=True, inplace=True) - - logging.info(f" Executing Reg Ex.. (state) {source_file}") - df["state"][:].replace("^(\\s+)$", "", regex=True, inplace=True) - - logging.info(f" Executing Reg Ex.. (country) {source_file}") - df["country"][:].replace("^(\\s+)$", "", regex=True, inplace=True) - - logging.info(f"Transform: Saving to output file.. {target_file}") - df.to_csv(target_file, index=False) - - logging.info(f"completed processing {source_url}") - - logging.info( - f"Uploading output file to.. gs://{target_gcs_bucket}/{target_gcs_path}" - ) - upload_file_to_gcs(target_file, target_gcs_bucket, target_gcs_path) - - logging.info("NOAA GSOD Stations process completed") - - else: - - logging.info(f"Error: Unable to reach url: {source_url}") - logging.info("Process failed!") - - -def replace_value(val: str) -> str: - if val is None or len(val) == 0: - return val - else: - if val.find("\n") > 0: - return re.sub(r"(^\d):(\d{2}:\d{2})", "0$1:$2", val) - else: - return val - - -def replace_values_regex(df: pd.DataFrame) -> None: - header_names = {"checkout_time"} - - for dt_col in header_names: - if (df[dt_col] is not None) & (df[dt_col].str.len() > 0): - df[dt_col] = df[dt_col].apply(replace_value) - - -def download_file_ftp( - ftp_host: str, - ftp_dir: str, - ftp_filename: str, - local_file: pathlib.Path, - source_url: str, -) -> None: - - # ftp_host -> host ip (eg. 123.123.0.1) - # ftp_dir -> working directory in ftp host where file is located - # ftp_filename -> filename of FTP file to download - # source_file -> local file (including path) to create containing ftp content - - logging.info(f"Downloading {source_url} into {local_file}") - ftp_conn = FTP(ftp_host) - ftp_conn.login("", "") - ftp_conn.cwd(ftp_dir) - - try: - bak_local_file = str(local_file) + ".bak" - dest_file = open(bak_local_file, "wb") - ftp_conn.encoding = "utf-8" - ftp_conn.retrbinary( - cmd="RETR " + ftp_filename, - callback=dest_file.write, - blocksize=1024, - rest=None, - ) - ftp_conn.quit() - dest_file.close() - except Exception as e: - logging.error(f"Error saving output file: {e}.") - - -def gz_decompress(infile: str, tofile: str) -> None: - with open(infile, "rb") as inf, open(tofile, "w", encoding="utf8") as tof: - decom_str = gzip.decompress(inf.read()).decode("utf-8") - tof.write(decom_str) - - -def url_is_reachable(url: str) -> bool: - - request = urllib.request.Request(url) - request.get_method = lambda: "HEAD" - - try: - urllib.request.urlopen(request) - return True - except urllib.request.HTTPError: - return False - - -def download_file(source_url: str, source_file: pathlib.Path) -> None: - r = requests.get(source_url, stream=True) - if r.status_code == 200: - with open(source_file, "wb") as f: - for chunk in r: - f.write(chunk) - else: - logging.error(f"Couldn't download {source_url}: {r.text}") - - -def upload_file_to_gcs(file_path: pathlib.Path, gcs_bucket: str, gcs_path: str) -> None: - storage_client = storage.Client() - bucket = storage_client.bucket(gcs_bucket) - blob = bucket.blob(gcs_path) - blob.upload_from_filename(file_path) - - -if __name__ == "__main__": - logging.getLogger().setLevel(logging.INFO) - main( - source_url=os.environ["SOURCE_URL"], - ftp_host=os.environ["FTP_HOST"], - ftp_dir=os.environ["FTP_DIR"], - ftp_filename=os.environ["FTP_FILENAME"], - source_file=pathlib.Path(os.environ["SOURCE_FILE"]).expanduser(), - target_file=pathlib.Path(os.environ["TARGET_FILE"]).expanduser(), - target_gcs_bucket=os.environ["TARGET_GCS_BUCKET"], - target_gcs_path=os.environ["TARGET_GCS_PATH"], - ) diff --git a/datasets/noaa/pipelines/_images/run_csv_transform_kub_gsod_stations/requirements.txt b/datasets/noaa/pipelines/_images/run_csv_transform_kub_gsod_stations/requirements.txt deleted file mode 100644 index f36704793..000000000 --- a/datasets/noaa/pipelines/_images/run_csv_transform_kub_gsod_stations/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -requests -pandas -google-cloud-storage diff --git a/datasets/noaa/pipelines/_images/run_csv_transform_kub_lightning_strikes_by_year/Dockerfile b/datasets/noaa/pipelines/_images/run_csv_transform_kub_lightning_strikes_by_year/Dockerfile deleted file mode 100644 index 85af90570..000000000 --- a/datasets/noaa/pipelines/_images/run_csv_transform_kub_lightning_strikes_by_year/Dockerfile +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# The base image for this build -# FROM gcr.io/google.com/cloudsdktool/cloud-sdk:slim -FROM python:3.8 - -# Allow statements and log messages to appear in Cloud logs -ENV PYTHONUNBUFFERED True - -# Copy the requirements file into the image -COPY requirements.txt ./ - -# Install the packages specified in the requirements file -RUN python3 -m pip install --no-cache-dir -r requirements.txt - -# The WORKDIR instruction sets the working directory for any RUN, CMD, -# ENTRYPOINT, COPY and ADD instructions that follow it in the Dockerfile. -# If the WORKDIR doesn’t exist, it will be created even if it’s not used in -# any subsequent Dockerfile instruction -WORKDIR /custom - -# Copy the specific data processing script/s in the image under /custom/* -COPY ./csv_transform.py . - -# Command to run the data processing script when the container is run -CMD ["python3", "csv_transform.py"] diff --git a/datasets/noaa/pipelines/_images/run_csv_transform_kub_lightning_strikes_by_year/csv_transform.py b/datasets/noaa/pipelines/_images/run_csv_transform_kub_lightning_strikes_by_year/csv_transform.py deleted file mode 100644 index 2244a729d..000000000 --- a/datasets/noaa/pipelines/_images/run_csv_transform_kub_lightning_strikes_by_year/csv_transform.py +++ /dev/null @@ -1,142 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gzip -import logging -import os -import pathlib -import urllib.request - -import pandas as pd -import requests -from google.cloud import storage - - -def main( - source_url: str, - source_file: pathlib.Path, - target_file: pathlib.Path, - target_gcs_bucket: str, - target_gcs_path: str, -): - - logging.info("NOAA Lightning Strikes By Year process started") - - if url_is_reachable(source_url): - - logging.info("creating 'files' folder") - pathlib.Path("./files").mkdir(parents=True, exist_ok=True) - - source_file_zipped = str(source_file) + ".gz" - source_file_unzipped = str(source_file) + ".1" - - logging.info(f"Downloading source file {source_url}") - download_file(source_url, source_file_zipped) - - logging.info(f"Decompressing {source_file_unzipped}") - gz_decompress(source_file_zipped, source_file_unzipped) - - logging.info(f"Removing unnecessary header in {source_file_unzipped}") - os.system(f"echo 'DATE,LONGITUDE,LATITUDE,TOTAL_COUNT' > {source_file}") - os.system(f"tail -n +4 {source_file_unzipped} >> {source_file}") - os.unlink(source_file_unzipped) - os.unlink(source_file_zipped) - - logging.info(f"Opening source file {source_file}") - df = pd.read_csv(str(source_file)) - - logging.info(f"Transform: Renaming Headers.. {source_file}") - df.columns = ["day_int", "centerlon", "centerlat", "number_of_strikes"] - - logging.info(f"Converting datetime format in {source_file}") - df["day"] = ( - pd.to_datetime( - (df["day_int"][:].astype("string") + "000000"), "raise", False, True - ).astype(str) - + " 00:00:00" - ) - - df["center_point"] = ( - "POINT(" - + df["centerlon"][:].astype("string") - + " " - + df["centerlat"][:].astype("string") - + ")" - ) - - logging.info(f"Reordering columns in {source_file}") - df = df[["day", "number_of_strikes", "center_point"]] - - logging.info(f"Transform: Saving to output file.. {target_file}") - df.to_csv(target_file, index=False) - - logging.info(f"completed processing {source_url}") - logging.info( - f"Uploading output file to.. gs://{target_gcs_bucket}/{target_gcs_path}" - ) - upload_file_to_gcs(target_file, target_gcs_bucket, target_gcs_path) - - logging.info("NOAA Lightning Strikes By Year process completed") - - else: - - logging.info(f"Error: Unable to reach url: {source_url}") - logging.info("Process failed!") - - -def gz_decompress(infile: str, tofile: str) -> None: - with open(infile, "rb") as inf, open(tofile, "w", encoding="utf8") as tof: - decom_str = gzip.decompress(inf.read()).decode("utf-8") - tof.write(decom_str) - - -def url_is_reachable(url: str) -> bool: - - request = urllib.request.Request(url) - request.get_method = lambda: "HEAD" - - try: - urllib.request.urlopen(request) - return True - except urllib.request.HTTPError: - return False - - -def download_file(source_url: str, source_file: pathlib.Path) -> None: - r = requests.get(source_url, stream=True) - if r.status_code == 200: - with open(source_file, "wb") as f: - for chunk in r: - f.write(chunk) - else: - logging.error(f"Couldn't download {source_url}: {r.text}") - - -def upload_file_to_gcs(file_path: pathlib.Path, gcs_bucket: str, gcs_path: str) -> None: - storage_client = storage.Client() - bucket = storage_client.bucket(gcs_bucket) - blob = bucket.blob(gcs_path) - blob.upload_from_filename(file_path) - - -if __name__ == "__main__": - logging.getLogger().setLevel(logging.INFO) - - main( - source_url=os.environ["SOURCE_URL"], - source_file=pathlib.Path(os.environ["SOURCE_FILE"]).expanduser(), - target_file=pathlib.Path(os.environ["TARGET_FILE"]).expanduser(), - target_gcs_bucket=os.environ["TARGET_GCS_BUCKET"], - target_gcs_path=os.environ["TARGET_GCS_PATH"], - ) diff --git a/datasets/noaa/pipelines/_images/run_csv_transform_kub_lightning_strikes_by_year/requirements.txt b/datasets/noaa/pipelines/_images/run_csv_transform_kub_lightning_strikes_by_year/requirements.txt deleted file mode 100644 index f36704793..000000000 --- a/datasets/noaa/pipelines/_images/run_csv_transform_kub_lightning_strikes_by_year/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -requests -pandas -google-cloud-storage diff --git a/datasets/noaa/pipelines/gsod_stations/gsod_stations_dag.py b/datasets/noaa/pipelines/gsod_stations/gsod_stations_dag.py deleted file mode 100644 index 0faedc1e4..000000000 --- a/datasets/noaa/pipelines/gsod_stations/gsod_stations_dag.py +++ /dev/null @@ -1,81 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod -from airflow.providers.google.cloud.transfers import gcs_to_bigquery - -default_args = { - "owner": "Google", - "depends_on_past": False, - "start_date": "2021-12-30", -} - - -with DAG( - dag_id="noaa.gsod_stations", - default_args=default_args, - max_active_runs=1, - schedule_interval="@yearly", - catchup=False, - default_view="graph", -) as dag: - - # Run CSV transform within kubernetes pod - transform_csv = kubernetes_pod.KubernetesPodOperator( - task_id="transform_csv", - name="gsod_stations", - namespace="composer", - service_account_name="datasets", - image_pull_policy="Always", - image="{{ var.json.noaa_gsod_stations.container_registry.run_csv_transform_kub_gsod_stations }}", - env_vars={ - "SOURCE_URL": "ftp://ftp.ncdc.noaa.gov/pub/data/noaa/isd-history.txt", - "FTP_HOST": "ftp.ncdc.noaa.gov", - "FTP_DIR": "/pub/data/noaa", - "FTP_FILENAME": "isd-history.txt", - "SOURCE_FILE": "files/data.csv", - "TARGET_FILE": "files/data_output.csv", - "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", - "TARGET_GCS_PATH": "data/noaa/gsod_stations/data_output.csv", - }, - resources={"limit_memory": "2G", "limit_cpu": "1"}, - ) - - # Task to load CSV data to a BigQuery table - load_to_bq = gcs_to_bigquery.GCSToBigQueryOperator( - task_id="load_to_bq", - bucket="{{ var.value.composer_bucket }}", - source_objects=["data/noaa/gsod_stations/data_output.csv"], - source_format="CSV", - destination_project_dataset_table="noaa.gsod_stations", - skip_leading_rows=1, - write_disposition="WRITE_TRUNCATE", - schema_fields=[ - {"name": "usaf", "type": "STRING", "mode": "NULLABLE"}, - {"name": "wban", "type": "STRING", "mode": "NULLABLE"}, - {"name": "name", "type": "STRING", "mode": "NULLABLE"}, - {"name": "country", "type": "STRING", "mode": "NULLABLE"}, - {"name": "state", "type": "STRING", "mode": "NULLABLE"}, - {"name": "call", "type": "STRING", "mode": "NULLABLE"}, - {"name": "lat", "type": "FLOAT", "mode": "NULLABLE"}, - {"name": "lon", "type": "FLOAT", "mode": "NULLABLE"}, - {"name": "elev", "type": "STRING", "mode": "NULLABLE"}, - {"name": "begin", "type": "STRING", "mode": "NULLABLE"}, - {"name": "end", "type": "STRING", "mode": "NULLABLE"}, - ], - ) - - transform_csv >> load_to_bq diff --git a/datasets/noaa/pipelines/gsod_stations/pipeline.yaml b/datasets/noaa/pipelines/gsod_stations/pipeline.yaml deleted file mode 100644 index 079ac0c24..000000000 --- a/datasets/noaa/pipelines/gsod_stations/pipeline.yaml +++ /dev/null @@ -1,164 +0,0 @@ -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -resources: - - - type: bigquery_table - # Required Properties: - table_id: gsod_stations - - # Description of the table - description: "noaaspc" - -dag: - airflow_version: 2 - initialize: - dag_id: gsod_stations - default_args: - owner: "Google" - - # When set to True, keeps a task from getting triggered if the previous schedule for the task hasn’t succeeded - depends_on_past: False - start_date: '2021-12-30' - max_active_runs: 1 - schedule_interval: "@yearly" - catchup: False - default_view: graph - - tasks: - - - operator: "GKECreateClusterOperator" - args: - task_id: "create_cluster" - project_id: "{{ var.value.gcp_project }}" - location: "us-central1-c" - body: - name: noaa--gsod-stations - initial_node_count: 1 - network: "{{ var.value.vpc_network }}" - node_config: - machine_type: e2-small - oauth_scopes: - - https://www.googleapis.com/auth/devstorage.read_write - - https://www.googleapis.com/auth/cloud-platform - - - operator: "GKEStartPodOperator" - - # Task description - description: "Run CSV transform within kubernetes pod" - - args: - - task_id: "transform_csv" - - # The name of the pod in which the task will run. This will be used (plus a random suffix) to generate a pod id - name: "gsod_stations" - - # The namespace to run within Kubernetes. Always set its value to "default" because we follow the guideline that KubernetesPodOperator will only be used for very light workloads, i.e. use the Cloud Composer environment's resources without starving other pipelines. - project_id: "{{ var.value.gcp_project }}" - location: "us-central1-c" - cluster_name: noaa--gsod-stations - namespace: "default" - - image_pull_policy: "Always" - - # Docker images will be built and pushed to GCR by default whenever the `scripts/generate_dag.py` is run. To skip building and pushing images, use the optional `--skip-builds` flag. - image: "{{ var.json.noaa_gsod_stations.container_registry.run_csv_transform_kub_gsod_stations }}" - - # Set the environment variables you need initialized in the container. Use these as input variables for the script your container is expected to perform. - env_vars: - SOURCE_URL: "ftp://ftp.ncdc.noaa.gov/pub/data/noaa/isd-history.txt" - FTP_HOST: "ftp.ncdc.noaa.gov" - FTP_DIR: "/pub/data/noaa" - FTP_FILENAME: "isd-history.txt" - SOURCE_FILE: "files/data.csv" - TARGET_FILE: "files/data_output.csv" - TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" - TARGET_GCS_PATH: "data/noaa/gsod_stations/data_output.csv" - # Set resource limits for the pod here. For resource units in Kubernetes, see https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-units-in-kubernetes - resources: - limit_memory: "2G" - limit_cpu: "1" - - - operator: "GoogleCloudStorageToBigQueryOperator" - description: "Task to load CSV data to a BigQuery table" - - args: - task_id: "load_to_bq" - - # The GCS bucket where the CSV file is located in. - bucket: "{{ var.value.composer_bucket }}" - - # The GCS object path for the CSV file - source_objects: ["data/noaa/gsod_stations/data_output.csv"] - source_format: "CSV" - destination_project_dataset_table: "noaa.gsod_stations" - - # Use this if your CSV file contains a header row - skip_leading_rows: 1 - - # How to write data to the table: overwrite, append, or write if empty - # See https://cloud.google.com/bigquery/docs/reference/auditlogs/rest/Shared.Types/WriteDisposition - write_disposition: "WRITE_TRUNCATE" - - # The BigQuery table schema based on the CSV file. For more info, see - # https://cloud.google.com/bigquery/docs/schemas. - # Always use snake_case and lowercase for column names, and be explicit, - # i.e. specify modes for all columns. - # types: "INTEGER", "TIMESTAMP", "STRING" - schema_fields: - - name: "usaf" - type: "STRING" - mode: "NULLABLE" - - name: "wban" - type: "STRING" - mode: "NULLABLE" - - name: "name" - type: "STRING" - mode: "NULLABLE" - - name: "country" - type: "STRING" - mode: "NULLABLE" - - name: "state" - type: "STRING" - mode: "NULLABLE" - - name: "call" - type: "STRING" - mode: "NULLABLE" - - name: "lat" - type: "FLOAT" - mode: "NULLABLE" - - name: "lon" - type: "FLOAT" - mode: "NULLABLE" - - name: "elev" - type: "STRING" - mode: "NULLABLE" - - name: "begin" - type: "STRING" - mode: "NULLABLE" - - name: "end" - type: "STRING" - mode: "NULLABLE" - - - operator: "GKEDeleteClusterOperator" - args: - task_id: "delete_cluster" - project_id: "{{ var.value.gcp_project }}" - location: "us-central1-c" - name: noaa--gsod-stations - - graph_paths: - - create_cluster >> transform_csv >> load_to_bq >> delete_cluster diff --git a/datasets/noaa/pipelines/noaa/noaa_dag.py b/datasets/noaa/pipelines/noaa/noaa_dag.py new file mode 100644 index 000000000..6ca2ae573 --- /dev/null +++ b/datasets/noaa/pipelines/noaa/noaa_dag.py @@ -0,0 +1,370 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.providers.google.cloud.operators import kubernetes_engine + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2021-03-01", +} + + +with DAG( + dag_id="noaa.noaa", + default_args=default_args, + max_active_runs=1, + schedule_interval="0 1 0 0 6", + catchup=False, + default_view="graph", +) as dag: + create_cluster = kubernetes_engine.GKECreateClusterOperator( + task_id="create_cluster", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + body={ + "name": "noaa", + "initial_node_count": 2, + "network": "{{ var.value.vpc_network }}", + "node_config": { + "machine_type": "e2-standard-16", + "oauth_scopes": [ + "https://www.googleapis.com/auth/devstorage.read_write", + "https://www.googleapis.com/auth/cloud-platform", + ], + }, + }, + ) + + # Run NOAA load processes + ghcnd_by_year = kubernetes_engine.GKEStartPodOperator( + task_id="ghcnd_by_year", + name="noaa.ghcnd_by_year", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="noaa", + namespace="default", + image_pull_policy="Always", + image="{{ var.json.noaa.container_registry.run_csv_transform_kub }}", + env_vars={ + "PIPELINE_NAME": "GHCND by year", + "SOURCE_URL": "ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/.csv.gz", + "SOURCE_FILE": "files/data_ghcnd_by_year.csv", + "TARGET_FILE": "files/data_output_ghcnd_by_year.csv", + "CHUNKSIZE": "750000", + "FTP_HOST": "ftp.ncdc.noaa.gov", + "FTP_DIR": "pub/data/ghcn/daily/by_year", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "ghcn_d", + "TABLE_ID": "ghcnd", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/noaa/ghcnd_by_year/data_output.csv", + "SCHEMA_PATH": "data/noaa/schema/ghcnd_by_year_schema.json", + "DROP_DEST_TABLE": "N", + "INPUT_FIELD_DELIMITER": ",", + "FTP_BATCH_SIZE": "10", + "FTP_BATCH_SLEEP_TIME": "60", + "FULL_DATA_LOAD": "N", + "START_YEAR": "1763", + "REMOVE_SOURCE_FILE": "Y", + "DELETE_TARGET_FILE": "Y", + "INPUT_CSV_HEADERS": '[\n "id",\n "date",\n "element",\n "value",\n "mflag",\n "qflag",\n "sflag",\n "time"\n]', + "DATA_DTYPES": '{\n "id": "str",\n "date": "str",\n "element": "str",\n "value": "str",\n "mflag": "str",\n "qflag": "str",\n "sflag": "str",\n "time": "str"\n}', + "REORDER_HEADERS_LIST": '[\n "id",\n "date",\n "element",\n "value",\n "mflag",\n "qflag",\n "sflag",\n "time",\n "source_url",\n "etl_timestamp"\n]', + "NULL_ROWS_LIST": '[\n "id"\n]', + "DATE_FORMAT_LIST": '[\n "date"\n]', + }, + resources={"request_ephemeral_storage": "16G", "limit_cpu": "3"}, + ) + + # Run NOAA load processes + ghcnd_countries = kubernetes_engine.GKEStartPodOperator( + task_id="ghcnd_countries", + name="noaa.ghcnd_countries", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="noaa", + namespace="default", + image_pull_policy="Always", + image="{{ var.json.noaa.container_registry.run_csv_transform_kub }}", + env_vars={ + "PIPELINE_NAME": "GHCND countries", + "SOURCE_URL": "ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-countries.txt", + "SOURCE_FILE": "files/data_ghcnd_countries.csv", + "TARGET_FILE": "files/data_output_ghcnd_countries.csv", + "CHUNKSIZE": "750000", + "FTP_HOST": "ftp.ncdc.noaa.gov", + "FTP_DIR": "pub/data/ghcn/daily", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "ghcn_d", + "TABLE_ID": "ghcnd_countries", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/noaa/ghcnd_countries/data_output.csv", + "SCHEMA_PATH": "data/noaa/schema/ghcnd_countries_schema.json", + "DROP_DEST_TABLE": "N", + "INPUT_FIELD_DELIMITER": "|", + "REMOVE_SOURCE_FILE": "N", + "DELETE_TARGET_FILE": "N", + "INPUT_CSV_HEADERS": '[\n "textdata"\n]', + "DATA_DTYPES": '{\n "textdata": "str"\n}', + "REORDER_HEADERS_LIST": '[\n "code",\n "name",\n "source_url",\n "etl_timestamp"\n]', + "SLICE_COLUMN_LIST": '{\n "code": ["textdata", "0", "2"],\n "name": ["textdata", "3", ""]\n}', + }, + resources={"request_ephemeral_storage": "4G", "limit_cpu": "3"}, + ) + + # Run NOAA load processes + ghcnd_inventory = kubernetes_engine.GKEStartPodOperator( + task_id="ghcnd_inventory", + name="noaa.ghcnd_inventory", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="noaa", + namespace="default", + image_pull_policy="Always", + image="{{ var.json.noaa.container_registry.run_csv_transform_kub }}", + env_vars={ + "PIPELINE_NAME": "GHCND inventory", + "SOURCE_URL": "ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-inventory.txt", + "SOURCE_FILE": "files/data_ghcnd_inventory.csv", + "TARGET_FILE": "files/data_output_ghcnd_inventory.csv", + "CHUNKSIZE": "750000", + "FTP_HOST": "ftp.ncdc.noaa.gov", + "FTP_DIR": "pub/data/ghcn/daily", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "ghcn_d", + "TABLE_ID": "ghcnd_inventory", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/noaa/ghcnd_inventory/data_output.csv", + "SCHEMA_PATH": "data/noaa/schema/ghcnd_inventory_schema.json", + "DROP_DEST_TABLE": "N", + "INPUT_FIELD_DELIMITER": "|", + "REMOVE_SOURCE_FILE": "Y", + "DELETE_TARGET_FILE": "Y", + "INPUT_CSV_HEADERS": '[\n "textdata"\n]', + "DATA_DTYPES": '{\n "textdata": "str"\n}', + "REORDER_HEADERS_LIST": '[\n "id",\n "latitude",\n "longitude",\n "element",\n "firstyear",\n "lastyear",\n "source_url",\n "etl_timestamp"\n]', + "SLICE_COLUMN_LIST": '{\n "id": ["textdata", "0", "11"],\n "latitude": ["textdata", "12", "20"],\n "longitude": ["textdata", "21", "30"],\n "element": ["textdata", "31", "35"],\n "firstyear": ["textdata", "36", "40"],\n "lastyear": ["textdata", "41", "45"]\n}', + }, + resources={"request_ephemeral_storage": "4G", "limit_cpu": "3"}, + ) + + # Run NOAA load processes + ghcnd_states = kubernetes_engine.GKEStartPodOperator( + task_id="ghcnd_states", + name="noaa.ghcnd_states", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="noaa", + namespace="default", + image_pull_policy="Always", + image="{{ var.json.noaa.container_registry.run_csv_transform_kub }}", + env_vars={ + "PIPELINE_NAME": "GHCND states", + "SOURCE_URL": "ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-states.txt", + "SOURCE_FILE": "files/data_ghcnd_states.csv", + "TARGET_FILE": "files/data_output_ghcnd_states.csv", + "CHUNKSIZE": "750000", + "FTP_HOST": "ftp.ncdc.noaa.gov", + "FTP_DIR": "pub/data/ghcn/daily", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "ghcn_d", + "TABLE_ID": "ghcnd_states", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/noaa/ghcnd_states/data_output.csv", + "SCHEMA_PATH": "data/noaa/schema/ghcnd_states_schema.json", + "DROP_DEST_TABLE": "N", + "INPUT_FIELD_DELIMITER": "|", + "REMOVE_SOURCE_FILE": "Y", + "DELETE_TARGET_FILE": "Y", + "INPUT_CSV_HEADERS": '[\n "textdata"\n]', + "DATA_DTYPES": '{\n "textdata": "str"\n}', + "REORDER_HEADERS_LIST": '[\n "code",\n "name",\n "source_url",\n "etl_timestamp"\n]', + "SLICE_COLUMN_LIST": '{\n "code": ["textdata", "0", "2"],\n "name": ["textdata", "3", ""]\n}', + }, + resources={"request_ephemeral_storage": "4G", "limit_cpu": "3"}, + ) + + # Run NOAA load processes + ghcnd_stations = kubernetes_engine.GKEStartPodOperator( + task_id="ghcnd_stations", + name="noaa.ghcnd_stations", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="noaa", + namespace="default", + image_pull_policy="Always", + image="{{ var.json.noaa.container_registry.run_csv_transform_kub }}", + env_vars={ + "PIPELINE_NAME": "GHCND stations", + "SOURCE_URL": "ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt", + "SOURCE_FILE": "files/data_ghcnd_stations.csv", + "TARGET_FILE": "files/data_output_ghcnd_stations.csv", + "CHUNKSIZE": "750000", + "FTP_HOST": "ftp.ncdc.noaa.gov", + "FTP_DIR": "pub/data/ghcn/daily", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "ghcn_d", + "TABLE_ID": "ghcnd_stations", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/noaa/ghcnd_stations/data_output.csv", + "SCHEMA_PATH": "data/noaa/schema/ghcnd_stations_schema.json", + "DROP_DEST_TABLE": "N", + "INPUT_FIELD_DELIMITER": "|", + "REMOVE_SOURCE_FILE": "Y", + "DELETE_TARGET_FILE": "Y", + "INPUT_CSV_HEADERS": '[\n "textdata"\n]', + "DATA_DTYPES": '{\n "textdata": "str"\n}', + "REORDER_HEADERS_LIST": '[\n "id",\n "latitude",\n "longitude",\n "elevation",\n "state",\n "name",\n "gsn_flag",\n "hcn_cm_flag",\n "wmoid",\n "source_url",\n "etl_timestamp"\n]', + "SLICE_COLUMN_LIST": '{\n "id": ["textdata", "0", "11"],\n "latitude": ["textdata", "12", "20"],\n "longitude": ["textdata", "21", "30"],\n "elevation": ["textdata", "31", "37"],\n "state": ["textdata", "38", "40"],\n "name": ["textdata", "41", "71"],\n "gsn_flag": ["textdata", "72", "75"],\n "hcn_cm_flag": ["textdata", "76", "79"],\n "wmoid": ["textdata", "80", "85"]\n}', + }, + resources={"request_ephemeral_storage": "4G", "limit_cpu": "3"}, + ) + + # Run NOAA load processes + gsod_stations = kubernetes_engine.GKEStartPodOperator( + task_id="gsod_stations", + name="noaa.gsod_stations", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="noaa", + namespace="default", + image_pull_policy="Always", + image="{{ var.json.noaa.container_registry.run_csv_transform_kub }}", + env_vars={ + "PIPELINE_NAME": "GSOD stations", + "SOURCE_URL": "ftp://ftp.ncdc.noaa.gov/pub/data/noaa/isd-history.txt", + "SOURCE_FILE": "files/data_gsod_stations.csv", + "TARGET_FILE": "files/data_output_gsod_stations.csv", + "CHUNKSIZE": "750000", + "FTP_HOST": "ftp.ncdc.noaa.gov", + "FTP_DIR": "pub/data/noaa", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "noaa", + "TABLE_ID": "gsod_stations", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/noaa/gsod_stations/data_output.csv", + "SCHEMA_PATH": "data/noaa/schema/gsod_stations_schema.json", + "DROP_DEST_TABLE": "N", + "INPUT_FIELD_DELIMITER": "|", + "REMOVE_SOURCE_FILE": "Y", + "DELETE_TARGET_FILE": "Y", + "NUMBER_OF_HEADER_ROWS": "21", + "REGEX_LIST": '{\n "lat": ["^(-[0]+)(.*)", "-$2", "True"],\n "lat": ["^(\\\\s+)$", "", "True"],\n "lat": ["^(\\\\+\\\\d+\\\\.\\\\d+[0-9])\\\\s+", "$1", "True"],\n "lat": ["^(-\\\\d+\\\\.\\\\d+[0-9])\\\\s+", "$1", "True"],\n "lat": ["nan", "", "False"],\n "lon": ["^(-[0]+)(.*)", "-$2", "True"],\n "lon": ["^(\\\\s+)$", "", "True"],\n "lon": ["^(\\\\+\\\\d+\\\\.\\\\d+[0-9])\\\\s+", "$1", "True"],\n "lon": ["^(-\\\\d+\\\\.\\\\d+[0-9])\\\\s+", "$1", "True"],\n "lon": ["nan", "", "False"],\n "usaf": ["(\\\\d{1,})(\\\\s{1,})$", "$1", "True"],\n "name": ["^\\\\s{1,}([a-zA-Z]\\\\D+)", "$1", "True"],\n "name": ["^(\\\\D+[a-zA-Z])\\\\s{1,}$", "$1", "True"],\n "name": ["^(\\\\s+)$", "", "True"],\n "call": ["^(\\\\s+)$", "", "True"],\n "call": ["^([a-zA-Z]+)\\\\s+", "$1", "True"],\n "elev": ["^(\\\\s+)$", "", "True"],\n "state": ["^(\\\\s+)$", "", "True"],\n "country": ["^(\\\\s+)$", "", "True"]\n}', + "INPUT_CSV_HEADERS": '[\n "textdata"\n]', + "DATA_DTYPES": '{\n "textdata": "str"\n}', + "REORDER_HEADERS_LIST": '[\n "usaf",\n "wban",\n "name",\n "country",\n "state",\n "call",\n "lat",\n "lon",\n "elev",\n "begin",\n "end",\n "source_url",\n "etl_timestamp"\n]', + "NULL_ROWS_LIST": '[\n "usaf"\n]', + "SLICE_COLUMN_LIST": '{\n "usaf": ["textdata", "0", "6"],\n "wban": ["textdata", "7", "12"],\n "name": ["textdata", "13", "42"],\n "country": ["textdata", "43", "45"],\n "state": ["textdata", "48", "50"],\n "call": ["textdata", "51", "56"],\n "lat": ["textdata", "57", "64"],\n "lon": ["textdata", "65", "74"],\n "elev": ["textdata", "75", "81"],\n "begin": ["textdata", "82", "90"],\n "end": ["textdata", "91", "99"]\n}', + }, + resources={"request_ephemeral_storage": "4G", "limit_cpu": "3"}, + ) + + # Run NOAA load processes + ghcnd_hurricanes = kubernetes_engine.GKEStartPodOperator( + task_id="ghcnd_hurricanes", + name="noaa.ghcnd_hurricanes", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="noaa", + namespace="default", + image_pull_policy="Always", + image="{{ var.json.noaa.container_registry.run_csv_transform_kub }}", + env_vars={ + "PIPELINE_NAME": "GHCND hurricanes", + "SOURCE_URL": "https://www.ncei.noaa.gov/data/international-best-track-archive-for-climate-stewardship-ibtracs/v04r00/access/csv/ibtracs.ALL.list.v04r00.csv", + "SOURCE_FILE": "files/data_ghcnd_hurricanes.csv", + "TARGET_FILE": "files/data_output_ghcnd_hurricanes.csv", + "CHUNKSIZE": "750000", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "noaa", + "TABLE_ID": "hurricanes", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/noaa/ghcnd_hurricanes/data_output_ghcnd_hurricanes.csv", + "SCHEMA_PATH": "data/noaa/schema/ghcnd_hurricanes_schema.json", + "DROP_DEST_TABLE": "N", + "INPUT_FIELD_DELIMITER": ",", + "NUMBER_OF_HEADER_ROWS": "2", + "REMOVE_SOURCE_FILE": "Y", + "DELETE_TARGET_FILE": "Y", + "INPUT_CSV_HEADERS": '[\n "sid",\n "season",\n "number",\n "basin",\n "subbasin",\n "name",\n "iso_time",\n "nature",\n "lat",\n "lon",\n "wmo_wind",\n "wmo_pres",\n "wmo_agency",\n "track_type",\n "dist2land",\n "landfall",\n "iflag",\n "usa_agency",\n "usa_atcf_id",\n "usa_lat",\n "usa_lon",\n "usa_record",\n "usa_status",\n "usa_wind",\n "usa_pres",\n "usa_sshs",\n "usa_r34_ne",\n "usa_r34_se",\n "usa_r34_sw",\n "usa_r34_nw",\n "usa_r50_ne",\n "usa_r50_se",\n "usa_r50_sw",\n "usa_r50_nw",\n "usa_r64_ne",\n "usa_r64_se",\n "usa_r64_sw",\n "usa_r64_nw",\n "usa_poci",\n "usa_roci",\n "usa_rmw",\n "usa_eye",\n "tokyo_lat",\n "tokyo_lon",\n "tokyo_grade",\n "tokyo_wind",\n "tokyo_pres",\n "tokyo_r50_dir",\n "tokyo_r50_long",\n "tokyo_r50_short",\n "tokyo_r30_dir",\n "tokyo_r30_long",\n "tokyo_r30_short",\n "tokyo_land",\n "cma_lat",\n "cma_lon",\n "cma_cat",\n "cma_wind",\n "cma_pres",\n "hko_lat",\n "hko_lon",\n "hko_cat",\n "hko_wind",\n "hko_pres",\n "newdelhi_lat",\n "newdelhi_lon",\n "newdelhi_grade",\n "newdelhi_wind",\n "newdelhi_pres",\n "newdelhi_ci",\n "newdelhi_dp",\n "newdelhi_poci",\n "reunion_lat",\n "reunion_lon",\n "reunion_type",\n "reunion_wind",\n "reunion_pres",\n "reunion_tnum",\n "reunion_ci",\n "reunion_rmw",\n "reunion_r34_ne",\n "reunion_r34_se",\n "reunion_r34_sw",\n "reunion_r34_nw",\n "reunion_r50_ne",\n "reunion_r50_se",\n "reunion_r50_sw",\n "reunion_r50_nw",\n "reunion_r64_ne",\n "reunion_r64_se",\n "reunion_r64_sw",\n "reunion_r64_nw",\n "bom_lat",\n "bom_lon",\n "bom_type",\n "bom_wind",\n "bom_pres",\n "bom_tnum",\n "bom_ci",\n "bom_rmw",\n "bom_r34_ne",\n "bom_r34_se",\n "bom_r34_sw",\n "bom_r34_nw",\n "bom_r50_ne",\n "bom_r50_se",\n "bom_r50_sw",\n "bom_r50_nw",\n "bom_r64_ne",\n "bom_r64_se",\n "bom_r64_sw",\n "bom_r64_nw",\n "bom_roci",\n "bom_poci",\n "bom_eye",\n "bom_pos_method",\n "bom_pres_method",\n "nadi_lat",\n "nadi_lon",\n "nadi_cat",\n "nadi_wind",\n "nadi_pres",\n "wellington_lat",\n "wellington_lon",\n "wellington_wind",\n "wellington_pres",\n "ds824_lat",\n "ds824_lon",\n "ds824_stage",\n "ds824_wind",\n "ds824_pres",\n "td9636_lat",\n "td9636_lon",\n "td9636_stage",\n "td9636_wind",\n "td9636_pres",\n "td9635_lat",\n "td9635_lon",\n "td9635_wind",\n "td9635_pres",\n "td9635_roci",\n "neumann_lat",\n "neumann_lon",\n "neumann_class",\n "neumann_wind",\n "neumann_pres",\n "mlc_lat",\n "mlc_lon",\n "mlc_class",\n "mlc_wind",\n "mlc_pres",\n "usa_gust",\n "bom_gust",\n "bom_gust_per",\n "reunion_gust",\n "reunion_gust_per",\n "usa_seahgt",\n "usa_searad_ne",\n "usa_searad_se",\n "usa_searad_sw",\n "usa_searad_nw",\n "storm_speed",\n "storm_dir"\n]', + "REORDER_HEADERS_LIST": '[\n "sid",\n "season",\n "number",\n "basin",\n "subbasin",\n "name",\n "iso_time",\n "nature",\n "latitude",\n "longitude",\n "wmo_wind",\n "wmo_pressure",\n "wmo_agency",\n "track_type",\n "dist2land",\n "landfall",\n "iflag",\n "usa_agency",\n "usa_latitude",\n "usa_longitude",\n "usa_record",\n "usa_status",\n "usa_wind",\n "usa_pressure",\n "usa_sshs",\n "usa_r34_ne",\n "usa_r34_se",\n "usa_r34_sw",\n "usa_r34_nw",\n "usa_r50_ne",\n "usa_r50_se",\n "usa_r50_sw",\n "usa_r50_nw",\n "usa_r64_ne",\n "usa_r64_se",\n "usa_r64_sw",\n "usa_r64_nw",\n "usa_poci",\n "usa_roci",\n "usa_rmw",\n "usa_eye",\n "tokyo_latitude",\n "tokyo_longitude",\n "tokyo_grade",\n "tokyo_wind",\n "tokyo_pressure",\n "tokyo_r50_dir",\n "tokyo_r50_longitude",\n "tokyo_r50_short",\n "tokyo_r30_dir",\n "tokyo_r30_long",\n "tokyo_r30_short",\n "tokyo_land",\n "cma_latitude",\n "cma_longitude",\n "cma_cat",\n "cma_wind",\n "cma_pressure",\n "hko_latitude",\n "hko_longitude",\n "hko_cat",\n "hko_wind",\n "hko_pressure",\n "newdelhi_latitude",\n "newdelhi_longitude",\n "newdelhi_grade",\n "newdelhi_wind",\n "newdelhi_pressure",\n "newdelhi_ci",\n "newdelhi_dp",\n "newdelhi_poci",\n "reunion_latitude",\n "reunion_longitude",\n "reunion_type",\n "reunion_wind",\n "reunion_pressure",\n "reunion_tnum",\n "reunion_ci",\n "reunion_rmw",\n "reunion_r34_ne",\n "reunion_r34_se",\n "reunion_r34_sw",\n "reunion_r34_nw",\n "reunion_r50_ne",\n "reunion_r50_se",\n "reunion_r50_sw",\n "reunion_r50_nw",\n "reunion_r64_ne",\n "reunion_r64_se",\n "reunion_r64_sw",\n "reunion_r64_nw",\n "bom_latitude",\n "bom_longitude",\n "bom_type",\n "bom_wind",\n "bom_pressure",\n "bom_tnum",\n "bom_ci",\n "bom_rmw",\n "bom_r34_ne",\n "bom_r34_se",\n "bom_r34_sw",\n "bom_r34_nw",\n "bom_r50_ne",\n "bom_r50_se",\n "bom_r50_sw",\n "bom_r50_nw",\n "bom_r64_ne",\n "bom_r64_se",\n "bom_r64_sw",\n "bom_r64_nw",\n "bom_roci",\n "bom_poci",\n "bom_eye",\n "bom_pos_method",\n "bom_pressure_method",\n "wellington_latitude",\n "wellington_longitude",\n "wellington_wind",\n "wellington_pressure",\n "nadi_latitude",\n "nadi_longitude",\n "nadi_cat",\n "nadi_wind",\n "nadi_pressure",\n "ds824_latitude",\n "ds824_longitude",\n "ds824_stage",\n "ds824_wind",\n "ds824_pressure",\n "td9636_latitude",\n "td9636_longitude",\n "td9636_stage",\n "td9636_wind",\n "td9636_pressure",\n "td9635_latitude",\n "td9635_longitude",\n "td9635_wind",\n "td9635_pressure",\n "td9635_roci",\n "neumann_latitude",\n "neumann_longitude",\n "neumann_class",\n "neumann_wind",\n "neumann_pressure",\n "mlc_latitude",\n "mlc_longitude",\n "mlc_class",\n "mlc_wind",\n "mlc_pressure",\n "usa_atcf_id",\n "source_url",\n "etl_timestamp"\n]', + "RENAME_HEADERS_LIST": '{\n "lat": "latitude",\n "lon": "longitude",\n "wmo_pres": "wmo_pressure",\n "usa_lat": "usa_latitude",\n "usa_lon": "usa_longitude",\n "usa_pres": "usa_pressure",\n "tokyo_lat": "tokyo_latitude",\n "tokyo_lon": "tokyo_longitude",\n "tokyo_pres": "tokyo_pressure",\n "tokyo_r50_long": "tokyo_r50_longitude",\n "cma_lat": "cma_latitude",\n "cma_lon": "cma_longitude",\n "cma_pres": "cma_pressure",\n "hko_lat": "hko_latitude",\n "hko_lon": "hko_longitude",\n "hko_pres": "hko_pressure",\n "newdelhi_lat": "newdelhi_latitude",\n "newdelhi_lon": "newdelhi_longitude",\n "newdelhi_pres": "newdelhi_pressure",\n "reunion_lat": "reunion_latitude",\n "reunion_lon": "reunion_longitude",\n "reunion_pres": "reunion_pressure",\n "bom_lat": "bom_latitude",\n "bom_lon": "bom_longitude",\n "bom_pres": "bom_pressure",\n "bom_pres_method": "bom_pressure_method",\n "wellington_lat": "wellington_latitude",\n "wellington_lon": "wellington_longitude",\n "wellington_pres": "wellington_pressure",\n "nadi_lat": "nadi_latitude",\n "nadi_lon": "nadi_longitude",\n "nadi_pres": "nadi_pressure",\n "ds824_lat": "ds824_latitude",\n "ds824_lon": "ds824_longitude",\n "ds824_pres": "ds824_pressure",\n "td9636_lat": "td9636_latitude",\n "td9636_lon": "td9636_longitude",\n "td9636_pres": "td9636_pressure",\n "td9635_lat": "td9635_latitude",\n "td9635_lon": "td9635_longitude",\n "td9635_pres": "td9635_pressure",\n "neumann_lat": "neumann_latitude",\n "neumann_lon": "neumann_longitude",\n "neumann_pres": "neumann_pressure",\n "mlc_lat": "mlc_latitude",\n "mlc_lon": "mlc_longitude",\n "mlc_pres": "mlc_pressure"\n}', + }, + resources={"request_ephemeral_storage": "16G", "limit_cpu": "3"}, + ) + + # Run NOAA load processes + lightning_strikes_by_year = kubernetes_engine.GKEStartPodOperator( + task_id="lightning_strikes_by_year", + name="noaa.lightning_strikes_by_year", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="noaa", + namespace="default", + image_pull_policy="Always", + image="{{ var.json.noaa.container_registry.run_csv_transform_kub }}", + env_vars={ + "PIPELINE_NAME": "NOAA lightning strikes by year", + "SOURCE_URL": "https://www1.ncdc.noaa.gov/pub/data/swdi/database-csv/v2/nldn-tiles-*.csv.gz", + "SOURCE_FILE": "files/data_lightning_strikes.csv", + "TARGET_FILE": "files/data_output_lightning_strikes.csv", + "CHUNKSIZE": "1000000", + "PROJECT_ID": "{{ var.value.gcp_project }}", + "DATASET_ID": "noaa", + "TABLE_ID": "lightning_strikes", + "TARGET_GCS_BUCKET": "{{ var.value.composer_bucket }}", + "TARGET_GCS_PATH": "data/noaa/lightning_strikes/data_output.csv", + "SCHEMA_PATH": "data/noaa/schema/noaa_lightning_strikes_schema.json", + "DROP_DEST_TABLE": "N", + "INPUT_FIELD_DELIMITER": ",", + "HTTP_BATCH_SIZE": "10", + "HTTP_BATCH_SLEEP_TIME": "60", + "FULL_DATA_LOAD": "Y", + "REMOVE_SOURCE_FILE": "Y", + "DELETE_TARGET_FILE": "Y", + "START_YEAR": "1990", + "NUMBER_OF_HEADER_ROWS": "3", + "INT_DATE_LIST": '{\n "date": "day_int"\n}', + "GEN_LOCATION_LIST": '{\n "center_point_geom": ["centerlon", "centerlat"]\n}', + "INPUT_CSV_HEADERS": '[\n "ZDAY",\n "CENTERLON",\n "CENTERLAT",\n "TOTAL_COUNT"\n]', + "DATA_DTYPES": '{\n "ZDAY": "str",\n "CENTERLON": "str",\n "CENTERLAT": "str",\n "TOTAL_COUNT": "str"\n}', + "REORDER_HEADERS_LIST": '[\n "date",\n "number_of_strikes",\n "center_point_geom",\n "source_url",\n "etl_timestamp"\n]', + "RENAME_HEADERS_LIST": '{\n "zday": "day_int",\n "total_count": "number_of_strikes"\n}', + }, + resources={"request_ephemeral_storage": "16G", "limit_cpu": "3"}, + ) + delete_cluster = kubernetes_engine.GKEDeleteClusterOperator( + task_id="delete_cluster", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + name="noaa", + ) + + ( + create_cluster + >> [ + ghcnd_by_year, + ghcnd_countries, + ghcnd_inventory, + ghcnd_states, + ghcnd_stations, + gsod_stations, + ghcnd_hurricanes, + lightning_strikes_by_year, + ] + >> delete_cluster + ) diff --git a/datasets/noaa/pipelines/noaa/pipeline.yaml b/datasets/noaa/pipelines/noaa/pipeline.yaml new file mode 100644 index 000000000..402496d31 --- /dev/null +++ b/datasets/noaa/pipelines/noaa/pipeline.yaml @@ -0,0 +1,957 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: + - type: bigquery_table + table_id: "ghcnd_by_year" + description: "noaaspc" + - type: bigquery_table + table_id: "ghcnd_countries" + description: "noaaspc" + - type: bigquery_table + table_id: "ghcnd_inventory" + description: "noaaspc" + - type: bigquery_table + table_id: "ghcnd_states" + description: "noaaspc" + - type: bigquery_table + table_id: "ghcnd_stations" + description: "noaaspc" + - type: bigquery_table + table_id: "gsod_stations" + description: "noaaspc" + - type: bigquery_table + table_id: "hurricanes" + description: "noaaspc" + - type: bigquery_table + table_id: "lightning_strikes_by_year" + description: "noaaspc" + +dag: + airflow_version: 2 + initialize: + dag_id: noaa + default_args: + owner: "Google" + depends_on_past: False + start_date: '2021-03-01' + max_active_runs: 1 + schedule_interval: "0 1 0 0 6" + catchup: False + default_view: graph + + tasks: + - operator: "GKECreateClusterOperator" + args: + task_id: "create_cluster" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + body: + name: noaa + initial_node_count: 2 + network: "{{ var.value.vpc_network }}" + node_config: + machine_type: e2-standard-16 + oauth_scopes: + - https://www.googleapis.com/auth/devstorage.read_write + - https://www.googleapis.com/auth/cloud-platform + - operator: "GKEStartPodOperator" + description: "Run NOAA load processes" + args: + task_id: "ghcnd_by_year" + name: "noaa.ghcnd_by_year" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: noaa + namespace: "default" + image_pull_policy: "Always" + image: "{{ var.json.noaa.container_registry.run_csv_transform_kub }}" + env_vars: + PIPELINE_NAME: "GHCND by year" + SOURCE_URL: "ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/.csv.gz" + SOURCE_FILE: "files/data_ghcnd_by_year.csv" + TARGET_FILE: "files/data_output_ghcnd_by_year.csv" + CHUNKSIZE: "750000" + FTP_HOST: "ftp.ncdc.noaa.gov" + FTP_DIR: "pub/data/ghcn/daily/by_year" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "ghcn_d" + TABLE_ID: "ghcnd" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/noaa/ghcnd_by_year/data_output.csv" + SCHEMA_PATH: "data/noaa/schema/ghcnd_by_year_schema.json" + DROP_DEST_TABLE: "N" + INPUT_FIELD_DELIMITER: "," + FTP_BATCH_SIZE: "10" + FTP_BATCH_SLEEP_TIME: "60" + FULL_DATA_LOAD: "N" + START_YEAR: "1763" + REMOVE_SOURCE_FILE: "Y" + DELETE_TARGET_FILE: "Y" + INPUT_CSV_HEADERS: >- + [ + "id", + "date", + "element", + "value", + "mflag", + "qflag", + "sflag", + "time" + ] + DATA_DTYPES: >- + { + "id": "str", + "date": "str", + "element": "str", + "value": "str", + "mflag": "str", + "qflag": "str", + "sflag": "str", + "time": "str" + } + REORDER_HEADERS_LIST: >- + [ + "id", + "date", + "element", + "value", + "mflag", + "qflag", + "sflag", + "time", + "source_url", + "etl_timestamp" + ] + NULL_ROWS_LIST: >- + [ + "id" + ] + DATE_FORMAT_LIST: >- + [ + "date" + ] + resources: + request_ephemeral_storage: "16G" + limit_cpu: "3" + - operator: "GKEStartPodOperator" + description: "Run NOAA load processes" + args: + task_id: "ghcnd_countries" + name: "noaa.ghcnd_countries" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: noaa + namespace: "default" + image_pull_policy: "Always" + image: "{{ var.json.noaa.container_registry.run_csv_transform_kub }}" + env_vars: + PIPELINE_NAME: "GHCND countries" + SOURCE_URL: "ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-countries.txt" + SOURCE_FILE: "files/data_ghcnd_countries.csv" + TARGET_FILE: "files/data_output_ghcnd_countries.csv" + CHUNKSIZE: "750000" + FTP_HOST: "ftp.ncdc.noaa.gov" + FTP_DIR: "pub/data/ghcn/daily" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "ghcn_d" + TABLE_ID: "ghcnd_countries" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/noaa/ghcnd_countries/data_output.csv" + SCHEMA_PATH: "data/noaa/schema/ghcnd_countries_schema.json" + DROP_DEST_TABLE: "N" + INPUT_FIELD_DELIMITER: "|" + REMOVE_SOURCE_FILE: "N" + DELETE_TARGET_FILE: "N" + INPUT_CSV_HEADERS: >- + [ + "textdata" + ] + DATA_DTYPES: >- + { + "textdata": "str" + } + REORDER_HEADERS_LIST: >- + [ + "code", + "name", + "source_url", + "etl_timestamp" + ] + SLICE_COLUMN_LIST: >- + { + "code": ["textdata", "0", "2"], + "name": ["textdata", "3", ""] + } + resources: + request_ephemeral_storage: "4G" + limit_cpu: "3" + - operator: "GKEStartPodOperator" + description: "Run NOAA load processes" + args: + task_id: "ghcnd_inventory" + name: "noaa.ghcnd_inventory" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: noaa + namespace: "default" + image_pull_policy: "Always" + image: "{{ var.json.noaa.container_registry.run_csv_transform_kub }}" + env_vars: + PIPELINE_NAME: "GHCND inventory" + SOURCE_URL: "ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-inventory.txt" + SOURCE_FILE: "files/data_ghcnd_inventory.csv" + TARGET_FILE: "files/data_output_ghcnd_inventory.csv" + CHUNKSIZE: "750000" + FTP_HOST: "ftp.ncdc.noaa.gov" + FTP_DIR: "pub/data/ghcn/daily" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "ghcn_d" + TABLE_ID: "ghcnd_inventory" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/noaa/ghcnd_inventory/data_output.csv" + SCHEMA_PATH: "data/noaa/schema/ghcnd_inventory_schema.json" + DROP_DEST_TABLE: "N" + INPUT_FIELD_DELIMITER: "|" + REMOVE_SOURCE_FILE: "Y" + DELETE_TARGET_FILE: "Y" + INPUT_CSV_HEADERS: >- + [ + "textdata" + ] + DATA_DTYPES: >- + { + "textdata": "str" + } + REORDER_HEADERS_LIST: >- + [ + "id", + "latitude", + "longitude", + "element", + "firstyear", + "lastyear", + "source_url", + "etl_timestamp" + ] + SLICE_COLUMN_LIST: >- + { + "id": ["textdata", "0", "11"], + "latitude": ["textdata", "12", "20"], + "longitude": ["textdata", "21", "30"], + "element": ["textdata", "31", "35"], + "firstyear": ["textdata", "36", "40"], + "lastyear": ["textdata", "41", "45"] + } + resources: + request_ephemeral_storage: "4G" + limit_cpu: "3" + - operator: "GKEStartPodOperator" + description: "Run NOAA load processes" + args: + task_id: "ghcnd_states" + name: "noaa.ghcnd_states" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: noaa + namespace: "default" + image_pull_policy: "Always" + image: "{{ var.json.noaa.container_registry.run_csv_transform_kub }}" + env_vars: + PIPELINE_NAME: "GHCND states" + SOURCE_URL: "ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-states.txt" + SOURCE_FILE: "files/data_ghcnd_states.csv" + TARGET_FILE: "files/data_output_ghcnd_states.csv" + CHUNKSIZE: "750000" + FTP_HOST: "ftp.ncdc.noaa.gov" + FTP_DIR: "pub/data/ghcn/daily" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "ghcn_d" + TABLE_ID: "ghcnd_states" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/noaa/ghcnd_states/data_output.csv" + SCHEMA_PATH: "data/noaa/schema/ghcnd_states_schema.json" + DROP_DEST_TABLE: "N" + INPUT_FIELD_DELIMITER: "|" + REMOVE_SOURCE_FILE: "Y" + DELETE_TARGET_FILE: "Y" + INPUT_CSV_HEADERS: >- + [ + "textdata" + ] + DATA_DTYPES: >- + { + "textdata": "str" + } + REORDER_HEADERS_LIST: >- + [ + "code", + "name", + "source_url", + "etl_timestamp" + ] + SLICE_COLUMN_LIST: >- + { + "code": ["textdata", "0", "2"], + "name": ["textdata", "3", ""] + } + resources: + request_ephemeral_storage: "4G" + limit_cpu: "3" + - operator: "GKEStartPodOperator" + description: "Run NOAA load processes" + args: + task_id: "ghcnd_stations" + name: "noaa.ghcnd_stations" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: noaa + namespace: "default" + image_pull_policy: "Always" + image: "{{ var.json.noaa.container_registry.run_csv_transform_kub }}" + env_vars: + PIPELINE_NAME: "GHCND stations" + SOURCE_URL: "ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt" + SOURCE_FILE: "files/data_ghcnd_stations.csv" + TARGET_FILE: "files/data_output_ghcnd_stations.csv" + CHUNKSIZE: "750000" + FTP_HOST: "ftp.ncdc.noaa.gov" + FTP_DIR: "pub/data/ghcn/daily" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "ghcn_d" + TABLE_ID: "ghcnd_stations" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/noaa/ghcnd_stations/data_output.csv" + SCHEMA_PATH: "data/noaa/schema/ghcnd_stations_schema.json" + DROP_DEST_TABLE: "N" + INPUT_FIELD_DELIMITER: "|" + REMOVE_SOURCE_FILE: "Y" + DELETE_TARGET_FILE: "Y" + INPUT_CSV_HEADERS: >- + [ + "textdata" + ] + DATA_DTYPES: >- + { + "textdata": "str" + } + REORDER_HEADERS_LIST: >- + [ + "id", + "latitude", + "longitude", + "elevation", + "state", + "name", + "gsn_flag", + "hcn_cm_flag", + "wmoid", + "source_url", + "etl_timestamp" + ] + SLICE_COLUMN_LIST: >- + { + "id": ["textdata", "0", "11"], + "latitude": ["textdata", "12", "20"], + "longitude": ["textdata", "21", "30"], + "elevation": ["textdata", "31", "37"], + "state": ["textdata", "38", "40"], + "name": ["textdata", "41", "71"], + "gsn_flag": ["textdata", "72", "75"], + "hcn_cm_flag": ["textdata", "76", "79"], + "wmoid": ["textdata", "80", "85"] + } + resources: + request_ephemeral_storage: "4G" + limit_cpu: "3" + - operator: "GKEStartPodOperator" + description: "Run NOAA load processes" + args: + task_id: "gsod_stations" + name: "noaa.gsod_stations" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: noaa + namespace: "default" + image_pull_policy: "Always" + image: "{{ var.json.noaa.container_registry.run_csv_transform_kub }}" + env_vars: + PIPELINE_NAME: "GSOD stations" + SOURCE_URL: "ftp://ftp.ncdc.noaa.gov/pub/data/noaa/isd-history.txt" + SOURCE_FILE: "files/data_gsod_stations.csv" + TARGET_FILE: "files/data_output_gsod_stations.csv" + CHUNKSIZE: "750000" + FTP_HOST: "ftp.ncdc.noaa.gov" + FTP_DIR: "pub/data/noaa" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "noaa" + TABLE_ID: "gsod_stations" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/noaa/gsod_stations/data_output.csv" + SCHEMA_PATH: "data/noaa/schema/gsod_stations_schema.json" + DROP_DEST_TABLE: "N" + INPUT_FIELD_DELIMITER: "|" + REMOVE_SOURCE_FILE: "Y" + DELETE_TARGET_FILE: "Y" + NUMBER_OF_HEADER_ROWS: "21" + REGEX_LIST: >- + { + "lat": ["^(-[0]+)(.*)", "-$2", "True"], + "lat": ["^(\\s+)$", "", "True"], + "lat": ["^(\\+\\d+\\.\\d+[0-9])\\s+", "$1", "True"], + "lat": ["^(-\\d+\\.\\d+[0-9])\\s+", "$1", "True"], + "lat": ["nan", "", "False"], + "lon": ["^(-[0]+)(.*)", "-$2", "True"], + "lon": ["^(\\s+)$", "", "True"], + "lon": ["^(\\+\\d+\\.\\d+[0-9])\\s+", "$1", "True"], + "lon": ["^(-\\d+\\.\\d+[0-9])\\s+", "$1", "True"], + "lon": ["nan", "", "False"], + "usaf": ["(\\d{1,})(\\s{1,})$", "$1", "True"], + "name": ["^\\s{1,}([a-zA-Z]\\D+)", "$1", "True"], + "name": ["^(\\D+[a-zA-Z])\\s{1,}$", "$1", "True"], + "name": ["^(\\s+)$", "", "True"], + "call": ["^(\\s+)$", "", "True"], + "call": ["^([a-zA-Z]+)\\s+", "$1", "True"], + "elev": ["^(\\s+)$", "", "True"], + "state": ["^(\\s+)$", "", "True"], + "country": ["^(\\s+)$", "", "True"] + } + INPUT_CSV_HEADERS: >- + [ + "textdata" + ] + DATA_DTYPES: >- + { + "textdata": "str" + } + REORDER_HEADERS_LIST: >- + [ + "usaf", + "wban", + "name", + "country", + "state", + "call", + "lat", + "lon", + "elev", + "begin", + "end", + "source_url", + "etl_timestamp" + ] + NULL_ROWS_LIST: >- + [ + "usaf" + ] + SLICE_COLUMN_LIST: >- + { + "usaf": ["textdata", "0", "6"], + "wban": ["textdata", "7", "12"], + "name": ["textdata", "13", "42"], + "country": ["textdata", "43", "45"], + "state": ["textdata", "48", "50"], + "call": ["textdata", "51", "56"], + "lat": ["textdata", "57", "64"], + "lon": ["textdata", "65", "74"], + "elev": ["textdata", "75", "81"], + "begin": ["textdata", "82", "90"], + "end": ["textdata", "91", "99"] + } + resources: + request_ephemeral_storage: "4G" + limit_cpu: "3" + - operator: "GKEStartPodOperator" + description: "Run NOAA load processes" + args: + task_id: "ghcnd_hurricanes" + name: "noaa.ghcnd_hurricanes" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: noaa + namespace: "default" + image_pull_policy: "Always" + image: "{{ var.json.noaa.container_registry.run_csv_transform_kub }}" + env_vars: + PIPELINE_NAME: "GHCND hurricanes" + SOURCE_URL: "https://www.ncei.noaa.gov/data/international-best-track-archive-for-climate-stewardship-ibtracs/v04r00/access/csv/ibtracs.ALL.list.v04r00.csv" + SOURCE_FILE: "files/data_ghcnd_hurricanes.csv" + TARGET_FILE: "files/data_output_ghcnd_hurricanes.csv" + CHUNKSIZE: "750000" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "noaa" + TABLE_ID: "hurricanes" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/noaa/ghcnd_hurricanes/data_output_ghcnd_hurricanes.csv" + SCHEMA_PATH: "data/noaa/schema/ghcnd_hurricanes_schema.json" + DROP_DEST_TABLE: "N" + INPUT_FIELD_DELIMITER: "," + NUMBER_OF_HEADER_ROWS: "2" + REMOVE_SOURCE_FILE: "Y" + DELETE_TARGET_FILE: "Y" + INPUT_CSV_HEADERS: >- + [ + "sid", + "season", + "number", + "basin", + "subbasin", + "name", + "iso_time", + "nature", + "lat", + "lon", + "wmo_wind", + "wmo_pres", + "wmo_agency", + "track_type", + "dist2land", + "landfall", + "iflag", + "usa_agency", + "usa_atcf_id", + "usa_lat", + "usa_lon", + "usa_record", + "usa_status", + "usa_wind", + "usa_pres", + "usa_sshs", + "usa_r34_ne", + "usa_r34_se", + "usa_r34_sw", + "usa_r34_nw", + "usa_r50_ne", + "usa_r50_se", + "usa_r50_sw", + "usa_r50_nw", + "usa_r64_ne", + "usa_r64_se", + "usa_r64_sw", + "usa_r64_nw", + "usa_poci", + "usa_roci", + "usa_rmw", + "usa_eye", + "tokyo_lat", + "tokyo_lon", + "tokyo_grade", + "tokyo_wind", + "tokyo_pres", + "tokyo_r50_dir", + "tokyo_r50_long", + "tokyo_r50_short", + "tokyo_r30_dir", + "tokyo_r30_long", + "tokyo_r30_short", + "tokyo_land", + "cma_lat", + "cma_lon", + "cma_cat", + "cma_wind", + "cma_pres", + "hko_lat", + "hko_lon", + "hko_cat", + "hko_wind", + "hko_pres", + "newdelhi_lat", + "newdelhi_lon", + "newdelhi_grade", + "newdelhi_wind", + "newdelhi_pres", + "newdelhi_ci", + "newdelhi_dp", + "newdelhi_poci", + "reunion_lat", + "reunion_lon", + "reunion_type", + "reunion_wind", + "reunion_pres", + "reunion_tnum", + "reunion_ci", + "reunion_rmw", + "reunion_r34_ne", + "reunion_r34_se", + "reunion_r34_sw", + "reunion_r34_nw", + "reunion_r50_ne", + "reunion_r50_se", + "reunion_r50_sw", + "reunion_r50_nw", + "reunion_r64_ne", + "reunion_r64_se", + "reunion_r64_sw", + "reunion_r64_nw", + "bom_lat", + "bom_lon", + "bom_type", + "bom_wind", + "bom_pres", + "bom_tnum", + "bom_ci", + "bom_rmw", + "bom_r34_ne", + "bom_r34_se", + "bom_r34_sw", + "bom_r34_nw", + "bom_r50_ne", + "bom_r50_se", + "bom_r50_sw", + "bom_r50_nw", + "bom_r64_ne", + "bom_r64_se", + "bom_r64_sw", + "bom_r64_nw", + "bom_roci", + "bom_poci", + "bom_eye", + "bom_pos_method", + "bom_pres_method", + "nadi_lat", + "nadi_lon", + "nadi_cat", + "nadi_wind", + "nadi_pres", + "wellington_lat", + "wellington_lon", + "wellington_wind", + "wellington_pres", + "ds824_lat", + "ds824_lon", + "ds824_stage", + "ds824_wind", + "ds824_pres", + "td9636_lat", + "td9636_lon", + "td9636_stage", + "td9636_wind", + "td9636_pres", + "td9635_lat", + "td9635_lon", + "td9635_wind", + "td9635_pres", + "td9635_roci", + "neumann_lat", + "neumann_lon", + "neumann_class", + "neumann_wind", + "neumann_pres", + "mlc_lat", + "mlc_lon", + "mlc_class", + "mlc_wind", + "mlc_pres", + "usa_gust", + "bom_gust", + "bom_gust_per", + "reunion_gust", + "reunion_gust_per", + "usa_seahgt", + "usa_searad_ne", + "usa_searad_se", + "usa_searad_sw", + "usa_searad_nw", + "storm_speed", + "storm_dir" + ] + REORDER_HEADERS_LIST: >- + [ + "sid", + "season", + "number", + "basin", + "subbasin", + "name", + "iso_time", + "nature", + "latitude", + "longitude", + "wmo_wind", + "wmo_pressure", + "wmo_agency", + "track_type", + "dist2land", + "landfall", + "iflag", + "usa_agency", + "usa_latitude", + "usa_longitude", + "usa_record", + "usa_status", + "usa_wind", + "usa_pressure", + "usa_sshs", + "usa_r34_ne", + "usa_r34_se", + "usa_r34_sw", + "usa_r34_nw", + "usa_r50_ne", + "usa_r50_se", + "usa_r50_sw", + "usa_r50_nw", + "usa_r64_ne", + "usa_r64_se", + "usa_r64_sw", + "usa_r64_nw", + "usa_poci", + "usa_roci", + "usa_rmw", + "usa_eye", + "tokyo_latitude", + "tokyo_longitude", + "tokyo_grade", + "tokyo_wind", + "tokyo_pressure", + "tokyo_r50_dir", + "tokyo_r50_longitude", + "tokyo_r50_short", + "tokyo_r30_dir", + "tokyo_r30_long", + "tokyo_r30_short", + "tokyo_land", + "cma_latitude", + "cma_longitude", + "cma_cat", + "cma_wind", + "cma_pressure", + "hko_latitude", + "hko_longitude", + "hko_cat", + "hko_wind", + "hko_pressure", + "newdelhi_latitude", + "newdelhi_longitude", + "newdelhi_grade", + "newdelhi_wind", + "newdelhi_pressure", + "newdelhi_ci", + "newdelhi_dp", + "newdelhi_poci", + "reunion_latitude", + "reunion_longitude", + "reunion_type", + "reunion_wind", + "reunion_pressure", + "reunion_tnum", + "reunion_ci", + "reunion_rmw", + "reunion_r34_ne", + "reunion_r34_se", + "reunion_r34_sw", + "reunion_r34_nw", + "reunion_r50_ne", + "reunion_r50_se", + "reunion_r50_sw", + "reunion_r50_nw", + "reunion_r64_ne", + "reunion_r64_se", + "reunion_r64_sw", + "reunion_r64_nw", + "bom_latitude", + "bom_longitude", + "bom_type", + "bom_wind", + "bom_pressure", + "bom_tnum", + "bom_ci", + "bom_rmw", + "bom_r34_ne", + "bom_r34_se", + "bom_r34_sw", + "bom_r34_nw", + "bom_r50_ne", + "bom_r50_se", + "bom_r50_sw", + "bom_r50_nw", + "bom_r64_ne", + "bom_r64_se", + "bom_r64_sw", + "bom_r64_nw", + "bom_roci", + "bom_poci", + "bom_eye", + "bom_pos_method", + "bom_pressure_method", + "wellington_latitude", + "wellington_longitude", + "wellington_wind", + "wellington_pressure", + "nadi_latitude", + "nadi_longitude", + "nadi_cat", + "nadi_wind", + "nadi_pressure", + "ds824_latitude", + "ds824_longitude", + "ds824_stage", + "ds824_wind", + "ds824_pressure", + "td9636_latitude", + "td9636_longitude", + "td9636_stage", + "td9636_wind", + "td9636_pressure", + "td9635_latitude", + "td9635_longitude", + "td9635_wind", + "td9635_pressure", + "td9635_roci", + "neumann_latitude", + "neumann_longitude", + "neumann_class", + "neumann_wind", + "neumann_pressure", + "mlc_latitude", + "mlc_longitude", + "mlc_class", + "mlc_wind", + "mlc_pressure", + "usa_atcf_id", + "source_url", + "etl_timestamp" + ] + RENAME_HEADERS_LIST: >- + { + "lat": "latitude", + "lon": "longitude", + "wmo_pres": "wmo_pressure", + "usa_lat": "usa_latitude", + "usa_lon": "usa_longitude", + "usa_pres": "usa_pressure", + "tokyo_lat": "tokyo_latitude", + "tokyo_lon": "tokyo_longitude", + "tokyo_pres": "tokyo_pressure", + "tokyo_r50_long": "tokyo_r50_longitude", + "cma_lat": "cma_latitude", + "cma_lon": "cma_longitude", + "cma_pres": "cma_pressure", + "hko_lat": "hko_latitude", + "hko_lon": "hko_longitude", + "hko_pres": "hko_pressure", + "newdelhi_lat": "newdelhi_latitude", + "newdelhi_lon": "newdelhi_longitude", + "newdelhi_pres": "newdelhi_pressure", + "reunion_lat": "reunion_latitude", + "reunion_lon": "reunion_longitude", + "reunion_pres": "reunion_pressure", + "bom_lat": "bom_latitude", + "bom_lon": "bom_longitude", + "bom_pres": "bom_pressure", + "bom_pres_method": "bom_pressure_method", + "wellington_lat": "wellington_latitude", + "wellington_lon": "wellington_longitude", + "wellington_pres": "wellington_pressure", + "nadi_lat": "nadi_latitude", + "nadi_lon": "nadi_longitude", + "nadi_pres": "nadi_pressure", + "ds824_lat": "ds824_latitude", + "ds824_lon": "ds824_longitude", + "ds824_pres": "ds824_pressure", + "td9636_lat": "td9636_latitude", + "td9636_lon": "td9636_longitude", + "td9636_pres": "td9636_pressure", + "td9635_lat": "td9635_latitude", + "td9635_lon": "td9635_longitude", + "td9635_pres": "td9635_pressure", + "neumann_lat": "neumann_latitude", + "neumann_lon": "neumann_longitude", + "neumann_pres": "neumann_pressure", + "mlc_lat": "mlc_latitude", + "mlc_lon": "mlc_longitude", + "mlc_pres": "mlc_pressure" + } + resources: + request_ephemeral_storage: "16G" + limit_cpu: "3" + - operator: "GKEStartPodOperator" + description: "Run NOAA load processes" + args: + task_id: "lightning_strikes_by_year" + name: "noaa.lightning_strikes_by_year" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: noaa + namespace: "default" + image_pull_policy: "Always" + image: "{{ var.json.noaa.container_registry.run_csv_transform_kub }}" + env_vars: + PIPELINE_NAME: "NOAA lightning strikes by year" + SOURCE_URL: "https://www1.ncdc.noaa.gov/pub/data/swdi/database-csv/v2/nldn-tiles-*.csv.gz" + SOURCE_FILE: "files/data_lightning_strikes.csv" + TARGET_FILE: "files/data_output_lightning_strikes.csv" + CHUNKSIZE: "1000000" + PROJECT_ID: "{{ var.value.gcp_project }}" + DATASET_ID: "noaa" + TABLE_ID: "lightning_strikes" + TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}" + TARGET_GCS_PATH: "data/noaa/lightning_strikes/data_output.csv" + SCHEMA_PATH: "data/noaa/schema/noaa_lightning_strikes_schema.json" + DROP_DEST_TABLE: "N" + INPUT_FIELD_DELIMITER: "," + HTTP_BATCH_SIZE: "10" + HTTP_BATCH_SLEEP_TIME: "60" + FULL_DATA_LOAD: "Y" + REMOVE_SOURCE_FILE: "Y" + DELETE_TARGET_FILE: "Y" + START_YEAR: "1990" + NUMBER_OF_HEADER_ROWS: "3" + INT_DATE_LIST: >- + { + "date": "day_int" + } + GEN_LOCATION_LIST: >- + { + "center_point_geom": ["centerlon", "centerlat"] + } + INPUT_CSV_HEADERS: >- + [ + "ZDAY", + "CENTERLON", + "CENTERLAT", + "TOTAL_COUNT" + ] + DATA_DTYPES: >- + { + "ZDAY": "str", + "CENTERLON": "str", + "CENTERLAT": "str", + "TOTAL_COUNT": "str" + } + REORDER_HEADERS_LIST: >- + [ + "date", + "number_of_strikes", + "center_point_geom", + "source_url", + "etl_timestamp" + ] + RENAME_HEADERS_LIST: >- + { + "zday": "day_int", + "total_count": "number_of_strikes" + } + resources: + request_ephemeral_storage: "16G" + limit_cpu: "3" + - operator: "GKEDeleteClusterOperator" + args: + task_id: "delete_cluster" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + name: noaa + + graph_paths: + - "create_cluster >> [ghcnd_by_year, ghcnd_countries, ghcnd_inventory, ghcnd_states, ghcnd_stations, gsod_stations, ghcnd_hurricanes, lightning_strikes_by_year] >> delete_cluster"