diff --git a/datasets/fashion_mnist/infra/fashion_mnist_dataset.tf b/datasets/fashion_mnist/infra/fashion_mnist_dataset.tf new file mode 100644 index 000000000..a0fbe913c --- /dev/null +++ b/datasets/fashion_mnist/infra/fashion_mnist_dataset.tf @@ -0,0 +1,32 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +resource "google_storage_bucket" "fashion-mnist" { + name = "${var.bucket_name_prefix}-fashion-mnist" + force_destroy = true + location = "US" + uniform_bucket_level_access = true + lifecycle { + ignore_changes = [ + logging, + ] + } +} + +output "storage_bucket-fashion-mnist-name" { + value = google_storage_bucket.fashion-mnist.name +} diff --git a/datasets/fashion_mnist/infra/provider.tf b/datasets/fashion_mnist/infra/provider.tf new file mode 100644 index 000000000..23ab87dcd --- /dev/null +++ b/datasets/fashion_mnist/infra/provider.tf @@ -0,0 +1,28 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +provider "google" { + project = var.project_id + impersonate_service_account = var.impersonating_acct + region = var.region +} + +data "google_client_openid_userinfo" "me" {} + +output "impersonating-account" { + value = data.google_client_openid_userinfo.me.email +} diff --git a/datasets/fashion_mnist/infra/variables.tf b/datasets/fashion_mnist/infra/variables.tf new file mode 100644 index 000000000..53f483735 --- /dev/null +++ b/datasets/fashion_mnist/infra/variables.tf @@ -0,0 +1,26 @@ +/** + * Copyright 2021 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +variable "project_id" {} +variable "bucket_name_prefix" {} +variable "impersonating_acct" {} +variable "region" {} +variable "env" {} +variable "iam_policies" { + default = {} +} + diff --git a/datasets/fashion_mnist/pipelines/dataset.yaml b/datasets/fashion_mnist/pipelines/dataset.yaml new file mode 100644 index 000000000..259fb7944 --- /dev/null +++ b/datasets/fashion_mnist/pipelines/dataset.yaml @@ -0,0 +1,26 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +dataset: + name: fashion_mnist + friendly_name: fashion_mnist + description: ~ + dataset_sources: ~ + terms_of_use: ~ + +resources: + - type: storage_bucket + name: fashion-mnist + uniform_bucket_level_access: True + location: US diff --git a/datasets/fashion_mnist/pipelines/fashion_mnist/fashion_mnist_dag.py b/datasets/fashion_mnist/pipelines/fashion_mnist/fashion_mnist_dag.py new file mode 100644 index 000000000..26bb05347 --- /dev/null +++ b/datasets/fashion_mnist/pipelines/fashion_mnist/fashion_mnist_dag.py @@ -0,0 +1,48 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from airflow import DAG +from airflow.operators import bash + +default_args = { + "owner": "Google", + "depends_on_past": False, + "start_date": "2022-06-10", +} + + +with DAG( + dag_id="fashion_mnist.fashion_mnist", + default_args=default_args, + max_active_runs=1, + schedule_interval="@weekly", + catchup=False, + default_view="graph", +) as dag: + + # Task to copy `fashion-mnist.gz` from FASHION MNIST Database to GCS + download_zip_files = bash.BashOperator( + task_id="download_zip_files", + bash_command="mkdir -p $data_dir/fashion-mnist\ncurl -o $data_dir/fashion-mnist/t10k-images-idx3-ubyte.gz -L $fashion_mnist_test\ncurl -o $data_dir/fashion-mnist/train-images-idx3-ubyte.gz -L $fashion_mnist_train\ncurl -o $data_dir/fashion-mnist/train-labels-idx1-ubyte.gz -L $fashion_mnist_train_labels\ncurl -o $data_dir/fashion-mnist/t10k-labels-idx1-ubyte.gz -L $fashion_mnist_test_labels\n", + env={ + "data_dir": "/home/airflow/gcs/data/fashion-mnist", + "fashion_mnist_test": "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz", + "fashion_mnist_train": "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz", + "fashion_mnist_train_labels": "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz", + "fashion_mnist_test_labels": "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz", + }, + ) + + download_zip_files diff --git a/datasets/fashion_mnist/pipelines/fashion_mnist/pipeline.yaml b/datasets/fashion_mnist/pipelines/fashion_mnist/pipeline.yaml new file mode 100644 index 000000000..a0057b5b4 --- /dev/null +++ b/datasets/fashion_mnist/pipelines/fashion_mnist/pipeline.yaml @@ -0,0 +1,50 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +resources: ~ +dag: + airflow_version: 2 + initialize: + dag_id: fashion_mnist + default_args: + owner: "Google" + depends_on_past: False + start_date: '2022-06-10' + max_active_runs: 1 + schedule_interval: "@weekly" + catchup: False + default_view: graph + + tasks: + - operator: BashOperator + description: "Task to copy `fashion-mnist.gz` from FASHION MNIST Database to GCS" + args: + task_id: "download_fashion_mnist_zip_files" + bash_command: | + mkdir -p $data_dir/fashion-mnist + curl -o $data_dir/fashion-mnist/t10k-images-idx3-ubyte.gz -L $fashion_mnist_test + curl -o $data_dir/fashion-mnist/train-images-idx3-ubyte.gz -L $fashion_mnist_train + curl -o $data_dir/fashion-mnist/train-labels-idx1-ubyte.gz -L $fashion_mnist_train_labels + curl -o $data_dir/fashion-mnist/t10k-labels-idx1-ubyte.gz -L $fashion_mnist_test_labels + + env: + data_dir: /home/airflow/gcs/data/fashion-mnist + fashion_mnist_test: http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz + fashion_mnist_train: http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz + fashion_mnist_train_labels: http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz + fashion_mnist_test_labels: http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz + + graph_paths: + - "download_fashion_mnist_zip_files"