diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml deleted file mode 100644 index 1ba1661e..00000000 --- a/.github/workflows/build.yml +++ /dev/null @@ -1,165 +0,0 @@ -# References -# https://docs.docker.com/build/ci/github-actions/ -# https://docs.github.com/en/actions/publishing-packages/publishing-docker-images - -name: build - -on: - push: - branches: - - "main" - pull_request: - branches: - - "main" - -jobs: - build: - runs-on: ubuntu-latest - - strategy: - matrix: - service: ["api", "datawarehouse", "pipeline"] - - env: - REGISTRY: ghcr.io - IMAGE_NAME: ${{ github.repository }}-${{ matrix.service }} - - permissions: - contents: read - packages: write - - defaults: - run: - working-directory: ${{ matrix.service }} - - steps: - - uses: actions/checkout@v4 - - - name: Log in to the Container registry - uses: docker/login-action@v3 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Extract metadata (tags, labels) for Docker - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} - tags: | - type=ref,event=branch - type=ref,event=tag - type=ref,event=pr - type=sha,format=long,prefix= - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Build and export to Docker - uses: docker/build-push-action@v5 - with: - context: ./${{ matrix.service }} - load: true - tags: ${{ steps.meta.outputs.tags }} - cache-from: type=gha - cache-to: type=gha,mode=max - labels: ${{ steps.meta.outputs.labels }} - - - name: Run tests - if: matrix.service == 'api' - env: - API_ENV: test - run: | - docker compose run --entrypoint pytest api -p no:cacheprovider -vv - - - name: Run tests - if: matrix.service == 'pipeline' - run: | - echo #TODO - - - name: Push image to GitHub registry - uses: docker/build-push-action@v5 - with: - context: ./${{ matrix.service }} - push: true - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - - deploy: - needs: build - runs-on: ubuntu-20.04 - environment: staging - - defaults: - run: - working-directory: deployment - - container: - image: hashicorp/terraform:1.6.1 - env: - TF_IN_AUTOMATION: true - - AWS_ACCESS_KEY_ID: ${{ vars.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - - # `TF_VAR_*` are case sensitive and must match the case of variables - TF_VAR_scaleway_application_id: ${{ vars.SCALEWAY_APPLICATION_ID }} - TF_VAR_datawarehouse_admin_password: ${{ secrets.DATAWAREHOUSE_ADMIN_PASSWORD }} - TF_VAR_datawarehouse_admin_username: ${{ vars.DATAWAREHOUSE_ADMIN_USERNAME }} - TF_VAR_datawarehouse_di_database: ${{ vars.DATAWAREHOUSE_DI_DATABASE }} - TF_VAR_datawarehouse_di_password: ${{ secrets.DATAWAREHOUSE_DI_PASSWORD }} - TF_VAR_datawarehouse_di_username: ${{ vars.DATAWAREHOUSE_DI_USERNAME }} - TF_VAR_scaleway_access_key: ${{ vars.SCALEWAY_ACCESS_KEY }} - TF_VAR_scaleway_project_id: ${{ vars.SCALEWAY_PROJECT_ID }} - TF_VAR_scaleway_secret_key: ${{ secrets.SCALEWAY_SECRET_KEY }} - TF_VAR_environment: ${{ vars.ENVIRONMENT }} - TF_VAR_api_scw_application_id: ${{ vars.API_SCW_APPLICATION_ID }} - TF_VAR_airflow_application_id: ${{ vars.AIRFLOW_APPLICATION_ID }} - TF_VAR_airflow_access_key: ${{ vars.AIRFLOW_ACCESS_KEY }} - TF_VAR_airflow_secret_key: ${{ secrets.AIRFLOW_SECRET_KEY }} - TF_VAR_airflow_admin_password: ${{ secrets.AIRFLOW_ADMIN_PASSWORD }} - TF_VAR_api_secret_key: ${{ secrets.API_SECRET_KEY }} - TF_VAR_stack_version: ${{ github.sha }} - TF_VAR_ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} - TF_VAR_dns_zone: ${{ vars.DNS_ZONE }} - TF_VAR_dns_subdomain: ${{ vars.DNS_SUBDOMAIN }} - TF_VAR_airflow__core__fernet_key: ${{ secrets.AIRFLOW__CORE__FERNET_KEY }} - TF_VAR_api_token_enabled: ${{ vars.API_TOKEN_ENABLED }} - TF_VAR_airflow_conn_s3_sources: ${{ secrets.AIRFLOW_CONN_S3_SOURCES }} - TF_VAR_brevo_api_key: ${{ secrets.BREVO_API_KEY }} - TF_VAR_datagouv_api_key: ${{ secrets.DATAGOUV_API_KEY }} - TF_VAR_dora_api_url: ${{ vars.DORA_API_URL }} - TF_VAR_dora_api_token: ${{ secrets.DORA_API_TOKEN }} - TF_VAR_ft_api_token: ${{ secrets.FT_API_TOKEN }} - TF_VAR_dora_preprod_api_token: ${{ secrets.DORA_PREPROD_API_TOKEN }} - TF_VAR_emplois_api_token: ${{ secrets.EMPLOIS_API_TOKEN }} - TF_VAR_grist_api_token: ${{ secrets.GRIST_API_TOKEN }} - TF_VAR_mes_aides_airtable_key: ${{ secrets.MES_AIDES_AIRTABLE_KEY }} - TF_VAR_soliguide_api_token: ${{ secrets.SOLIGUIDE_API_TOKEN }} - TF_VAR_metabase_secret_key: ${{ secrets.METABASE_SECRET_KEY }} - ENV: ${{ vars.ENVIRONMENT }} - volumes: - - .:/deployment - options: --workdir /deployment - - steps: - - uses: actions/checkout@v4 - - - name: tf init - run: | - terraform init \ - -backend-config "bucket=data-inclusion-tf-states" \ - -backend-config "key=${ENV}" - - - name: tf validate - run: | - terraform validate - - - name: tf plan - run: | - terraform plan - - - name: tf apply - run: | - terraform apply -auto-approve diff --git a/.github/workflows/build_deploy.yml b/.github/workflows/build_deploy.yml new file mode 100644 index 00000000..ac248440 --- /dev/null +++ b/.github/workflows/build_deploy.yml @@ -0,0 +1,149 @@ +# References +# https://docs.docker.com/build/ci/github-actions/ +# https://docs.github.com/en/actions/publishing-packages/publishing-docker-images + +name: build_deploy + +on: + push: + branches: [main] + pull_request: + branches: [main] + # default types + ready_for_review + types: [opened, synchronize, repopened, ready_for_review] + +jobs: + build: + runs-on: ubuntu-latest + + if: ${{ !(github.event_name == 'pull_request' && github.event.pull_request.draft) }} + + strategy: + matrix: + service: ["api", "datawarehouse", "pipeline"] + + env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }}-${{ matrix.service }} + + permissions: + contents: read + packages: write + + defaults: + run: + working-directory: ${{ matrix.service }} + + steps: + - uses: actions/checkout@v4 + + - name: Log in to the Container registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: | + type=ref,event=branch + type=ref,event=tag + type=ref,event=pr + type=sha,format=long,prefix= + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build and export to Docker + uses: docker/build-push-action@v5 + with: + context: ./${{ matrix.service }} + load: true + tags: ${{ steps.meta.outputs.tags }} + cache-from: type=gha + cache-to: type=gha,mode=max + labels: ${{ steps.meta.outputs.labels }} + + - name: Run tests + if: matrix.service == 'api' + env: + API_ENV: test + run: | + docker compose run --entrypoint pytest api -p no:cacheprovider -vv + + - name: Run tests + if: matrix.service == 'pipeline' + run: | + echo #TODO + + - name: Push image to GitHub registry + uses: docker/build-push-action@v5 + with: + context: ./${{ matrix.service }} + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + + deploy: + needs: build + runs-on: ubuntu-latest + + strategy: + matrix: + environment: [staging, prod] + + # prevent deployment failure in an environment to interrupt other deployments + fail-fast: true + + environment: ${{ matrix.environment }} + + env: + ENV: ${{ vars.ENVIRONMENT }} + AWS_ACCESS_KEY_ID: ${{ vars.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + TF_VARS_BASE64: ${{ secrets.TF_VARS_BASE64 }} + TF_VAR_stack_version: ${{ github.sha }} + + defaults: + run: + working-directory: deployment + + steps: + - uses: actions/checkout@v4 + + - uses: hashicorp/setup-terraform@v3 + with: + terraform_version: "1.8.1" + + - name: mask tf variables + run: | + echo "${TF_VARS_BASE64}" \ + | base64 --decode \ + | jq 'to_entries | map(.value // empty) | .[]' \ + | xargs -I{} echo '::add-mask::{}' + + - name: tf init + run: | + terraform init \ + -backend-config "bucket=data-inclusion-tf-states" \ + -backend-config "key=${ENV}" + + - name: tf validate + run: | + terraform validate + + - name: tf plan + run: | + trap "rm -f terraform.tfvars.json" EXIT + echo "${TF_VARS_BASE64}" | base64 --decode > terraform.tfvars.json + terraform plan -input=false + + - name: tf apply + run: | + trap "rm -f terraform.tfvars.json" EXIT + echo "${TF_VARS_BASE64}" | base64 --decode > terraform.tfvars.json + terraform apply -input=false -auto-approve diff --git a/deployment/.tool-versions b/deployment/.tool-versions index daaf02fd..fad6fadc 100644 --- a/deployment/.tool-versions +++ b/deployment/.tool-versions @@ -1 +1 @@ -terraform 1.6.1 +terraform 1.8.1 diff --git a/deployment/README.md b/deployment/README.md index ef4c5129..6392d49e 100644 --- a/deployment/README.md +++ b/deployment/README.md @@ -1,5 +1,11 @@ # deployment +Only the pipeline is currently managed by this terraform configuration. + +The pipeline is deployed on scaleway, from compiled images, using terraform in a github workflow. + +The api and metabase are deployed on scalingo, from sources, using the scalingo github integration. + ## provisioning ### prerequisites @@ -8,27 +14,40 @@ #### for the state backend -* A scaleway project `terraform` -* A policy `terraform--manual--edit-tf-states` allowing edits to object storage -* An IAM application `terraform--manual--github-ci` with the `terraform--manual--edit-tf-states` policy assigned -* An API key for this application +The state backend is shared by environments. -The state backend is shared by environments. There should already be a project `terraform`. +| resource type | resource name | description | details | +|-----------------------|------------------------------------|-------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| scaleway project | `data-inclusion-terraform` | Common to all environments | | +| object storage bucket | `data-inclusion-tf-states` | Store terraform states for all environments | | +| IAM group | `data-inclusion-terraform-editors` | Contains all apps or users who want to deploy | A team member who wants to deploy using the `terraform` cli locally must add himself to this group | +| IAM policy | `data-inclusion-terraform-editors` | Assigned to the IAM group of the same name. Allow it to provision the resources required. | `IAMReadOnly`, `ProjectReadOnly` in the organization AND `InstancesFullAccess`, `ObjectStorageFullAccess`, `RelationalDatabasesFullAccess` and `DomainsDNSFullAccess` in the target project | +| IAM application | `data-inclusion-terraform-github` | Used by GH action to provision on SCW | Must be in the `data-inclusion-terraform-editors` group | | +| API key | - | Creds for `data-inclusion-terraform-github` | Set secret `AWS_SECRET_ACCESS_KEY` and variable `AWS_ACCESS_KEY_ID` in the target GH environment ([here](https://github.com/gip-inclusion/data-inclusion/settings/environments)) | #### to provision an environment -1. A scaleway project dedicated for that environment (`prod`, `staging`, etc.) -2. An IAM application `--manual--github-ci` that will be used to provision scaleway resources in the project, together with the API key (access key + secret key) for this application. This application needs a policy `--manual--edit-stack-data-resources` attached that gives it: - * `IAMReadOnly`, `ProjectReadOnly` in the organization; - * `InstancesFullAccess`, `ObjectStorageFullAccess`, `RelationalDatabasesFullAccess` and `DomainsDNSFullAccess` in the target project scope -3. Another IAM application `--manual--airflow` that will be used for object storage by Airflow, together with the API key (access key + secret key) for this application. This application needs a policy `--manual--airflow-object-storage-access` attached that gives it `ObjectStorageFullAccess` in the target project. -4. Another IAM application `--manual--api` used by the api to load the data·inclusion dataset regularly, together with the API key (access key + secret key) for this application. This application needs a policy `--manual--api-object-storage-access` attached that gives it `ObjectStorageReadOnly` in the target project. -5. A SSH key pair: - * generated with `ssh-keygen -t ed25519 -C -f /tmp/ -N ''`) - * the public key must have been uploaded to scaleway -6. A domain (`.data.inclusion.beta.gouv.fr`) registered on AlwaysData and declared on scaleway as an external domain in the environment project. +The following resources must be created manually **before** running `terraform apply`. + +Replace `` with the identifier for the target environment : `prod`, `staging`, etc. -💡 IAM applications are not created with terraform, because it would require the `--manual--github-ci` application to have full access to IAM management at the organization level. +| resource type | resource name | description | details | +|------------------|---------------------------------------|-----------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| scaleway project | `data-inclusion-` | | | +| IAM application | `data-inclusion--github` | Used by GH action to provision on SCW | | +| IAM policy | `data-inclusion--github` | Assigned to the IAM app of the same name. Allow it to provision the resources required. | `IAMReadOnly`, `ProjectReadOnly` in the organization AND `InstancesFullAccess`, `ObjectStorageFullAccess`, `RelationalDatabasesFullAccess` and `DomainsDNSFullAccess` in the target project | +| API key | - | Creds for `data-inclusion--github` | Set `scaleway_access_key` and `scaleway_secret_key` in the terraform config for that env stored in bitwarden | +| IAM application | `data-inclusion--airflow` | Used by airflow to read from & write to the datalake | +| IAM policy | `data-inclusion--airflow` | Assigned to the IAM app of the same name. Allow it to read from & write to object storage. | `ObjectStorageFullAccess` in the target project | +| API key | - | Creds for `data-inclusion--airflow` | Set `airflow_access_key` and `airflow_secret_key` in the terraform config for that env stored in bitwarden | +| IAM application | `data-inclusion--api` | Used by the api to read in the `data/marts` directory in the datalake | +| IAM policy | `data-inclusion--api` | Assigned to the IAM app of the same name. Allow it to read from object storage. | `ObjectStorageReadOnly` in the target project | +| API key | - | Creds for `data-inclusion--api` | Set `DATALAKE_SECRET_KEY` and `DATALAKE_ACCESS_KEY` in the scalingo app `data-inclusion-api-` | +| SSH key | - | Used by GH action to connect to the server and deploy docker services | Generated with `ssh-keygen -t ed25519 -C -f /tmp/ -N ''`. The public key must be uploaded to SCW. | +| External domain | `.data.inclusion.beta.gouv.fr` | Used to generate subdomains pointing to the server | The `data.inclusion.beta.gouv.fr` domain is registered on AlwaysData | + + +💡 IAM applications must be created manually, because it would require the `data-inclusion--github` application to have full access to IAM management at the organization level. #### links @@ -79,6 +98,22 @@ terraform plan terraform apply ``` +### prod and staging on github + +Each environment has a dedicated github environment. In each of these gh environment, a `TF_VARS_BASE64` secret contains a base64 encoded version of `terraform.tfvars.json`, with all the variables for a given environment. + +The files are stored on bitwarden as `.terraform.tfvars.json`. + +Use the following gh cli command to update this file on github: + +```bash +# using gh cli +# replace +base64 -w0 .terraform.tfvars.json | gh secret --repo gip-inclusion/data-inclusion --environment set TF_VARS_BASE64 +``` + +Make sure to update the config on bitwarden. + ### formatting `.tf` files A pre-commit hook is provided, make sure to `pre-commit install`! diff --git a/deployment/docker-compose.yml b/deployment/docker-compose.yml index f3a9b4ae..8da1fbe9 100644 --- a/deployment/docker-compose.yml +++ b/deployment/docker-compose.yml @@ -11,29 +11,35 @@ x-airflow-common: AIRFLOW__CORE__DEFAULT_TIMEZONE: Europe/Paris AIRFLOW__CORE__EXECUTOR: LocalExecutor AIRFLOW__CORE__FERNET_KEY: ${AIRFLOW__CORE__FERNET_KEY} - AIRFLOW__CORE__LOAD_EXAMPLES: 'false' AIRFLOW__CORE__LOAD_DEFAULT_CONNECTIONS: 'false' + AIRFLOW__CORE__LOAD_EXAMPLES: 'false' + AIRFLOW__CORE__PARALLELISM: 4 AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql://airflow:airflow@airflow-db:5432/airflow - AIRFLOW__WEBSERVER__BASE_URL: https://${AIRFLOW_HOSTNAME} + AIRFLOW__SCHEDULER__MAX_TIS_PER_QUERY: 0 + AIRFLOW__SENTRY__SENTRY_DSN: ${AIRFLOW__SENTRY__SENTRY_DSN} + AIRFLOW__SENTRY__SENTRY_ON: 'true' + AIRFLOW__WEBSERVER__BASE_URL: ${AIRFLOW__WEBSERVER__BASE_URL} # Connections + AIRFLOW_CONN_MATTERMOST: ${AIRFLOW_CONN_MATTERMOST} + AIRFLOW_CONN_PG_API: ${AIRFLOW_CONN_PG_API} AIRFLOW_CONN_PG: ${AIRFLOW_CONN_PG} - AIRFLOW_CONN_S3: ${AIRFLOW_CONN_S3} AIRFLOW_CONN_S3_SOURCES: ${AIRFLOW_CONN_S3_SOURCES} + AIRFLOW_CONN_S3: ${AIRFLOW_CONN_S3} + AIRFLOW_CONN_SSH_API: ${AIRFLOW_CONN_SSH_API} # Variables AIRFLOW_VAR_BREVO_API_KEY: ${AIRFLOW_VAR_BREVO_API_KEY} AIRFLOW_VAR_DATAGOUV_API_KEY: ${AIRFLOW_VAR_DATAGOUV_API_KEY} AIRFLOW_VAR_DORA_API_TOKEN: ${AIRFLOW_VAR_DORA_API_TOKEN} - AIRFLOW_VAR_DORA_PREPROD_API_TOKEN: ${AIRFLOW_VAR_DORA_PREPROD_API_TOKEN} - AIRFLOW_VAR_FT_API_TOKEN: ${AIRFLOW_VAR_FT_API_TOKEN} + AIRFLOW_VAR_DORA_API_URL: ${AIRFLOW_VAR_DORA_API_URL} AIRFLOW_VAR_EMPLOIS_API_TOKEN: ${AIRFLOW_VAR_EMPLOIS_API_TOKEN} + AIRFLOW_VAR_FT_API_TOKEN: ${AIRFLOW_VAR_FT_API_TOKEN} AIRFLOW_VAR_GRIST_API_TOKEN: ${AIRFLOW_VAR_GRIST_API_TOKEN} AIRFLOW_VAR_MES_AIDES_AIRTABLE_KEY: ${AIRFLOW_VAR_MES_AIDES_AIRTABLE_KEY} + AIRFLOW_VAR_SIAO_FILE_URL: ${AIRFLOW_VAR_SIAO_FILE_URL} AIRFLOW_VAR_SOLIGUIDE_API_TOKEN: ${AIRFLOW_VAR_SOLIGUIDE_API_TOKEN} - AIRFLOW_VAR_DORA_API_URL: ${AIRFLOW_VAR_DORA_API_URL} - AIRFLOW_VAR_DBT_LOG_PATH: /opt/airflow/dbt-runtime/logs AIRFLOW_VAR_DBT_TARGET_PATH: /opt/airflow/dbt-runtime/target @@ -151,49 +157,10 @@ services: - /var/run/docker.sock:/var/run/docker.sock:ro - letsencrypt-data:/letsencrypt - metabase-db: - image: postgres:14 - restart: always - healthcheck: - test: [ "CMD", "pg_isready", "-U", "metabase" ] - interval: 5s - retries: 5 - environment: - - POSTGRES_DB=metabase - - POSTGRES_USER=metabase - - POSTGRES_PASSWORD=metabase - volumes: - - metabase-data:/var/lib/postgresql/data - - metabase: - image: metabase/metabase:v0.48.6 - restart: always - healthcheck: - test: curl --fail -I http://localhost:3000/api/health || exit 1 - interval: 15s - timeout: 5s - retries: 10 - ports: - - 3000:3000 - labels: - - "traefik.enable=true" - - "traefik.http.routers.metabase.rule=Host(`${METABASE_HOSTNAME}`)" - - "traefik.http.routers.metabase.entrypoints=websecure" - - "traefik.http.routers.metabase.tls.certresolver=main" - environment: - - JAVA_TIMEZONE=Europe/Paris - - MB_DB_TYPE=postgres - - MB_DB_CONNECTION_URI=postgresql://metabase:metabase@metabase-db:5432/metabase - - MB_ENCRYPTION_SECRET_KEY=${METABASE_SECRET_KEY} - - MB_ANON_TRACKING_ENABLED=false - - MB_SITE_URL=${METABASE_HOSTNAME} - - MB_REDIRECT_ALL_REQUESTS_TO_HTTPS=true - volumes: airflow-logs: datawarehouse-data: letsencrypt-data: - metabase-data: networks: default: diff --git a/deployment/main.tf b/deployment/main.tf index bd4705c2..5cc300f3 100644 --- a/deployment/main.tf +++ b/deployment/main.tf @@ -19,20 +19,17 @@ resource "scaleway_instance_security_group" "main" { action = "accept" port = 443 } - inbound_rule { - action = "accept" - port = 5432 - } } resource "scaleway_instance_server" "main" { - type = var.scaleway_instance_type + type = var.environment == "prod" ? "POP2-HM-2C-16G" : "GP1-XS" image = "docker" ip_id = scaleway_instance_ip.main.id routed_ip_enabled = true security_group_id = scaleway_instance_security_group.main.id root_volume { + size_in_gb = var.environment == "prod" ? 150 : 50 delete_on_termination = false } } @@ -41,6 +38,38 @@ resource "random_pet" "datalake_bucket_suffix" {} resource "scaleway_object_bucket" "main" { name = "data-inclusion-datalake-${var.environment}-${random_pet.datalake_bucket_suffix.id}" + + lifecycle_rule { + id = "archive-raw-data-after-30-days" + prefix = "data/raw" + enabled = true + + transition { + days = 30 + storage_class = "GLACIER" + } + } + + lifecycle_rule { + id = "archive-marts-data-after-7-days" + prefix = "data/marts" + enabled = true + + transition { + days = 7 + storage_class = "GLACIER" + } + } + + lifecycle_rule { + id = "expire-marts-data-after-30-days" + prefix = "data/marts" + enabled = true + + expiration { + days = 30 + } + } } data "scaleway_account_project" "main" { @@ -127,20 +156,24 @@ locals { base_hostname = "${var.dns_subdomain != "" ? "${var.dns_subdomain}." : ""}${var.dns_zone}" - airflow_hostname = "airflow.${local.base_hostname}" - metabase_hostname = "metabase.${local.base_hostname}" + airflow_hostname = "airflow.${local.base_hostname}" work_dir = "/root/data-inclusion" } resource "scaleway_domain_record" "dns" { - for_each = toset([local.airflow_hostname, local.metabase_hostname]) + for_each = toset( + [ + "", + local.airflow_hostname + ] + ) dns_zone = var.dns_zone name = replace(each.key, ".${var.dns_zone}", "") type = "A" data = scaleway_instance_server.main.public_ip - ttl = 60 + ttl = 3600 } resource "null_resource" "up" { @@ -164,34 +197,39 @@ resource "null_resource" "up" { provisioner "file" { content = sensitive(<<-EOT - METABASE_HOSTNAME=${local.metabase_hostname} - METABASE_SECRET_KEY=${var.metabase_secret_key} - - # common configuration - AIRFLOW__CORE__FERNET_KEY=${var.airflow__core__fernet_key} - AIRFLOW_CONN_PG=${local.airflow_conn_pg} - AIRFLOW_CONN_S3=${local.airflow_conn_s3} - AIRFLOW_HOSTNAME=${local.airflow_hostname} - AIRFLOW_WWW_USER_PASSWORD=${var.airflow_admin_password} - DATAWAREHOUSE_DI_DATABASE=${var.datawarehouse_di_database} - DATAWAREHOUSE_DI_PASSWORD=${var.datawarehouse_di_password} - DATAWAREHOUSE_DI_USERNAME=${var.datawarehouse_di_username} - STACK_VERSION=${var.stack_version} - - # pipeline secrets - AIRFLOW_CONN_S3_SOURCES=${var.airflow_conn_s3_sources} - AIRFLOW_VAR_BREVO_API_KEY=${var.brevo_api_key} - AIRFLOW_VAR_DATAGOUV_API_KEY=${var.datagouv_api_key} - AIRFLOW_VAR_DORA_API_TOKEN=${var.dora_api_token} - AIRFLOW_VAR_FT_API_TOKEN=${var.ft_api_token} - AIRFLOW_VAR_DORA_PREPROD_API_TOKEN=${var.dora_preprod_api_token} - AIRFLOW_VAR_EMPLOIS_API_TOKEN=${var.emplois_api_token} - AIRFLOW_VAR_GRIST_API_TOKEN=${var.grist_api_token} - AIRFLOW_VAR_MES_AIDES_AIRTABLE_KEY=${var.mes_aides_airtable_key} - AIRFLOW_VAR_SOLIGUIDE_API_TOKEN=${var.soliguide_api_token} - - # overrides - AIRFLOW_VAR_DORA_API_URL=${var.dora_api_url} + STACK_VERSION='${var.stack_version}' + AIRFLOW_HOSTNAME='${local.airflow_hostname}' + + # Datawarehouse + DATAWAREHOUSE_DI_DATABASE='${var.datawarehouse_di_database}' + DATAWAREHOUSE_DI_PASSWORD='${var.datawarehouse_di_password}' + DATAWAREHOUSE_DI_USERNAME='${var.datawarehouse_di_username}' + + # Airflow settings + AIRFLOW_WWW_USER_PASSWORD='${var.airflow_admin_password}' + AIRFLOW__CORE__FERNET_KEY='${var.airflow__core__fernet_key}' + AIRFLOW__SENTRY__SENTRY_DSN='${var.airflow__sentry__sentry_dsn}' + AIRFLOW__WEBSERVER__BASE_URL='https://${local.airflow_hostname}' + + # Airflow connections + AIRFLOW_CONN_MATTERMOST='${var.airflow_conn_mattermost}' + AIRFLOW_CONN_PG_API='${var.airflow_conn_pg_api}' + AIRFLOW_CONN_PG='${local.airflow_conn_pg}' + AIRFLOW_CONN_S3_SOURCES='${var.airflow_conn_s3_sources}' + AIRFLOW_CONN_S3='${local.airflow_conn_s3}' + AIRFLOW_CONN_SSH_API='${var.airflow_conn_ssh_api}' + + # Airflow variables + AIRFLOW_VAR_BREVO_API_KEY='${var.brevo_api_key}' + AIRFLOW_VAR_DATAGOUV_API_KEY='${var.datagouv_api_key}' + AIRFLOW_VAR_DORA_API_TOKEN='${var.dora_api_token}' + AIRFLOW_VAR_DORA_API_URL='${var.dora_api_url}' + AIRFLOW_VAR_EMPLOIS_API_TOKEN='${var.emplois_api_token}' + AIRFLOW_VAR_FT_API_TOKEN='${var.ft_api_token}' + AIRFLOW_VAR_GRIST_API_TOKEN='${var.grist_api_token}' + AIRFLOW_VAR_MES_AIDES_AIRTABLE_KEY='${var.mes_aides_airtable_key}' + AIRFLOW_VAR_SIAO_FILE_URL='${var.siao_file_url}' + AIRFLOW_VAR_SOLIGUIDE_API_TOKEN='${var.soliguide_api_token}' EOT ) destination = "${local.work_dir}/.env" diff --git a/deployment/template.terraform.tfvars.json b/deployment/template.terraform.tfvars.json index 86efa540..af849324 100644 --- a/deployment/template.terraform.tfvars.json +++ b/deployment/template.terraform.tfvars.json @@ -1,32 +1,34 @@ { "airflow__core__fernet_key": null, + "airflow__sentry__sentry_dsn": null, "airflow_access_key": null, "airflow_admin_password": null, "airflow_application_id": null, + "airflow_conn_mattermost": null, + "airflow_conn_pg_api": null, "airflow_conn_s3_sources": null, + "airflow_conn_ssh_api": null, "airflow_secret_key": null, "api_scw_application_id": null, + "brevo_api_key": null, "datagouv_api_key": null, - "datawarehouse_admin_password": null, - "datawarehouse_admin_username": null, "datawarehouse_di_database": null, "datawarehouse_di_password": null, "datawarehouse_di_username": null, "dns_subdomain": null, "dns_zone": null, "dora_api_token": null, - "dora_preprod_api_token": null, "dora_api_url": null, "emplois_api_token": null, "environment": null, "ft_api_token": null, "grist_api_token": null, "mes_aides_airtable_key": null, - "metabase_secret_key": null, "scaleway_access_key": null, "scaleway_application_id": null, "scaleway_project_id": null, "scaleway_secret_key": null, + "siao_file_url": null, "soliguide_api_token": null, "ssh_private_key": null, "stack_version": null diff --git a/deployment/variables.tf b/deployment/variables.tf index ea0f8582..76147040 100644 --- a/deployment/variables.tf +++ b/deployment/variables.tf @@ -55,17 +55,6 @@ variable "brevo_api_key" { default = "" } -variable "datawarehouse_admin_password" { - description = "Password for the first user of the postgres datawarehouse" - type = string - sensitive = true -} - -variable "datawarehouse_admin_username" { - description = "Identifier for the first user of the postgres datawarehouse" - type = string -} - variable "datawarehouse_di_database" { description = "Identifier for the data inclusion database" type = string @@ -147,12 +136,6 @@ variable "mes_aides_airtable_key" { default = "" } -variable "metabase_secret_key" { - description = "Secret key to save connection passwords in the db" - type = string - sensitive = true -} - variable "scaleway_access_key" { description = "Scaleway access key (https://console.scaleway.com/iam/api-keys)" type = string @@ -175,12 +158,6 @@ variable "scaleway_project_id" { type = string } -variable "scaleway_instance_type" { - description = "Scaleway instance type (ex. GP1-XS, see https://www.scaleway.com/en/pricing/?tags=compute)" - type = string - default = "GP1-XS" -} - variable "soliguide_api_token" { description = "Used in extraction tasks orchestrated by airflow" type = string @@ -198,3 +175,36 @@ variable "stack_version" { description = "Version (e.g. sha or semver) of the stack services to deploy" type = string } + +variable "airflow__sentry__sentry_dsn" { + description = "Sentry DSN for airflow monitoring" + type = string + default = "" +} + +variable "airflow_conn_pg_api" { + description = "Postgres URI similar to the api scalingo app SCALINGO_POSTGRESQL_URL, but with a dedicated read-only credentials" + type = string + sensitive = true + default = "" +} + +variable "airflow_conn_ssh_api" { + description = "SSH connection string used to open a tunnel to scalingo. The associated private_key must have been uploaded to scalingo" + type = string + sensitive = true + default = "" +} + +variable "airflow_conn_mattermost" { + description = "Mattermost webhook used by airflow to notifications" + type = string + sensitive = true + default = "" +} + +variable "siao_file_url" { + description = "Public URL to the siao export on our s3 bucket" + type = string + default = "" +} diff --git a/pipeline/Dockerfile b/pipeline/Dockerfile index 782fd82b..7fa232de 100644 --- a/pipeline/Dockerfile +++ b/pipeline/Dockerfile @@ -48,6 +48,9 @@ ENV PYTHONDONTWRITEBYTECODE 1 ENV AIRFLOW_VAR_DBT_PROJECT_DIR /opt/airflow/dbt +COPY requirements requirements +RUN pip install -r requirements/airflow/requirements.txt + USER root RUN apt-get update \ diff --git a/pipeline/Procfile b/pipeline/Procfile deleted file mode 100644 index 54b98555..00000000 --- a/pipeline/Procfile +++ /dev/null @@ -1,3 +0,0 @@ -web: ./entrypoint.sh webserver - -scheduler: ./entrypoint.sh scheduler diff --git a/pipeline/defaults.env b/pipeline/defaults.env index ae5439b3..ef5482af 100644 --- a/pipeline/defaults.env +++ b/pipeline/defaults.env @@ -15,12 +15,12 @@ AIRFLOW_VAR_DORA_PREPROD_API_URL=https://api.dora.incubateur.net/api/v2/ AIRFLOW_VAR_EMPLOIS_API_URL=https://emplois.inclusion.beta.gouv.fr/api/v1/structures/ AIRFLOW_VAR_ETAB_PUB_FILE_URL=https://www.data.gouv.fr/fr/datasets/r/73302880-e4df-4d4c-8676-1a61bb997f3d AIRFLOW_VAR_FINESS_FILE_URL=https://www.data.gouv.fr/fr/datasets/r/3dc9b1d5-0157-440d-a7b5-c894fcfdfd45 -AIRFLOW_VAR_IGN_ADMIN_EXPRESS_FILE_URL=http://files.opendatarchives.fr/professionnels.ign.fr/adminexpress/ADMIN-EXPRESS-COG_3-0__SHP__FRA_WM_2021-05-19.7z +AIRFLOW_VAR_IGN_ADMIN_EXPRESS_FILE_URL=https://files.opendatarchives.fr/professionnels.ign.fr/adminexpress/ADMIN-EXPRESS-COG_3-0__SHP__FRA_WM_2021-05-19.7z AIRFLOW_VAR_INSEE_FIRSTNAME_FILE_URL=https://www.insee.fr/fr/statistiques/fichier/2540004/nat2021_csv.zip AIRFLOW_VAR_INSEE_COG_DATASET_URL=https://www.insee.fr/fr/statistiques/fichier/6800675 AIRFLOW_VAR_MEDNUM_API_URL=https://cartographie.societenumerique.gouv.fr/api/v0/ -AIRFLOW_VAR_MES_AIDES_AIDES_URL=https://airtable.com/appoYjASNOp90Ryy5/tblN4m8Ayzxzgxl9W/viw7HRKMxq4hR2f27 -AIRFLOW_VAR_MES_AIDES_GARAGES_URL=https://airtable.com/appEvva5gyqqoQRnr/tblnGf4Y5EUEeVHtJ/viw9ZZAUkexq6uDaI +AIRFLOW_VAR_MES_AIDES_AIDES_URL=https://airtable.com/appRga7C9USklxYiV/tblcAC5yMV3Ftzv5c/viwMte3unsIYXxY9a +AIRFLOW_VAR_MES_AIDES_GARAGES_URL=https://airtable.com/appRga7C9USklxYiV/tblfhYoBpcQoJwGIv/viwoJsw0vsAnU0fAo AIRFLOW_VAR_MONENFANT_CRECHES_FILE_URL=https://data-inclusion-lake.s3.fr-par.scw.cloud/sources/monenfant/2023-06-14/creches.json AIRFLOW_VAR_ODSPEP_S3_KEY_PREFIX=sources/odspep/2023-01-23/denormalized/Exports/ AIRFLOW_VAR_RESEAU_ALPHA_URL=https://www.reseau-alpha.org diff --git a/pipeline/entrypoint.sh b/pipeline/entrypoint.sh deleted file mode 100755 index 20b76aa7..00000000 --- a/pipeline/entrypoint.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash - -# Exit immediately if a command exits with a non-zero status. -set -e - -# Trace execution -[[ "${DEBUG}" ]] && set -x - -if [[ $# -eq 0 ]]; then - echo "No service parameter provided."; - exit 1; -fi - -COMMAND=$1 - -# The `DATABASE_URL` env var is automatically set by Scalingo and uses the depreciated -# scheme `postgres://`. Therefore it is replaced. -export AIRFLOW__DATABASE__SQL_ALCHEMY_CONN="${DATABASE_URL/postgres\:\/\//postgresql\:\/\/}" - -export AIRFLOW_HOME="${HOME}/airflow" -export AIRFLOW__CORE__LOAD_DEFAULT_CONNECTIONS=False -export AIRFLOW__CORE__LOAD_EXAMPLES=False -export AIRFLOW__CORE__EXECUTOR=LocalExecutor -export AIRFLOW__CORE__DEFAULT_TIMEZONE=Europe/Paris -export AIRFLOW__CORE__FERNET_KEY="${SECRET_KEY}" -export AIRFLOW__CORE__DAGS_FOLDER="${HOME}/dags" - -export AIRFLOW__LOGGING__REMOTE_LOGGING=True -export AIRFLOW__LOGGING__REMOTE_BASE_LOG_FOLDER=s3://data-inclusion-lake/logs -export AIRFLOW__LOGGING__REMOTE_LOG_CONN_ID=s3_logs -export AIRFLOW__LOGGING__DELETE_LOCAL_LOGS=True - -if [[ "${_AIRFLOW_DB_MIGRATE}" = "true" ]]; then - airflow db migrate -fi - -if [[ "${COMMAND}" = "webserver" ]]; then - airflow webserver --port "${PORT}" -fi - -if [[ "${COMMAND}" = "scheduler" ]]; then - # Create additional virtualenvs for isolated task executions - VIRTUAL_ENV="${AIRFLOW_HOME}/venvs/python/venv" - python -m venv "${VIRTUAL_ENV}" - "${VIRTUAL_ENV}/bin/python" -m pip install -U pip setuptools wheel - "${VIRTUAL_ENV}/bin/python" -m pip install -r requirements/tasks/python/requirements.txt - "${VIRTUAL_ENV}/bin/python" -m pip install . - - # Create additional virtualenvs for isolated task executions - VIRTUAL_ENV="${AIRFLOW_HOME}/venvs/pipx/venv" - python -m venv "${VIRTUAL_ENV}" - "${VIRTUAL_ENV}/bin/python" -m pip install -U pip setuptools wheel - "${VIRTUAL_ENV}/bin/python" -m pip install -r requirements/tasks/pipx/requirements.txt - - # Create additional virtualenvs for isolated task executions - VIRTUAL_ENV="${AIRFLOW_HOME}/venvs/dbt/venv" - python -m venv "${VIRTUAL_ENV}" - "${VIRTUAL_ENV}/bin/python" -m pip install -U pip setuptools wheel - "${VIRTUAL_ENV}/bin/python" -m pip install -r requirements/tasks/dbt/requirements.txt - - # Install dbt packages (not python packages) - "${VIRTUAL_ENV}/bin/dbt" deps --project-dir "${AIRFLOW_VAR_DBT_PROJECT_DIR}" - - airflow scheduler -fi diff --git a/pipeline/requirements.txt b/pipeline/requirements.txt deleted file mode 100644 index e39d31f8..00000000 --- a/pipeline/requirements.txt +++ /dev/null @@ -1 +0,0 @@ --r requirements/airflow/requirements.txt diff --git a/pipeline/runtime.txt b/pipeline/runtime.txt deleted file mode 100644 index 76b6e496..00000000 --- a/pipeline/runtime.txt +++ /dev/null @@ -1 +0,0 @@ -python-3.11.6