Skip to content

Commit

Permalink
removed airflow services (#1043)
Browse files Browse the repository at this point in the history
  • Loading branch information
HazalCiplak authored Nov 14, 2024
1 parent 32c5307 commit e229cc4
Show file tree
Hide file tree
Showing 6 changed files with 17 additions and 158 deletions.
29 changes: 8 additions & 21 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -58,54 +58,41 @@ dev-watch:
dev-test: dev-lint dev-unittest

build:
$(DOCKER_COMPOSE) build data-hub-dags
$(DOCKER_COMPOSE) build data-hub-pipelines

build-dev:
$(DOCKER_COMPOSE) build data-hub-dags-dev
$(DOCKER_COMPOSE) build data-hub-pipelines-dev

flake8:
$(DOCKER_COMPOSE) run --rm data-hub-dags-dev \
$(DOCKER_COMPOSE) run --rm data-hub-pipelines-dev \
python -m flake8 ejp_xml_pipeline tests

pylint:
$(DOCKER_COMPOSE) run --rm data-hub-dags-dev \
$(DOCKER_COMPOSE) run --rm data-hub-pipelines-dev \
python -m pylint ejp_xml_pipeline tests

mypy:
$(DOCKER_COMPOSE) run --rm data-hub-dags-dev \
$(DOCKER_COMPOSE) run --rm data-hub-pipelines-dev \
python -m mypy ejp_xml_pipeline tests

lint: flake8 pylint mypy

unittest:
$(DOCKER_COMPOSE) run --rm data-hub-dags-dev \
$(DOCKER_COMPOSE) run --rm data-hub-pipelines-dev \
python -m pytest -p no:cacheprovider $(ARGS) tests/unit_test

test: lint unittest

watch:
$(DOCKER_COMPOSE) run --rm data-hub-dags-dev \
$(DOCKER_COMPOSE) run --rm data-hub-pipelines-dev \
python -m pytest_watch -- -p no:cacheprovider $(ARGS) $(PYTEST_WATCH_MODULES)

airflow-start:
$(DOCKER_COMPOSE) up worker webserver

airflow-stop:
$(DOCKER_COMPOSE) down

test-exclude-e2e: build-dev
$(DOCKER_COMPOSE) run --rm data-hub-dags-dev ./run_test.sh
$(DOCKER_COMPOSE) run --rm data-hub-pipelines-dev ./run_test.sh

clean:
$(DOCKER_COMPOSE) down -v

airflow-db-migrate:
$(DOCKER_COMPOSE) run --rm webserver db migrate

airflow-initdb:
$(DOCKER_COMPOSE) run --rm webserver db init


data-hub-pipelines-run-ejp-xml-pipeline:
$(DOCKER_COMPOSE) run --rm data-hub-pipelines \
python -m ejp_xml_pipeline.cli
Expand Down
8 changes: 2 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# eJP XML Airflow Data Pipeline
# eJP XML Data Pipeline
This repository consists of a generic data pipeline that is used to ETL eJP XML dumps in S3 buckets.
Being generic, it needs to be configured for its data source, data sink, and transformations.
The sample configuration for this data pipeline can be found in `sample_data_config` directory of this project
Expand All @@ -10,10 +10,6 @@ Following are the credentials that you may need to provide
- GCP's service account json key (mandatory for all data pipelines)
- AWS credentials

To run the application locally:

make build-dev airflow-initdb airflow-start

To run the whole test on the application:

make build-dev end2end-test
Expand Down Expand Up @@ -42,4 +38,4 @@ To set up the development environment:
## CI/CD

This runs on Jenkins and follows the standard approaches used by the `eLife Data Team` for CI/CD.
Note that as part of the CI/CD, another Jenkins pipeline is always triggered whenever there is a commit to the develop branch. The latest commit reference to a `develop` branch is passed on as a parameter to this Jenkins pipeline to be triggered, and this is used to update the [repo-list.json file](https://github.com/elifesciences/data-hub-airflow-image/blob/develop/repo-list.json) in another repository
Note that as part of the CI/CD, another Jenkins pipeline is always triggered whenever there is a commit to the develop branch. The latest commit reference to a `develop` branch is passed on as a parameter to this Jenkins pipeline to be triggered.
18 changes: 0 additions & 18 deletions docker-compose.ci.override.yml
Original file line number Diff line number Diff line change
@@ -1,24 +1,6 @@
version: '3.4'

services:
webserver:
ports:
- "8080:8080"
volumes:
- ./credentials.json:/tmp/credentials.json
- ~/.aws/credentials:/home/airflow/.aws/credentials

scheduler:
volumes:
- ./credentials.json:/tmp/credentials.json
- ~/.aws/credentials:/home/airflow/.aws/credentials

worker:
volumes:
- ./sample_data_config/ci-ejp-xml-data-pipeline.config.yaml:/home/airflow/app-config/ejp-xml/ejp-xml-data-pipeline.config.yaml
- ./credentials.json:/tmp/credentials.json
- ~/.aws/credentials:/home/airflow/.aws/credentials

test-client:
volumes:
- ./sample_data_config/ci-ejp-xml-data-pipeline.config.yaml:/home/airflow/app-config/ejp-xml/ejp-xml-data-pipeline.config.yaml
Expand Down
22 changes: 1 addition & 21 deletions docker-compose.dev.override.yml
Original file line number Diff line number Diff line change
@@ -1,37 +1,17 @@
version: '3.4'

services:
data-hub-dags-dev:
data-hub-pipelines-dev:
volumes:
- ./tests:/opt/airflow/tests
- ./ejp_xml_pipeline:/opt/airflow/ejp_xml_pipeline

webserver:
ports:
- "8080:8080"
volumes:
- ~/.config/gcloud/application_default_credentials.json:/tmp/credentials.json
- ~/.aws/credentials:/home/airflow/.aws/credentials

scheduler:
volumes:
- ~/.config/gcloud/application_default_credentials.json:/tmp/credentials.json
- ~/.aws/credentials:/home/airflow/.aws/credentials

worker:
volumes:
- ~/.config/gcloud/application_default_credentials.json:/tmp/credentials.json
- ~/.aws/credentials:/home/airflow/.aws/credentials
- ./sample_data_config/ci-ejp-xml-data-pipeline.config.yaml:/home/airflow/app-config/ejp-xml/ejp-xml-data-pipeline.config.yaml


test-client:
volumes:
- ~/.config/gcloud/application_default_credentials.json:/tmp/credentials.json
- ./sample_data_config/ci-ejp-xml-data-pipeline.config.yaml:/home/airflow/app-config/ejp-xml/ejp-xml-data-pipeline.config.yaml
- ~/.aws/credentials:/home/airflow/.aws/credentials


data-hub-pipelines:
volumes:
- ~/.config/gcloud/application_default_credentials.json:/tmp/credentials.json
Expand Down
92 changes: 6 additions & 86 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,38 +1,22 @@
version: '3.4'

x-airflow-env:
&airflow-env
x-pipeline-env:
&pipeline-env
- LOAD_EX=n
- AIRFLOW_HOST=webserver
- AIRFLOW_PORT=8080
- AIRFLOW__CORE__EXECUTOR=CeleryExecutor
- AIRFLOW__CELERY__BROKER_URL=redis://redis:6379/1
- AIRFLOW__CELERY__RESULT_BACKEND=db+postgresql://airflow:airflow@postgres:5432/airflow
- AIRFLOW__DATABASE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres:5432/airflow
- AIRFLOW__CORE__FERNET_KEY='81HqDtbqAywKSOumSha3BhWNOdQ26slT6K0YaZeZyPs='
- AIRFLOW__WEBSERVER__SECRET_KEY='WmZHRmJwd1dCUEp6Xl4zVA=='
- AIRFLOW__SMTP__SMTP_HOST=smtp-server
- AIRFLOW__SMTP__SMTP_STARTTLS=False
- AIRFLOW__SMTP__SMTP_SSL=False
- AIRFLOW__SMTP__SMTP_PORT=25
- [email protected]
- AIRFLOW__API__ENABLE_EXPERIMENTAL_API=True
- AIRFLOW__API__AUTH_BACKENDS=airflow.api.auth.backend.default
- GOOGLE_APPLICATION_CREDENTIALS=/home/airflow/.config/gcloud/credentials.json
- DEPLOYMENT_ENV=ci
- EJP_XML_CONFIG_FILE_PATH=/home/airflow/app-config/ejp-xml/ejp-xml-data-pipeline.config.yaml


services:
data-hub-dags:
environment:
GOOGLE_APPLICATION_CREDENTIALS: /tmp/credentials.json
data-hub-pipelines:
environment: *pipeline-env
build:
context: .
image: ${IMAGE_REPO}:${IMAGE_TAG}
command: ''

data-hub-dags-dev:
data-hub-pipelines-dev:
build:
context: .
dockerfile: Dockerfile
Expand All @@ -42,74 +26,10 @@ services:
command: /bin/sh -c exit 0
entrypoint: []

data-hub-pipelines:
image: ${IMAGE_REPO}:${IMAGE_TAG}
environment: *airflow-env

webserver:
depends_on:
- worker
environment: *airflow-env
volumes:
- ./config/webserver_config.py:/opt/airflow/webserver_config.py
image: ${IMAGE_REPO}-dev:${IMAGE_TAG}
entrypoint: /entrypoint
command: webserver

smtp-server:
restart: always
image: namshi/smtp@sha256:aa63b8de68ce63dfcf848c56f3c1a16d81354f4accd4242a0086c57dd5a91d77

scheduler:
image: ${IMAGE_REPO}-dev:${IMAGE_TAG}
depends_on:
- postgres
environment: *airflow-env
entrypoint: /entrypoint
command: scheduler

worker:
environment: *airflow-env
depends_on:
- smtp-server
- redis
- scheduler
image: ${IMAGE_REPO}-dev:${IMAGE_TAG}
entrypoint: /entrypoint
hostname: worker
command: >
bash -c "sudo install -D /tmp/credentials.json -m 644 -t /home/airflow/.config/gcloud
&& airflow celery worker"
test-client:
image: ${IMAGE_REPO}-dev:${IMAGE_TAG}
environment: *airflow-env
environment: *pipeline-env
command: >
bash -c "sudo install -D /tmp/credentials.json -m 644 -t /home/airflow/.config/gcloud
&& ./run_test.sh with-end-to-end"
postgres:
image: postgres:15
environment:
- POSTGRES_USER=airflow
- POSTGRES_PASSWORD=airflow
- POSTGRES_DB=airflow
healthcheck:
test: ["CMD-SHELL", "pg_isready -U airflow"]
interval: 5s
timeout: 5s
retries: 5

redis:
image: redis:5.0.5
environment:
- ALLOW_EMPTY_PASSWORD=yes

# flower:
# image: ${IMAGE_REPO}-dev:${IMAGE_TAG}
# depends_on:
# - redis
# environment: *airflow-env
# ports:
# - "5555:5555"
# command: celery flower
6 changes: 0 additions & 6 deletions run_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,6 @@

set -e

: "${AIRFLOW__CORE__FERNET_KEY:=${FERNET_KEY:=$(python -c "from cryptography.fernet import Fernet; FERNET_KEY = Fernet.generate_key().decode(); print(FERNET_KEY)")}}"
export AIRFLOW__CORE__FERNET_KEY

# to initialize SQLite DB for running non-e2e test and Postgres DB for running e2e test
# airflow initdb

# avoid issues with .pyc/pyo files when mounting source directory
export PYTHONOPTIMIZE=

Expand Down

0 comments on commit e229cc4

Please sign in to comment.