diff --git a/.travis.yml b/.travis.yml index 2497be0b..ea890554 100644 --- a/.travis.yml +++ b/.travis.yml @@ -22,7 +22,7 @@ jobs: include: - stage: test install: - - pip install -e ".[dev]" + - pip install -e ".[test]" script: - python3 -m flake8 - stage: test @@ -39,6 +39,8 @@ jobs: - sleep 2 - wget -O - http://localhost:5000/api/v1/health - stage: build + install: + - /bin/true script: - export IMAGE="kqueen/api:${TRAVIS_BRANCH/\//-}" - echo "Building image $IMAGE" @@ -47,6 +49,8 @@ jobs: - docker push $IMAGE - docker logout - stage: test + install: + - /bin/true script: - docker-compose -f docker-compose.etcd-volume.yml up -d - docker-compose -f docker-compose.etcd-volume.yml exec etcd etcdctl mkdir /travis/ @@ -62,8 +66,10 @@ jobs: echo "$VALUE == $TRAVIS_COMMIT"; [ "$VALUE" == "$TRAVIS_COMMIT" ] && exit 0 || exit 1 - stage: publish + install: + - /bin/true script: - - "/bin/true" + - /bin/true deploy: provider: pypi user: tomkukral diff --git a/README.rst b/README.rst index 830d7601..65240f80 100644 --- a/README.rst +++ b/README.rst @@ -39,6 +39,14 @@ Development etcdctl rm --recursive /kqueen ./devenv.py +- Run flask shell + +:: + + export FLASK_APP=kqueen.server + export prometheus_multiproc_dir=$(mktemp -d) + flask shell + - Test access token with `curl` :: diff --git a/docker-compose.demo.yml b/docker-compose.demo.yml index 5bab2b44..b71f84a6 100644 --- a/docker-compose.demo.yml +++ b/docker-compose.demo.yml @@ -1,6 +1,6 @@ version: '2' services: - kqueen: + api: image: kqueen/api:v0.11 ports: - 127.0.0.1:5000:5000 @@ -17,24 +17,24 @@ services: BOOTSTRAP_ADMIN_NAMESPACE: demoorg extra_hosts: - "ci.mcp.mirantis.net:172.16.48.254" - kqueen_mail: - image: modularitycontainers/postfix - volumes: - - /var/spool/postfix:/var/spool/postfix - - /var/spool/mail:/var/spool/mail - environment: - MYHOSTNAME: 'kqueen_mail' - kqueen_ui: + ui: image: kqueen/ui:v0.1 ports: - 127.0.0.1:5080:5080 depends_on: - - kqueen + - api environment: KQUEEN_UI_CONFIG_FILE: config/prod.py - KQUEENUI_KQUEEN_API_URL: http://kqueen:5000/api/v1/ - KQUEENUI_KQUEEN_AUTH_URL: http://kqueen:5000/api/v1/auth + KQUEENUI_KQUEEN_API_URL: http://api:5000/api/v1/ + KQUEENUI_KQUEEN_AUTH_URL: http://api:5000/api/v1/auth KQUEENUI_KQUEEN_SERVICE_USER_USERNAME: admin KQUEENUI_KQUEEN_SERVICE_USER_PASSWORD: default - KQUEENUI_MAIL_SERVER: kqueen_mail + KQUEENUI_MAIL_SERVER: mail KQUEENUI_MAIL_PORT: 10025 +# mail: +# image: modularitycontainers/postfix +# volumes: +# - /var/spool/postfix:/var/spool/postfix +# - /var/spool/mail:/var/spool/mail +# environment: +# MYHOSTNAME: 'mail' diff --git a/docker-compose.production.yml b/docker-compose.production.yml new file mode 100644 index 00000000..0e6a997c --- /dev/null +++ b/docker-compose.production.yml @@ -0,0 +1,69 @@ +version: '2' +services: + etcd: + restart: always + volumes: + - /0.etcd/:/mnt/storage/kqueen/etcd/ + api: + image: kqueen/api:v0.14 + restart: always + environment: + KQUEEN_CONFIG_FILE: config/prod.py + KQUEEN_ETCD_HOST: etcd + KQUEEN_PROMETHEUS_WHITELIST: '172.16.238.0/24' + # TODO: set SECRET_KEY + KQUEEN_SECRET_KEY: '' + BOOTSTRAP_ADMIN: 1 + BOOTSTRAP_ADMIN_USERNAME: admin + # TODO: set admin password + BOOTSTRAP_ADMIN_PASSWORD: + BOOTSTRAP_ADMIN_ORGANIZATION: DemoOrg + BOOTSTRAP_ADMIN_NAMESPACE: demoorg + volumes: + - ./kqueen/config/prod.py:/code/kqueen/config/prod.py + proxy: + build: ./prod/nginx/ + restart: always + ports: + - 443:443 + - 80:80 + volumes: + - /mnt/storage/kqueen/certs/:/mnt/letsencrypt/demo.kqueen.net/:ro + volumes_from: + - ui:ro + ui: + image: kqueen/ui:v0.2 + environment: + KQUEENUI_PREFERRED_URL_SCHEME: https + KQUEEN_UI_CONFIG_FILE: config/prod.py + KQUEENUI_KQUEEN_API_URL: http://api:5000/api/v1/ + KQUEENUI_KQUEEN_AUTH_URL: http://api:5000/api/v1/auth + KQUEENUI_KQUEEN_SERVICE_USER_USERNAME: admin + # TODO: set same password as in api BOOTSTRAP_ADMIN_PASSWORD + KQUEENUI_KQUEEN_SERVICE_USER_PASSWORD: + KQUEENUI_MAIL_SERVER: mail + KQUEENUI_MAIL_PORT: 10025 + KQUEENUI_ENABLE_PUBLIC_REGISTRATION: 1 + STATIC_DIR: /mnt/static/ + volumes: + - /mnt/static/ + restart: always + mail: + image: modularitycontainers/postfix + restart: always + volumes: + - /var/spool/postfix/ + - /var/spool/mail/ + environment: + MYHOSTNAME: 'mail' + prometheus: + image: prom/prometheus + restart: always + ports: + - 127.0.0.1:9090:9090 + volumes: + - ./prod/prometheus/:/etc/prometheus/:Z + - /mnt/storage/kqueen/prometheus/:/prometheus/ + links: + - api + - etcd diff --git a/kqueen/config/base.py b/kqueen/config/base.py index 4cc092bb..e424db9c 100644 --- a/kqueen/config/base.py +++ b/kqueen/config/base.py @@ -37,6 +37,8 @@ class BaseConfig: PROVISIONER_OK_STATE = 'OK' PROVISIONER_UNKNOWN_STATE = 'Not Reachable' + PROVISIONER_ENGINE_WHITELIST = None + PROMETHEUS_WHITELIST = '127.0.0.0/8' @classmethod diff --git a/kqueen/config/prod.py b/kqueen/config/prod.py index aeb168ed..21429d83 100644 --- a/kqueen/config/prod.py +++ b/kqueen/config/prod.py @@ -7,8 +7,8 @@ class Config(BaseConfig): KQUEEN_HOST = '0.0.0.0' - # App secret - SECRET_KEY = 'secret' + # App secret - set this to random string >= 16 chars + # SECRET_KEY = 'secret' # Jenkins engine settings JENKINS_API_URL = 'https://ci.mcp.mirantis.net' diff --git a/prod/nginx/Dockerfile b/prod/nginx/Dockerfile new file mode 100644 index 00000000..9e3c484b --- /dev/null +++ b/prod/nginx/Dockerfile @@ -0,0 +1,15 @@ +FROM nginx + +# environment +ENV DIR_CONF /etc/nginx/conf.d/ +ENV DIR_APP /var/www/app/ +ENV VHOSTNAME demo.kqueen.net + +# flush nginx config +RUN rm -v /etc/nginx/conf.d/* + +# copy config +COPY vhost.conf $DIR_CONF + +# edit vhost.conf +RUN sed -i "s/vhostname/$VHOSTNAME/g" $DIR_CONF/vhost.conf diff --git a/prod/nginx/vhost.conf b/prod/nginx/vhost.conf new file mode 100644 index 00000000..2f363115 --- /dev/null +++ b/prod/nginx/vhost.conf @@ -0,0 +1,83 @@ +# upstream app +upstream django { + server ui:5080; +} + +server { #default server + listen 80 default_server; + server_name _; + + access_log /dev/stdout main; + error_log /dev/stdout info; + + root /dev/null; +} + +server { # http://vhostname + listen 0.0.0.0:80; + server_name vhostname; + return 301 https://$server_name$request_uri; +} + +server { # https://vhostname + + listen 0.0.0.0:443 ssl http2; + server_name vhostname; + + access_log /dev/stdout main; + error_log /dev/stdout info; + + ssl_certificate /mnt/letsencrypt/vhostname/fullchain.cer; + ssl_certificate_key /mnt/letsencrypt/vhostname/vhostname.key; + ssl_session_timeout 1d; + ssl_session_cache shared:SSL:50m; + + # Diffie-Hellman parameter for DHE ciphersuites, recommended 2048 bits + #ssl_dhparam /etc/nginx/certs/marast/dhparam.pem; + + # intermediate configuration. tweak to your needs. + ssl_protocols TLSv1 TLSv1.1 TLSv1.2; + ssl_ciphers 'ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-AES256-GCM-SHA384:DHE-RSA-AES128-GCM-SHA256:DHE-DSS-AES128-GCM-SHA256:kEDH+AESGCM:ECDHE-RSA-AES128-SHA256:ECDHE-ECDSA-AES128-SHA256:ECDHE-RSA-AES128-SHA:ECDHE-ECDSA-AES128-SHA:ECDHE-RSA-AES256-SHA384:ECDHE-ECDSA-AES256-SHA384:ECDHE-RSA-AES256-SHA:ECDHE-ECDSA-AES256-SHA:DHE-RSA-AES128-SHA256:DHE-RSA-AES128-SHA:DHE-DSS-AES128-SHA256:DHE-RSA-AES256-SHA256:DHE-DSS-AES256-SHA:DHE-RSA-AES256-SHA:ECDHE-RSA-DES-CBC3-SHA:ECDHE-ECDSA-DES-CBC3-SHA:AES128-GCM-SHA256:AES256-GCM-SHA384:AES128-SHA256:AES256-SHA256:AES128-SHA:AES256-SHA:AES:CAMELLIA:DES-CBC3-SHA:!aNULL:!eNULL:!EXPORT:!DES:!RC4:!MD5:!PSK:!aECDH:!EDH-DSS-DES-CBC3-SHA:!EDH-RSA-DES-CBC3-SHA:!KRB5-DES-CBC3-SHA'; + ssl_prefer_server_ciphers on; + + # HSTS (ngx_http_headers_module is required) (15768000 seconds = 6 months) + add_header Strict-Transport-Security max-age=315360000; # 10 years + + # OCSP Stapling --- + # fetch OCSP records from URL in ssl_certificate and cache them + #ssl_stapling on; + #ssl_stapling_verify on; + + ## verify chain of trust of OCSP response using Root CA and Intermediate certs + ssl_trusted_certificate /mnt/letsencrypt/vhostname/ca.cer; + + client_max_body_size 64M; + + server_name _; + + root /dev/null/; + + location /static/ { + alias /mnt/static/; + #gzip_static on; + #expires 24h; + #add_header Cache-Control public; + autoindex on; + } + + # letsencrypt validation + location /.well-known/ { + alias /mnt/letsencrypt/.well-known/; + } + + location / { + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header Host $http_host; + proxy_set_header X-Forwarded-Proto $scheme; + add_header Cache-Control no-cache; + proxy_redirect http:// https://; + expires -1; + proxy_pass http://django; + } +} diff --git a/prod/prometheus/etcd3_alert.rules.yml b/prod/prometheus/etcd3_alert.rules.yml new file mode 100644 index 00000000..e68f1a42 --- /dev/null +++ b/prod/prometheus/etcd3_alert.rules.yml @@ -0,0 +1,143 @@ +groups: +- name: etcd3_alert.rules + rules: + - alert: InsufficientMembers + expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) + for: 3m + labels: + severity: critical + annotations: + description: If one more etcd member goes down the cluster will be unavailable + summary: etcd cluster insufficient members + - alert: NoLeader + expr: etcd_server_has_leader{job="etcd"} == 0 + for: 1m + labels: + severity: critical + annotations: + description: etcd member {{ $labels.instance }} has no leader + summary: etcd member has no leader + - alert: HighNumberOfLeaderChanges + expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader + changes within the last hour + summary: a high number of leader changes within the etcd cluster are happening + - alert: HighNumberOfFailedGRPCRequests + expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method) + / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.01 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed + on etcd instance {{ $labels.instance }}' + summary: a high number of gRPC requests are failing + - alert: HighNumberOfFailedGRPCRequests + expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method) + / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.05 + for: 5m + labels: + severity: critical + annotations: + description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed + on etcd instance {{ $labels.instance }}' + summary: a high number of gRPC requests are failing + - alert: GRPCRequestsSlow + expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le)) + > 0.15 + for: 10m + labels: + severity: critical + annotations: + description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method + }} are slow + summary: slow gRPC requests + - alert: HighNumberOfFailedHTTPRequests + expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) + BY (method) > 0.01 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd + instance {{ $labels.instance }}' + summary: a high number of HTTP requests are failing + - alert: HighNumberOfFailedHTTPRequests + expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) + BY (method) > 0.05 + for: 5m + labels: + severity: critical + annotations: + description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd + instance {{ $labels.instance }}' + summary: a high number of HTTP requests are failing + - alert: HTTPRequestsSlow + expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) + > 0.15 + for: 10m + labels: + severity: warning + annotations: + description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method + }} are slow + summary: slow HTTP requests + - record: instance:fd_utilization + expr: process_open_fds / process_max_fds + - alert: FdExhaustionClose + expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $labels.job }} instance {{ $labels.instance }} will exhaust + its file descriptors soon' + summary: file descriptors soon exhausted + - alert: FdExhaustionClose + expr: predict_linear(instance:fd_utilization[10m], 3600) > 1 + for: 10m + labels: + severity: critical + annotations: + description: '{{ $labels.job }} instance {{ $labels.instance }} will exhaust + its file descriptors soon' + summary: file descriptors soon exhausted + - alert: EtcdMemberCommunicationSlow + expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) + > 0.15 + for: 10m + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} member communication with + {{ $labels.To }} is slow + summary: etcd member communication is slow + - alert: HighNumberOfFailedProposals + expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal + failures within the last hour + summary: a high number of proposals within the etcd cluster are failing + - alert: HighFsyncDurations + expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) + > 0.5 + for: 10m + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} fync durations are high + summary: high fsync durations + - alert: HighCommitDurations + expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) + > 0.25 + for: 10m + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} commit durations are high + summary: high commit durations diff --git a/prod/prometheus/prometheus.yml b/prod/prometheus/prometheus.yml new file mode 100644 index 00000000..4cfd5ed7 --- /dev/null +++ b/prod/prometheus/prometheus.yml @@ -0,0 +1,18 @@ +global: + scrape_interval: '30s' + +rule_files: + - '/etc/prometheus/*.rules.yml' + +scrape_configs: + - job_name: 'kqueen' + static_configs: + - targets: ['api:5000'] + + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'etcd' + static_configs: + - targets: ['etcd:4001'] diff --git a/setup.py b/setup.py index f4762a93..0312fa07 100644 --- a/setup.py +++ b/setup.py @@ -55,6 +55,7 @@ ], tests_require=test_require, extras_require={ + 'test': test_require, 'dev': test_require + [ 'ipython', 'sphinx',