diff --git a/scripts/deploy-backend.sh b/scripts/deploy-backend.sh index c4ab36e2b..8a66babcc 100755 --- a/scripts/deploy-backend.sh +++ b/scripts/deploy-backend.sh @@ -118,17 +118,6 @@ prepare_promtail() { popd } -update_plg_networking() { - # Need to switch the space after deploy since we're not always in dev space to handle specific networking from dev - # PLG apps to the correct backend app. - cf target -o hhs-acf-ofa -s tanf-dev - cf add-network-policy prometheus "$CGAPPNAME_BACKEND" -s "$CF_SPACE" --protocol tcp --port 8080 - cf target -o hhs-acf-ofa -s "$CF_SPACE" - - # Promtial needs to send logs to Loki - cf add-network-policy "$CGAPPNAME_BACKEND" loki -s "tanf-dev" --protocol tcp --port 8080 -} - update_backend() { cd tdrs-backend || exit @@ -167,9 +156,6 @@ update_backend() # Add network policy to allow frontend to access backend cf add-network-policy "$CGAPPNAME_FRONTEND" "$CGAPPNAME_BACKEND" --protocol tcp --port 8080 - # Add PLG routing - update_plg_networking - if [ "$CF_SPACE" = "tanf-prod" ]; then # Add network policy to allow backend to access tanf-prod services cf add-network-policy "$CGAPPNAME_BACKEND" clamav-rest --protocol tcp --port 9000 diff --git a/scripts/deploy-frontend.sh b/scripts/deploy-frontend.sh index fd7206929..ddb0bcdea 100755 --- a/scripts/deploy-frontend.sh +++ b/scripts/deploy-frontend.sh @@ -13,7 +13,6 @@ CF_SPACE=${5} ENVIRONMENT=${6} env=${CF_SPACE#"tanf-"} -frontend_app_name=$(echo $CGHOSTNAME_FRONTEND | cut -d"-" -f3) # Update the Kibana name to include the environment KIBANA_BASE_URL="${CGAPPNAME_KIBANA}-${env}.apps.internal" diff --git a/tdrs-backend/plg/README.md b/tdrs-backend/plg/README.md new file mode 100644 index 000000000..f0438e8f4 --- /dev/null +++ b/tdrs-backend/plg/README.md @@ -0,0 +1,6 @@ +### Grafana Auth and RBAC Config +Grafana is accessible by any frontend app on a private route to users who have the correct role. The Grafana UI is not be accessible to any user or application unless they are routed to it via a frontend app. Grafana is configured to require user and password authentication. Having the extra layer of authentication is required because the roles defined in Grafana are not in alignment with the roles TDP defines. Assigning users to appropriate role and teams in Grafana allows for least privilege access to any information that Grafana might be able to display. + +Grafana has three roles: `Admin`, `Editor`, and `Viewer`. We have also defined two teams (groups) in Grafana: `OFA` and `Raft` and several users. The teams are how we manage least privilege to Grafana's resources. Upon creation, all users are given one of the base roles. All Raft dev user accounts are given read only access (`Viewer`) to Grafana and OFA has a user account(s) associated with each of the roles. All users who are outside of OFA should always be assigned the `Viewer` role to maintain least privilege. All dashboards in Grafana are viewable by team as opposed to individual users/roles. Dashboard permissions are configured per dashboard and each team is given read only access to the appropriate dashboards. The `ofa-admin` user is the only direct user given access to resources. This account is given exclusive admin rights to all of Grafana. + +All Grafana administration is handled under the `Administration` drop down in the hamburger menu which is only accessible to `Admin` users. Users can be created, assigned a role, and then associated with a team. As new dashboards are added to Grafana their permissions need to be configured for least privilege by going to Dashboards->->Settings->Permissions. The admin can use other dashboard permission configurations to help finish the configuration. diff --git a/tdrs-backend/plg/deploy.sh b/tdrs-backend/plg/deploy.sh index 11adaebdd..c411f5457 100755 --- a/tdrs-backend/plg/deploy.sh +++ b/tdrs-backend/plg/deploy.sh @@ -1,13 +1,21 @@ #!/bin/bash set -e +DEV_BACKEND_APPS=("tdp-backend-raft" "tdp-backend-qasp" "tdp-backend-a11y") +STAGING_BACKEND_APPS=("tdp-backend-develop" "tdp-backend-staging") +PROD_BACKEND="tdp-backend-prod" + +DEV_FRONTEND_APPS=("tdp-frontend-raft" "tdp-frontend-qasp" "tdp-frontend-a11y") +STAGING_FRONTEND_APPS=("tdp-frontend-develop" "tdp-frontend-staging") +PROD_FRONTEND="tdp-frontend-prod" + help() { echo "Deploy the PLG stack or a Postgres exporter to the Cloud Foundry space you're currently authenticated in." echo "Syntax: deploy.sh [-h|a|p|u|d]" echo "Options:" echo "h Print this help message." echo "a Deploy the entire PLG stack." - echo "p Deploy a postgres exporter. Requires -u and -d" + echo "p Deploy a postgres exporter, expects the environment name (dev, staging, production) to be passed with switch. Requires -u and -d" echo "u Requires -p. The database URI the exporter should connect with." echo "d The Cloud Foundry service name of the RDS instance. Should be included with all deployments." echo @@ -19,6 +27,7 @@ deploy_pg_exporter() { cp manifest.yml $MANIFEST APP_NAME="pg-exporter-$1" + EXPORTER_SPACE=$(cf target | grep -Eo "tanf(.*)") yq eval -i ".applications[0].name = \"$APP_NAME\"" $MANIFEST yq eval -i ".applications[0].env.DATA_SOURCE_NAME = \"$2\"" $MANIFEST @@ -27,9 +36,10 @@ deploy_pg_exporter() { cf push --no-route -f $MANIFEST -t 180 --strategy rolling cf map-route $APP_NAME apps.internal --hostname $APP_NAME - # Add policy to allow prometheus to talk to pg-exporter - # TODO: this logic needs to be updated to allow routing accross spaces based on where we want PLG to live. - cf add-network-policy prometheus $APP_NAME -s "tanf-dev" --protocol tcp --port 9187 + # Add policy to allow prometheus to talk to pg-exporter regardless of environment + cf target -o hhs-acf-ofa -s tanf-prod + cf add-network-policy prometheus $APP_NAME -s "$EXPORTER_SPACE" --protocol tcp --port 9187 + cf target -o hhs-acf-ofa -s "$EXPORTER_SPACE" rm $MANIFEST popd } @@ -47,13 +57,21 @@ deploy_grafana() { yq eval -i ".applications[0].services[0] = \"$1\"" $MANIFEST cf push --no-route -f $MANIFEST -t 180 --strategy rolling - # cf map-route $APP_NAME apps.internal --hostname $APP_NAME - # Give Grafana a public route for now. Might be able to swap to internal route later. - cf map-route "$APP_NAME" app.cloud.gov --hostname "${APP_NAME}" + cf map-route $APP_NAME apps.internal --hostname $APP_NAME # Add policy to allow grafana to talk to prometheus and loki cf add-network-policy $APP_NAME prometheus --protocol tcp --port 8080 cf add-network-policy $APP_NAME loki --protocol tcp --port 8080 + + # Add network policies to allow grafana to talk to all frontend apps in all environments + for app in ${DEV_FRONTEND_APPS[@]}; do + cf add-network-policy "grafana" $app -s "tanf-dev" --protocol tcp --port 80 + done + for app in ${STAGING_FRONTEND_APPS[@]}; do + cf add-network-policy "grafana" $app -s "tanf-staging" --protocol tcp --port 80 + done + cf add-network-policy "grafana" $PROD_FRONTEND --protocol tcp --port 80 + rm $DATASOURCES rm $MANIFEST popd @@ -63,6 +81,16 @@ deploy_prometheus() { pushd prometheus cf push --no-route -f manifest.yml -t 180 --strategy rolling cf map-route prometheus apps.internal --hostname prometheus + + # Add network policies to allow prometheus to talk to all backend apps in all environments + for app in ${DEV_BACKEND_APPS[@]}; do + cf add-network-policy prometheus $app -s "tanf-dev" --protocol tcp --port 8080 + done + for app in ${STAGING_BACKEND_APPS[@]}; do + cf add-network-policy prometheus $app -s "tanf-staging" --protocol tcp --port 8080 + done + cf add-network-policy prometheus $PROD_BACKEND --protocol tcp --port 8080 + popd } @@ -73,6 +101,25 @@ deploy_loki() { popd } +setup_extra_net_pols() { + # Add network policies to allow frontend/backend to talk to grafana/loki + cf target -o hhs-acf-ofa -s tanf-dev + for i in ${!DEV_BACKEND_APPS[@]}; do + cf add-network-policy ${DEV_FRONTEND_APPS[$i]} grafana -s tanf-prod --protocol tcp --port 8080 + cf add-network-policy ${DEV_BACKEND_APPS[$i]} loki -s tanf-prod --protocol tcp --port 8080 + done + + cf target -o hhs-acf-ofa -s tanf-staging + for i in ${!STAGING_BACKEND_APPS[@]}; do + cf add-network-policy ${STAGING_FRONTEND_APPS[$i]} grafana -s tanf-prod --protocol tcp --port 8080 + cf add-network-policy ${STAGING_BACKEND_APPS[$i]} loki -s tanf-prod --protocol tcp --port 8080 + done + + cf target -o hhs-acf-ofa -s tanf-prod + cf add-network-policy $PROD_FRONTEND grafana -s tanf-prod --protocol tcp --port 8080 + cf add-network-policy $PROD_BACKEND loki -s tanf-prod --protocol tcp --port 8080 +} + err_help_exit() { echo $1 echo @@ -97,6 +144,7 @@ while getopts ":hap:u:d:" option; do DB_SERVICE_NAME=$OPTARG;; \?) # Invalid option echo "Error: Invalid option" + help exit;; esac done @@ -107,13 +155,14 @@ if [ "$#" -eq 0 ]; then fi pushd "$(dirname "$0")" -if [ "$DB_URI" == "" ] || [ "$DB_SERVICE_NAME" == "" ]; then +if [ "$DB_SERVICE_NAME" == "" ]; then err_help_exit "Error: you must include a database service name." fi if [ "$DEPLOY" == "plg" ]; then deploy_prometheus deploy_loki - deploy_grafana + deploy_grafana $DB_SERVICE_NAME + setup_extra_net_pols fi if [ "$DEPLOY" == "pg-exporter" ]; then if [ "$DB_URI" == "" ]; then diff --git a/tdrs-backend/plg/grafana/custom.ini b/tdrs-backend/plg/grafana/custom.ini index 7d8be7d57..fef040207 100644 --- a/tdrs-backend/plg/grafana/custom.ini +++ b/tdrs-backend/plg/grafana/custom.ini @@ -40,7 +40,7 @@ http_addr = http_port = 8080 # The public facing domain name used to access grafana from a browser -domain = app.cloud.gov +domain = grafana.apps.internal # Redirect to correct domain if host header does not match domain # Prevents DNS rebinding attacks @@ -553,10 +553,10 @@ login_cookie_name = grafana_session disable_login = false # The maximum lifetime (duration) an authenticated user can be inactive before being required to login at next visit. Default is 7 days (7d). This setting should be expressed as a duration, e.g. 5m (minutes), 6h (hours), 10d (days), 2w (weeks), 1M (month). The lifetime resets at each successful token rotation (token_rotation_interval_minutes). -login_maximum_inactive_lifetime_duration = +login_maximum_inactive_lifetime_duration = 30m # The maximum lifetime (duration) an authenticated user can be logged in since login time before being required to login. Default is 30 days (30d). This setting should be expressed as a duration, e.g. 5m (minutes), 6h (hours), 10d (days), 2w (weeks), 1M (month). -login_maximum_lifetime_duration = +login_maximum_lifetime_duration = 1d # How often should auth tokens be rotated for authenticated users when being active. The default is each 10 minutes. token_rotation_interval_minutes = 10 diff --git a/tdrs-backend/plg/grafana/dashboards/dev_logs_dashboard.json b/tdrs-backend/plg/grafana/dashboards/dev_logs_dashboard.json new file mode 100644 index 000000000..7fc27f8d5 --- /dev/null +++ b/tdrs-backend/plg/grafana/dashboards/dev_logs_dashboard.json @@ -0,0 +1,113 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Dashboard allowing log visualization", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "datasource": { + "type": "loki", + "uid": "L8E80F9AEF21F6940" + }, + "gridPos": { + "h": 28, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": true, + "showCommonLabels": false, + "showLabels": false, + "showTime": false, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "L8E80F9AEF21F6940" + }, + "editorMode": "code", + "expr": "{job=~\"$job\"}", + "queryType": "range", + "refId": "A" + } + ], + "title": "Job Logs", + "type": "logs" + } + ], + "refresh": "10s", + "schemaVersion": 39, + "tags": [ + "Django", + "Logs", + "Loki" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "loki", + "uid": "L8E80F9AEF21F6940" + }, + "definition": "", + "description": "Filter logs by job.", + "hide": 0, + "includeAll": true, + "label": "Job", + "multi": false, + "name": "job", + "options": [], + "query": { + "label": "job", + "refId": "LokiVariableQueryEditor-VariableQuery", + "stream": "", + "type": 1 + }, + "refresh": 1, + "regex": "^(?!.*[-]prod$).*$", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Dev Logs", + "uid": "cdyz6flmh0ttsy", + "version": 1, + "weekStart": "" +} \ No newline at end of file diff --git a/tdrs-backend/plg/grafana/dashboards/logs_dashboard.json b/tdrs-backend/plg/grafana/dashboards/logs_dashboard.json index 5b34ecf3a..ef2c34f56 100644 --- a/tdrs-backend/plg/grafana/dashboards/logs_dashboard.json +++ b/tdrs-backend/plg/grafana/dashboards/logs_dashboard.json @@ -21,93 +21,6 @@ "graphTooltip": 0, "links": [], "panels": [ - { - "datasource": { - "default": true, - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "dark-red", - "value": 80 - }, - { - "color": "light-red", - "value": 85 - }, - { - "color": "#EAB839", - "value": 90 - }, - { - "color": "semi-dark-green", - "value": 95 - }, - { - "color": "dark-green", - "value": 100 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 14, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 2, - "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "center", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.2.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "editorMode": "code", - "expr": "100 * avg_over_time(up{job=~\"$app\"}[$__range])", - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A" - } - ], - "title": "App Uptime", - "type": "stat" - }, { "datasource": { "type": "loki", @@ -117,7 +30,7 @@ "h": 28, "w": 24, "x": 0, - "y": 14 + "y": 0 }, "id": 1, "options": { @@ -157,7 +70,7 @@ "list": [ { "current": { - "selected": true, + "selected": false, "text": "All", "value": "$__all" }, @@ -184,30 +97,6 @@ "skipUrlSync": false, "sort": 0, "type": "query" - }, - { - "current": { - "selected": true, - "text": "All", - "value": "$__all" - }, - "definition": "query_result(up)", - "hide": 0, - "includeAll": true, - "label": "App", - "multi": false, - "name": "app", - "options": [], - "query": { - "qryType": 3, - "query": "query_result(up)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" - }, - "refresh": 1, - "regex": "/.*job=\"([^\"]+).*/", - "skipUrlSync": false, - "sort": 0, - "type": "query" } ] }, diff --git a/tdrs-backend/plg/grafana/manifest.yml b/tdrs-backend/plg/grafana/manifest.yml index 2f796535f..1d6be5f3a 100644 --- a/tdrs-backend/plg/grafana/manifest.yml +++ b/tdrs-backend/plg/grafana/manifest.yml @@ -5,12 +5,12 @@ applications: disk_quota: 2G instances: 1 env: - GF_PATHS_PROVISIONING: "/conf/provisioning" + GF_PATHS_PROVISIONING: "conf/provisioning" GF_PATHS_CONFIG: "/home/vcap/app/custom.ini" GF_PATHS_HOME: "/home/vcap/app/grafana-v11.2.0" GF_PATHS_DATA: "/home/vcap/app/data" GF_PATHS_LOGS: "/home/vcap/app/logs" - GF_PATHS_PLUGINS: "/conf/provisioning/plugins" + GF_PATHS_PLUGINS: "conf/provisioning/plugins" GF_SERVER_HTTP_PORT: 8080 GF_DATABASE_TYPE: postgres GF_DATABASE_SSL_MODE: require diff --git a/tdrs-backend/plg/loki/manifest.yml b/tdrs-backend/plg/loki/manifest.yml index ab0d5d532..3f747daf4 100644 --- a/tdrs-backend/plg/loki/manifest.yml +++ b/tdrs-backend/plg/loki/manifest.yml @@ -1,7 +1,7 @@ version: 1 applications: - name: loki - memory: 512M + memory: 1G disk_quota: 7G instances: 1 command: | diff --git a/tdrs-backend/plg/prometheus/prometheus.yml b/tdrs-backend/plg/prometheus/prometheus.yml index 007422b26..66e35c519 100644 --- a/tdrs-backend/plg/prometheus/prometheus.yml +++ b/tdrs-backend/plg/prometheus/prometheus.yml @@ -61,13 +61,6 @@ scrape_configs: service: "tdp-backend" env: "production" - - job_name: "celery-exporter-raft" - static_configs: - - targets: ["celery-exporter-raft.apps.internal:9540"] - labels: - service: "celery" - env: "dev" - - job_name: postgres-dev static_configs: - targets: ["pg-exporter-dev.apps.internal:9187"] @@ -95,7 +88,7 @@ scrape_configs: labels: service: "loki" env: "production" - + - job_name: grafana metrics_path: /grafana/metrics static_configs: diff --git a/tdrs-backend/plg/promtail/config.local.yml b/tdrs-backend/plg/promtail/config.local.yml index 9cb617c11..dc6eb0da4 100644 --- a/tdrs-backend/plg/promtail/config.local.yml +++ b/tdrs-backend/plg/promtail/config.local.yml @@ -15,19 +15,26 @@ scrape_configs: - targets: - localhost labels: - job: varlogs + job: varlogs-local __path__: /var/log/*log - - job_name: django + - job_name: backend-local static_configs: - targets: - localhost labels: - job: django + job: backend-local __path__: /logs/django.log - - job_name: nginx + - job_name: backend-prod static_configs: - targets: - localhost labels: - job: nginx + job: backend-prod + __path__: /logs/django.log + - job_name: frontend-local + static_configs: + - targets: + - localhost + labels: + job: frontend-local __path__: /var/log/nginx/*log diff --git a/tdrs-backend/tdpservice/users/api/authorization_check.py b/tdrs-backend/tdpservice/users/api/authorization_check.py index 1d7bed218..1309d5a51 100644 --- a/tdrs-backend/tdpservice/users/api/authorization_check.py +++ b/tdrs-backend/tdpservice/users/api/authorization_check.py @@ -66,10 +66,9 @@ def get(self, request, *args, **kwargs): user_in_valid_group = user.is_ofa_sys_admin or user.is_digit_team if (user.hhs_id is not None and user_in_valid_group) or settings.BYPASS_OFA_AUTH: - logger.debug(f"User: {user} has correct authentication credentials. Allowing access to Kibana.") return HttpResponse(status=200) else: - logger.debug(f"User: {user} has incorrect authentication credentials. Not allowing access to Kibana.") + logger.warning(f"User: {user} has incorrect authentication credentials. Not allowing access to Kibana.") return HttpResponse(status=401) class PlgAuthorizationCheck(APIView): @@ -85,11 +84,8 @@ def get(self, request, *args, **kwargs): user_in_valid_group = user.is_ofa_sys_admin or user.is_developer - print("\n\nINSIDE AUTH CHECK\n\n") - if user_in_valid_group: - logger.debug(f"User: {user} has correct authentication credentials. Allowing access to plg.") return HttpResponse(status=200) else: - logger.debug(f"User: {user} has incorrect authentication credentials. Not allowing access to plg.") + logger.warning(f"User: {user} has incorrect authentication credentials. Not allowing access to Grafana.") return HttpResponse(status=401) diff --git a/tdrs-frontend/nginx/cloud.gov/locations.conf b/tdrs-frontend/nginx/cloud.gov/locations.conf index 592063439..2e14fc69f 100644 --- a/tdrs-frontend/nginx/cloud.gov/locations.conf +++ b/tdrs-frontend/nginx/cloud.gov/locations.conf @@ -60,6 +60,42 @@ location = /kibana_auth_check { send_timeout 900; } +location /grafana/ { + auth_request /plg_auth_check; + auth_request_set $auth_status $upstream_status; + + set $grafana http://grafana.apps.internal:8080$request_uri; + proxy_pass $grafana; + proxy_set_header Host $host:3000; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto https; + + proxy_connect_timeout 300; + proxy_read_timeout 300; + proxy_send_timeout 300; + send_timeout 900; + proxy_buffer_size 4k; +} + +location = /plg_auth_check { + internal; + set $endpoint http://{{env "BACKEND_HOST"}}.apps.internal:8080/plg_auth_check/; + proxy_pass $endpoint$1$is_args$args; + proxy_set_header Host $host:3000; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto https; + proxy_set_header Content-Length ""; + proxy_set_header X-Original-URI $request_uri; + + proxy_connect_timeout 300; + proxy_read_timeout 300; + proxy_send_timeout 300; + send_timeout 900; + proxy_pass_header x-csrftoken; +} + if ($request_method ~ ^(TRACE|OPTION)$) { return 405; }