Skip to content

Commit

Permalink
feat(backup): logs everything to stdout/err, implement lock file for …
Browse files Browse the repository at this point in the history
…both backup/restore (#1023)
  • Loading branch information
brokenpip3 authored Jun 25, 2024
1 parent 5ef6c73 commit b722ef1
Show file tree
Hide file tree
Showing 10 changed files with 128 additions and 44 deletions.
2 changes: 1 addition & 1 deletion backup/pvc/VERSION.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
v0.3.0
v0.4.0
56 changes: 40 additions & 16 deletions backup/pvc/bin/backup.sh
Original file line number Diff line number Diff line change
@@ -1,39 +1,63 @@
#!/usr/bin/env bash

set -eo pipefail
source "$(dirname "$0")/utils.sh"

[[ ! $# -eq 1 ]] && _log "ERROR" "Usage: $0 BACKUP_NUMBER" && exit 1
[[ -z "${BACKUP_DIR}" ]] && _log "ERROR" "Required 'BACKUP_DIR' env not set" && exit 1
[[ -z "${JENKINS_HOME}" ]] && _log "ERROR" "Required 'JENKINS_HOME' env not set" && exit 1
BACKUP_RETRY_COUNT=${BACKUP_RETRY_COUNT:-3}
BACKUP_RETRY_INTERVAL=${BACKUP_RETRY_INTERVAL:-60}
BACKUP_NUMBER=$1
TRAP_FILE="/tmp/_backup_${BACKUP_NUMBER}_is_running"

# --> Check if another backup process is running (operator restart/crash)
for ((i=0; i<BACKUP_RETRY_COUNT; i++)); do
[[ ! -f "${TRAP_FILE}" ]] && _log "INFO" "[backup] no other backup process are running" && break
_log "INFO" "[backup] backup is already running. Waiting for ${BACKUP_RETRY_INTERVAL} seconds..."
sleep "${BACKUP_RETRY_INTERVAL}"
done
[[ -f "${TRAP_FILE}" ]] && { _log "ERROR" "[backup] backup is still running after waiting ${BACKUP_RETRY_COUNT} time ${BACKUP_RETRY_INTERVAL}s. Exiting."; exit 1; }
# --< Done

_log "INFO" "[backup] running backup ${BACKUP_NUMBER}"
touch "${TRAP_FILE}"
# create temp dir on the same filesystem with a BACKUP_DIR to be able use atomic mv enstead of copy
BACKUP_TMP_DIR=$(mktemp -d --tmpdir="${BACKUP_DIR}")

[[ ! $# -eq 1 ]] && echo "Usage: $0 backup_number" && exit 1;
[[ -z "${BACKUP_DIR}" ]] && echo "Required 'BACKUP_DIR' env not set" && exit 1;
[[ -z "${JENKINS_HOME}" ]] && echo "Required 'JENKINS_HOME' env not set" && exit 1;
_clean(){
test -d "${BACKUP_TMP_DIR}" && rm -fr "${BACKUP_TMP_DIR}"
test -f "${TRAP_FILE}" && rm -f "${TRAP_FILE}"
}

# create temp dir on the same filesystem with a BACKUP_DIR to be able use atomic mv enstead of copy
BACKUP_TMP_DIR=$(mktemp -d --tmpdir=${BACKUP_DIR})
trap "test -d "${BACKUP_TMP_DIR}" && rm -fr "${BACKUP_TMP_DIR}"" EXIT SIGINT SIGTERM
_trap(){
_clean
_log "ERROR" "[backup] something wrong happened, check the logs"
}

backup_number=$1
echo "Running backup"
trap '_trap' SIGQUIT SIGINT SIGTERM

# config.xml in a job directory is a config file that shouldn't be backed up
# config.xml in child directories is state that should. For example-
# branches/myorg/branches/myrepo/branches/master/config.xml should be retained while
# branches/myorg/config.xml should not
tar --zstd -C "${JENKINS_HOME}" -cf "${BACKUP_TMP_DIR}/${backup_number}.tar.zstd" \
tar --zstd -C "${JENKINS_HOME}" -cf "${BACKUP_TMP_DIR}/${BACKUP_NUMBER}.tar.zstd" \
--exclude jobs/*/workspace* \
--no-wildcards-match-slash --anchored \
--ignore-failed-read \
--exclude jobs/*/config.xml -c jobs || ret=$?

if [[ "$ret" -eq 0 ]]; then
echo "Backup was completed without warnings"
_log "INFO" "[backup] backup ${BACKUP_NUMBER} was completed without warnings"
elif [[ "$ret" -eq 1 ]]; then
echo "Backup was completed with some warnings"
_log "INFO" "[backup] backup ${BACKUP_NUMBER} was completed with some warnings"
fi

# atomically create a backup file
mv "${BACKUP_TMP_DIR}/${backup_number}.tar.zstd" "${BACKUP_DIR}/${backup_number}.tar.zstd"
mv "${BACKUP_TMP_DIR}/${BACKUP_NUMBER}.tar.zstd" "${BACKUP_DIR}/${BACKUP_NUMBER}.tar.zstd"

rm -rf "${BACKUP_TMP_DIR}"
[[ ! -s ${BACKUP_DIR}/${backup_number}.tar.zstd ]] && echo "backup file '${BACKUP_DIR}/${backup_number}.tar.zstd' is empty" && exit 1;
_log "INFO" "[backup] cleaning ${BACKUP_TMP_DIR} and trap file ${TRAP_FILE}"
_clean
[[ ! -s ${BACKUP_DIR}/${BACKUP_NUMBER}.tar.zstd ]] && _log "ERROR" "[backup] file '${BACKUP_DIR}/${BACKUP_NUMBER}.tar.zstd' is empty" && exit 1

echo Done
_log "INFO" "[backup] ${BACKUP_NUMBER} done"
exit 0
8 changes: 2 additions & 6 deletions backup/pvc/bin/get-latest.sh
Original file line number Diff line number Diff line change
@@ -1,30 +1,26 @@
#!/usr/bin/env bash

set -eo pipefail
source "$(dirname "$0")/utils.sh"

is_backup_not_exist() {
local backup_dir="$1"
# Save the current value of 'set -e'
local previous_e
previous_e=$(set +e; :; echo $?)

# Temporarily turn off 'set -e'
set +e

# Run ls command to check if any files matching the pattern exist
ls "${backup_dir}"/*.tar.* 1> /dev/null 2>&1

# Store the exit status of the ls command
local ls_exit_status=$?

# Restore the previous value of 'set -e'
[ "$previous_e" = "0" ] && set -e

# Return true if ls command succeeded (no files found), otherwise return false
[ $ls_exit_status -ne 0 ]
}

[[ -z "${BACKUP_DIR}" ]] && { echo "Required 'BACKUP_DIR' env not set"; exit 1; }
[[ -z "${BACKUP_DIR}" ]] && { _log "ERROR" "Required 'BACKUP_DIR' env not set"; exit 1; }

# Check if we have any backup
if is_backup_not_exist "${BACKUP_DIR}"; then
Expand Down
44 changes: 31 additions & 13 deletions backup/pvc/bin/restore.sh
Original file line number Diff line number Diff line change
@@ -1,29 +1,47 @@
#!/usr/bin/env bash

set -eo pipefail
source "$(dirname "$0")/utils.sh"

[[ ! $# -eq 1 ]] && echo "Usage: $0 backup_number" && exit 1
[[ -z "${BACKUP_DIR}" ]] && echo "Required 'BACKUP_DIR' env not set" && exit 1;
[[ -z "${JENKINS_HOME}" ]] && echo "Required 'JENKINS_HOME' env not set" && exit 1;
[[ ! $# -eq 1 ]] && _log "ERROR" "Usage: $0 <backup number>" && exit 1
[[ -z "${BACKUP_DIR}" ]] && _log "ERROR" "Required 'BACKUP_DIR' env not set" && exit 1
[[ -z "${JENKINS_HOME}" ]] && _log "ERROR" "Required 'JENKINS_HOME' env not set" && exit 1
BACKUP_NUMBER=$1
RESTORE_RETRY_COUNT=${RESTORE_RETRY_COUNT:-10}
RESTORE_RETRY_INTERVAL=${RESTORE_RETRY_INTERVAL:-10}

backup_number=$1
backup_file="${BACKUP_DIR}/${backup_number}"
echo "Running restore backup with backup number #${backup_number}"
# --> Check if another restore process is running (operator restart/crash)
TRAP_FILE="/tmp/_restore_${BACKUP_NUMBER}_is_running"
trap "rm -f ${TRAP_FILE}" SIGINT SIGTERM

if [[ -f "$backup_file.tar.gz" ]]; then
echo "Old format tar.gz found, restoring it"
for ((i=0; i<RESTORE_RETRY_COUNT; i++)); do
[[ ! -f "${TRAP_FILE}" ]] && _log "INFO" "[restore] no other process are running, restoring" && break
_log "INFO" "[restore] is already running. Waiting for ${RESTORE_RETRY_INTERVAL} seconds..."
sleep "${RESTORE_RETRY_INTERVAL}"
done
[[ -f "${TRAP_FILE}" ]] && { _log "ERROR" "[restore] is still running after waiting ${RESTORE_RETRY_COUNT} time ${RESTORE_RETRY_INTERVAL}s. Exiting."; exit 1; }
# --< Done

_log "INFO" "[restore] restore backup with backup number #${BACKUP_NUMBER}"
touch "${TRAP_FILE}"
BACKUP_FILE="${BACKUP_DIR}/${BACKUP_NUMBER}"

if [[ -f "$BACKUP_FILE.tar.gz" ]]; then
_log "INFO" "[restore] old format tar.gz found, restoring it"
OPTS=""
EXT="tar.gz"
elif [[ -f "$backup_file.tar.zstd" ]]; then
echo "Backup file found, proceeding"
elif [[ -f "$BACKUP_FILE.tar.zstd" ]]; then
_log "INFO" "[restore] Backup file found, proceeding"
OPTS="--zstd"
EXT="tar.zstd"
else
echo "ERR: Backup file not found: $backup_file"
_log "ERROR" "[restore] backup file not found: $BACKUP_FILE"
exit 1
fi

tar $OPTS -C "${JENKINS_HOME}" -xf "${BACKUP_DIR}/${backup_number}.${EXT}"
tar $OPTS -C "${JENKINS_HOME}" -xf "${BACKUP_DIR}/${BACKUP_NUMBER}.${EXT}"

echo Done
_log "INFO" "[restore] deleting lock file ${TRAP_FILE}"
test -f "${TRAP_FILE}" && rm -f "${TRAP_FILE}"
_log "INFO" "[restore] restoring ${BACKUP_NUMBER} Done"
exit 0
11 changes: 6 additions & 5 deletions backup/pvc/bin/run.sh
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
#!/usr/bin/env bash

set -eo pipefail
source "$(dirname "$0")/utils.sh"

# Use 60 as default in case BACKUP_CLEANUP_INTERVAL did not set
BACKUP_CLEANUP_INTERVAL=${BACKUP_CLEANUP_INTERVAL:=60}

# Ensure required environment variables are set
check_env_var() {
if [[ -z "${!1}" ]]; then
echo "Required '$1' environment variable is not set"
_log "ERROR" "Required '$1' environment variable is not set"
exit 1
fi
}
Expand Down Expand Up @@ -41,7 +42,7 @@ find_exceeding_backups() {
local backup_count="$2"
# Check if we have any backup
if is_backup_not_exist "${backup_dir}"; then
echo "backups not found in ${backup_dir}" >&2
_log "ERROR" "[run] backups not found in ${backup_dir}"
return
fi
find "${backup_dir}"/*.tar.zstd -maxdepth 0 -exec basename {} \; | sort -gr | tail -n +$((backup_count +1))
Expand All @@ -51,9 +52,9 @@ check_env_var "BACKUP_DIR"
check_env_var "JENKINS_HOME"

if [[ -z "${BACKUP_COUNT}" ]]; then
echo "ATTENTION! No BACKUP_COUNT set, it means you MUST delete old backups manually or by custom script"
_log "WARNING" "[run] no BACKUP_COUNT set, it means you MUST delete old backups manually or by custom script"
else
echo "Retaining only the ${BACKUP_COUNT} most recent backups, cleanup occurs every ${BACKUP_CLEANUP_INTERVAL} seconds"
_log "INFO" "[run] retaining only the ${BACKUP_COUNT} most recent backups, cleanup occurs every ${BACKUP_CLEANUP_INTERVAL} seconds"
fi

while true;
Expand All @@ -62,7 +63,7 @@ do
if [[ -n "${BACKUP_COUNT}" ]]; then
exceeding_backups=$(find_exceeding_backups "${BACKUP_DIR}" "${BACKUP_COUNT}")
if [[ -n "$exceeding_backups" ]]; then
echo "Removing backups: $(echo "$exceeding_backups" | tr '\n' ', ' | sed 's/,$//')"
_log "INFO" "[run] removing backups: $(echo "$exceeding_backups" | tr '\n' ', ' | sed 's/,$//')"
echo "$exceeding_backups" | while read -r file; do
rm "${BACKUP_DIR}/${file}"
done
Expand Down
14 changes: 14 additions & 0 deletions backup/pvc/bin/utils.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/env bash
# Common utils

_log() {
local level="$1"
local message="$2"
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
if [[ "$level" =~ ^(ERROR|ERR|error|err)$ ]]; then
echo "${timestamp} - ${level} - ${message}" > /proc/1/fd/2
else
echo "${timestamp} - ${level} - ${message}" > /proc/1/fd/1
echo "${timestamp} - ${level} - ${message}" >&2
fi
}
2 changes: 1 addition & 1 deletion chart/jenkins-operator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ Kubernetes native operator which fully manages Jenkins on Kubernetes
| jenkins.backup.env[2].name | string | `"BACKUP_COUNT"` | |
| jenkins.backup.env[2].value | string | `"3"` | |
| jenkins.backup.getLatestAction[0] | string | `"/home/user/bin/get-latest.sh"` | |
| jenkins.backup.image | string | `"quay.io/jenkins-kubernetes-operator/backup-pvc:v0.2.6"` | |
| jenkins.backup.image | string | `"quay.io/jenkins-kubernetes-operator/backup-pvc:v0.4.1"` | |
| jenkins.backup.interval | int | `30` | |
| jenkins.backup.makeBackupBeforePodDeletion | bool | `true` | |
| jenkins.backup.pvc.className | string | `""` | |
Expand Down
16 changes: 15 additions & 1 deletion chart/jenkins-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ jenkins:

# image used by backup feature
# By default using prebuilt backup PVC image
image: quay.io/jenkins-kubernetes-operator/backup-pvc:v0.2.6
image: quay.io/jenkins-kubernetes-operator/backup-pvc:v0.4.1

# containerName is backup container name
containerName: backup
Expand Down Expand Up @@ -262,13 +262,27 @@ jenkins:
# BACKUP_DIR - path for storing backup files (default: "/backup")
# JENKINS_HOME - path to jenkins home (default: "/jenkins-home")
# BACKUP_COUNT - define how much recent backups will be kept
# Optional in case you want to modify the backup and restore retry logic
# BACKUP_RETRY_COUNT
# BACKUP_RETRY_INTERVAL
# RESTORE_RETRY_COUNT
# RESTORE_RETRY_INTERVAL
env:
- name: BACKUP_DIR
value: /backup
- name: JENKINS_HOME
value: /jenkins-home
- name: BACKUP_COUNT
value: "3" # keep only the 3 most recent backups
#- name: BACKUP_RETRY_COUNT
# value: "3"
#- name: BACKUP_RETRY_INTERVAL
# value: "60"
#- name: RESTORE_RETRY_COUNT
# value: "10"
#- name: RESTORE_RETRY_INTERVAL
# value: "10"


# volumeMounts holds the mount points for volumes
volumeMounts:
Expand Down
1 change: 1 addition & 0 deletions nix/website-shell.nix
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ let
devShellPackages = [
hugo_099_pkgs.hugo #hugo pre-v100
pkgs.nodejs_21 #Node 1.21
pkgs.helm-docs
];
baseUrl = ((builtins.fromTOML (builtins.readFile ../website/config.toml)).baseURL);
in
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
title: "Configuring backup and restore"
linkTitle: "Configuring backup and restore"
weight: 5
date: 2023-01-08
date: 2024-06-25
description: >
Prevent loss of job history
---
Expand Down Expand Up @@ -115,3 +115,19 @@ spec:
command:
- /home/user/bin/get-latest.sh # this command is invoked on "backup" container to get last backup number before pod deletion; not having it in the CR may cause loss of data
```
#### Customizing pvc backup behaviour
To prevent situations where the operator crashes or gets killed during a backup and restore process, a retry logic has been implemented.
This logic can be customized by adjusting the following environment variables:
* **Backup**: total time wait until giving up by default: 180s
* `BACKUP_RETRY_COUNT`: by default is `3`
* `BACKUP_RETRY_INTERVAL`: by default is `60`

* **Restore**: total time wait until giving up by default: 100s
* `RESTORE_RETRY_COUNT`: by default is `10`
* `RESTORE_RETRY_INTERVAL`: by default is `10`

You can adjust the retry logic based on the size of your backup and the duration of the restore process.

0 comments on commit b722ef1

Please sign in to comment.