Skip to content

Commit

Permalink
memory-monitor: Move file cleanup to exit handler.
Browse files Browse the repository at this point in the history
This update moves the file cleanup process to an exit handler to ensure
it runs even if the script encounters an error. By trapping the cleanup
function, the script reliably archives old files, deletes temporary
files, and maintains the total archive size below 100 MB. This change
addresses a previous issue where errors could lead to leftover files not
being removed, enhancing the overall reliability and resource
management.

Signed-off-by: Nikolay Martyanov <[email protected]>
  • Loading branch information
OhmSpectator committed Oct 16, 2024
1 parent 0a6aedf commit e71af37
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 44 deletions.
1 change: 1 addition & 0 deletions pkg/memory-monitor/src/monitor/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#define HANDLER_SCRIPT "memory-monitor-handler.sh"

// The paths here should correspond to the paths in src/monitor/memory-monitor-handler.sh
#define LOG_DIR "output"
#define EVENT_LOG_FILE "events.log"
#define HANDLER_LOG_FILE "memory-monitor-handler.log"
Expand Down
108 changes: 64 additions & 44 deletions pkg/memory-monitor/src/monitor/memory-monitor-handler.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,63 @@
set -x
set -e

# The paths here should correspond to the paths in the src/monitor/config.h
MEMORY_MONITOR_HANDLER_LOG_FILE="memory-monitor-handler.log"
EVENT_LOG_FILE="events.log"
PSI_FILE="psi.txt"

MAX_OUTPUT_SIZE_MB=1 # 100 MB
MAX_OUTPUT_SIZE_KB=$((MAX_OUTPUT_SIZE_MB * 1024))

tar_old_output() {
# Tar directory with previous output to save space, but keep the latest output as is for easy access
# It's necessary as one output directory takes around 15 Mb. In archive, it's compressed to 1-2 Mb.
for dir in */; do
if [ "$dir" != "$timestamp/" ]; then
#Remove / from the end of the directory name
tar_name=${dir%/}
find "$dir" -type f -print0 | tar -czf "$tar_name.tar.gz" --files-from=-
rm -rf "$dir"
fi
done
}

cleanup() {
# Disable the script debug messages, so that the caller of the script can print the
# last lines of the log file that most likely contain the error message
set +x

cd output
tar_old_output
# Clean up the temporary file
rm "$sorted_eve_processes"
rm "$sorted_pillar_processes"

# Remove old archives, do not keep more than 100 MB of archives
total_size=$(du -s | awk '{print $1}') # Size in KB
# Subtract the size of the handler log file and convert it to KB
total_size=$((total_size - $(stat -c %s "$MEMORY_MONITOR_HANDLER_LOG_FILE") / 1024))
# Subtract the size of the psi.txt file (if it exists) as it size is regulated by the PSICollector
if [ -f "$PSI_FILE" ]; then
total_size=$((total_size - $(stat -c %s "$PSI_FILE") / 1024))
fi
while [ "$total_size" -gt "$MAX_OUTPUT_SIZE_KB" ]; do
found_archives=$(find . -type f -name "*.tar.gz" -print | sort -n)
if [ -z "$found_archives" ]; then
break
fi
oldest_archive=$(echo "$found_archives" | head -n 1)
rm "$oldest_archive"
# Remove the first line from the events.log file: it contains the oldest event info
sed -i '1d' "$EVENT_LOG_FILE"
total_size=$(du -s | awk '{print $1}')
total_size=$((total_size - $(stat -c %s "$MEMORY_MONITOR_HANDLER_LOG_FILE") / 1024))
done
}

# Trap the cleanup function
trap cleanup EXIT

# Define the function to recursively process each cgroup
find_pids_of_cgroup() {
path=$1
Expand Down Expand Up @@ -113,6 +170,12 @@ pillar_processes=$(mktemp)
sorted_eve_processes=$(mktemp)
sorted_pillar_processes=$(mktemp)

# Create the output directory if necessary
current_output_dir=$1
# Get the timestamp from the directory name (it's the last part of the path)
timestamp=$(basename "$current_output_dir")
mkdir -p "$current_output_dir"

# Process the cgroup and its subgroups
find_pids_of_cgroup "$cgroup_eve" "$eve_processes"
normalize_pids "$eve_processes" "$sorted_eve_processes"
Expand All @@ -126,12 +189,6 @@ rm "$pillar_processes"
# TODO How to deal with the older eve versions that do not support the debug command?
eve http-debug

# Create the output directory if necessary
current_output_dir=$1
# Get the timestamp from the directory name (it's the last part of the path)
timestamp=$(basename "$current_output_dir")
mkdir -p "$current_output_dir"

# ==== Handle the Pillar memory usage ====

show_pid_mem_usage "eve/services/pillar" "$sorted_pillar_processes" "$current_output_dir/memstat_pillar.out"
Expand All @@ -155,41 +212,4 @@ eve http-debug stop

ln -s /containers/services/pillar/rootfs/opt/zededa/bin/zedbox "$current_output_dir/zedbox"

# Clean up the temporary file
rm "$sorted_eve_processes"
rm "$sorted_pillar_processes"

# Tar directory with previous output to save space, but keep the latest output as is for easy access
# It's necessary as one output directory takes around 15 Mb. In archive, it's compressed to 1-2 Mb.
cd output || exit
for dir in */; do
if [ "$dir" != "$timestamp/" ]; then
#Remove / from the end of the directory name
tar_name=${dir%/}
find "$dir" -type f -print0 | tar -czf "$tar_name.tar.gz" --files-from=-
rm -rf "$dir"
fi
done

MEMORY_MONITOR_HANDLER_LOG_FILE="memory-monitor-handler.log"

# Remove old archives, do not keep more than 100 MB of archives
total_size=$(du -s | awk '{print $1}') # Size in KB
# Subtract the size of the handler log file
total_size=$((total_size - $(stat -c %s "$MEMORY_MONITOR_HANDLER_LOG_FILE") / 1024))
# Subtract the size of the psi.txt file (if it exists) as it size is regulated by the PSICollector
if [ -f psi.txt ]; then
total_size=$((total_size - $(stat -c %s psi.txt) / 1024))
fi
while [ "$total_size" -gt 102400 ]; do
found_archives=$(find . -type f -name "*.tar.gz" -print | sort -n)
if [ -z "$found_archives" ]; then
break
fi
oldest_archive=$(echo "$found_archives" | head -n 1)
rm "$oldest_archive"
# Remove the first line from the events.log file: it contains the oldest event info
sed -i '1d' events.log
total_size=$(du -s | awk '{print $1}')
total_size=$((total_size - $(stat -c %s "$MEMORY_MONITOR_HANDLER_LOG_FILE") / 1024))
done
# cleanup function will be called here automatically

0 comments on commit e71af37

Please sign in to comment.