Add disk full incident response toolkit
This commit is contained in:
Executable
+124
@@ -0,0 +1,124 @@
|
||||
#!/usr/bin/env bash
|
||||
set -o errexit
|
||||
set -o nounset
|
||||
set -o pipefail
|
||||
|
||||
TIMESTAMP="${TIMESTAMP:-$(date +%Y%m%d_%H%M%S)}"
|
||||
DRY_RUN="${DRY_RUN:-true}"
|
||||
LOG_FILE="${LOG_FILE:-/tmp/disk_full_${TIMESTAMP}.log}"
|
||||
WARN_THRESHOLD="${WARN_THRESHOLD:-80}"
|
||||
CRIT_THRESHOLD="${CRIT_THRESHOLD:-90}"
|
||||
EMERGENCY_THRESHOLD="${EMERGENCY_THRESHOLD:-95}"
|
||||
|
||||
log() {
|
||||
local level="$1"
|
||||
shift
|
||||
local message="$*"
|
||||
|
||||
printf '%s: %s\n' "$level" "$message" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
ok() {
|
||||
log "OK" "$@"
|
||||
}
|
||||
|
||||
warning() {
|
||||
log "WARNING" "$@"
|
||||
}
|
||||
|
||||
critical() {
|
||||
log "CRITICAL" "$@"
|
||||
}
|
||||
|
||||
section() {
|
||||
printf '\n== %s ==\n' "$1" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
require_cmd() {
|
||||
local cmd="$1"
|
||||
|
||||
if command -v "$cmd" >/dev/null 2>&1; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
warning "Command not available: $cmd"
|
||||
return 1
|
||||
}
|
||||
|
||||
run_cmd() {
|
||||
if [[ "$#" -eq 0 ]]; then
|
||||
critical "run_cmd called without a command"
|
||||
return 2
|
||||
fi
|
||||
|
||||
if [[ "$DRY_RUN" == "true" ]]; then
|
||||
ok "DRY-RUN: $*"
|
||||
return 0
|
||||
fi
|
||||
|
||||
ok "RUN: $*"
|
||||
"$@" 2>&1 | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
confirm_execute() {
|
||||
local target="${1:-disk-full remediation}"
|
||||
|
||||
if [[ "$DRY_RUN" == "true" ]]; then
|
||||
ok "Safe mode enabled. No destructive actions will be taken."
|
||||
return 0
|
||||
fi
|
||||
|
||||
warning "Execution mode requested for: $target"
|
||||
warning "Confirm the affected filesystem, application impact, backups, and change approval before continuing."
|
||||
printf 'Type EXECUTE to continue: '
|
||||
read -r confirmation
|
||||
|
||||
if [[ "$confirmation" != "EXECUTE" ]]; then
|
||||
critical "Confirmation failed. Aborting."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
ok "Execution confirmed by operator."
|
||||
}
|
||||
|
||||
validate_path() {
|
||||
local path="$1"
|
||||
|
||||
if [[ -z "$path" ]]; then
|
||||
critical "Path cannot be empty"
|
||||
return 2
|
||||
fi
|
||||
|
||||
if [[ ! -e "$path" ]]; then
|
||||
critical "Path does not exist: $path"
|
||||
return 2
|
||||
fi
|
||||
}
|
||||
|
||||
usage_percent_number() {
|
||||
local value="$1"
|
||||
printf '%s\n' "${value%\%}"
|
||||
}
|
||||
|
||||
status_for_percent() {
|
||||
local percent="$1"
|
||||
|
||||
if (( percent >= EMERGENCY_THRESHOLD )); then
|
||||
printf 'CRITICAL'
|
||||
elif (( percent >= CRIT_THRESHOLD )); then
|
||||
printf 'WARNING'
|
||||
elif (( percent >= WARN_THRESHOLD )); then
|
||||
printf 'WARNING'
|
||||
else
|
||||
printf 'OK'
|
||||
fi
|
||||
}
|
||||
|
||||
safe_find_prune_args() {
|
||||
printf '%s\n' \
|
||||
-path /proc -o \
|
||||
-path /sys -o \
|
||||
-path /dev -o \
|
||||
-path /run -o \
|
||||
-path /tmp/systemd-private-\*
|
||||
}
|
||||
+47
@@ -0,0 +1,47 @@
|
||||
#!/usr/bin/env bash
|
||||
set -o errexit
|
||||
set -o nounset
|
||||
set -o pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
# shellcheck source=00_env.sh
|
||||
. "$SCRIPT_DIR/00_env.sh"
|
||||
|
||||
exit_code=0
|
||||
|
||||
section "Disk Space Overview"
|
||||
if require_cmd df; then
|
||||
df -h 2>&1 | tee -a "$LOG_FILE"
|
||||
else
|
||||
critical "df is required for disk overview"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
section "Inode Overview"
|
||||
df -i 2>&1 | tee -a "$LOG_FILE" || warning "Unable to collect inode usage"
|
||||
|
||||
section "Filesystems Sorted By Usage"
|
||||
df -P -h | awk 'NR == 1 { next } { print $5, $6, $1, $2, $3, $4 }' | sort -rn | while read -r used mount fs size used_space avail; do
|
||||
percent="$(usage_percent_number "$used")"
|
||||
level="$(status_for_percent "$percent")"
|
||||
printf '%s: %s used on %s (%s, size=%s used=%s avail=%s)\n' "$level" "$used" "$mount" "$fs" "$size" "$used_space" "$avail" | tee -a "$LOG_FILE"
|
||||
done
|
||||
|
||||
section "Threshold Summary"
|
||||
while read -r fs size used avail pct mount; do
|
||||
percent="$(usage_percent_number "$pct")"
|
||||
level="$(status_for_percent "$percent")"
|
||||
|
||||
if (( percent >= EMERGENCY_THRESHOLD )); then
|
||||
critical "$mount is ${pct} full on $fs (size=$size used=$used avail=$avail)"
|
||||
exit_code=1
|
||||
elif (( percent >= CRIT_THRESHOLD )); then
|
||||
warning "$mount is ${pct} full on $fs (size=$size used=$used avail=$avail)"
|
||||
elif (( percent >= WARN_THRESHOLD )); then
|
||||
warning "$mount is ${pct} full on $fs (size=$size used=$used avail=$avail)"
|
||||
else
|
||||
ok "$mount is ${pct} full on $fs"
|
||||
fi
|
||||
done < <(df -P -h | awk 'NR > 1 { print $1, $2, $3, $4, $5, $6 }')
|
||||
|
||||
exit "$exit_code"
|
||||
+63
@@ -0,0 +1,63 @@
|
||||
#!/usr/bin/env bash
|
||||
set -o errexit
|
||||
set -o nounset
|
||||
set -o pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
# shellcheck source=00_env.sh
|
||||
. "$SCRIPT_DIR/00_env.sh"
|
||||
|
||||
SEARCH_PATH="/"
|
||||
TOP_N=20
|
||||
|
||||
usage() {
|
||||
printf 'Usage: %s [--path <path>] [--top <N>]\n' "$(basename "$0")"
|
||||
}
|
||||
|
||||
while [[ "$#" -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--path) SEARCH_PATH="${2:-}"; shift 2 ;;
|
||||
--top) TOP_N="${2:-}"; shift 2 ;;
|
||||
-h|--help) usage; exit 0 ;;
|
||||
*) critical "Unknown argument: $1"; usage; exit 2 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
if ! [[ "$TOP_N" =~ ^[0-9]+$ ]] || (( TOP_N < 1 )); then
|
||||
critical "--top must be a positive integer"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
validate_path "$SEARCH_PATH" || exit 2
|
||||
require_cmd find || exit 1
|
||||
require_cmd sort || exit 1
|
||||
require_cmd head || exit 1
|
||||
|
||||
section "Largest Files Under $SEARCH_PATH"
|
||||
warning "Read-only scan. Permission errors can be normal without root access."
|
||||
|
||||
find "$SEARCH_PATH" -xdev \
|
||||
\( -path /proc -o -path /sys -o -path /dev -o -path /run \) -prune -o \
|
||||
-type f -printf '%s\t%p\n' 2>/dev/null |
|
||||
sort -rn |
|
||||
head -n "$TOP_N" |
|
||||
awk '
|
||||
function human(bytes) {
|
||||
split("B KB MB GB TB PB", unit)
|
||||
size = bytes
|
||||
idx = 1
|
||||
while (size >= 1024 && idx < 6) {
|
||||
size = size / 1024
|
||||
idx++
|
||||
}
|
||||
return sprintf("%.1f%s", size, unit[idx])
|
||||
}
|
||||
{
|
||||
size = $1
|
||||
$1 = ""
|
||||
sub(/^\t/, "")
|
||||
printf "%10s %s\n", human(size), $0
|
||||
}
|
||||
' | tee -a "$LOG_FILE"
|
||||
|
||||
ok "No files were modified."
|
||||
@@ -0,0 +1,41 @@
|
||||
#!/usr/bin/env bash
|
||||
set -o errexit
|
||||
set -o nounset
|
||||
set -o pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
# shellcheck source=00_env.sh
|
||||
. "$SCRIPT_DIR/00_env.sh"
|
||||
|
||||
section "Deleted But Open Files"
|
||||
|
||||
if ! require_cmd lsof; then
|
||||
warning "lsof is not installed or not in PATH. Install lsof or run equivalent tooling with appropriate privileges."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
warning "Read-only check. Full results may require elevated privileges."
|
||||
|
||||
deleted_output="$(lsof -nP +L1 2>/dev/null || true)"
|
||||
|
||||
if [[ -z "$deleted_output" ]]; then
|
||||
ok "No deleted open files detected by lsof."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
printf '%s\n' "$deleted_output" |
|
||||
awk '
|
||||
NR == 1 {
|
||||
printf "%-20s %-10s %-12s %s\n", "PROCESS", "PID", "SIZE", "PATH"
|
||||
next
|
||||
}
|
||||
{
|
||||
path = $9
|
||||
for (i = 10; i <= NF; i++) {
|
||||
path = path " " $i
|
||||
}
|
||||
printf "%-20s %-10s %-12s %s\n", $1, $2, $7, path
|
||||
}
|
||||
' | tee -a "$LOG_FILE"
|
||||
|
||||
warning "Space from deleted files is released when the owning process closes the file or is safely restarted."
|
||||
+51
@@ -0,0 +1,51 @@
|
||||
#!/usr/bin/env bash
|
||||
set -o errexit
|
||||
set -o nounset
|
||||
set -o pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
# shellcheck source=00_env.sh
|
||||
. "$SCRIPT_DIR/00_env.sh"
|
||||
|
||||
SEARCH_PATH="/"
|
||||
DEPTH=2
|
||||
TOP_N=25
|
||||
|
||||
usage() {
|
||||
printf 'Usage: %s [--path <path>] [--depth <N>] [--top <N>]\n' "$(basename "$0")"
|
||||
}
|
||||
|
||||
while [[ "$#" -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--path) SEARCH_PATH="${2:-}"; shift 2 ;;
|
||||
--depth) DEPTH="${2:-}"; shift 2 ;;
|
||||
--top) TOP_N="${2:-}"; shift 2 ;;
|
||||
-h|--help) usage; exit 0 ;;
|
||||
*) critical "Unknown argument: $1"; usage; exit 2 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
if ! [[ "$DEPTH" =~ ^[0-9]+$ ]]; then
|
||||
critical "--depth must be a non-negative integer"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
if ! [[ "$TOP_N" =~ ^[0-9]+$ ]] || (( TOP_N < 1 )); then
|
||||
critical "--top must be a positive integer"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
validate_path "$SEARCH_PATH" || exit 2
|
||||
require_cmd du || exit 1
|
||||
require_cmd sort || exit 1
|
||||
require_cmd head || exit 1
|
||||
|
||||
section "Top Directories Under $SEARCH_PATH"
|
||||
warning "Read-only scan. Permission errors can be normal without root access."
|
||||
|
||||
du -x -h --max-depth="$DEPTH" "$SEARCH_PATH" 2>/dev/null |
|
||||
sort -hr |
|
||||
head -n "$TOP_N" |
|
||||
tee -a "$LOG_FILE"
|
||||
|
||||
ok "No directories were modified."
|
||||
+97
@@ -0,0 +1,97 @@
|
||||
#!/usr/bin/env bash
|
||||
set -o errexit
|
||||
set -o nounset
|
||||
set -o pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
# shellcheck source=00_env.sh
|
||||
. "$SCRIPT_DIR/00_env.sh"
|
||||
|
||||
EXECUTE=false
|
||||
LOG_PATH="/var/log"
|
||||
DAYS_OLD=14
|
||||
|
||||
usage() {
|
||||
printf 'Usage: %s [--path <path>] [--days-old <N>] [--execute]\n' "$(basename "$0")"
|
||||
}
|
||||
|
||||
while [[ "$#" -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--path) LOG_PATH="${2:-}"; shift 2 ;;
|
||||
--days-old) DAYS_OLD="${2:-}"; shift 2 ;;
|
||||
--execute) EXECUTE=true; DRY_RUN=false; shift ;;
|
||||
-h|--help) usage; exit 0 ;;
|
||||
*) critical "Unknown argument: $1"; usage; exit 2 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
if ! [[ "$DAYS_OLD" =~ ^[0-9]+$ ]] || (( DAYS_OLD < 1 )); then
|
||||
critical "--days-old must be a positive integer"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
validate_path "$LOG_PATH" || exit 2
|
||||
require_cmd find || exit 1
|
||||
require_cmd sort || exit 1
|
||||
require_cmd xargs || true
|
||||
|
||||
section "Large Log Files In $LOG_PATH"
|
||||
find "$LOG_PATH" -xdev -type f \( -name '*.log' -o -name '*log' -o -name 'messages*' -o -name 'syslog*' \) -size +100M -printf '%s\t%p\n' 2>/dev/null |
|
||||
sort -rn |
|
||||
awk '
|
||||
function human(bytes) {
|
||||
split("B KB MB GB TB", unit)
|
||||
size = bytes
|
||||
idx = 1
|
||||
while (size >= 1024 && idx < 5) {
|
||||
size = size / 1024
|
||||
idx++
|
||||
}
|
||||
return sprintf("%.1f%s", size, unit[idx])
|
||||
}
|
||||
{ size = $1; $1 = ""; sub(/^\t/, ""); printf "%10s %s\n", human(size), $0 }
|
||||
' | tee -a "$LOG_FILE"
|
||||
|
||||
section "Old Rotated Logs Eligible For Review"
|
||||
mapfile -t rotated_logs < <(
|
||||
find "$LOG_PATH" -xdev -type f \
|
||||
\( -name '*.gz' -o -name '*.1' -o -name '*.old' -o -name '*.bz2' -o -name '*.xz' \) \
|
||||
-mtime +"$DAYS_OLD" -print 2>/dev/null | sort
|
||||
)
|
||||
|
||||
if [[ "${#rotated_logs[@]}" -eq 0 ]]; then
|
||||
ok "No old rotated logs found under $LOG_PATH with age greater than $DAYS_OLD days."
|
||||
else
|
||||
printf '%s\n' "${rotated_logs[@]}" | tee -a "$LOG_FILE"
|
||||
fi
|
||||
|
||||
section "Suggested Cleanup Commands"
|
||||
cat <<SUGGESTIONS | tee -a "$LOG_FILE"
|
||||
# Review large active logs before truncating. Prefer application-aware log rotation:
|
||||
logrotate -d /etc/logrotate.conf
|
||||
|
||||
# Remove old rotated logs only after retention approval:
|
||||
$(basename "$0") --path "$LOG_PATH" --days-old "$DAYS_OLD" --execute
|
||||
SUGGESTIONS
|
||||
|
||||
if [[ "$EXECUTE" != "true" ]]; then
|
||||
ok "Safe mode. No logs were removed."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [[ "${#rotated_logs[@]}" -eq 0 ]]; then
|
||||
ok "Execution requested, but there are no eligible old rotated logs to remove."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
confirm_execute "remove old rotated logs from $LOG_PATH"
|
||||
|
||||
for file in "${rotated_logs[@]}"; do
|
||||
if [[ -f "$file" && ! -L "$file" ]]; then
|
||||
run_cmd rm -f -- "$file"
|
||||
else
|
||||
warning "Skipped non-regular file or symlink: $file"
|
||||
fi
|
||||
done
|
||||
|
||||
ok "Old rotated log cleanup completed."
|
||||
+78
@@ -0,0 +1,78 @@
|
||||
#!/usr/bin/env bash
|
||||
set -o errexit
|
||||
set -o nounset
|
||||
set -o pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
# shellcheck source=00_env.sh
|
||||
. "$SCRIPT_DIR/00_env.sh"
|
||||
|
||||
EXECUTE=false
|
||||
TRUNCATE_FILE=""
|
||||
RESTART_SERVICE=""
|
||||
|
||||
usage() {
|
||||
printf 'Usage: %s [--truncate-file <path>] [--restart-service <name>] [--execute]\n' "$(basename "$0")"
|
||||
}
|
||||
|
||||
while [[ "$#" -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--truncate-file) TRUNCATE_FILE="${2:-}"; shift 2 ;;
|
||||
--restart-service) RESTART_SERVICE="${2:-}"; shift 2 ;;
|
||||
--execute) EXECUTE=true; DRY_RUN=false; shift ;;
|
||||
-h|--help) usage; exit 0 ;;
|
||||
*) critical "Unknown argument: $1"; usage; exit 2 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
section "Emergency Disk Full Quick Fix Options"
|
||||
cat <<OPTIONS | tee -a "$LOG_FILE"
|
||||
Possible actions after incident commander approval:
|
||||
1. Truncate a verified active log file:
|
||||
$0 --truncate-file /path/to/large.log --execute
|
||||
|
||||
2. Restart a specific service holding deleted files open:
|
||||
$0 --restart-service service-name --execute
|
||||
|
||||
Review application impact before either action. Truncation preserves the file inode but destroys file contents.
|
||||
OPTIONS
|
||||
|
||||
if [[ -z "$TRUNCATE_FILE" && -z "$RESTART_SERVICE" ]]; then
|
||||
ok "No quick fix requested. Printed options only."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [[ "$EXECUTE" != "true" ]]; then
|
||||
warning "Quick fix arguments supplied without --execute. No changes made."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
confirm_execute "emergency disk-full quick fix"
|
||||
|
||||
if [[ -n "$TRUNCATE_FILE" ]]; then
|
||||
validate_path "$TRUNCATE_FILE" || exit 2
|
||||
|
||||
if [[ ! -f "$TRUNCATE_FILE" || -L "$TRUNCATE_FILE" ]]; then
|
||||
critical "Refusing to truncate non-regular file or symlink: $TRUNCATE_FILE"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
warning "Truncating file contents: $TRUNCATE_FILE"
|
||||
: > "$TRUNCATE_FILE"
|
||||
ok "Truncated $TRUNCATE_FILE"
|
||||
fi
|
||||
|
||||
if [[ -n "$RESTART_SERVICE" ]]; then
|
||||
if [[ "$RESTART_SERVICE" == *"/"* || "$RESTART_SERVICE" == *".."* ]]; then
|
||||
critical "Invalid service name: $RESTART_SERVICE"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
if require_cmd systemctl; then
|
||||
run_cmd systemctl restart "$RESTART_SERVICE"
|
||||
ok "Restart requested for service: $RESTART_SERVICE"
|
||||
else
|
||||
critical "systemctl is required to restart services"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
+90
@@ -0,0 +1,90 @@
|
||||
#!/usr/bin/env bash
|
||||
set -o errexit
|
||||
set -o nounset
|
||||
set -o pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
# shellcheck source=00_env.sh
|
||||
. "$SCRIPT_DIR/00_env.sh"
|
||||
|
||||
BEFORE_FILE=""
|
||||
exit_code=0
|
||||
|
||||
usage() {
|
||||
printf 'Usage: %s [--before-file <df_output_file>]\n' "$(basename "$0")"
|
||||
}
|
||||
|
||||
while [[ "$#" -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--before-file) BEFORE_FILE="${2:-}"; shift 2 ;;
|
||||
-h|--help) usage; exit 0 ;;
|
||||
*) critical "Unknown argument: $1"; usage; exit 2 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
require_cmd df || exit 1
|
||||
|
||||
section "Post-Cleanup Disk Space"
|
||||
df -h 2>&1 | tee -a "$LOG_FILE"
|
||||
|
||||
section "Post-Cleanup Inodes"
|
||||
df -i 2>&1 | tee -a "$LOG_FILE" || warning "Unable to collect inode usage"
|
||||
|
||||
section "Critical Filesystem Check"
|
||||
while read -r fs size used avail pct mount; do
|
||||
percent="$(usage_percent_number "$pct")"
|
||||
if (( percent >= EMERGENCY_THRESHOLD )); then
|
||||
critical "$mount is still ${pct} full on $fs (size=$size used=$used avail=$avail)"
|
||||
exit_code=1
|
||||
elif (( percent >= CRIT_THRESHOLD )); then
|
||||
warning "$mount remains high at ${pct} on $fs"
|
||||
else
|
||||
ok "$mount is ${pct} full on $fs"
|
||||
fi
|
||||
done < <(df -P -h | awk 'NR > 1 { print $1, $2, $3, $4, $5, $6 }')
|
||||
|
||||
if [[ -n "$BEFORE_FILE" ]]; then
|
||||
section "Before And After Comparison"
|
||||
if [[ ! -f "$BEFORE_FILE" ]]; then
|
||||
warning "Before file not found: $BEFORE_FILE"
|
||||
else
|
||||
awk '
|
||||
NR == FNR && FNR > 1 {
|
||||
before[$6] = $5
|
||||
next
|
||||
}
|
||||
FNR > 1 {
|
||||
mount = $6
|
||||
if (mount in before) {
|
||||
before_pct = before[mount]
|
||||
after_pct = $5
|
||||
gsub(/%/, "", before_pct)
|
||||
gsub(/%/, "", after_pct)
|
||||
|
||||
if (after_pct < before_pct) {
|
||||
status = "OK"
|
||||
result = "improved"
|
||||
} else if (after_pct == before_pct) {
|
||||
status = "WARNING"
|
||||
result = "unchanged"
|
||||
} else {
|
||||
status = "WARNING"
|
||||
result = "increased"
|
||||
}
|
||||
|
||||
printf "%s: %s before=%s after=%s (%s)\n", status, mount, before[mount], $5, result
|
||||
}
|
||||
}
|
||||
' "$BEFORE_FILE" <(df -P -h) | tee -a "$LOG_FILE"
|
||||
fi
|
||||
else
|
||||
warning "No --before-file supplied. Improvement comparison skipped."
|
||||
fi
|
||||
|
||||
if [[ "$exit_code" -eq 0 ]]; then
|
||||
ok "Post-check completed without emergency-threshold filesystems."
|
||||
else
|
||||
critical "One or more filesystems remain at or above ${EMERGENCY_THRESHOLD}%."
|
||||
fi
|
||||
|
||||
exit "$exit_code"
|
||||
@@ -0,0 +1,84 @@
|
||||
# Linux Disk Full Incident Toolkit
|
||||
|
||||
Production-style Bash toolkit for diagnosing and handling a disk full incident on Linux systems. It is intentionally conservative: default mode is safe, cleanup actions require `--execute` and an operator confirmation prompt, and the scripts do not assume root access.
|
||||
|
||||
## Why Disk Full Incidents Happen
|
||||
|
||||
- **Logs** - application, audit, system, or middleware logs can grow faster than rotation policy expects.
|
||||
- **Temporary files** - failed jobs, installers, archives, and batch workloads often leave large files in `/tmp`, `/var/tmp`, or application work directories.
|
||||
- **Deleted open files** - a process can keep writing to a file after it has been deleted, hiding disk usage from normal directory listings until the process closes the file.
|
||||
- **Inode exhaustion** - a filesystem can fail writes even when space is available if it has too many small files and no free inodes.
|
||||
|
||||
## Safety Model
|
||||
|
||||
- Safe dry-run behavior is the default.
|
||||
- No script blindly deletes files.
|
||||
- Cleanup operations require `--execute` and confirmation.
|
||||
- Missing optional commands are reported as `WARNING`.
|
||||
- Output is formatted with `OK`, `WARNING`, and `CRITICAL` for incident notes.
|
||||
- The scripts are designed to work without root, while warning when permissions may limit visibility.
|
||||
|
||||
## Scripts
|
||||
|
||||
- `00_env.sh` - shared configuration and helper functions.
|
||||
- `01_disk_overview.sh` - `df -h`, `df -i`, sorted mount usage, and threshold highlights.
|
||||
- `02_find_big_files.sh` - read-only largest-file discovery.
|
||||
- `03_deleted_open_files.sh` - deleted but open file detection with `lsof` when available.
|
||||
- `04_top_dirs.sh` - largest directory discovery with `du`.
|
||||
- `05_log_cleanup.sh` - safe log cleanup analysis and optional old rotated log removal.
|
||||
- `06_quick_fix.sh` - defensive emergency actions for verified truncation or service restart.
|
||||
- `07_postcheck.sh` - validation after cleanup, with optional before/after comparison.
|
||||
- `disk_full_runbook.sh` - guided incident workflow.
|
||||
|
||||
## Example Usage
|
||||
|
||||
```bash
|
||||
cd infra-run/scripts/bash/disk-full
|
||||
|
||||
./01_disk_overview.sh
|
||||
./02_find_big_files.sh --path /var --top 20
|
||||
./03_deleted_open_files.sh
|
||||
./04_top_dirs.sh --path /var --depth 2
|
||||
./05_log_cleanup.sh
|
||||
./07_postcheck.sh
|
||||
```
|
||||
|
||||
Run the guided read-only workflow:
|
||||
|
||||
```bash
|
||||
./disk_full_runbook.sh --path /var --top 20 --depth 2
|
||||
```
|
||||
|
||||
Review old rotated logs without deleting them:
|
||||
|
||||
```bash
|
||||
./05_log_cleanup.sh --path /var/log --days-old 14
|
||||
```
|
||||
|
||||
Remove old rotated logs only after approval:
|
||||
|
||||
```bash
|
||||
./05_log_cleanup.sh --path /var/log --days-old 14 --execute
|
||||
```
|
||||
|
||||
Emergency truncation of a verified active log:
|
||||
|
||||
```bash
|
||||
./06_quick_fix.sh --truncate-file /var/log/app/verified-large.log --execute
|
||||
```
|
||||
|
||||
Restart a specific service after confirming it is holding deleted files open:
|
||||
|
||||
```bash
|
||||
./06_quick_fix.sh --restart-service app.service --execute
|
||||
```
|
||||
|
||||
## Exit Codes
|
||||
|
||||
- `0` - OK
|
||||
- `1` - operational issue detected or still critical
|
||||
- `2` - invalid input
|
||||
|
||||
## Production Warning
|
||||
|
||||
Use this toolkit as an incident aid, not an autopilot. Confirm the affected filesystem, application ownership, retention requirements, backup expectations, and change approval before cleanup. In enterprise environments, coordinate service restarts and file truncation with application owners because both can destroy evidence or interrupt production workloads.
|
||||
+67
@@ -0,0 +1,67 @@
|
||||
#!/usr/bin/env bash
|
||||
set -o errexit
|
||||
set -o nounset
|
||||
set -o pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
# shellcheck source=00_env.sh
|
||||
. "$SCRIPT_DIR/00_env.sh"
|
||||
|
||||
SEARCH_PATH="/"
|
||||
TOP_N=20
|
||||
DEPTH=2
|
||||
EXECUTE=false
|
||||
|
||||
usage() {
|
||||
printf 'Usage: %s [--path <path>] [--top <N>] [--depth <N>] [--execute]\n' "$(basename "$0")"
|
||||
}
|
||||
|
||||
while [[ "$#" -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--path) SEARCH_PATH="${2:-}"; shift 2 ;;
|
||||
--top) TOP_N="${2:-}"; shift 2 ;;
|
||||
--depth) DEPTH="${2:-}"; shift 2 ;;
|
||||
--execute) EXECUTE=true; DRY_RUN=false; shift ;;
|
||||
-h|--help) usage; exit 0 ;;
|
||||
*) critical "Unknown argument: $1"; usage; exit 2 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
section "Disk Full Incident Workflow"
|
||||
cat <<FLOW | tee -a "$LOG_FILE"
|
||||
Step 1. Disk overview
|
||||
$SCRIPT_DIR/01_disk_overview.sh
|
||||
|
||||
Step 2. Find largest files
|
||||
$SCRIPT_DIR/02_find_big_files.sh --path "$SEARCH_PATH" --top "$TOP_N"
|
||||
|
||||
Step 3. Check deleted but open files
|
||||
$SCRIPT_DIR/03_deleted_open_files.sh
|
||||
|
||||
Step 4. Identify top directories
|
||||
$SCRIPT_DIR/04_top_dirs.sh --path "$SEARCH_PATH" --depth "$DEPTH"
|
||||
|
||||
Step 5. Review safe log cleanup suggestions
|
||||
$SCRIPT_DIR/05_log_cleanup.sh
|
||||
|
||||
Step 6. Optional emergency quick fix, only after approval
|
||||
$SCRIPT_DIR/06_quick_fix.sh --truncate-file /path/to/verified.log --execute
|
||||
$SCRIPT_DIR/06_quick_fix.sh --restart-service service-name --execute
|
||||
|
||||
Step 7. Post-check
|
||||
$SCRIPT_DIR/07_postcheck.sh
|
||||
FLOW
|
||||
|
||||
if [[ "$EXECUTE" == "true" ]]; then
|
||||
warning "--execute was supplied to the runbook. Destructive actions are still not run automatically."
|
||||
fi
|
||||
|
||||
section "Running Read-Only Incident Checks"
|
||||
"$SCRIPT_DIR/01_disk_overview.sh" || warning "Disk overview reported critical usage"
|
||||
"$SCRIPT_DIR/02_find_big_files.sh" --path "$SEARCH_PATH" --top "$TOP_N" || warning "Large-file scan reported an issue"
|
||||
"$SCRIPT_DIR/03_deleted_open_files.sh" || warning "Deleted-open-file check reported an issue"
|
||||
"$SCRIPT_DIR/04_top_dirs.sh" --path "$SEARCH_PATH" --depth "$DEPTH" || warning "Top-directory scan reported an issue"
|
||||
"$SCRIPT_DIR/05_log_cleanup.sh" || warning "Log cleanup suggestion step reported an issue"
|
||||
|
||||
section "Next Manual Decision"
|
||||
ok "Review findings, identify owner and retention requirements, then run a targeted cleanup script with --execute only if approved."
|
||||
Reference in New Issue
Block a user