Add disk full incident response toolkit

This commit is contained in:
Mateusz Suski
2026-05-05 21:44:08 +00:00
parent 5dd8c34952
commit 76e24796bb
10 changed files with 742 additions and 0 deletions
+124
View File
@@ -0,0 +1,124 @@
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
TIMESTAMP="${TIMESTAMP:-$(date +%Y%m%d_%H%M%S)}"
DRY_RUN="${DRY_RUN:-true}"
LOG_FILE="${LOG_FILE:-/tmp/disk_full_${TIMESTAMP}.log}"
WARN_THRESHOLD="${WARN_THRESHOLD:-80}"
CRIT_THRESHOLD="${CRIT_THRESHOLD:-90}"
EMERGENCY_THRESHOLD="${EMERGENCY_THRESHOLD:-95}"
log() {
local level="$1"
shift
local message="$*"
printf '%s: %s\n' "$level" "$message" | tee -a "$LOG_FILE"
}
ok() {
log "OK" "$@"
}
warning() {
log "WARNING" "$@"
}
critical() {
log "CRITICAL" "$@"
}
section() {
printf '\n== %s ==\n' "$1" | tee -a "$LOG_FILE"
}
require_cmd() {
local cmd="$1"
if command -v "$cmd" >/dev/null 2>&1; then
return 0
fi
warning "Command not available: $cmd"
return 1
}
run_cmd() {
if [[ "$#" -eq 0 ]]; then
critical "run_cmd called without a command"
return 2
fi
if [[ "$DRY_RUN" == "true" ]]; then
ok "DRY-RUN: $*"
return 0
fi
ok "RUN: $*"
"$@" 2>&1 | tee -a "$LOG_FILE"
}
confirm_execute() {
local target="${1:-disk-full remediation}"
if [[ "$DRY_RUN" == "true" ]]; then
ok "Safe mode enabled. No destructive actions will be taken."
return 0
fi
warning "Execution mode requested for: $target"
warning "Confirm the affected filesystem, application impact, backups, and change approval before continuing."
printf 'Type EXECUTE to continue: '
read -r confirmation
if [[ "$confirmation" != "EXECUTE" ]]; then
critical "Confirmation failed. Aborting."
exit 1
fi
ok "Execution confirmed by operator."
}
validate_path() {
local path="$1"
if [[ -z "$path" ]]; then
critical "Path cannot be empty"
return 2
fi
if [[ ! -e "$path" ]]; then
critical "Path does not exist: $path"
return 2
fi
}
usage_percent_number() {
local value="$1"
printf '%s\n' "${value%\%}"
}
status_for_percent() {
local percent="$1"
if (( percent >= EMERGENCY_THRESHOLD )); then
printf 'CRITICAL'
elif (( percent >= CRIT_THRESHOLD )); then
printf 'WARNING'
elif (( percent >= WARN_THRESHOLD )); then
printf 'WARNING'
else
printf 'OK'
fi
}
safe_find_prune_args() {
printf '%s\n' \
-path /proc -o \
-path /sys -o \
-path /dev -o \
-path /run -o \
-path /tmp/systemd-private-\*
}
+47
View File
@@ -0,0 +1,47 @@
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=00_env.sh
. "$SCRIPT_DIR/00_env.sh"
exit_code=0
section "Disk Space Overview"
if require_cmd df; then
df -h 2>&1 | tee -a "$LOG_FILE"
else
critical "df is required for disk overview"
exit 1
fi
section "Inode Overview"
df -i 2>&1 | tee -a "$LOG_FILE" || warning "Unable to collect inode usage"
section "Filesystems Sorted By Usage"
df -P -h | awk 'NR == 1 { next } { print $5, $6, $1, $2, $3, $4 }' | sort -rn | while read -r used mount fs size used_space avail; do
percent="$(usage_percent_number "$used")"
level="$(status_for_percent "$percent")"
printf '%s: %s used on %s (%s, size=%s used=%s avail=%s)\n' "$level" "$used" "$mount" "$fs" "$size" "$used_space" "$avail" | tee -a "$LOG_FILE"
done
section "Threshold Summary"
while read -r fs size used avail pct mount; do
percent="$(usage_percent_number "$pct")"
level="$(status_for_percent "$percent")"
if (( percent >= EMERGENCY_THRESHOLD )); then
critical "$mount is ${pct} full on $fs (size=$size used=$used avail=$avail)"
exit_code=1
elif (( percent >= CRIT_THRESHOLD )); then
warning "$mount is ${pct} full on $fs (size=$size used=$used avail=$avail)"
elif (( percent >= WARN_THRESHOLD )); then
warning "$mount is ${pct} full on $fs (size=$size used=$used avail=$avail)"
else
ok "$mount is ${pct} full on $fs"
fi
done < <(df -P -h | awk 'NR > 1 { print $1, $2, $3, $4, $5, $6 }')
exit "$exit_code"
+63
View File
@@ -0,0 +1,63 @@
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=00_env.sh
. "$SCRIPT_DIR/00_env.sh"
SEARCH_PATH="/"
TOP_N=20
usage() {
printf 'Usage: %s [--path <path>] [--top <N>]\n' "$(basename "$0")"
}
while [[ "$#" -gt 0 ]]; do
case "$1" in
--path) SEARCH_PATH="${2:-}"; shift 2 ;;
--top) TOP_N="${2:-}"; shift 2 ;;
-h|--help) usage; exit 0 ;;
*) critical "Unknown argument: $1"; usage; exit 2 ;;
esac
done
if ! [[ "$TOP_N" =~ ^[0-9]+$ ]] || (( TOP_N < 1 )); then
critical "--top must be a positive integer"
exit 2
fi
validate_path "$SEARCH_PATH" || exit 2
require_cmd find || exit 1
require_cmd sort || exit 1
require_cmd head || exit 1
section "Largest Files Under $SEARCH_PATH"
warning "Read-only scan. Permission errors can be normal without root access."
find "$SEARCH_PATH" -xdev \
\( -path /proc -o -path /sys -o -path /dev -o -path /run \) -prune -o \
-type f -printf '%s\t%p\n' 2>/dev/null |
sort -rn |
head -n "$TOP_N" |
awk '
function human(bytes) {
split("B KB MB GB TB PB", unit)
size = bytes
idx = 1
while (size >= 1024 && idx < 6) {
size = size / 1024
idx++
}
return sprintf("%.1f%s", size, unit[idx])
}
{
size = $1
$1 = ""
sub(/^\t/, "")
printf "%10s %s\n", human(size), $0
}
' | tee -a "$LOG_FILE"
ok "No files were modified."
+41
View File
@@ -0,0 +1,41 @@
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=00_env.sh
. "$SCRIPT_DIR/00_env.sh"
section "Deleted But Open Files"
if ! require_cmd lsof; then
warning "lsof is not installed or not in PATH. Install lsof or run equivalent tooling with appropriate privileges."
exit 0
fi
warning "Read-only check. Full results may require elevated privileges."
deleted_output="$(lsof -nP +L1 2>/dev/null || true)"
if [[ -z "$deleted_output" ]]; then
ok "No deleted open files detected by lsof."
exit 0
fi
printf '%s\n' "$deleted_output" |
awk '
NR == 1 {
printf "%-20s %-10s %-12s %s\n", "PROCESS", "PID", "SIZE", "PATH"
next
}
{
path = $9
for (i = 10; i <= NF; i++) {
path = path " " $i
}
printf "%-20s %-10s %-12s %s\n", $1, $2, $7, path
}
' | tee -a "$LOG_FILE"
warning "Space from deleted files is released when the owning process closes the file or is safely restarted."
+51
View File
@@ -0,0 +1,51 @@
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=00_env.sh
. "$SCRIPT_DIR/00_env.sh"
SEARCH_PATH="/"
DEPTH=2
TOP_N=25
usage() {
printf 'Usage: %s [--path <path>] [--depth <N>] [--top <N>]\n' "$(basename "$0")"
}
while [[ "$#" -gt 0 ]]; do
case "$1" in
--path) SEARCH_PATH="${2:-}"; shift 2 ;;
--depth) DEPTH="${2:-}"; shift 2 ;;
--top) TOP_N="${2:-}"; shift 2 ;;
-h|--help) usage; exit 0 ;;
*) critical "Unknown argument: $1"; usage; exit 2 ;;
esac
done
if ! [[ "$DEPTH" =~ ^[0-9]+$ ]]; then
critical "--depth must be a non-negative integer"
exit 2
fi
if ! [[ "$TOP_N" =~ ^[0-9]+$ ]] || (( TOP_N < 1 )); then
critical "--top must be a positive integer"
exit 2
fi
validate_path "$SEARCH_PATH" || exit 2
require_cmd du || exit 1
require_cmd sort || exit 1
require_cmd head || exit 1
section "Top Directories Under $SEARCH_PATH"
warning "Read-only scan. Permission errors can be normal without root access."
du -x -h --max-depth="$DEPTH" "$SEARCH_PATH" 2>/dev/null |
sort -hr |
head -n "$TOP_N" |
tee -a "$LOG_FILE"
ok "No directories were modified."
+97
View File
@@ -0,0 +1,97 @@
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=00_env.sh
. "$SCRIPT_DIR/00_env.sh"
EXECUTE=false
LOG_PATH="/var/log"
DAYS_OLD=14
usage() {
printf 'Usage: %s [--path <path>] [--days-old <N>] [--execute]\n' "$(basename "$0")"
}
while [[ "$#" -gt 0 ]]; do
case "$1" in
--path) LOG_PATH="${2:-}"; shift 2 ;;
--days-old) DAYS_OLD="${2:-}"; shift 2 ;;
--execute) EXECUTE=true; DRY_RUN=false; shift ;;
-h|--help) usage; exit 0 ;;
*) critical "Unknown argument: $1"; usage; exit 2 ;;
esac
done
if ! [[ "$DAYS_OLD" =~ ^[0-9]+$ ]] || (( DAYS_OLD < 1 )); then
critical "--days-old must be a positive integer"
exit 2
fi
validate_path "$LOG_PATH" || exit 2
require_cmd find || exit 1
require_cmd sort || exit 1
require_cmd xargs || true
section "Large Log Files In $LOG_PATH"
find "$LOG_PATH" -xdev -type f \( -name '*.log' -o -name '*log' -o -name 'messages*' -o -name 'syslog*' \) -size +100M -printf '%s\t%p\n' 2>/dev/null |
sort -rn |
awk '
function human(bytes) {
split("B KB MB GB TB", unit)
size = bytes
idx = 1
while (size >= 1024 && idx < 5) {
size = size / 1024
idx++
}
return sprintf("%.1f%s", size, unit[idx])
}
{ size = $1; $1 = ""; sub(/^\t/, ""); printf "%10s %s\n", human(size), $0 }
' | tee -a "$LOG_FILE"
section "Old Rotated Logs Eligible For Review"
mapfile -t rotated_logs < <(
find "$LOG_PATH" -xdev -type f \
\( -name '*.gz' -o -name '*.1' -o -name '*.old' -o -name '*.bz2' -o -name '*.xz' \) \
-mtime +"$DAYS_OLD" -print 2>/dev/null | sort
)
if [[ "${#rotated_logs[@]}" -eq 0 ]]; then
ok "No old rotated logs found under $LOG_PATH with age greater than $DAYS_OLD days."
else
printf '%s\n' "${rotated_logs[@]}" | tee -a "$LOG_FILE"
fi
section "Suggested Cleanup Commands"
cat <<SUGGESTIONS | tee -a "$LOG_FILE"
# Review large active logs before truncating. Prefer application-aware log rotation:
logrotate -d /etc/logrotate.conf
# Remove old rotated logs only after retention approval:
$(basename "$0") --path "$LOG_PATH" --days-old "$DAYS_OLD" --execute
SUGGESTIONS
if [[ "$EXECUTE" != "true" ]]; then
ok "Safe mode. No logs were removed."
exit 0
fi
if [[ "${#rotated_logs[@]}" -eq 0 ]]; then
ok "Execution requested, but there are no eligible old rotated logs to remove."
exit 0
fi
confirm_execute "remove old rotated logs from $LOG_PATH"
for file in "${rotated_logs[@]}"; do
if [[ -f "$file" && ! -L "$file" ]]; then
run_cmd rm -f -- "$file"
else
warning "Skipped non-regular file or symlink: $file"
fi
done
ok "Old rotated log cleanup completed."
+78
View File
@@ -0,0 +1,78 @@
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=00_env.sh
. "$SCRIPT_DIR/00_env.sh"
EXECUTE=false
TRUNCATE_FILE=""
RESTART_SERVICE=""
usage() {
printf 'Usage: %s [--truncate-file <path>] [--restart-service <name>] [--execute]\n' "$(basename "$0")"
}
while [[ "$#" -gt 0 ]]; do
case "$1" in
--truncate-file) TRUNCATE_FILE="${2:-}"; shift 2 ;;
--restart-service) RESTART_SERVICE="${2:-}"; shift 2 ;;
--execute) EXECUTE=true; DRY_RUN=false; shift ;;
-h|--help) usage; exit 0 ;;
*) critical "Unknown argument: $1"; usage; exit 2 ;;
esac
done
section "Emergency Disk Full Quick Fix Options"
cat <<OPTIONS | tee -a "$LOG_FILE"
Possible actions after incident commander approval:
1. Truncate a verified active log file:
$0 --truncate-file /path/to/large.log --execute
2. Restart a specific service holding deleted files open:
$0 --restart-service service-name --execute
Review application impact before either action. Truncation preserves the file inode but destroys file contents.
OPTIONS
if [[ -z "$TRUNCATE_FILE" && -z "$RESTART_SERVICE" ]]; then
ok "No quick fix requested. Printed options only."
exit 0
fi
if [[ "$EXECUTE" != "true" ]]; then
warning "Quick fix arguments supplied without --execute. No changes made."
exit 0
fi
confirm_execute "emergency disk-full quick fix"
if [[ -n "$TRUNCATE_FILE" ]]; then
validate_path "$TRUNCATE_FILE" || exit 2
if [[ ! -f "$TRUNCATE_FILE" || -L "$TRUNCATE_FILE" ]]; then
critical "Refusing to truncate non-regular file or symlink: $TRUNCATE_FILE"
exit 2
fi
warning "Truncating file contents: $TRUNCATE_FILE"
: > "$TRUNCATE_FILE"
ok "Truncated $TRUNCATE_FILE"
fi
if [[ -n "$RESTART_SERVICE" ]]; then
if [[ "$RESTART_SERVICE" == *"/"* || "$RESTART_SERVICE" == *".."* ]]; then
critical "Invalid service name: $RESTART_SERVICE"
exit 2
fi
if require_cmd systemctl; then
run_cmd systemctl restart "$RESTART_SERVICE"
ok "Restart requested for service: $RESTART_SERVICE"
else
critical "systemctl is required to restart services"
exit 1
fi
fi
+90
View File
@@ -0,0 +1,90 @@
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=00_env.sh
. "$SCRIPT_DIR/00_env.sh"
BEFORE_FILE=""
exit_code=0
usage() {
printf 'Usage: %s [--before-file <df_output_file>]\n' "$(basename "$0")"
}
while [[ "$#" -gt 0 ]]; do
case "$1" in
--before-file) BEFORE_FILE="${2:-}"; shift 2 ;;
-h|--help) usage; exit 0 ;;
*) critical "Unknown argument: $1"; usage; exit 2 ;;
esac
done
require_cmd df || exit 1
section "Post-Cleanup Disk Space"
df -h 2>&1 | tee -a "$LOG_FILE"
section "Post-Cleanup Inodes"
df -i 2>&1 | tee -a "$LOG_FILE" || warning "Unable to collect inode usage"
section "Critical Filesystem Check"
while read -r fs size used avail pct mount; do
percent="$(usage_percent_number "$pct")"
if (( percent >= EMERGENCY_THRESHOLD )); then
critical "$mount is still ${pct} full on $fs (size=$size used=$used avail=$avail)"
exit_code=1
elif (( percent >= CRIT_THRESHOLD )); then
warning "$mount remains high at ${pct} on $fs"
else
ok "$mount is ${pct} full on $fs"
fi
done < <(df -P -h | awk 'NR > 1 { print $1, $2, $3, $4, $5, $6 }')
if [[ -n "$BEFORE_FILE" ]]; then
section "Before And After Comparison"
if [[ ! -f "$BEFORE_FILE" ]]; then
warning "Before file not found: $BEFORE_FILE"
else
awk '
NR == FNR && FNR > 1 {
before[$6] = $5
next
}
FNR > 1 {
mount = $6
if (mount in before) {
before_pct = before[mount]
after_pct = $5
gsub(/%/, "", before_pct)
gsub(/%/, "", after_pct)
if (after_pct < before_pct) {
status = "OK"
result = "improved"
} else if (after_pct == before_pct) {
status = "WARNING"
result = "unchanged"
} else {
status = "WARNING"
result = "increased"
}
printf "%s: %s before=%s after=%s (%s)\n", status, mount, before[mount], $5, result
}
}
' "$BEFORE_FILE" <(df -P -h) | tee -a "$LOG_FILE"
fi
else
warning "No --before-file supplied. Improvement comparison skipped."
fi
if [[ "$exit_code" -eq 0 ]]; then
ok "Post-check completed without emergency-threshold filesystems."
else
critical "One or more filesystems remain at or above ${EMERGENCY_THRESHOLD}%."
fi
exit "$exit_code"
@@ -0,0 +1,84 @@
# Linux Disk Full Incident Toolkit
Production-style Bash toolkit for diagnosing and handling a disk full incident on Linux systems. It is intentionally conservative: default mode is safe, cleanup actions require `--execute` and an operator confirmation prompt, and the scripts do not assume root access.
## Why Disk Full Incidents Happen
- **Logs** - application, audit, system, or middleware logs can grow faster than rotation policy expects.
- **Temporary files** - failed jobs, installers, archives, and batch workloads often leave large files in `/tmp`, `/var/tmp`, or application work directories.
- **Deleted open files** - a process can keep writing to a file after it has been deleted, hiding disk usage from normal directory listings until the process closes the file.
- **Inode exhaustion** - a filesystem can fail writes even when space is available if it has too many small files and no free inodes.
## Safety Model
- Safe dry-run behavior is the default.
- No script blindly deletes files.
- Cleanup operations require `--execute` and confirmation.
- Missing optional commands are reported as `WARNING`.
- Output is formatted with `OK`, `WARNING`, and `CRITICAL` for incident notes.
- The scripts are designed to work without root, while warning when permissions may limit visibility.
## Scripts
- `00_env.sh` - shared configuration and helper functions.
- `01_disk_overview.sh` - `df -h`, `df -i`, sorted mount usage, and threshold highlights.
- `02_find_big_files.sh` - read-only largest-file discovery.
- `03_deleted_open_files.sh` - deleted but open file detection with `lsof` when available.
- `04_top_dirs.sh` - largest directory discovery with `du`.
- `05_log_cleanup.sh` - safe log cleanup analysis and optional old rotated log removal.
- `06_quick_fix.sh` - defensive emergency actions for verified truncation or service restart.
- `07_postcheck.sh` - validation after cleanup, with optional before/after comparison.
- `disk_full_runbook.sh` - guided incident workflow.
## Example Usage
```bash
cd infra-run/scripts/bash/disk-full
./01_disk_overview.sh
./02_find_big_files.sh --path /var --top 20
./03_deleted_open_files.sh
./04_top_dirs.sh --path /var --depth 2
./05_log_cleanup.sh
./07_postcheck.sh
```
Run the guided read-only workflow:
```bash
./disk_full_runbook.sh --path /var --top 20 --depth 2
```
Review old rotated logs without deleting them:
```bash
./05_log_cleanup.sh --path /var/log --days-old 14
```
Remove old rotated logs only after approval:
```bash
./05_log_cleanup.sh --path /var/log --days-old 14 --execute
```
Emergency truncation of a verified active log:
```bash
./06_quick_fix.sh --truncate-file /var/log/app/verified-large.log --execute
```
Restart a specific service after confirming it is holding deleted files open:
```bash
./06_quick_fix.sh --restart-service app.service --execute
```
## Exit Codes
- `0` - OK
- `1` - operational issue detected or still critical
- `2` - invalid input
## Production Warning
Use this toolkit as an incident aid, not an autopilot. Confirm the affected filesystem, application ownership, retention requirements, backup expectations, and change approval before cleanup. In enterprise environments, coordinate service restarts and file truncation with application owners because both can destroy evidence or interrupt production workloads.
+67
View File
@@ -0,0 +1,67 @@
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=00_env.sh
. "$SCRIPT_DIR/00_env.sh"
SEARCH_PATH="/"
TOP_N=20
DEPTH=2
EXECUTE=false
usage() {
printf 'Usage: %s [--path <path>] [--top <N>] [--depth <N>] [--execute]\n' "$(basename "$0")"
}
while [[ "$#" -gt 0 ]]; do
case "$1" in
--path) SEARCH_PATH="${2:-}"; shift 2 ;;
--top) TOP_N="${2:-}"; shift 2 ;;
--depth) DEPTH="${2:-}"; shift 2 ;;
--execute) EXECUTE=true; DRY_RUN=false; shift ;;
-h|--help) usage; exit 0 ;;
*) critical "Unknown argument: $1"; usage; exit 2 ;;
esac
done
section "Disk Full Incident Workflow"
cat <<FLOW | tee -a "$LOG_FILE"
Step 1. Disk overview
$SCRIPT_DIR/01_disk_overview.sh
Step 2. Find largest files
$SCRIPT_DIR/02_find_big_files.sh --path "$SEARCH_PATH" --top "$TOP_N"
Step 3. Check deleted but open files
$SCRIPT_DIR/03_deleted_open_files.sh
Step 4. Identify top directories
$SCRIPT_DIR/04_top_dirs.sh --path "$SEARCH_PATH" --depth "$DEPTH"
Step 5. Review safe log cleanup suggestions
$SCRIPT_DIR/05_log_cleanup.sh
Step 6. Optional emergency quick fix, only after approval
$SCRIPT_DIR/06_quick_fix.sh --truncate-file /path/to/verified.log --execute
$SCRIPT_DIR/06_quick_fix.sh --restart-service service-name --execute
Step 7. Post-check
$SCRIPT_DIR/07_postcheck.sh
FLOW
if [[ "$EXECUTE" == "true" ]]; then
warning "--execute was supplied to the runbook. Destructive actions are still not run automatically."
fi
section "Running Read-Only Incident Checks"
"$SCRIPT_DIR/01_disk_overview.sh" || warning "Disk overview reported critical usage"
"$SCRIPT_DIR/02_find_big_files.sh" --path "$SEARCH_PATH" --top "$TOP_N" || warning "Large-file scan reported an issue"
"$SCRIPT_DIR/03_deleted_open_files.sh" || warning "Deleted-open-file check reported an issue"
"$SCRIPT_DIR/04_top_dirs.sh" --path "$SEARCH_PATH" --depth "$DEPTH" || warning "Top-directory scan reported an issue"
"$SCRIPT_DIR/05_log_cleanup.sh" || warning "Log cleanup suggestion step reported an issue"
section "Next Manual Decision"
ok "Review findings, identify owner and retention requirements, then run a targeted cleanup script with --execute only if approved."