386 lines
10 KiB
Bash
Executable File
386 lines
10 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -o errexit
|
|
set -o nounset
|
|
set -o pipefail
|
|
|
|
incident_type=""
|
|
service_name=""
|
|
host_name=""
|
|
port=""
|
|
target_pid=""
|
|
match_string=""
|
|
output_file=""
|
|
since_value="1 hour ago"
|
|
|
|
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
|
|
usage() {
|
|
cat <<'USAGE'
|
|
Usage: incident_triage_report.sh --type TYPE [options]
|
|
|
|
Run selected read-only incident checks and produce a Markdown triage report.
|
|
|
|
Incident types:
|
|
cpu
|
|
memory
|
|
service
|
|
network
|
|
auth
|
|
cert
|
|
filesystem
|
|
jvm
|
|
all
|
|
|
|
Options:
|
|
--type TYPE Incident type to collect
|
|
--service SERVICE_NAME systemd service name for service checks
|
|
--host HOSTNAME_OR_FQDN host for DNS, network, or certificate checks
|
|
--port PORT TCP or TLS port for host checks
|
|
--pid PID JVM process ID
|
|
--match PROCESS_MATCH JVM process match string
|
|
--output FILE write Markdown report to FILE
|
|
--since VALUE time window for log-based checks
|
|
--help show this help
|
|
|
|
Examples:
|
|
./incident_triage_report.sh --type cpu
|
|
./incident_triage_report.sh --type service --service nginx --since "30 minutes ago"
|
|
./incident_triage_report.sh --type network --host app.example.com --port 443
|
|
./incident_triage_report.sh --type all --service nginx --host app.example.com --port 443 --output triage.md
|
|
USAGE
|
|
}
|
|
|
|
is_number() {
|
|
[[ "$1" =~ ^[0-9]+$ ]]
|
|
}
|
|
|
|
valid_type() {
|
|
case "$1" in
|
|
cpu|memory|service|network|auth|cert|filesystem|jvm|all) return 0 ;;
|
|
*) return 1 ;;
|
|
esac
|
|
}
|
|
|
|
while (($# > 0)); do
|
|
case "$1" in
|
|
--type)
|
|
[[ $# -ge 2 ]] || { printf 'CRITICAL: --type requires a value\n'; exit 2; }
|
|
incident_type="$2"
|
|
shift 2
|
|
;;
|
|
--service)
|
|
[[ $# -ge 2 ]] || { printf 'CRITICAL: --service requires a value\n'; exit 2; }
|
|
service_name="$2"
|
|
shift 2
|
|
;;
|
|
--host)
|
|
[[ $# -ge 2 ]] || { printf 'CRITICAL: --host requires a value\n'; exit 2; }
|
|
host_name="$2"
|
|
shift 2
|
|
;;
|
|
--port)
|
|
[[ $# -ge 2 ]] || { printf 'CRITICAL: --port requires a value\n'; exit 2; }
|
|
port="$2"
|
|
shift 2
|
|
;;
|
|
--pid)
|
|
[[ $# -ge 2 ]] || { printf 'CRITICAL: --pid requires a value\n'; exit 2; }
|
|
target_pid="$2"
|
|
shift 2
|
|
;;
|
|
--match)
|
|
[[ $# -ge 2 ]] || { printf 'CRITICAL: --match requires a value\n'; exit 2; }
|
|
match_string="$2"
|
|
shift 2
|
|
;;
|
|
--output)
|
|
[[ $# -ge 2 ]] || { printf 'CRITICAL: --output requires a value\n'; exit 2; }
|
|
output_file="$2"
|
|
shift 2
|
|
;;
|
|
--since)
|
|
[[ $# -ge 2 ]] || { printf 'CRITICAL: --since requires a value\n'; exit 2; }
|
|
since_value="$2"
|
|
shift 2
|
|
;;
|
|
--help|-h)
|
|
usage
|
|
exit 0
|
|
;;
|
|
*)
|
|
printf 'CRITICAL: unknown option: %s\n' "$1"
|
|
usage
|
|
exit 2
|
|
;;
|
|
esac
|
|
done
|
|
|
|
if [[ -z "$incident_type" ]]; then
|
|
printf 'CRITICAL: --type is required\n'
|
|
usage
|
|
exit 2
|
|
fi
|
|
if ! valid_type "$incident_type"; then
|
|
printf 'CRITICAL: unsupported incident type: %s\n' "$incident_type"
|
|
usage
|
|
exit 2
|
|
fi
|
|
if [[ -n "$port" ]] && ! is_number "$port"; then
|
|
printf 'CRITICAL: --port must be numeric\n'
|
|
exit 2
|
|
fi
|
|
if [[ -n "$target_pid" ]] && ! is_number "$target_pid"; then
|
|
printf 'CRITICAL: --pid must be numeric\n'
|
|
exit 2
|
|
fi
|
|
if [[ -n "$target_pid" && -n "$match_string" ]]; then
|
|
printf 'CRITICAL: use either --pid or --match for JVM checks, not both\n'
|
|
exit 2
|
|
fi
|
|
|
|
tmp_dir="$(mktemp -d)"
|
|
trap 'rm -rf "$tmp_dir"' EXIT
|
|
|
|
report_file="$tmp_dir/report.md"
|
|
|
|
check_labels=()
|
|
check_names=()
|
|
check_commands=()
|
|
check_statuses=()
|
|
check_exit_codes=()
|
|
check_summaries=()
|
|
check_outputs=()
|
|
|
|
status_from_exit() {
|
|
case "$1" in
|
|
0) printf 'OK' ;;
|
|
1) printf 'WARNING' ;;
|
|
2) printf 'INVALID' ;;
|
|
3) printf 'CRITICAL' ;;
|
|
*) printf 'ERROR' ;;
|
|
esac
|
|
}
|
|
|
|
render_command() {
|
|
local item
|
|
for item in "$@"; do
|
|
printf '%q ' "$item"
|
|
done | sed 's/[[:space:]]*$//'
|
|
}
|
|
|
|
append_skipped_check() {
|
|
local label="$1"
|
|
local name="$2"
|
|
local reason="$3"
|
|
local output_path="$tmp_dir/check_${#check_labels[@]}.txt"
|
|
|
|
printf 'SKIPPED: %s\n' "$reason" > "$output_path"
|
|
|
|
check_labels+=("$label")
|
|
check_names+=("$name")
|
|
check_commands+=("not run")
|
|
check_statuses+=("SKIPPED")
|
|
check_exit_codes+=("-")
|
|
check_summaries+=("$reason")
|
|
check_outputs+=("$output_path")
|
|
}
|
|
|
|
run_check() {
|
|
local label="$1"
|
|
local script_name="$2"
|
|
shift 2
|
|
|
|
local script_path="${script_dir}/${script_name}"
|
|
local output_path="$tmp_dir/check_${#check_labels[@]}.txt"
|
|
local command_text
|
|
local exit_code
|
|
local status
|
|
local summary
|
|
|
|
command_text="$(render_command "$script_path" "$@")"
|
|
|
|
if [[ ! -e "$script_path" ]]; then
|
|
append_skipped_check "$label" "$script_name" "missing script: $script_name"
|
|
return
|
|
fi
|
|
if [[ ! -x "$script_path" ]]; then
|
|
append_skipped_check "$label" "$script_name" "script is not executable: $script_name"
|
|
return
|
|
fi
|
|
|
|
set +e
|
|
"$script_path" "$@" > "$output_path" 2>&1
|
|
exit_code=$?
|
|
set -e
|
|
|
|
status="$(status_from_exit "$exit_code")"
|
|
summary="$(sed -n '1p' "$output_path")"
|
|
if [[ -z "$summary" ]]; then
|
|
summary="no output captured"
|
|
fi
|
|
|
|
check_labels+=("$label")
|
|
check_names+=("$script_name")
|
|
check_commands+=("$command_text")
|
|
check_statuses+=("$status")
|
|
check_exit_codes+=("$exit_code")
|
|
check_summaries+=("$summary")
|
|
check_outputs+=("$output_path")
|
|
}
|
|
|
|
run_cpu_checks() {
|
|
run_check "CPU saturation" "check_high_cpu.sh"
|
|
}
|
|
|
|
run_memory_checks() {
|
|
run_check "Memory and OOM" "check_high_memory_oom.sh" --since "$since_value"
|
|
}
|
|
|
|
run_service_checks() {
|
|
if [[ -z "$service_name" ]]; then
|
|
append_skipped_check "Service restart loop" "check_service_restart_loop.sh" "requires --service SERVICE_NAME"
|
|
return
|
|
fi
|
|
run_check "Service restart loop" "check_service_restart_loop.sh" --service "$service_name" --since "$since_value"
|
|
}
|
|
|
|
run_network_checks() {
|
|
local args=(--host "$host_name")
|
|
if [[ -z "$host_name" ]]; then
|
|
append_skipped_check "DNS and connectivity" "check_dns_connectivity.sh" "requires --host HOSTNAME_OR_FQDN"
|
|
return
|
|
fi
|
|
if [[ -n "$port" ]]; then
|
|
args+=(--port "$port")
|
|
fi
|
|
run_check "DNS and connectivity" "check_dns_connectivity.sh" "${args[@]}"
|
|
}
|
|
|
|
run_auth_checks() {
|
|
run_check "Failed SSH logins" "check_failed_ssh_logins.sh" --since "$since_value"
|
|
}
|
|
|
|
run_cert_checks() {
|
|
local args=(--host "$host_name")
|
|
if [[ -z "$host_name" ]]; then
|
|
append_skipped_check "Certificate expiry" "check_certificate_expiry.sh" "requires --host HOSTNAME_OR_FQDN"
|
|
return
|
|
fi
|
|
if [[ -n "$port" ]]; then
|
|
args+=(--port "$port")
|
|
fi
|
|
run_check "Certificate expiry" "check_certificate_expiry.sh" "${args[@]}"
|
|
}
|
|
|
|
run_filesystem_checks() {
|
|
run_check "Read-only filesystems" "check_filesystem_readonly.sh"
|
|
run_check "Inode usage" "check_inode_usage.sh"
|
|
}
|
|
|
|
run_jvm_checks() {
|
|
local args=()
|
|
if [[ -n "$target_pid" ]]; then
|
|
args+=(--pid "$target_pid")
|
|
elif [[ -n "$match_string" ]]; then
|
|
args+=(--match "$match_string")
|
|
fi
|
|
run_check "JVM threads and heap" "check_jvm_threads_heap.sh" "${args[@]}"
|
|
}
|
|
|
|
case "$incident_type" in
|
|
cpu) run_cpu_checks ;;
|
|
memory) run_memory_checks ;;
|
|
service) run_service_checks ;;
|
|
network) run_network_checks ;;
|
|
auth) run_auth_checks ;;
|
|
cert) run_cert_checks ;;
|
|
filesystem) run_filesystem_checks ;;
|
|
jvm) run_jvm_checks ;;
|
|
all)
|
|
run_cpu_checks
|
|
run_memory_checks
|
|
run_service_checks
|
|
run_network_checks
|
|
run_auth_checks
|
|
run_cert_checks
|
|
run_filesystem_checks
|
|
run_jvm_checks
|
|
;;
|
|
esac
|
|
|
|
generated_at="$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
|
|
local_hostname="$(hostname 2>/dev/null || printf 'unknown')"
|
|
current_user="$(id -un 2>/dev/null || printf 'unknown')"
|
|
|
|
{
|
|
printf '# L2 Incident Triage Report\n\n'
|
|
printf -- '- Generated: %s\n' "$generated_at"
|
|
printf -- '- Local hostname: %s\n' "$local_hostname"
|
|
printf -- '- Current user: %s\n' "$current_user"
|
|
printf -- '- Incident type: %s\n' "$incident_type"
|
|
printf -- '- Service: %s\n' "${service_name:-not provided}"
|
|
printf -- '- Host: %s\n' "${host_name:-not provided}"
|
|
printf -- '- Port: %s\n' "${port:-not provided}"
|
|
printf -- '- PID: %s\n' "${target_pid:-not provided}"
|
|
printf -- '- Process match: %s\n' "${match_string:-not provided}"
|
|
printf -- '- Since: %s\n\n' "$since_value"
|
|
|
|
printf '## Executed Checks\n\n'
|
|
printf '| Check | Script | Status | Exit | Command |\n'
|
|
printf '| --- | --- | --- | --- | --- |\n'
|
|
for index in "${!check_labels[@]}"; do
|
|
printf "| %s | \`%s\` | %s | %s | \`%s\` |\n" \
|
|
"${check_labels[$index]}" \
|
|
"${check_names[$index]}" \
|
|
"${check_statuses[$index]}" \
|
|
"${check_exit_codes[$index]}" \
|
|
"${check_commands[$index]}"
|
|
done
|
|
printf '\n'
|
|
|
|
printf '## Summary\n\n'
|
|
for index in "${!check_labels[@]}"; do
|
|
printf -- '- %s: %s\n' "${check_labels[$index]}" "${check_summaries[$index]}"
|
|
done
|
|
printf '\n'
|
|
|
|
printf '## Raw Evidence\n\n'
|
|
for index in "${!check_labels[@]}"; do
|
|
printf '### %s\n\n' "${check_labels[$index]}"
|
|
printf "Script: \`%s\`\n\n" "${check_names[$index]}"
|
|
printf "Command: \`%s\`\n\n" "${check_commands[$index]}"
|
|
printf 'Status: %s, exit: %s\n\n' "${check_statuses[$index]}" "${check_exit_codes[$index]}"
|
|
printf '```text\n'
|
|
cat "${check_outputs[$index]}"
|
|
printf '\n```\n\n'
|
|
done
|
|
|
|
printf '## L2 Handover Checklist\n\n'
|
|
printf -- '- [ ] Business impact confirmed\n'
|
|
printf -- '- [ ] Affected host/service identified\n'
|
|
printf -- '- [ ] Monitoring alert attached\n'
|
|
printf -- '- [ ] Recent changes checked\n'
|
|
printf -- '- [ ] Logs attached\n'
|
|
printf -- '- [ ] Service owner identified\n'
|
|
printf -- '- [ ] Escalation target identified\n\n'
|
|
|
|
printf '## Escalation Notes\n\n'
|
|
printf -- '- Escalate when impact is active, spreading, customer-facing, or outside L2 access.\n'
|
|
printf -- '- Include the alert, timeline, commands run, and the raw evidence above.\n'
|
|
printf -- '- Call out skipped checks and missing inputs so the next responder does not repeat the same gap.\n'
|
|
printf -- '- Do not restart, kill, remount, or rotate anything unless the incident owner approves the action.\n\n'
|
|
|
|
printf '## Recommended Next Steps\n\n'
|
|
printf -- '- Confirm the symptom against monitoring and user reports.\n'
|
|
printf -- '- Compare this point-in-time evidence with recent deploys, config changes, and host events.\n'
|
|
printf -- '- Attach this report to the incident ticket before handoff.\n'
|
|
printf -- '- If escalation is needed, include exact hostnames, service names, timestamps, and observed impact.\n'
|
|
} > "$report_file"
|
|
|
|
if [[ -n "$output_file" ]]; then
|
|
cp "$report_file" "$output_file"
|
|
printf 'OK: wrote L2 incident triage report to %s\n' "$output_file"
|
|
else
|
|
cat "$report_file"
|
|
fi
|