This commit is contained in:
@@ -0,0 +1,385 @@
|
||||
#!/usr/bin/env bash
|
||||
set -o errexit
|
||||
set -o nounset
|
||||
set -o pipefail
|
||||
|
||||
incident_type=""
|
||||
service_name=""
|
||||
host_name=""
|
||||
port=""
|
||||
target_pid=""
|
||||
match_string=""
|
||||
output_file=""
|
||||
since_value="1 hour ago"
|
||||
|
||||
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
usage() {
|
||||
cat <<'USAGE'
|
||||
Usage: incident_triage_report.sh --type TYPE [options]
|
||||
|
||||
Run selected read-only incident checks and produce a Markdown triage report.
|
||||
|
||||
Incident types:
|
||||
cpu
|
||||
memory
|
||||
service
|
||||
network
|
||||
auth
|
||||
cert
|
||||
filesystem
|
||||
jvm
|
||||
all
|
||||
|
||||
Options:
|
||||
--type TYPE Incident type to collect
|
||||
--service SERVICE_NAME systemd service name for service checks
|
||||
--host HOSTNAME_OR_FQDN host for DNS, network, or certificate checks
|
||||
--port PORT TCP or TLS port for host checks
|
||||
--pid PID JVM process ID
|
||||
--match PROCESS_MATCH JVM process match string
|
||||
--output FILE write Markdown report to FILE
|
||||
--since VALUE time window for log-based checks
|
||||
--help show this help
|
||||
|
||||
Examples:
|
||||
./incident_triage_report.sh --type cpu
|
||||
./incident_triage_report.sh --type service --service nginx --since "30 minutes ago"
|
||||
./incident_triage_report.sh --type network --host app.example.com --port 443
|
||||
./incident_triage_report.sh --type all --service nginx --host app.example.com --port 443 --output triage.md
|
||||
USAGE
|
||||
}
|
||||
|
||||
is_number() {
|
||||
[[ "$1" =~ ^[0-9]+$ ]]
|
||||
}
|
||||
|
||||
valid_type() {
|
||||
case "$1" in
|
||||
cpu|memory|service|network|auth|cert|filesystem|jvm|all) return 0 ;;
|
||||
*) return 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
while (($# > 0)); do
|
||||
case "$1" in
|
||||
--type)
|
||||
[[ $# -ge 2 ]] || { printf 'CRITICAL: --type requires a value\n'; exit 2; }
|
||||
incident_type="$2"
|
||||
shift 2
|
||||
;;
|
||||
--service)
|
||||
[[ $# -ge 2 ]] || { printf 'CRITICAL: --service requires a value\n'; exit 2; }
|
||||
service_name="$2"
|
||||
shift 2
|
||||
;;
|
||||
--host)
|
||||
[[ $# -ge 2 ]] || { printf 'CRITICAL: --host requires a value\n'; exit 2; }
|
||||
host_name="$2"
|
||||
shift 2
|
||||
;;
|
||||
--port)
|
||||
[[ $# -ge 2 ]] || { printf 'CRITICAL: --port requires a value\n'; exit 2; }
|
||||
port="$2"
|
||||
shift 2
|
||||
;;
|
||||
--pid)
|
||||
[[ $# -ge 2 ]] || { printf 'CRITICAL: --pid requires a value\n'; exit 2; }
|
||||
target_pid="$2"
|
||||
shift 2
|
||||
;;
|
||||
--match)
|
||||
[[ $# -ge 2 ]] || { printf 'CRITICAL: --match requires a value\n'; exit 2; }
|
||||
match_string="$2"
|
||||
shift 2
|
||||
;;
|
||||
--output)
|
||||
[[ $# -ge 2 ]] || { printf 'CRITICAL: --output requires a value\n'; exit 2; }
|
||||
output_file="$2"
|
||||
shift 2
|
||||
;;
|
||||
--since)
|
||||
[[ $# -ge 2 ]] || { printf 'CRITICAL: --since requires a value\n'; exit 2; }
|
||||
since_value="$2"
|
||||
shift 2
|
||||
;;
|
||||
--help|-h)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
printf 'CRITICAL: unknown option: %s\n' "$1"
|
||||
usage
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ -z "$incident_type" ]]; then
|
||||
printf 'CRITICAL: --type is required\n'
|
||||
usage
|
||||
exit 2
|
||||
fi
|
||||
if ! valid_type "$incident_type"; then
|
||||
printf 'CRITICAL: unsupported incident type: %s\n' "$incident_type"
|
||||
usage
|
||||
exit 2
|
||||
fi
|
||||
if [[ -n "$port" ]] && ! is_number "$port"; then
|
||||
printf 'CRITICAL: --port must be numeric\n'
|
||||
exit 2
|
||||
fi
|
||||
if [[ -n "$target_pid" ]] && ! is_number "$target_pid"; then
|
||||
printf 'CRITICAL: --pid must be numeric\n'
|
||||
exit 2
|
||||
fi
|
||||
if [[ -n "$target_pid" && -n "$match_string" ]]; then
|
||||
printf 'CRITICAL: use either --pid or --match for JVM checks, not both\n'
|
||||
exit 2
|
||||
fi
|
||||
|
||||
tmp_dir="$(mktemp -d)"
|
||||
trap 'rm -rf "$tmp_dir"' EXIT
|
||||
|
||||
report_file="$tmp_dir/report.md"
|
||||
|
||||
check_labels=()
|
||||
check_names=()
|
||||
check_commands=()
|
||||
check_statuses=()
|
||||
check_exit_codes=()
|
||||
check_summaries=()
|
||||
check_outputs=()
|
||||
|
||||
status_from_exit() {
|
||||
case "$1" in
|
||||
0) printf 'OK' ;;
|
||||
1) printf 'WARNING' ;;
|
||||
2) printf 'INVALID' ;;
|
||||
3) printf 'CRITICAL' ;;
|
||||
*) printf 'ERROR' ;;
|
||||
esac
|
||||
}
|
||||
|
||||
render_command() {
|
||||
local item
|
||||
for item in "$@"; do
|
||||
printf '%q ' "$item"
|
||||
done | sed 's/[[:space:]]*$//'
|
||||
}
|
||||
|
||||
append_skipped_check() {
|
||||
local label="$1"
|
||||
local name="$2"
|
||||
local reason="$3"
|
||||
local output_path="$tmp_dir/check_${#check_labels[@]}.txt"
|
||||
|
||||
printf 'SKIPPED: %s\n' "$reason" > "$output_path"
|
||||
|
||||
check_labels+=("$label")
|
||||
check_names+=("$name")
|
||||
check_commands+=("not run")
|
||||
check_statuses+=("SKIPPED")
|
||||
check_exit_codes+=("-")
|
||||
check_summaries+=("$reason")
|
||||
check_outputs+=("$output_path")
|
||||
}
|
||||
|
||||
run_check() {
|
||||
local label="$1"
|
||||
local script_name="$2"
|
||||
shift 2
|
||||
|
||||
local script_path="${script_dir}/${script_name}"
|
||||
local output_path="$tmp_dir/check_${#check_labels[@]}.txt"
|
||||
local command_text
|
||||
local exit_code
|
||||
local status
|
||||
local summary
|
||||
|
||||
command_text="$(render_command "$script_path" "$@")"
|
||||
|
||||
if [[ ! -e "$script_path" ]]; then
|
||||
append_skipped_check "$label" "$script_name" "missing script: $script_name"
|
||||
return
|
||||
fi
|
||||
if [[ ! -x "$script_path" ]]; then
|
||||
append_skipped_check "$label" "$script_name" "script is not executable: $script_name"
|
||||
return
|
||||
fi
|
||||
|
||||
set +e
|
||||
"$script_path" "$@" > "$output_path" 2>&1
|
||||
exit_code=$?
|
||||
set -e
|
||||
|
||||
status="$(status_from_exit "$exit_code")"
|
||||
summary="$(sed -n '1p' "$output_path")"
|
||||
if [[ -z "$summary" ]]; then
|
||||
summary="no output captured"
|
||||
fi
|
||||
|
||||
check_labels+=("$label")
|
||||
check_names+=("$script_name")
|
||||
check_commands+=("$command_text")
|
||||
check_statuses+=("$status")
|
||||
check_exit_codes+=("$exit_code")
|
||||
check_summaries+=("$summary")
|
||||
check_outputs+=("$output_path")
|
||||
}
|
||||
|
||||
run_cpu_checks() {
|
||||
run_check "CPU saturation" "check_high_cpu.sh"
|
||||
}
|
||||
|
||||
run_memory_checks() {
|
||||
run_check "Memory and OOM" "check_high_memory_oom.sh" --since "$since_value"
|
||||
}
|
||||
|
||||
run_service_checks() {
|
||||
if [[ -z "$service_name" ]]; then
|
||||
append_skipped_check "Service restart loop" "check_service_restart_loop.sh" "requires --service SERVICE_NAME"
|
||||
return
|
||||
fi
|
||||
run_check "Service restart loop" "check_service_restart_loop.sh" --service "$service_name" --since "$since_value"
|
||||
}
|
||||
|
||||
run_network_checks() {
|
||||
local args=(--host "$host_name")
|
||||
if [[ -z "$host_name" ]]; then
|
||||
append_skipped_check "DNS and connectivity" "check_dns_connectivity.sh" "requires --host HOSTNAME_OR_FQDN"
|
||||
return
|
||||
fi
|
||||
if [[ -n "$port" ]]; then
|
||||
args+=(--port "$port")
|
||||
fi
|
||||
run_check "DNS and connectivity" "check_dns_connectivity.sh" "${args[@]}"
|
||||
}
|
||||
|
||||
run_auth_checks() {
|
||||
run_check "Failed SSH logins" "check_failed_ssh_logins.sh" --since "$since_value"
|
||||
}
|
||||
|
||||
run_cert_checks() {
|
||||
local args=(--host "$host_name")
|
||||
if [[ -z "$host_name" ]]; then
|
||||
append_skipped_check "Certificate expiry" "check_certificate_expiry.sh" "requires --host HOSTNAME_OR_FQDN"
|
||||
return
|
||||
fi
|
||||
if [[ -n "$port" ]]; then
|
||||
args+=(--port "$port")
|
||||
fi
|
||||
run_check "Certificate expiry" "check_certificate_expiry.sh" "${args[@]}"
|
||||
}
|
||||
|
||||
run_filesystem_checks() {
|
||||
run_check "Read-only filesystems" "check_filesystem_readonly.sh"
|
||||
run_check "Inode usage" "check_inode_usage.sh"
|
||||
}
|
||||
|
||||
run_jvm_checks() {
|
||||
local args=()
|
||||
if [[ -n "$target_pid" ]]; then
|
||||
args+=(--pid "$target_pid")
|
||||
elif [[ -n "$match_string" ]]; then
|
||||
args+=(--match "$match_string")
|
||||
fi
|
||||
run_check "JVM threads and heap" "check_jvm_threads_heap.sh" "${args[@]}"
|
||||
}
|
||||
|
||||
case "$incident_type" in
|
||||
cpu) run_cpu_checks ;;
|
||||
memory) run_memory_checks ;;
|
||||
service) run_service_checks ;;
|
||||
network) run_network_checks ;;
|
||||
auth) run_auth_checks ;;
|
||||
cert) run_cert_checks ;;
|
||||
filesystem) run_filesystem_checks ;;
|
||||
jvm) run_jvm_checks ;;
|
||||
all)
|
||||
run_cpu_checks
|
||||
run_memory_checks
|
||||
run_service_checks
|
||||
run_network_checks
|
||||
run_auth_checks
|
||||
run_cert_checks
|
||||
run_filesystem_checks
|
||||
run_jvm_checks
|
||||
;;
|
||||
esac
|
||||
|
||||
generated_at="$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
|
||||
local_hostname="$(hostname 2>/dev/null || printf 'unknown')"
|
||||
current_user="$(id -un 2>/dev/null || printf 'unknown')"
|
||||
|
||||
{
|
||||
printf '# L2 Incident Triage Report\n\n'
|
||||
printf -- '- Generated: %s\n' "$generated_at"
|
||||
printf -- '- Local hostname: %s\n' "$local_hostname"
|
||||
printf -- '- Current user: %s\n' "$current_user"
|
||||
printf -- '- Incident type: %s\n' "$incident_type"
|
||||
printf -- '- Service: %s\n' "${service_name:-not provided}"
|
||||
printf -- '- Host: %s\n' "${host_name:-not provided}"
|
||||
printf -- '- Port: %s\n' "${port:-not provided}"
|
||||
printf -- '- PID: %s\n' "${target_pid:-not provided}"
|
||||
printf -- '- Process match: %s\n' "${match_string:-not provided}"
|
||||
printf -- '- Since: %s\n\n' "$since_value"
|
||||
|
||||
printf '## Executed Checks\n\n'
|
||||
printf '| Check | Script | Status | Exit | Command |\n'
|
||||
printf '| --- | --- | --- | --- | --- |\n'
|
||||
for index in "${!check_labels[@]}"; do
|
||||
printf "| %s | \`%s\` | %s | %s | \`%s\` |\n" \
|
||||
"${check_labels[$index]}" \
|
||||
"${check_names[$index]}" \
|
||||
"${check_statuses[$index]}" \
|
||||
"${check_exit_codes[$index]}" \
|
||||
"${check_commands[$index]}"
|
||||
done
|
||||
printf '\n'
|
||||
|
||||
printf '## Summary\n\n'
|
||||
for index in "${!check_labels[@]}"; do
|
||||
printf -- '- %s: %s\n' "${check_labels[$index]}" "${check_summaries[$index]}"
|
||||
done
|
||||
printf '\n'
|
||||
|
||||
printf '## Raw Evidence\n\n'
|
||||
for index in "${!check_labels[@]}"; do
|
||||
printf '### %s\n\n' "${check_labels[$index]}"
|
||||
printf "Script: \`%s\`\n\n" "${check_names[$index]}"
|
||||
printf "Command: \`%s\`\n\n" "${check_commands[$index]}"
|
||||
printf 'Status: %s, exit: %s\n\n' "${check_statuses[$index]}" "${check_exit_codes[$index]}"
|
||||
printf '```text\n'
|
||||
cat "${check_outputs[$index]}"
|
||||
printf '\n```\n\n'
|
||||
done
|
||||
|
||||
printf '## L2 Handover Checklist\n\n'
|
||||
printf -- '- [ ] Business impact confirmed\n'
|
||||
printf -- '- [ ] Affected host/service identified\n'
|
||||
printf -- '- [ ] Monitoring alert attached\n'
|
||||
printf -- '- [ ] Recent changes checked\n'
|
||||
printf -- '- [ ] Logs attached\n'
|
||||
printf -- '- [ ] Service owner identified\n'
|
||||
printf -- '- [ ] Escalation target identified\n\n'
|
||||
|
||||
printf '## Escalation Notes\n\n'
|
||||
printf -- '- Escalate when impact is active, spreading, customer-facing, or outside L2 access.\n'
|
||||
printf -- '- Include the alert, timeline, commands run, and the raw evidence above.\n'
|
||||
printf -- '- Call out skipped checks and missing inputs so the next responder does not repeat the same gap.\n'
|
||||
printf -- '- Do not restart, kill, remount, or rotate anything unless the incident owner approves the action.\n\n'
|
||||
|
||||
printf '## Recommended Next Steps\n\n'
|
||||
printf -- '- Confirm the symptom against monitoring and user reports.\n'
|
||||
printf -- '- Compare this point-in-time evidence with recent deploys, config changes, and host events.\n'
|
||||
printf -- '- Attach this report to the incident ticket before handoff.\n'
|
||||
printf -- '- If escalation is needed, include exact hostnames, service names, timestamps, and observed impact.\n'
|
||||
} > "$report_file"
|
||||
|
||||
if [[ -n "$output_file" ]]; then
|
||||
cp "$report_file" "$output_file"
|
||||
printf 'OK: wrote L2 incident triage report to %s\n' "$output_file"
|
||||
else
|
||||
cat "$report_file"
|
||||
fi
|
||||
Reference in New Issue
Block a user