revert 6475f76787
lint / shell-yaml-ansible (push) Failing after 17s

revert Add L2 incident triage report wrapper
This commit is contained in:
2026-05-14 21:16:57 +02:00
parent f2c5e43c0a
commit e03865b453
8 changed files with 3 additions and 537 deletions
@@ -16,7 +16,6 @@ They favor standard tools found on RHEL-like and Debian/Ubuntu systems. Optional
- `check_filesystem_readonly.sh` - read-only filesystem detection.
- `check_inode_usage.sh` - inode pressure and top affected mount points.
- `check_jvm_threads_heap.sh` - lightweight JVM process, heap, and thread diagnostics.
- `incident_triage_report.sh` - wrapper that runs selected checks and writes a single Markdown L2 handover report.
## Usage Examples
@@ -52,21 +51,8 @@ They favor standard tools found on RHEL-like and Debian/Ubuntu systems. Optional
./check_jvm_threads_heap.sh
./check_jvm_threads_heap.sh --pid 1234
./check_jvm_threads_heap.sh --match app-name
./incident_triage_report.sh --type cpu
./incident_triage_report.sh --type service --service nginx --since "30 minutes ago"
./incident_triage_report.sh --type network --host app.example.com --port 443
./incident_triage_report.sh --type all --service nginx --host app.example.com --port 443 --output triage.md
```
## L2 Triage Report Wrapper
`incident_triage_report.sh` collects selected incident checks into one Markdown report. It is useful for L2 mentoring, repeatable triage, and ticket evidence because it keeps the command list, point-in-time output, handover checklist, escalation notes, and recommended next steps in one place.
Supported report types are `cpu`, `memory`, `service`, `network`, `auth`, `cert`, `filesystem`, `jvm`, and `all`.
The wrapper is read-only apart from writing the requested `--output` file. It does not require root and skips checks safely when an underlying script is missing, not executable, or missing required context such as `--service` or `--host`.
## Exit Codes
- `0` - OK.
@@ -120,5 +106,3 @@ Sanitized examples are available in [examples](./examples/):
- `filesystem-readonly.sample.txt`
- `inode-usage.sample.txt`
- `jvm-threads-heap.sample.txt`
A sanitized report sample is available at [../../../examples/incident-triage/l2-incident-triage-report.sample.md](../../../examples/incident-triage/l2-incident-triage-report.sample.md).
@@ -92,7 +92,7 @@ load_15m="unavailable"
load_per_cpu_pct=0
if [[ -r /proc/loadavg ]]; then
read -r load_1m load_5m load_15m _ < /proc/loadavg
load_per_cpu_pct="$(awk -v load_avg="$load_1m" -v cpus="$cpu_count" 'BEGIN { printf "%d", (load_avg / cpus) * 100 }')"
load_per_cpu_pct="$(awk -v load="$load_1m" -v cpus="$cpu_count" 'BEGIN { printf "%d", (load / cpus) * 100 }')"
elif command -v uptime >/dev/null 2>&1; then
load_line="$(uptime 2>/dev/null || true)"
load_1m="$(printf '%s\n' "$load_line" | sed -n 's/.*load average[s]*: *\([^,]*\).*/\1/p')"
@@ -1,385 +0,0 @@
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
incident_type=""
service_name=""
host_name=""
port=""
target_pid=""
match_string=""
output_file=""
since_value="1 hour ago"
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
usage() {
cat <<'USAGE'
Usage: incident_triage_report.sh --type TYPE [options]
Run selected read-only incident checks and produce a Markdown triage report.
Incident types:
cpu
memory
service
network
auth
cert
filesystem
jvm
all
Options:
--type TYPE Incident type to collect
--service SERVICE_NAME systemd service name for service checks
--host HOSTNAME_OR_FQDN host for DNS, network, or certificate checks
--port PORT TCP or TLS port for host checks
--pid PID JVM process ID
--match PROCESS_MATCH JVM process match string
--output FILE write Markdown report to FILE
--since VALUE time window for log-based checks
--help show this help
Examples:
./incident_triage_report.sh --type cpu
./incident_triage_report.sh --type service --service nginx --since "30 minutes ago"
./incident_triage_report.sh --type network --host app.example.com --port 443
./incident_triage_report.sh --type all --service nginx --host app.example.com --port 443 --output triage.md
USAGE
}
is_number() {
[[ "$1" =~ ^[0-9]+$ ]]
}
valid_type() {
case "$1" in
cpu|memory|service|network|auth|cert|filesystem|jvm|all) return 0 ;;
*) return 1 ;;
esac
}
while (($# > 0)); do
case "$1" in
--type)
[[ $# -ge 2 ]] || { printf 'CRITICAL: --type requires a value\n'; exit 2; }
incident_type="$2"
shift 2
;;
--service)
[[ $# -ge 2 ]] || { printf 'CRITICAL: --service requires a value\n'; exit 2; }
service_name="$2"
shift 2
;;
--host)
[[ $# -ge 2 ]] || { printf 'CRITICAL: --host requires a value\n'; exit 2; }
host_name="$2"
shift 2
;;
--port)
[[ $# -ge 2 ]] || { printf 'CRITICAL: --port requires a value\n'; exit 2; }
port="$2"
shift 2
;;
--pid)
[[ $# -ge 2 ]] || { printf 'CRITICAL: --pid requires a value\n'; exit 2; }
target_pid="$2"
shift 2
;;
--match)
[[ $# -ge 2 ]] || { printf 'CRITICAL: --match requires a value\n'; exit 2; }
match_string="$2"
shift 2
;;
--output)
[[ $# -ge 2 ]] || { printf 'CRITICAL: --output requires a value\n'; exit 2; }
output_file="$2"
shift 2
;;
--since)
[[ $# -ge 2 ]] || { printf 'CRITICAL: --since requires a value\n'; exit 2; }
since_value="$2"
shift 2
;;
--help|-h)
usage
exit 0
;;
*)
printf 'CRITICAL: unknown option: %s\n' "$1"
usage
exit 2
;;
esac
done
if [[ -z "$incident_type" ]]; then
printf 'CRITICAL: --type is required\n'
usage
exit 2
fi
if ! valid_type "$incident_type"; then
printf 'CRITICAL: unsupported incident type: %s\n' "$incident_type"
usage
exit 2
fi
if [[ -n "$port" ]] && ! is_number "$port"; then
printf 'CRITICAL: --port must be numeric\n'
exit 2
fi
if [[ -n "$target_pid" ]] && ! is_number "$target_pid"; then
printf 'CRITICAL: --pid must be numeric\n'
exit 2
fi
if [[ -n "$target_pid" && -n "$match_string" ]]; then
printf 'CRITICAL: use either --pid or --match for JVM checks, not both\n'
exit 2
fi
tmp_dir="$(mktemp -d)"
trap 'rm -rf "$tmp_dir"' EXIT
report_file="$tmp_dir/report.md"
check_labels=()
check_names=()
check_commands=()
check_statuses=()
check_exit_codes=()
check_summaries=()
check_outputs=()
status_from_exit() {
case "$1" in
0) printf 'OK' ;;
1) printf 'WARNING' ;;
2) printf 'INVALID' ;;
3) printf 'CRITICAL' ;;
*) printf 'ERROR' ;;
esac
}
render_command() {
local item
for item in "$@"; do
printf '%q ' "$item"
done | sed 's/[[:space:]]*$//'
}
append_skipped_check() {
local label="$1"
local name="$2"
local reason="$3"
local output_path="$tmp_dir/check_${#check_labels[@]}.txt"
printf 'SKIPPED: %s\n' "$reason" > "$output_path"
check_labels+=("$label")
check_names+=("$name")
check_commands+=("not run")
check_statuses+=("SKIPPED")
check_exit_codes+=("-")
check_summaries+=("$reason")
check_outputs+=("$output_path")
}
run_check() {
local label="$1"
local script_name="$2"
shift 2
local script_path="${script_dir}/${script_name}"
local output_path="$tmp_dir/check_${#check_labels[@]}.txt"
local command_text
local exit_code
local status
local summary
command_text="$(render_command "$script_path" "$@")"
if [[ ! -e "$script_path" ]]; then
append_skipped_check "$label" "$script_name" "missing script: $script_name"
return
fi
if [[ ! -x "$script_path" ]]; then
append_skipped_check "$label" "$script_name" "script is not executable: $script_name"
return
fi
set +e
"$script_path" "$@" > "$output_path" 2>&1
exit_code=$?
set -e
status="$(status_from_exit "$exit_code")"
summary="$(sed -n '1p' "$output_path")"
if [[ -z "$summary" ]]; then
summary="no output captured"
fi
check_labels+=("$label")
check_names+=("$script_name")
check_commands+=("$command_text")
check_statuses+=("$status")
check_exit_codes+=("$exit_code")
check_summaries+=("$summary")
check_outputs+=("$output_path")
}
run_cpu_checks() {
run_check "CPU saturation" "check_high_cpu.sh"
}
run_memory_checks() {
run_check "Memory and OOM" "check_high_memory_oom.sh" --since "$since_value"
}
run_service_checks() {
if [[ -z "$service_name" ]]; then
append_skipped_check "Service restart loop" "check_service_restart_loop.sh" "requires --service SERVICE_NAME"
return
fi
run_check "Service restart loop" "check_service_restart_loop.sh" --service "$service_name" --since "$since_value"
}
run_network_checks() {
local args=(--host "$host_name")
if [[ -z "$host_name" ]]; then
append_skipped_check "DNS and connectivity" "check_dns_connectivity.sh" "requires --host HOSTNAME_OR_FQDN"
return
fi
if [[ -n "$port" ]]; then
args+=(--port "$port")
fi
run_check "DNS and connectivity" "check_dns_connectivity.sh" "${args[@]}"
}
run_auth_checks() {
run_check "Failed SSH logins" "check_failed_ssh_logins.sh" --since "$since_value"
}
run_cert_checks() {
local args=(--host "$host_name")
if [[ -z "$host_name" ]]; then
append_skipped_check "Certificate expiry" "check_certificate_expiry.sh" "requires --host HOSTNAME_OR_FQDN"
return
fi
if [[ -n "$port" ]]; then
args+=(--port "$port")
fi
run_check "Certificate expiry" "check_certificate_expiry.sh" "${args[@]}"
}
run_filesystem_checks() {
run_check "Read-only filesystems" "check_filesystem_readonly.sh"
run_check "Inode usage" "check_inode_usage.sh"
}
run_jvm_checks() {
local args=()
if [[ -n "$target_pid" ]]; then
args+=(--pid "$target_pid")
elif [[ -n "$match_string" ]]; then
args+=(--match "$match_string")
fi
run_check "JVM threads and heap" "check_jvm_threads_heap.sh" "${args[@]}"
}
case "$incident_type" in
cpu) run_cpu_checks ;;
memory) run_memory_checks ;;
service) run_service_checks ;;
network) run_network_checks ;;
auth) run_auth_checks ;;
cert) run_cert_checks ;;
filesystem) run_filesystem_checks ;;
jvm) run_jvm_checks ;;
all)
run_cpu_checks
run_memory_checks
run_service_checks
run_network_checks
run_auth_checks
run_cert_checks
run_filesystem_checks
run_jvm_checks
;;
esac
generated_at="$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
local_hostname="$(hostname 2>/dev/null || printf 'unknown')"
current_user="$(id -un 2>/dev/null || printf 'unknown')"
{
printf '# L2 Incident Triage Report\n\n'
printf -- '- Generated: %s\n' "$generated_at"
printf -- '- Local hostname: %s\n' "$local_hostname"
printf -- '- Current user: %s\n' "$current_user"
printf -- '- Incident type: %s\n' "$incident_type"
printf -- '- Service: %s\n' "${service_name:-not provided}"
printf -- '- Host: %s\n' "${host_name:-not provided}"
printf -- '- Port: %s\n' "${port:-not provided}"
printf -- '- PID: %s\n' "${target_pid:-not provided}"
printf -- '- Process match: %s\n' "${match_string:-not provided}"
printf -- '- Since: %s\n\n' "$since_value"
printf '## Executed Checks\n\n'
printf '| Check | Script | Status | Exit | Command |\n'
printf '| --- | --- | --- | --- | --- |\n'
for index in "${!check_labels[@]}"; do
printf "| %s | \`%s\` | %s | %s | \`%s\` |\n" \
"${check_labels[$index]}" \
"${check_names[$index]}" \
"${check_statuses[$index]}" \
"${check_exit_codes[$index]}" \
"${check_commands[$index]}"
done
printf '\n'
printf '## Summary\n\n'
for index in "${!check_labels[@]}"; do
printf -- '- %s: %s\n' "${check_labels[$index]}" "${check_summaries[$index]}"
done
printf '\n'
printf '## Raw Evidence\n\n'
for index in "${!check_labels[@]}"; do
printf '### %s\n\n' "${check_labels[$index]}"
printf "Script: \`%s\`\n\n" "${check_names[$index]}"
printf "Command: \`%s\`\n\n" "${check_commands[$index]}"
printf 'Status: %s, exit: %s\n\n' "${check_statuses[$index]}" "${check_exit_codes[$index]}"
printf '```text\n'
cat "${check_outputs[$index]}"
printf '\n```\n\n'
done
printf '## L2 Handover Checklist\n\n'
printf -- '- [ ] Business impact confirmed\n'
printf -- '- [ ] Affected host/service identified\n'
printf -- '- [ ] Monitoring alert attached\n'
printf -- '- [ ] Recent changes checked\n'
printf -- '- [ ] Logs attached\n'
printf -- '- [ ] Service owner identified\n'
printf -- '- [ ] Escalation target identified\n\n'
printf '## Escalation Notes\n\n'
printf -- '- Escalate when impact is active, spreading, customer-facing, or outside L2 access.\n'
printf -- '- Include the alert, timeline, commands run, and the raw evidence above.\n'
printf -- '- Call out skipped checks and missing inputs so the next responder does not repeat the same gap.\n'
printf -- '- Do not restart, kill, remount, or rotate anything unless the incident owner approves the action.\n\n'
printf '## Recommended Next Steps\n\n'
printf -- '- Confirm the symptom against monitoring and user reports.\n'
printf -- '- Compare this point-in-time evidence with recent deploys, config changes, and host events.\n'
printf -- '- Attach this report to the incident ticket before handoff.\n'
printf -- '- If escalation is needed, include exact hostnames, service names, timestamps, and observed impact.\n'
} > "$report_file"
if [[ -n "$output_file" ]]; then
cp "$report_file" "$output_file"
printf 'OK: wrote L2 incident triage report to %s\n' "$output_file"
else
cat "$report_file"
fi