#!/usr/bin/env bash set -o errexit set -o nounset set -o pipefail incident_type="" service_name="" host_name="" port="" target_pid="" match_string="" output_file="" since_value="1 hour ago" script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" usage() { cat <<'USAGE' Usage: incident_triage_report.sh --type TYPE [options] Run selected read-only incident checks and produce a Markdown triage report. Incident types: cpu memory service network auth cert filesystem jvm all Options: --type TYPE Incident type to collect --service SERVICE_NAME systemd service name for service checks --host HOSTNAME_OR_FQDN host for DNS, network, or certificate checks --port PORT TCP or TLS port for host checks --pid PID JVM process ID --match PROCESS_MATCH JVM process match string --output FILE write Markdown report to FILE --since VALUE time window for log-based checks --help show this help Examples: ./incident_triage_report.sh --type cpu ./incident_triage_report.sh --type service --service nginx --since "30 minutes ago" ./incident_triage_report.sh --type network --host app.example.com --port 443 ./incident_triage_report.sh --type all --service nginx --host app.example.com --port 443 --output triage.md USAGE } is_number() { [[ "$1" =~ ^[0-9]+$ ]] } valid_type() { case "$1" in cpu|memory|service|network|auth|cert|filesystem|jvm|all) return 0 ;; *) return 1 ;; esac } while (($# > 0)); do case "$1" in --type) [[ $# -ge 2 ]] || { printf 'CRITICAL: --type requires a value\n'; exit 2; } incident_type="$2" shift 2 ;; --service) [[ $# -ge 2 ]] || { printf 'CRITICAL: --service requires a value\n'; exit 2; } service_name="$2" shift 2 ;; --host) [[ $# -ge 2 ]] || { printf 'CRITICAL: --host requires a value\n'; exit 2; } host_name="$2" shift 2 ;; --port) [[ $# -ge 2 ]] || { printf 'CRITICAL: --port requires a value\n'; exit 2; } port="$2" shift 2 ;; --pid) [[ $# -ge 2 ]] || { printf 'CRITICAL: --pid requires a value\n'; exit 2; } target_pid="$2" shift 2 ;; --match) [[ $# -ge 2 ]] || { printf 'CRITICAL: --match requires a value\n'; exit 2; } match_string="$2" shift 2 ;; --output) [[ $# -ge 2 ]] || { printf 'CRITICAL: --output requires a value\n'; exit 2; } output_file="$2" shift 2 ;; --since) [[ $# -ge 2 ]] || { printf 'CRITICAL: --since requires a value\n'; exit 2; } since_value="$2" shift 2 ;; --help|-h) usage exit 0 ;; *) printf 'CRITICAL: unknown option: %s\n' "$1" usage exit 2 ;; esac done if [[ -z "$incident_type" ]]; then printf 'CRITICAL: --type is required\n' usage exit 2 fi if ! valid_type "$incident_type"; then printf 'CRITICAL: unsupported incident type: %s\n' "$incident_type" usage exit 2 fi if [[ -n "$port" ]] && ! is_number "$port"; then printf 'CRITICAL: --port must be numeric\n' exit 2 fi if [[ -n "$target_pid" ]] && ! is_number "$target_pid"; then printf 'CRITICAL: --pid must be numeric\n' exit 2 fi if [[ -n "$target_pid" && -n "$match_string" ]]; then printf 'CRITICAL: use either --pid or --match for JVM checks, not both\n' exit 2 fi tmp_dir="$(mktemp -d)" trap 'rm -rf "$tmp_dir"' EXIT report_file="$tmp_dir/report.md" check_labels=() check_names=() check_commands=() check_statuses=() check_exit_codes=() check_summaries=() check_outputs=() status_from_exit() { case "$1" in 0) printf 'OK' ;; 1) printf 'WARNING' ;; 2) printf 'INVALID' ;; 3) printf 'CRITICAL' ;; *) printf 'ERROR' ;; esac } render_command() { local item for item in "$@"; do printf '%q ' "$item" done | sed 's/[[:space:]]*$//' } append_skipped_check() { local label="$1" local name="$2" local reason="$3" local output_path="$tmp_dir/check_${#check_labels[@]}.txt" printf 'SKIPPED: %s\n' "$reason" > "$output_path" check_labels+=("$label") check_names+=("$name") check_commands+=("not run") check_statuses+=("SKIPPED") check_exit_codes+=("-") check_summaries+=("$reason") check_outputs+=("$output_path") } run_check() { local label="$1" local script_name="$2" shift 2 local script_path="${script_dir}/${script_name}" local output_path="$tmp_dir/check_${#check_labels[@]}.txt" local command_text local exit_code local status local summary command_text="$(render_command "$script_path" "$@")" if [[ ! -e "$script_path" ]]; then append_skipped_check "$label" "$script_name" "missing script: $script_name" return fi if [[ ! -x "$script_path" ]]; then append_skipped_check "$label" "$script_name" "script is not executable: $script_name" return fi set +e "$script_path" "$@" > "$output_path" 2>&1 exit_code=$? set -e status="$(status_from_exit "$exit_code")" summary="$(sed -n '1p' "$output_path")" if [[ -z "$summary" ]]; then summary="no output captured" fi check_labels+=("$label") check_names+=("$script_name") check_commands+=("$command_text") check_statuses+=("$status") check_exit_codes+=("$exit_code") check_summaries+=("$summary") check_outputs+=("$output_path") } run_cpu_checks() { run_check "CPU saturation" "check_high_cpu.sh" } run_memory_checks() { run_check "Memory and OOM" "check_high_memory_oom.sh" --since "$since_value" } run_service_checks() { if [[ -z "$service_name" ]]; then append_skipped_check "Service restart loop" "check_service_restart_loop.sh" "requires --service SERVICE_NAME" return fi run_check "Service restart loop" "check_service_restart_loop.sh" --service "$service_name" --since "$since_value" } run_network_checks() { local args=(--host "$host_name") if [[ -z "$host_name" ]]; then append_skipped_check "DNS and connectivity" "check_dns_connectivity.sh" "requires --host HOSTNAME_OR_FQDN" return fi if [[ -n "$port" ]]; then args+=(--port "$port") fi run_check "DNS and connectivity" "check_dns_connectivity.sh" "${args[@]}" } run_auth_checks() { run_check "Failed SSH logins" "check_failed_ssh_logins.sh" --since "$since_value" } run_cert_checks() { local args=(--host "$host_name") if [[ -z "$host_name" ]]; then append_skipped_check "Certificate expiry" "check_certificate_expiry.sh" "requires --host HOSTNAME_OR_FQDN" return fi if [[ -n "$port" ]]; then args+=(--port "$port") fi run_check "Certificate expiry" "check_certificate_expiry.sh" "${args[@]}" } run_filesystem_checks() { run_check "Read-only filesystems" "check_filesystem_readonly.sh" run_check "Inode usage" "check_inode_usage.sh" } run_jvm_checks() { local args=() if [[ -n "$target_pid" ]]; then args+=(--pid "$target_pid") elif [[ -n "$match_string" ]]; then args+=(--match "$match_string") fi run_check "JVM threads and heap" "check_jvm_threads_heap.sh" "${args[@]}" } case "$incident_type" in cpu) run_cpu_checks ;; memory) run_memory_checks ;; service) run_service_checks ;; network) run_network_checks ;; auth) run_auth_checks ;; cert) run_cert_checks ;; filesystem) run_filesystem_checks ;; jvm) run_jvm_checks ;; all) run_cpu_checks run_memory_checks run_service_checks run_network_checks run_auth_checks run_cert_checks run_filesystem_checks run_jvm_checks ;; esac generated_at="$(date -u '+%Y-%m-%dT%H:%M:%SZ')" local_hostname="$(hostname 2>/dev/null || printf 'unknown')" current_user="$(id -un 2>/dev/null || printf 'unknown')" { printf '# L2 Incident Triage Report\n\n' printf -- '- Generated: %s\n' "$generated_at" printf -- '- Local hostname: %s\n' "$local_hostname" printf -- '- Current user: %s\n' "$current_user" printf -- '- Incident type: %s\n' "$incident_type" printf -- '- Service: %s\n' "${service_name:-not provided}" printf -- '- Host: %s\n' "${host_name:-not provided}" printf -- '- Port: %s\n' "${port:-not provided}" printf -- '- PID: %s\n' "${target_pid:-not provided}" printf -- '- Process match: %s\n' "${match_string:-not provided}" printf -- '- Since: %s\n\n' "$since_value" printf '## Executed Checks\n\n' printf '| Check | Script | Status | Exit | Command |\n' printf '| --- | --- | --- | --- | --- |\n' for index in "${!check_labels[@]}"; do printf "| %s | \`%s\` | %s | %s | \`%s\` |\n" \ "${check_labels[$index]}" \ "${check_names[$index]}" \ "${check_statuses[$index]}" \ "${check_exit_codes[$index]}" \ "${check_commands[$index]}" done printf '\n' printf '## Summary\n\n' for index in "${!check_labels[@]}"; do printf -- '- %s: %s\n' "${check_labels[$index]}" "${check_summaries[$index]}" done printf '\n' printf '## Raw Evidence\n\n' for index in "${!check_labels[@]}"; do printf '### %s\n\n' "${check_labels[$index]}" printf "Script: \`%s\`\n\n" "${check_names[$index]}" printf "Command: \`%s\`\n\n" "${check_commands[$index]}" printf 'Status: %s, exit: %s\n\n' "${check_statuses[$index]}" "${check_exit_codes[$index]}" printf '```text\n' cat "${check_outputs[$index]}" printf '\n```\n\n' done printf '## L2 Handover Checklist\n\n' printf -- '- [ ] Business impact confirmed\n' printf -- '- [ ] Affected host/service identified\n' printf -- '- [ ] Monitoring alert attached\n' printf -- '- [ ] Recent changes checked\n' printf -- '- [ ] Logs attached\n' printf -- '- [ ] Service owner identified\n' printf -- '- [ ] Escalation target identified\n\n' printf '## Escalation Notes\n\n' printf -- '- Escalate when impact is active, spreading, customer-facing, or outside L2 access.\n' printf -- '- Include the alert, timeline, commands run, and the raw evidence above.\n' printf -- '- Call out skipped checks and missing inputs so the next responder does not repeat the same gap.\n' printf -- '- Do not restart, kill, remount, or rotate anything unless the incident owner approves the action.\n\n' printf '## Recommended Next Steps\n\n' printf -- '- Confirm the symptom against monitoring and user reports.\n' printf -- '- Compare this point-in-time evidence with recent deploys, config changes, and host events.\n' printf -- '- Attach this report to the incident ticket before handoff.\n' printf -- '- If escalation is needed, include exact hostnames, service names, timestamps, and observed impact.\n' } > "$report_file" if [[ -n "$output_file" ]]; then cp "$report_file" "$output_file" printf 'OK: wrote L2 incident triage report to %s\n' "$output_file" else cat "$report_file" fi