portfolio/infra-run/scripts/bash/incident-checks/incident_triage_report.sh

#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail

incident_type=""
service_name=""
host_name=""
port=""
target_pid=""
match_string=""
output_file=""
since_value="1 hour ago"

script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

usage() {
  cat <<'USAGE'
Usage: incident_triage_report.sh --type TYPE [options]

Run selected read-only incident checks and produce a Markdown triage report.

Incident types:
  cpu
  memory
  service
  network
  auth
  cert
  filesystem
  jvm
  all

Options:
  --type TYPE                 Incident type to collect
  --service SERVICE_NAME      systemd service name for service checks
  --host HOSTNAME_OR_FQDN     host for DNS, network, or certificate checks
  --port PORT                 TCP or TLS port for host checks
  --pid PID                   JVM process ID
  --match PROCESS_MATCH       JVM process match string
  --output FILE               write Markdown report to FILE
  --since VALUE               time window for log-based checks
  --help                      show this help

Examples:
  ./incident_triage_report.sh --type cpu
  ./incident_triage_report.sh --type service --service nginx --since "30 minutes ago"
  ./incident_triage_report.sh --type network --host app.example.com --port 443
  ./incident_triage_report.sh --type all --service nginx --host app.example.com --port 443 --output triage.md
USAGE
}

is_number() {
  [[ "$1" =~ ^[0-9]+$ ]]
}

valid_type() {
  case "$1" in
    cpu|memory|service|network|auth|cert|filesystem|jvm|all) return 0 ;;
    *) return 1 ;;
  esac
}

while (($# > 0)); do
  case "$1" in
    --type)
      [[ $# -ge 2 ]] || { printf 'CRITICAL: --type requires a value\n'; exit 2; }
      incident_type="$2"
      shift 2
      ;;
    --service)
      [[ $# -ge 2 ]] || { printf 'CRITICAL: --service requires a value\n'; exit 2; }
      service_name="$2"
      shift 2
      ;;
    --host)
      [[ $# -ge 2 ]] || { printf 'CRITICAL: --host requires a value\n'; exit 2; }
      host_name="$2"
      shift 2
      ;;
    --port)
      [[ $# -ge 2 ]] || { printf 'CRITICAL: --port requires a value\n'; exit 2; }
      port="$2"
      shift 2
      ;;
    --pid)
      [[ $# -ge 2 ]] || { printf 'CRITICAL: --pid requires a value\n'; exit 2; }
      target_pid="$2"
      shift 2
      ;;
    --match)
      [[ $# -ge 2 ]] || { printf 'CRITICAL: --match requires a value\n'; exit 2; }
      match_string="$2"
      shift 2
      ;;
    --output)
      [[ $# -ge 2 ]] || { printf 'CRITICAL: --output requires a value\n'; exit 2; }
      output_file="$2"
      shift 2
      ;;
    --since)
      [[ $# -ge 2 ]] || { printf 'CRITICAL: --since requires a value\n'; exit 2; }
      since_value="$2"
      shift 2
      ;;
    --help|-h)
      usage
      exit 0
      ;;
    *)
      printf 'CRITICAL: unknown option: %s\n' "$1"
      usage
      exit 2
      ;;
  esac
done

if [[ -z "$incident_type" ]]; then
  printf 'CRITICAL: --type is required\n'
  usage
  exit 2
fi
if ! valid_type "$incident_type"; then
  printf 'CRITICAL: unsupported incident type: %s\n' "$incident_type"
  usage
  exit 2
fi
if [[ -n "$port" ]] && ! is_number "$port"; then
  printf 'CRITICAL: --port must be numeric\n'
  exit 2
fi
if [[ -n "$target_pid" ]] && ! is_number "$target_pid"; then
  printf 'CRITICAL: --pid must be numeric\n'
  exit 2
fi
if [[ -n "$target_pid" && -n "$match_string" ]]; then
  printf 'CRITICAL: use either --pid or --match for JVM checks, not both\n'
  exit 2
fi

tmp_dir="$(mktemp -d)"
trap 'rm -rf "$tmp_dir"' EXIT

report_file="$tmp_dir/report.md"

check_labels=()
check_names=()
check_commands=()
check_statuses=()
check_exit_codes=()
check_summaries=()
check_outputs=()

status_from_exit() {
  case "$1" in
    0) printf 'OK' ;;
    1) printf 'WARNING' ;;
    2) printf 'INVALID' ;;
    3) printf 'CRITICAL' ;;
    *) printf 'ERROR' ;;
  esac
}

render_command() {
  local item
  for item in "$@"; do
    printf '%q ' "$item"
  done | sed 's/[[:space:]]*$//'
}

append_skipped_check() {
  local label="$1"
  local name="$2"
  local reason="$3"
  local output_path="$tmp_dir/check_${#check_labels[@]}.txt"

  printf 'SKIPPED: %s\n' "$reason" > "$output_path"

  check_labels+=("$label")
  check_names+=("$name")
  check_commands+=("not run")
  check_statuses+=("SKIPPED")
  check_exit_codes+=("-")
  check_summaries+=("$reason")
  check_outputs+=("$output_path")
}

run_check() {
  local label="$1"
  local script_name="$2"
  shift 2

  local script_path="${script_dir}/${script_name}"
  local output_path="$tmp_dir/check_${#check_labels[@]}.txt"
  local command_text
  local exit_code
  local status
  local summary

  command_text="$(render_command "$script_path" "$@")"

  if [[ ! -e "$script_path" ]]; then
    append_skipped_check "$label" "$script_name" "missing script: $script_name"
    return
  fi
  if [[ ! -x "$script_path" ]]; then
    append_skipped_check "$label" "$script_name" "script is not executable: $script_name"
    return
  fi

  set +e
  "$script_path" "$@" > "$output_path" 2>&1
  exit_code=$?
  set -e

  status="$(status_from_exit "$exit_code")"
  summary="$(sed -n '1p' "$output_path")"
  if [[ -z "$summary" ]]; then
    summary="no output captured"
  fi

  check_labels+=("$label")
  check_names+=("$script_name")
  check_commands+=("$command_text")
  check_statuses+=("$status")
  check_exit_codes+=("$exit_code")
  check_summaries+=("$summary")
  check_outputs+=("$output_path")
}

run_cpu_checks() {
  run_check "CPU saturation" "check_high_cpu.sh"
}

run_memory_checks() {
  run_check "Memory and OOM" "check_high_memory_oom.sh" --since "$since_value"
}

run_service_checks() {
  if [[ -z "$service_name" ]]; then
    append_skipped_check "Service restart loop" "check_service_restart_loop.sh" "requires --service SERVICE_NAME"
    return
  fi
  run_check "Service restart loop" "check_service_restart_loop.sh" --service "$service_name" --since "$since_value"
}

run_network_checks() {
  local args=(--host "$host_name")
  if [[ -z "$host_name" ]]; then
    append_skipped_check "DNS and connectivity" "check_dns_connectivity.sh" "requires --host HOSTNAME_OR_FQDN"
    return
  fi
  if [[ -n "$port" ]]; then
    args+=(--port "$port")
  fi
  run_check "DNS and connectivity" "check_dns_connectivity.sh" "${args[@]}"
}

run_auth_checks() {
  run_check "Failed SSH logins" "check_failed_ssh_logins.sh" --since "$since_value"
}

run_cert_checks() {
  local args=(--host "$host_name")
  if [[ -z "$host_name" ]]; then
    append_skipped_check "Certificate expiry" "check_certificate_expiry.sh" "requires --host HOSTNAME_OR_FQDN"
    return
  fi
  if [[ -n "$port" ]]; then
    args+=(--port "$port")
  fi
  run_check "Certificate expiry" "check_certificate_expiry.sh" "${args[@]}"
}

run_filesystem_checks() {
  run_check "Read-only filesystems" "check_filesystem_readonly.sh"
  run_check "Inode usage" "check_inode_usage.sh"
}

run_jvm_checks() {
  local args=()
  if [[ -n "$target_pid" ]]; then
    args+=(--pid "$target_pid")
  elif [[ -n "$match_string" ]]; then
    args+=(--match "$match_string")
  fi
  run_check "JVM threads and heap" "check_jvm_threads_heap.sh" "${args[@]}"
}

case "$incident_type" in
  cpu) run_cpu_checks ;;
  memory) run_memory_checks ;;
  service) run_service_checks ;;
  network) run_network_checks ;;
  auth) run_auth_checks ;;
  cert) run_cert_checks ;;
  filesystem) run_filesystem_checks ;;
  jvm) run_jvm_checks ;;
  all)
    run_cpu_checks
    run_memory_checks
    run_service_checks
    run_network_checks
    run_auth_checks
    run_cert_checks
    run_filesystem_checks
    run_jvm_checks
    ;;
esac

generated_at="$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
local_hostname="$(hostname 2>/dev/null || printf 'unknown')"
current_user="$(id -un 2>/dev/null || printf 'unknown')"

{
  printf '# L2 Incident Triage Report\n\n'
  printf -- '- Generated: %s\n' "$generated_at"
  printf -- '- Local hostname: %s\n' "$local_hostname"
  printf -- '- Current user: %s\n' "$current_user"
  printf -- '- Incident type: %s\n' "$incident_type"
  printf -- '- Service: %s\n' "${service_name:-not provided}"
  printf -- '- Host: %s\n' "${host_name:-not provided}"
  printf -- '- Port: %s\n' "${port:-not provided}"
  printf -- '- PID: %s\n' "${target_pid:-not provided}"
  printf -- '- Process match: %s\n' "${match_string:-not provided}"
  printf -- '- Since: %s\n\n' "$since_value"

  printf '## Executed Checks\n\n'
  printf '| Check | Script | Status | Exit | Command |\n'
  printf '| --- | --- | --- | --- | --- |\n'
  for index in "${!check_labels[@]}"; do
    printf "| %s | \`%s\` | %s | %s | \`%s\` |\n" \
      "${check_labels[$index]}" \
      "${check_names[$index]}" \
      "${check_statuses[$index]}" \
      "${check_exit_codes[$index]}" \
      "${check_commands[$index]}"
  done
  printf '\n'

  printf '## Summary\n\n'
  for index in "${!check_labels[@]}"; do
    printf -- '- %s: %s\n' "${check_labels[$index]}" "${check_summaries[$index]}"
  done
  printf '\n'

  printf '## Raw Evidence\n\n'
  for index in "${!check_labels[@]}"; do
    printf '### %s\n\n' "${check_labels[$index]}"
    printf "Script: \`%s\`\n\n" "${check_names[$index]}"
    printf "Command: \`%s\`\n\n" "${check_commands[$index]}"
    printf 'Status: %s, exit: %s\n\n' "${check_statuses[$index]}" "${check_exit_codes[$index]}"
    printf '```text\n'
    cat "${check_outputs[$index]}"
    printf '\n```\n\n'
  done

  printf '## L2 Handover Checklist\n\n'
  printf -- '- [ ] Business impact confirmed\n'
  printf -- '- [ ] Affected host/service identified\n'
  printf -- '- [ ] Monitoring alert attached\n'
  printf -- '- [ ] Recent changes checked\n'
  printf -- '- [ ] Logs attached\n'
  printf -- '- [ ] Service owner identified\n'
  printf -- '- [ ] Escalation target identified\n\n'

  printf '## Escalation Notes\n\n'
  printf -- '- Escalate when impact is active, spreading, customer-facing, or outside L2 access.\n'
  printf -- '- Include the alert, timeline, commands run, and the raw evidence above.\n'
  printf -- '- Call out skipped checks and missing inputs so the next responder does not repeat the same gap.\n'
  printf -- '- Do not restart, kill, remount, or rotate anything unless the incident owner approves the action.\n\n'

  printf '## Recommended Next Steps\n\n'
  printf -- '- Confirm the symptom against monitoring and user reports.\n'
  printf -- '- Compare this point-in-time evidence with recent deploys, config changes, and host events.\n'
  printf -- '- Attach this report to the incident ticket before handoff.\n'
  printf -- '- If escalation is needed, include exact hostnames, service names, timestamps, and observed impact.\n'
} > "$report_file"

if [[ -n "$output_file" ]]; then
  cp "$report_file" "$output_file"
  printf 'OK: wrote L2 incident triage report to %s\n' "$output_file"
else
  cat "$report_file"
fi