From 9fb291f834b9d4049b12af78c1b87aac0e3edc9d Mon Sep 17 00:00:00 2001 From: Mateusz Suski Date: Tue, 5 May 2026 21:26:02 +0000 Subject: [PATCH] Add initial Linux operations Bash toolkit with network diagnostics --- infra-run/scripts/bash/README.md | 51 ++++++ infra-run/scripts/bash/disk_check.sh | 29 ++++ infra-run/scripts/bash/healthcheck.sh | 68 ++++++++ .../scripts/bash/network_troubleshoot.sh | 148 ++++++++++++++++++ infra-run/scripts/bash/service_check.sh | 60 +++++++ infra-run/scripts/bash/system_report.sh | 81 ++++++++++ 6 files changed, 437 insertions(+) create mode 100644 infra-run/scripts/bash/README.md create mode 100755 infra-run/scripts/bash/disk_check.sh create mode 100755 infra-run/scripts/bash/healthcheck.sh create mode 100755 infra-run/scripts/bash/network_troubleshoot.sh create mode 100755 infra-run/scripts/bash/service_check.sh create mode 100755 infra-run/scripts/bash/system_report.sh diff --git a/infra-run/scripts/bash/README.md b/infra-run/scripts/bash/README.md new file mode 100644 index 0000000..1c73f2a --- /dev/null +++ b/infra-run/scripts/bash/README.md @@ -0,0 +1,51 @@ +# Linux Operations Bash Toolkit + +Small, practical Bash scripts for Linux operations checks and incident triage. The scripts are sanitized examples inspired by production Linux operations work and avoid destructive actions or root-only assumptions. + +## Scripts + +- `healthcheck.sh` - general host health overview. +- `disk_check.sh` - filesystem usage threshold check. +- `service_check.sh` - critical service status check. +- `system_report.sh` - writes a timestamped system report to `/tmp`. +- `network_troubleshoot.sh` - local and optional remote network diagnostics. + +## Usage + +```bash +./healthcheck.sh +./disk_check.sh +./disk_check.sh 90 +./service_check.sh +./service_check.sh sshd nginx zabbix-agent +./system_report.sh +./network_troubleshoot.sh +./network_troubleshoot.sh google.com +``` + +## Exit Codes + +`disk_check.sh`: + +- `0` - all filesystems are below the threshold. +- `1` - one or more filesystems are at or above the threshold. +- `2` - invalid threshold input. + +`service_check.sh`: + +- `0` - all checked services are active. +- `1` - at least one service is inactive, failed, missing, or cannot be checked. + +`network_troubleshoot.sh`: + +- `0` - no obvious local, DNS, or connectivity issue detected. +- `1` - DNS, interface, gateway, or target connectivity problems detected. + +`healthcheck.sh` and `system_report.sh` are informational. They print warnings for missing tools where possible. + +## Notes + +- Requires Bash. +- Designed for RHEL, Oracle Linux, and Ubuntu style systems. +- Handles missing tools such as `ss`, `traceroute`, `nc`, and `journalctl` gracefully. +- Does not require root and does not make system changes. diff --git a/infra-run/scripts/bash/disk_check.sh b/infra-run/scripts/bash/disk_check.sh new file mode 100755 index 0000000..fcfa0c1 --- /dev/null +++ b/infra-run/scripts/bash/disk_check.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +set -o errexit +set -o nounset +set -o pipefail + +threshold="${1:-80}" + +if [[ ! "$threshold" =~ ^[0-9]+$ ]] || (( threshold < 1 || threshold > 100 )); then + printf 'CRITICAL: invalid threshold "%s"; provide an integer from 1 to 100\n' "$threshold" >&2 + exit 2 +fi + +status=0 +warning_threshold=$(( threshold > 5 ? threshold - 5 : threshold )) + +while read -r filesystem size used avail use_percent mountpoint; do + usage="${use_percent%\%}" + + if (( usage >= threshold )); then + printf 'CRITICAL: %s mounted on %s is %s used; threshold is %s%% (%s free)\n' "$filesystem" "$mountpoint" "$use_percent" "$threshold" "$avail" + status=1 + elif (( usage >= warning_threshold )); then + printf 'WARNING: %s mounted on %s is %s used; threshold is %s%%\n' "$filesystem" "$mountpoint" "$use_percent" "$threshold" + else + printf 'OK: %s mounted on %s is %s used\n' "$filesystem" "$mountpoint" "$use_percent" + fi +done < <(df -P -x tmpfs -x devtmpfs | awk 'NR > 1 {print $1, $2, $3, $4, $5, $6}') + +exit "$status" diff --git a/infra-run/scripts/bash/healthcheck.sh b/infra-run/scripts/bash/healthcheck.sh new file mode 100755 index 0000000..0da350c --- /dev/null +++ b/infra-run/scripts/bash/healthcheck.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash +set -o errexit +set -o nounset +set -o pipefail + +section() { + printf '\n== %s ==\n' "$1" +} + +run_or_warn() { + local description="$1" + shift + + if command -v "$1" >/dev/null 2>&1; then + "$@" || printf 'WARNING: %s command failed\n' "$description" + else + printf 'WARNING: %s command not available\n' "$1" + fi +} + +top_processes() { + local sort_key="$1" + + if command -v ps >/dev/null 2>&1; then + ps -eo pid,ppid,comm,%cpu,%mem --sort="$sort_key" | head -n 11 + else + printf 'WARNING: ps command not available\n' + fi +} + +section "Host" +hostname +uptime + +section "OS" +if [[ -r /etc/os-release ]]; then + . /etc/os-release + printf '%s\n' "${PRETTY_NAME:-Unknown Linux}" +else + printf 'WARNING: /etc/os-release not readable\n' +fi +uname -r + +section "CPU Load" +if [[ -r /proc/loadavg ]]; then + awk '{print "1m="$1, "5m="$2, "15m="$3}' /proc/loadavg +else + uptime +fi + +section "Memory" +run_or_warn "memory usage" free -h + +section "Disk" +run_or_warn "disk usage" df -h -x tmpfs -x devtmpfs + +section "Failed systemd Services" +if command -v systemctl >/dev/null 2>&1; then + systemctl --failed --no-pager || true +else + printf 'WARNING: systemctl command not available\n' +fi + +section "Top CPU Processes" +top_processes "-%cpu" + +section "Top Memory Processes" +top_processes "-%mem" diff --git a/infra-run/scripts/bash/network_troubleshoot.sh b/infra-run/scripts/bash/network_troubleshoot.sh new file mode 100755 index 0000000..91bd277 --- /dev/null +++ b/infra-run/scripts/bash/network_troubleshoot.sh @@ -0,0 +1,148 @@ +#!/usr/bin/env bash +set -o errexit +set -o nounset +set -o pipefail + +target="${1:-}" +status=0 +warnings=() +criticals=() + +section() { + printf '\n[%s]\n' "$1" +} + +warn() { + warnings+=("$1") + printf 'WARNING: %s\n' "$1" +} + +critical() { + criticals+=("$1") + status=1 + printf 'CRITICAL: %s\n' "$1" +} + +have() { + command -v "$1" >/dev/null 2>&1 +} + +run_if_available() { + local command_name="$1" + shift + + if have "$command_name"; then + "$@" || warn "$command_name command failed" + else + warn "$command_name command not available" + fi +} + +section "LOCAL NETWORK" +if have ip; then + ip addr || warn "ip addr command failed" + printf '\nRouting table:\n' + ip route || warn "ip route command failed" + printf '\nDefault gateway:\n' + if ! ip route show default; then + critical "default gateway not found" + elif ! ip route show default | grep -q '^default '; then + critical "default gateway not configured" + fi +else + warn "ip command not available" +fi + +section "INTERFACES" +active_interfaces=0 +if have ip; then + ip -br link || warn "interface state query failed" + active_interfaces="$(ip -br link 2>/dev/null | awk '$2 == "UP" && $1 != "lo" {count++} END {print count+0}')" + if (( active_interfaces == 0 )); then + critical "no active non-loopback interface detected" + else + printf 'OK: %s active non-loopback interface(s) detected\n' "$active_interfaces" + fi +else + warn "cannot inspect interface state without ip command" +fi + +section "DNS" +if [[ -r /etc/resolv.conf ]]; then + cat /etc/resolv.conf +else + warn "/etc/resolv.conf not readable" +fi + +dns_target="${target:-localhost}" +if have getent; then + if getent hosts "$dns_target" >/dev/null 2>&1; then + printf 'OK: DNS resolution succeeded for %s\n' "$dns_target" + getent hosts "$dns_target" + else + critical "DNS resolution failed for ${dns_target}" + fi +elif have nslookup; then + if nslookup "$dns_target"; then + printf 'OK: DNS resolution succeeded for %s\n' "$dns_target" + else + critical "DNS resolution failed for ${dns_target}" + fi +else + warn "no DNS lookup tool available" +fi + +section "CONNECTIVITY" +if [[ -n "$target" ]]; then + if have ping; then + if ping -c 3 -W 2 "$target"; then + printf 'OK: ping succeeded for %s\n' "$target" + else + critical "ping failed for ${target}" + fi + else + warn "ping command not available" + fi + + run_if_available traceroute traceroute "$target" + + if have nc; then + if nc -vz -w 3 "$target" 443; then + printf 'OK: TCP 443 reachable on %s\n' "$target" + else + critical "TCP 443 connectivity failed for ${target}" + fi + elif have curl; then + if curl --head --silent --show-error --connect-timeout 5 "https://${target}" >/dev/null; then + printf 'OK: HTTPS connectivity succeeded for %s\n' "$target" + else + critical "HTTPS connectivity failed for ${target}" + fi + else + warn "no TCP connectivity test tool available (nc or curl)" + fi +else + printf 'OK: no target provided; skipped remote connectivity checks\n' +fi + +section "PORTS" +if have ss; then + ss -tuln || warn "ss command failed" +else + warn "ss command not available" +fi + +section "SUMMARY" +if (( ${#criticals[@]} > 0 )); then + printf 'CRITICAL: %s issue(s) detected\n' "${#criticals[@]}" +fi + +if (( ${#warnings[@]} > 0 )); then + printf 'WARNING: %s warning(s) detected\n' "${#warnings[@]}" +fi + +if (( status == 0 )); then + printf 'OK: no obvious DNS or connectivity problems detected\n' +fi + +exit "$status" diff --git a/infra-run/scripts/bash/service_check.sh b/infra-run/scripts/bash/service_check.sh new file mode 100755 index 0000000..e8d5491 --- /dev/null +++ b/infra-run/scripts/bash/service_check.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash +set -o errexit +set -o nounset +set -o pipefail + +services=("$@") + +service_exists() { + local service="$1" + systemctl list-unit-files "${service}.service" --no-legend 2>/dev/null | awk '{print $1}' | grep -qx "${service}.service" +} + +pick_default_scheduler() { + if service_exists cron; then + printf 'cron' + elif service_exists crond; then + printf 'crond' + else + printf 'cron' + fi +} + +pick_default_ssh() { + if service_exists sshd; then + printf 'sshd' + elif service_exists ssh; then + printf 'ssh' + else + printf 'sshd' + fi +} + +if ! command -v systemctl >/dev/null 2>&1; then + printf 'CRITICAL: systemctl command not available; cannot check services\n' >&2 + exit 1 +fi + +if (( ${#services[@]} == 0 )); then + services=("$(pick_default_ssh)" "$(pick_default_scheduler)") +fi + +status=0 + +for service in "${services[@]}"; do + if ! service_exists "$service"; then + printf 'CRITICAL: %s service not found\n' "$service" + status=1 + continue + fi + + if systemctl is-active --quiet "$service"; then + printf 'OK: %s is active\n' "$service" + else + state="$(systemctl is-active "$service" 2>/dev/null || true)" + printf 'CRITICAL: %s is %s\n' "$service" "${state:-unknown}" + status=1 + fi +done + +exit "$status" diff --git a/infra-run/scripts/bash/system_report.sh b/infra-run/scripts/bash/system_report.sh new file mode 100755 index 0000000..9f8d77b --- /dev/null +++ b/infra-run/scripts/bash/system_report.sh @@ -0,0 +1,81 @@ +#!/usr/bin/env bash +set -o errexit +set -o nounset +set -o pipefail + +host="$(hostname)" +timestamp="$(date '+%Y-%m-%d_%H%M%S')" +report="/tmp/system_report_${host}_${timestamp}.txt" + +section() { + printf '\n== %s ==\n' "$1" +} + +run_or_warn() { + local description="$1" + shift + + if command -v "$1" >/dev/null 2>&1; then + "$@" || printf 'WARNING: %s command failed\n' "$description" + else + printf 'WARNING: %s command not available\n' "$1" + fi +} + +{ + section "Host" + hostname + + section "Date" + date + + section "Uptime" + uptime + + section "OS" + if [[ -r /etc/os-release ]]; then + . /etc/os-release + printf '%s\n' "${PRETTY_NAME:-Unknown Linux}" + else + printf 'WARNING: /etc/os-release not readable\n' + fi + + section "Kernel" + uname -r + + section "CPU Load" + if [[ -r /proc/loadavg ]]; then + awk '{print "1m="$1, "5m="$2, "15m="$3}' /proc/loadavg + else + uptime + fi + + section "Memory" + run_or_warn "memory usage" free -h + + section "Disk" + run_or_warn "disk usage" df -h -x tmpfs -x devtmpfs + + section "Failed systemd Services" + if command -v systemctl >/dev/null 2>&1; then + systemctl --failed --no-pager || true + else + printf 'WARNING: systemctl command not available\n' + fi + + section "Listening Ports" + if command -v ss >/dev/null 2>&1; then + ss -tuln || printf 'WARNING: ss command failed\n' + else + printf 'WARNING: ss command not available\n' + fi + + section "Recent Kernel Messages" + if command -v journalctl >/dev/null 2>&1; then + journalctl -k -n 50 --no-pager || printf 'WARNING: journalctl kernel log query failed\n' + else + printf 'WARNING: journalctl command not available\n' + fi +} > "$report" + +printf 'System report written to: %s\n' "$report"