Add AI lab maintenance toolkit
lint / shell-yaml-ansible (push) Failing after 17s

This commit is contained in:
Mateusz Suski
2026-06-06 00:10:44 +00:00
parent 1843796e92
commit 8cb92de06f
21 changed files with 1031 additions and 0 deletions
+66
View File
@@ -0,0 +1,66 @@
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
LOG_FILE="/var/log/ailab-apt-cleanup.log"
execute=false
non_interactive=false
usage() {
printf 'Usage: %s [--execute [--non-interactive]]\n' "$(basename "$0")"
}
while (($# > 0)); do
case "$1" in
--execute) execute=true ;;
--non-interactive) non_interactive=true ;;
-h|--help) usage; exit 0 ;;
*) printf 'CRITICAL: unknown argument: %s\n' "$1" >&2; usage >&2; exit 2 ;;
esac
shift
done
if [[ "$non_interactive" == true && "$execute" != true ]]; then
printf 'CRITICAL: --non-interactive requires --execute\n' >&2
exit 2
fi
if ((EUID != 0)); then
printf 'CRITICAL: this script must run as root\n' >&2
exit 2
fi
if ! command -v apt >/dev/null 2>&1; then
printf 'CRITICAL: apt is required\n' >&2
exit 2
fi
exec > >(tee -a "$LOG_FILE") 2>&1
printf '\n[%s] APT cleanup\n' "$(date --iso-8601=seconds)"
if [[ "$execute" != true ]]; then
printf 'INFO: dry-run mode; apt update, autoremove, autoclean, and needrestart are not executed\n'
printf 'INFO: simulated autoremove follows\n'
LC_ALL=C apt -s autoremove --purge
printf 'INFO: rerun with --execute and confirm to apply changes\n'
exit 0
fi
if [[ "$non_interactive" != true ]]; then
printf 'WARNING: this will update APT metadata and remove packages marked as automatically installed and unused.\n'
printf 'Type EXECUTE to continue: '
read -r confirmation
if [[ "$confirmation" != "EXECUTE" ]]; then
printf 'CRITICAL: confirmation failed; no changes made\n'
exit 2
fi
fi
apt update
apt autoremove --purge -y
apt autoclean -y
if command -v needrestart >/dev/null 2>&1; then
needrestart -b || true
else
printf 'WARNING: needrestart is not installed\n'
fi
printf 'OK: APT cleanup completed\n'
@@ -0,0 +1,90 @@
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
LOG_FILE="/var/log/ailab-config-backup.log"
BACKUP_DIR="/srv/backups/ailab-config"
RETENTION_DAYS=30
execute=false
non_interactive=false
usage() {
printf 'Usage: %s [--execute [--non-interactive]]\n' "$(basename "$0")"
}
while (($# > 0)); do
case "$1" in
--execute) execute=true ;;
--non-interactive) non_interactive=true ;;
-h|--help) usage; exit 0 ;;
*) printf 'CRITICAL: unknown argument: %s\n' "$1" >&2; usage >&2; exit 2 ;;
esac
shift
done
if [[ "$non_interactive" == true && "$execute" != true ]]; then
printf 'CRITICAL: --non-interactive requires --execute\n' >&2
exit 2
fi
if ((EUID != 0)); then
printf 'CRITICAL: this script must run as root\n' >&2
exit 2
fi
for command_name in tar gzip find; do
if ! command -v "$command_name" >/dev/null 2>&1; then
printf 'CRITICAL: required command is missing: %s\n' "$command_name" >&2
exit 2
fi
done
exec > >(tee -a "$LOG_FILE") 2>&1
timestamp="$(date '+%Y%m%d-%H%M%S')"
archive="$BACKUP_DIR/ailab-config-$timestamp.tar.gz"
candidate_paths=(
/etc
/root/.bashrc
/root/.bashrc.d
/opt/ailab-maintenance
/var/lib/libvirt/qemu
)
source_paths=()
printf '\n[%s] Configuration backup\n' "$(date --iso-8601=seconds)"
for path in "${candidate_paths[@]}"; do
if [[ -e "$path" ]]; then
source_paths+=("${path#/}")
printf 'OK: include %s\n' "$path"
else
printf 'INFO: optional path is absent: %s\n' "$path"
fi
done
if ((${#source_paths[@]} == 0)); then
printf 'CRITICAL: no backup source paths are present\n'
exit 1
fi
printf 'Backup destination: %s\n' "$archive"
printf 'Retention: matching archives older than %d days\n' "$RETENTION_DAYS"
printf 'Configuration beneath /etc includes libvirt, Docker, and systemd when present\n'
printf 'Excluded by policy: Docker data, application data, model data, and VM disk images\n'
if [[ "$execute" != true ]]; then
printf 'INFO: dry-run mode; no archive or directory was created and no retention deletion ran\n'
exit 0
fi
if [[ "$non_interactive" != true ]]; then
printf 'Type EXECUTE to create the archive and apply retention: '
read -r confirmation
if [[ "$confirmation" != "EXECUTE" ]]; then
printf 'CRITICAL: confirmation failed; no changes made\n'
exit 2
fi
fi
install -d -m 0750 "$BACKUP_DIR"
tar --create --gzip --file "$archive" --ignore-failed-read --directory / -- "${source_paths[@]}"
find "$BACKUP_DIR" -maxdepth 1 -type f -name 'ailab-config-*.tar.gz' -mtime "+$RETENTION_DAYS" -print -delete
printf 'OK: configuration backup created: %s\n' "$archive"
+38
View File
@@ -0,0 +1,38 @@
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
LOG_FILE="/var/log/ailab-disk-watch.log"
threshold="${AILAB_DISK_THRESHOLD:-85}"
if ((EUID != 0)); then
printf 'CRITICAL: this script must run as root to write %s\n' "$LOG_FILE" >&2
exit 2
fi
if [[ ! "$threshold" =~ ^[0-9]+$ ]] || ((threshold < 1 || threshold > 100)); then
printf 'CRITICAL: AILAB_DISK_THRESHOLD must be an integer from 1 to 100\n' >&2
exit 2
fi
exec > >(tee -a "$LOG_FILE") 2>&1
printf '\n[%s] Disk usage check; threshold=%s%%\n' "$(date --iso-8601=seconds)" "$threshold"
status=0
while read -r filesystem _blocks _used available use_percent mountpoint; do
usage="${use_percent%\%}"
if [[ ! "$usage" =~ ^[0-9]+$ ]]; then
printf 'WARNING: unable to parse usage for %s mounted on %s\n' "$filesystem" "$mountpoint"
status=1
elif ((usage >= threshold)); then
printf 'WARNING: %s mounted on %s is %s used; threshold=%s%%; available=%s KB\n' \
"$filesystem" "$mountpoint" "$use_percent" "$threshold" "$available"
status=1
else
printf 'OK: %s mounted on %s is %s used\n' "$filesystem" "$mountpoint" "$use_percent"
fi
done < <(df -P -x tmpfs -x devtmpfs | awk 'NR > 1 {print $1, $2, $3, $4, $5, $6}')
exit "$status"
@@ -0,0 +1,70 @@
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
LOG_FILE="/var/log/ailab-docker-cleanup.log"
execute=false
non_interactive=false
usage() {
printf 'Usage: %s [--execute [--non-interactive]]\n' "$(basename "$0")"
}
while (($# > 0)); do
case "$1" in
--execute) execute=true ;;
--non-interactive) non_interactive=true ;;
-h|--help) usage; exit 0 ;;
*) printf 'CRITICAL: unknown argument: %s\n' "$1" >&2; usage >&2; exit 2 ;;
esac
shift
done
if [[ "$non_interactive" == true && "$execute" != true ]]; then
printf 'CRITICAL: --non-interactive requires --execute\n' >&2
exit 2
fi
if ((EUID != 0)); then
printf 'CRITICAL: this script must run as root\n' >&2
exit 2
fi
exec > >(tee -a "$LOG_FILE") 2>&1
printf '\n[%s] Docker cleanup\n' "$(date --iso-8601=seconds)"
if ! command -v docker >/dev/null 2>&1; then
printf 'INFO: Docker is not installed; nothing to do\n'
exit 0
fi
if command -v systemctl >/dev/null 2>&1 && ! systemctl is-active --quiet docker; then
printf 'INFO: docker.service is inactive; nothing to do\n'
exit 0
fi
printf '\nDocker disk usage before cleanup:\n'
docker system df
if [[ "$execute" != true ]]; then
printf 'INFO: dry-run mode; would run docker system prune -af\n'
printf 'INFO: dry-run mode; would run docker builder prune -af --filter until=168h\n'
printf 'INFO: Docker volumes are never included in this cleanup\n'
exit 0
fi
if [[ "$non_interactive" != true ]]; then
printf 'WARNING: this removes unused containers, networks, images, and old build cache, but not volumes.\n'
printf 'Type EXECUTE to continue: '
read -r confirmation
if [[ "$confirmation" != "EXECUTE" ]]; then
printf 'CRITICAL: confirmation failed; no changes made\n'
exit 2
fi
fi
docker system prune -af
docker builder prune -af --filter "until=168h"
printf '\nDocker disk usage after cleanup:\n'
docker system df
printf 'OK: Docker cleanup completed; volumes were not pruned\n'
+111
View File
@@ -0,0 +1,111 @@
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
section() {
printf '\n== %s ==\n' "$1"
}
run_optional() {
local description="$1"
shift
if "$@"; then
return 0
fi
printf 'WARNING: %s failed\n' "$description"
return 0
}
section "Host identity"
if command -v hostnamectl >/dev/null 2>&1; then
run_optional "hostnamectl" hostnamectl
else
run_optional "hostname" hostname
fi
run_optional "kernel information" uname -a
run_optional "uptime" uptime
section "Memory"
if command -v free >/dev/null 2>&1; then
run_optional "memory report" free -h
else
printf 'WARNING: free is not available\n'
fi
section "Filesystems"
if command -v df >/dev/null 2>&1; then
run_optional "filesystem report" df -hT
printf '\nKey mountpoints present:\n'
for mountpoint in / /boot /var /srv /opt /home; do
if findmnt -rn --target "$mountpoint" >/dev/null 2>&1; then
run_optional "filesystem report for $mountpoint" df -hT "$mountpoint"
fi
done
else
printf 'WARNING: df is not available\n'
fi
section "Journal usage"
if command -v journalctl >/dev/null 2>&1; then
run_optional "journal disk usage" journalctl --disk-usage
else
printf 'WARNING: journalctl is not available\n'
fi
section "Docker"
if command -v docker >/dev/null 2>&1; then
if command -v systemctl >/dev/null 2>&1; then
run_optional "Docker service state" systemctl is-active docker
fi
run_optional "Docker container list" docker ps --format 'table {{.Names}}\t{{.Image}}\t{{.Status}}\t{{.Ports}}'
run_optional "Docker disk usage" docker system df
else
printf 'INFO: Docker is not installed\n'
fi
section "Libvirt"
if command -v virsh >/dev/null 2>&1; then
if command -v systemctl >/dev/null 2>&1; then
run_optional "libvirtd service state" systemctl is-active libvirtd
fi
run_optional "libvirt guest list" virsh list --all
else
printf 'INFO: virsh is not installed\n'
fi
section "NVIDIA"
if command -v nvidia-smi >/dev/null 2>&1; then
run_optional "NVIDIA status" nvidia-smi
else
printf 'INFO: nvidia-smi is not installed\n'
fi
section "Failed systemd units"
if command -v systemctl >/dev/null 2>&1; then
run_optional "failed systemd unit report" systemctl --failed --no-pager
else
printf 'WARNING: systemctl is not available\n'
fi
section "SMART quick health"
if command -v smartctl >/dev/null 2>&1; then
shopt -s nullglob
devices=(/dev/sd? /dev/nvme?n?)
shopt -u nullglob
if ((${#devices[@]} == 0)); then
printf 'INFO: no matching SATA/SCSI or NVMe devices found\n'
else
for device in "${devices[@]}"; do
printf '\n-- %s --\n' "$device"
run_optional "SMART health check for $device" smartctl -H "$device"
done
fi
else
printf 'INFO: smartctl is not installed\n'
fi
exit 0
@@ -0,0 +1,117 @@
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
# APT autoremove respects package dependencies and kernel protection rules. That
# is safer than name-based purging on HWE hosts using NVIDIA, DKMS, or VFIO.
LOG_FILE="/var/log/ailab-kernel-cleanup.log"
execute=false
non_interactive=false
usage() {
printf 'Usage: %s [--execute [--non-interactive]]\n' "$(basename "$0")"
}
kernel_packages() {
dpkg-query -W -f='${db:Status-Abbrev} ${binary:Package}\n' \
'linux-image*' 'linux-headers*' 'linux-modules*' 2>/dev/null \
| awk '$1 ~ /^ii/ {print $2}' \
| sort -u || true
}
versioned_kernel_images() {
dpkg-query -W -f='${db:Status-Abbrev} ${binary:Package}\n' 'linux-image-[0-9]*' 2>/dev/null \
| awk '$1 ~ /^ii/ {sub(/:.*/, "", $2); print $2}' \
| sort -u || true
}
while (($# > 0)); do
case "$1" in
--execute) execute=true ;;
--non-interactive) non_interactive=true ;;
-h|--help) usage; exit 0 ;;
*) printf 'CRITICAL: unknown argument: %s\n' "$1" >&2; usage >&2; exit 2 ;;
esac
shift
done
if [[ "$non_interactive" == true && "$execute" != true ]]; then
printf 'CRITICAL: --non-interactive requires --execute\n' >&2
exit 2
fi
if ((EUID != 0)); then
printf 'CRITICAL: this script must run as root\n' >&2
exit 2
fi
for command_name in apt dpkg-query uname; do
if ! command -v "$command_name" >/dev/null 2>&1; then
printf 'CRITICAL: required command is missing: %s\n' "$command_name" >&2
exit 2
fi
done
exec > >(tee -a "$LOG_FILE") 2>&1
printf '\n[%s] Kernel cleanup\n' "$(date --iso-8601=seconds)"
printf 'Running kernel: %s\n' "$(uname -r)"
printf '\nInstalled kernel-related packages before cleanup:\n'
kernel_packages
simulation="$(LC_ALL=C apt -s autoremove --purge)"
printf '\nAPT autoremove simulation:\n%s\n' "$simulation"
mapfile -t installed_images < <(versioned_kernel_images)
mapfile -t removed_images < <(
awk '$1 == "Remv" && $2 ~ /^linux-image-[0-9]/ {sub(/:.*/, "", $2); print $2}' <<<"$simulation" | sort -u
)
remaining_images=0
for image in "${installed_images[@]}"; do
remove_image=false
for removed in "${removed_images[@]}"; do
if [[ "$image" == "$removed" ]]; then
remove_image=true
break
fi
done
if [[ "$remove_image" != true ]]; then
remaining_images=$((remaining_images + 1))
fi
done
printf 'Kernel image safety check: installed=%d simulated-removals=%d remaining=%d\n' \
"${#installed_images[@]}" "${#removed_images[@]}" "$remaining_images"
if ((${#installed_images[@]} < 2 || remaining_images < 2)); then
printf 'CRITICAL: cleanup would not leave at least two versioned kernel images; refusing execution\n'
exit 1
fi
if [[ "$execute" != true ]]; then
printf 'INFO: dry-run mode; no packages were removed\n'
printf 'INFO: rerun with --execute and confirm to apply the simulated cleanup\n'
exit 0
fi
if [[ "$non_interactive" != true ]]; then
printf 'WARNING: APT will remove the packages shown in the simulation above.\n'
printf 'Type EXECUTE to continue: '
read -r confirmation
if [[ "$confirmation" != "EXECUTE" ]]; then
printf 'CRITICAL: confirmation failed; no changes made\n'
exit 2
fi
fi
apt autoremove --purge -y
apt autoclean -y
if command -v update-grub >/dev/null 2>&1; then
update-grub || true
else
printf 'WARNING: update-grub is not installed\n'
fi
printf '\nInstalled kernel-related packages after cleanup:\n'
kernel_packages
printf 'OK: kernel cleanup completed with APT-managed package selection\n'
+42
View File
@@ -0,0 +1,42 @@
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
section() {
printf '\n== %s ==\n' "$1"
}
if ! command -v virsh >/dev/null 2>&1; then
printf 'INFO: virsh is not installed; VM audit skipped\n'
exit 0
fi
section "Virtual machines"
virsh list --all || printf 'WARNING: unable to list virtual machines\n'
section "Storage pools"
virsh pool-list --all || printf 'WARNING: unable to list storage pools\n'
mapfile -t pools < <(virsh pool-list --all --name 2>/dev/null | sed '/^[[:space:]]*$/d' || true)
for pool in "${pools[@]}"; do
section "Volumes in pool: $pool"
virsh vol-list "$pool" || printf 'WARNING: unable to list volumes in pool %s\n' "$pool"
done
section "Possible VM disk and installation images"
search_roots=()
for path in /var/lib/libvirt /srv /opt; do
[[ -d "$path" ]] && search_roots+=("$path")
done
if ((${#search_roots[@]} == 0)); then
printf 'INFO: no configured search roots are present\n'
else
find "${search_roots[@]}" -xdev -type f \
\( -iname '*.qcow2' -o -iname '*.raw' -o -iname '*.iso' \) \
-printf '%12s bytes %p\n' 2>/dev/null \
| sort -nr || true
fi
printf '\nINFO: audit complete; no files or libvirt resources were modified\n'