""" Disk Usage Data Collector Collects disk usage statistics including directory sizes, file system usage, and largest files information. """ import json import logging import subprocess from typing import Dict, Any, List from pathlib import Path logger = logging.getLogger(__name__) class DiskUsageCollector: """Collector for disk usage statistics.""" def __init__(self): self.max_depth = 3 self.exclude_paths = [ "/proc", "/sys", "/dev", "/run", "/tmp", "/var/log" ] def collect_disk_usage(self, system: str) -> Dict[str, Any]: """Collect disk usage information from target system.""" logger.info(f"Collecting disk usage data from {system}") try: # Collect filesystem usage filesystem_usage = self.collect_filesystem_usage(system) # Collect directory sizes directory_sizes = self.collect_directory_sizes(system) # Collect largest files largest_files = self.collect_largest_files(system) return { "filesystem_usage": filesystem_usage, "directory_sizes": directory_sizes, "largest_files": largest_files, "timestamp": self.get_timestamp(system) } except Exception as e: logger.error(f"Failed to collect disk usage from {system}: {e}") raise def collect_filesystem_usage(self, system: str) -> List[Dict[str, Any]]: """Collect filesystem usage statistics.""" usage_stats = [] try: # Run df command result = subprocess.run( ["ssh", system, "df -h --output=source,fstype,size,used,avail,pcent,target"], capture_output=True, text=True, timeout=30 ) if result.returncode != 0: raise RuntimeError(f"df command failed: {result.stderr}") # Parse output lines = result.stdout.strip().split('\n') if len(lines) < 2: return usage_stats for line in lines[1:]: # Skip header parts = line.split() if len(parts) >= 7: usage_stat = { "filesystem": parts[0], "type": parts[1], "size": parts[2], "used": parts[3], "available": parts[4], "use_percent": parts[5], "mountpoint": parts[6] } usage_stats.append(usage_stat) except subprocess.TimeoutExpired: logger.error(f"Timeout collecting filesystem usage from {system}") raise except Exception as e: logger.error(f"Failed to collect filesystem usage from {system}: {e}") raise return usage_stats def collect_directory_sizes(self, system: str) -> List[Dict[str, Any]]: """Collect sizes of top-level directories.""" directory_sizes = [] try: # Get top-level directories dirs_to_check = ["/", "/home", "/var", "/usr", "/opt", "/etc"] for directory in dirs_to_check: if directory in self.exclude_paths: continue try: # Run du command for directory size result = subprocess.run( ["ssh", system, f"du -sh {directory} 2>/dev/null"], capture_output=True, text=True, timeout=60 ) if result.returncode == 0: size, path = result.stdout.strip().split('\t', 1) directory_sizes.append({ "path": path, "size": size }) except subprocess.TimeoutExpired: logger.warning(f"Timeout getting size for {directory} on {system}") continue except Exception as e: logger.warning(f"Failed to get size for {directory} on {system}: {e}") continue except Exception as e: logger.error(f"Failed to collect directory sizes from {system}: {e}") raise return directory_sizes def collect_largest_files(self, system: str) -> List[Dict[str, Any]]: """Collect information about largest files in the system.""" largest_files = [] try: # Find largest files (excluding certain paths) exclude_expr = " ".join(f"-not -path '{path}/*'" for path in self.exclude_paths) cmd = f"find / {exclude_expr} -type f -exec ls -lh {{}} \\; 2>/dev/null | sort -k5 -hr | head -20" result = subprocess.run( ["ssh", system, cmd], capture_output=True, text=True, timeout=120 ) if result.returncode == 0: for line in result.stdout.strip().split('\n'): if not line.strip(): continue parts = line.split() if len(parts) >= 9: file_info = { "permissions": parts[0], "links": parts[1], "owner": parts[2], "group": parts[3], "size": parts[4], "month": parts[5], "day": parts[6], "time": parts[7], "path": " ".join(parts[8:]) } largest_files.append(file_info) except subprocess.TimeoutExpired: logger.error(f"Timeout collecting largest files from {system}") raise except Exception as e: logger.error(f"Failed to collect largest files from {system}: {e}") raise return largest_files def get_timestamp(self, system: str) -> str: """Get current timestamp from target system.""" try: result = subprocess.run( ["ssh", system, "date -Iseconds"], capture_output=True, text=True, timeout=10 ) if result.returncode == 0: return result.stdout.strip() else: return "unknown" except Exception: return "unknown" def collect(system: str) -> Dict[str, Any]: """Main collection function for disk usage data.""" collector = DiskUsageCollector() return collector.collect_disk_usage(system)