Add more Healthchecks to BigBearCasaOS Healthcheck (#18)

* 🔧 feat(healthcheck): Add process resource check

Adds a new check to the healthcheck script that
displays the top 5 CPU and memory consuming
processes, as well as checks for any zombie
processes. This provides more detailed
information about the system's resource
utilization.

*  feat: Implement comprehensive system health checks

This commit introduces a comprehensive system health check module to the CasaOS
healthcheck script. The new checks include:

- Network interface status and error/drop monitoring
- Network latency testing to common targets
- Filesystem health checks, including inode usage and mount point write access
- Time synchronization status, including NTP sync
- Log rotation configuration and large log file detection

These additional checks provide a more thorough assessment of the overall system
health, helping to identify potential issues that could impact the CasaOS
application. The goal is to proactively detect and report on common system
problems, allowing users to address them before they become critical.

*  feat(casaos-healthcheck): Update healthcheck version to 3.4

The changes update the version of the BigBearCasaOS Healthcheck from 3.3 to 3.4. This
update is necessary to reflect the latest improvements and bug fixes in the
healthcheck script.
This commit is contained in:
Christopher
2024-11-15 13:17:24 -06:00
committed by GitHub
parent 33f8153a27
commit 43e996bfda

View File

@@ -381,6 +381,325 @@ check_dmesg_errors() {
fi
}
check_process_resources() {
print_header "Process Resource Check"
echo "Top 5 CPU consuming processes:"
ps aux --sort=-%cpu | head -6 | tail -5 | \
awk '{printf "%-20s %5s%%\n", $11, $3}'
echo -e "\nTop 5 Memory consuming processes:"
ps aux --sort=-%mem | head -6 | tail -5 | \
awk '{printf "%-20s %5s%%\n", $11, $4}'
local zombie_count=$(ps aux | grep -w Z | wc -l)
if [ "$zombie_count" -gt 0 ]; then
print_color "0;31" "${CROSS_MARK} Found $zombie_count zombie processes"
else
print_color "0;32" "${CHECK_MARK} No zombie processes found"
fi
}
check_network_interfaces() {
print_header "Network Interface Check"
for interface in $(ip -o link show | awk -F': ' '{print $2}'); do
# Skip loopback
[[ "$interface" == "lo" ]] && continue
# Check link status
local state=$(ip link show $interface | grep -oP 'state \K\w+')
local speed=$(ethtool $interface 2>/dev/null | grep "Speed:" | awk '{print $2}')
local errors=$(ip -s link show $interface | awk '/errors/{print $2}')
local drops=$(ip -s link show $interface | awk '/drops/{print $2}')
if [[ "$state" == "UP" ]]; then
print_color "0;32" "${CHECK_MARK} Interface $interface is UP"
[[ -n "$speed" ]] && echo "Speed: $speed"
else
print_color "0;31" "${CROSS_MARK} Interface $interface is DOWN"
fi
if [[ "$errors" != "0" || "$drops" != "0" ]]; then
print_color "0;33" "${WARNING_MARK} $interface has $errors errors and $drops drops"
fi
done
}
check_network_latency() {
print_header "Network Latency Check"
local targets=("8.8.8.8" "1.1.1.1" "google.com")
for target in "${targets[@]}"; do
local ping_result=$(ping -c 3 $target 2>/dev/null | tail -1 | awk '{print $4}' | cut -d '/' -f 2)
if [ -n "$ping_result" ]; then
if (( $(awk "BEGIN {print ($ping_result > 100) ? 1 : 0}") )); then
print_color "0;31" "${CROSS_MARK} High latency to $target: ${ping_result}ms"
else
print_color "0;32" "${CHECK_MARK} Good latency to $target: ${ping_result}ms"
fi
fi
done
}
check_filesystem_health() {
print_header "File System Health Check"
# Check inode usage
df -i | grep -v "Filesystem" | while read line; do
local fs=$(echo $line | awk '{print $1}')
local inode_usage=$(echo $line | awk '{print $5}' | sed 's/%//g')
# Skip if inode usage is not a number
if [[ "$inode_usage" =~ ^[0-9]+$ ]]; then
if [ "$inode_usage" -gt 80 ]; then
print_color "0;31" "${CROSS_MARK} High inode usage on $fs: $inode_usage%"
else
print_color "0;32" "${CHECK_MARK} Inode usage on $fs: $inode_usage%"
fi
fi
done
# Check mount points
mount | grep -E 'ext4|xfs|btrfs|zfs' | while read line; do
local mount_point=$(echo $line | awk '{print $3}')
if touch "$mount_point"/.test_write 2>/dev/null; then
rm "$mount_point"/.test_write
print_color "0;32" "${CHECK_MARK} Mount point $mount_point is writable"
else
print_color "0;31" "${CROSS_MARK} Mount point $mount_point is not writable"
fi
done
}
check_time_sync() {
print_header "Time Synchronization Check"
if command -v timedatectl &>/dev/null; then
local ntp_status=$(timedatectl | grep "NTP synchronized")
if [[ $ntp_status == *"yes"* ]]; then
print_color "0;32" "${CHECK_MARK} NTP is synchronized"
else
print_color "0;31" "${CROSS_MARK} NTP is not synchronized"
fi
local time_status=$(timedatectl status --no-pager)
echo "System time status:"
echo "$time_status"
else
if command -v ntpq &>/dev/null; then
local ntp_peers=$(ntpq -p)
echo "NTP peers status:"
echo "$ntp_peers"
else
print_color "0;33" "${WARNING_MARK} No time synchronization service found"
fi
fi
}
check_log_rotation() {
print_header "Log Rotation Check"
local log_dirs=("/var/log" "/var/log/casaos")
local max_log_size=$((100 * 1024 * 1024)) # 100MB
for dir in "${log_dirs[@]}"; do
if [ -d "$dir" ]; then
echo "Checking logs in $dir:"
find "$dir" -type f -name "*.log" -o -name "*.gz" | while read log; do
local size=$(stat -f%z "$log" 2>/dev/null || stat -c%s "$log")
if [ "$size" -gt "$max_log_size" ]; then
print_color "0;31" "${CROSS_MARK} Large log file: $log ($(numfmt --to=iec-i --suffix=B $size))"
fi
done
fi
done
if [ -f "/etc/logrotate.d/casaos" ]; then
print_color "0;32" "${CHECK_MARK} CasaOS log rotation configured"
else
print_color "0;33" "${WARNING_MARK} No CasaOS log rotation configuration found"
fi
}
check_security_audit() {
print_header "Security Audit Check"
# Check SSH configuration
if [ -f "/etc/ssh/sshd_config" ]; then
local root_login=$(grep "^PermitRootLogin" /etc/ssh/sshd_config)
local password_auth=$(grep "^PasswordAuthentication" /etc/ssh/sshd_config)
[[ "$root_login" == *"no"* ]] && \
print_color "0;32" "${CHECK_MARK} Root SSH login disabled" || \
print_color "0;31" "${CROSS_MARK} Root SSH login enabled"
[[ "$password_auth" == *"no"* ]] && \
print_color "0;32" "${CHECK_MARK} SSH password authentication disabled" || \
print_color "0;31" "${CROSS_MARK} SSH password authentication enabled"
fi
# Check failed login attempts
if [ -f "/var/log/auth.log" ]; then
local failed_attempts=$(grep "Failed password" /var/log/auth.log | wc -l)
if [ "$failed_attempts" -gt 0 ]; then
print_color "0;33" "${WARNING_MARK} Found $failed_attempts failed login attempts"
fi
fi
# Check listening ports
echo "Open ports:"
netstat -tuln | grep LISTEN
}
check_memory_pressure() {
print_header "Memory Pressure Check"
# Check swap usage and configuration
local swap_total=$(free -m | awk '/Swap:/ {print $2}')
local swap_used=$(free -m | awk '/Swap:/ {print $3}')
local swappiness=$(cat /proc/sys/vm/swappiness)
echo "Swap Configuration:"
if [ "$swap_total" -eq 0 ]; then
print_color "0;33" "${WARNING_MARK} No swap space configured"
else
local swap_percent=$((swap_used * 100 / swap_total))
if [ "$swap_percent" -gt 80 ]; then
print_color "0;31" "${CROSS_MARK} High swap usage: ${swap_percent}%"
else
print_color "0;32" "${CHECK_MARK} Swap usage: ${swap_percent}%"
fi
fi
echo "Swappiness value: $swappiness"
# Check memory pressure stats if available
if [ -f "/proc/pressure/memory" ]; then
echo -e "\nMemory Pressure Statistics:"
local pressure=$(cat /proc/pressure/memory)
echo "$pressure"
# Extract 10 second average and convert to integer
local avg10=$(echo "$pressure" | grep "avg10=" | cut -d= -f2 | cut -d" " -f1 | awk '{printf "%d", $1}')
if (( $(awk "BEGIN {print ($avg10 > 50) ? 1 : 0}") )); then
print_color "0;31" "${CROSS_MARK} High memory pressure detected"
else
print_color "0;32" "${CHECK_MARK} Normal memory pressure"
fi
fi
}
check_docker_containers_health() {
print_header "Docker Container Health Check"
if ! command -v docker &>/dev/null; then
print_color "0;33" "${WARNING_MARK} Docker not installed"
return
fi
# Get all containers including stopped ones
local containers=$(docker ps -a --format "{{.Names}}")
for container in $containers; do
echo "Container: $container"
# Check container status
local status=$(docker inspect --format='{{.State.Status}}' "$container")
local health=$(docker inspect --format='{{if .State.Health}}{{.State.Health.Status}}{{else}}no health check{{end}}' "$container")
local restarts=$(docker inspect --format='{{.RestartCount}}' "$container")
# Get resource usage
local cpu=$(docker stats --no-stream --format "{{.CPUPerc}}" "$container")
local mem=$(docker stats --no-stream --format "{{.MemPerc}}" "$container")
case $status in
"running")
print_color "0;32" "${CHECK_MARK} Status: Running"
;;
"exited")
print_color "0;31" "${CROSS_MARK} Status: Stopped"
;;
*)
print_color "0;33" "${WARNING_MARK} Status: $status"
;;
esac
echo "Health: $health"
echo "Restart Count: $restarts"
echo "CPU Usage: $cpu"
echo "Memory Usage: $mem"
echo "---"
done
}
check_system_limits() {
print_header "System Resource Limits Check"
local max_files=$(ulimit -n)
local max_processes=$(ulimit -u)
echo "File descriptor limit: $max_files"
echo "Max user processes: $max_processes"
if [ "$max_files" -lt 65535 ]; then
print_color "0;33" "${WARNING_MARK} Low file descriptor limit"
fi
if [ "$max_processes" -lt 4096 ]; then
print_color "0;33" "${WARNING_MARK} Low process limit"
fi
}
generate_health_report() {
print_header "Health Check Summary Report"
# Initialize arrays for different severity levels
declare -a critical_issues=()
declare -a warnings=()
# Collect issues from previous checks
if $ERROR_FOUND; then
critical_issues+=("Service log errors detected")
fi
# Add disk space issues
local disk_usage=$(df / | grep / | awk '{ print $5 }' | sed 's/%//g')
if [ "$disk_usage" -ge 80 ]; then
critical_issues+=("High disk usage: ${disk_usage}%")
fi
# Add memory pressure issues
local mem_used=$(free -m | awk '/^Mem:/ { print $3 }')
local mem_total=$(free -m | awk '/^Mem:/ { print $2 }')
if [ $((mem_used * 100 / mem_total)) -gt 80 ]; then
critical_issues+=("High memory usage")
fi
# Add Docker status issues
if ! docker info &>/dev/null; then
critical_issues+=("Docker service is not running")
fi
# Display report
if [ ${#critical_issues[@]} -gt 0 ]; then
print_color "0;31" "Critical Issues Found:"
for issue in "${critical_issues[@]}"; do
echo "- $issue"
done
fi
if [ ${#warnings[@]} -gt 0 ]; then
print_color "0;33" "Warnings:"
for warning in "${warnings[@]}"; do
echo "- $warning"
done
fi
if [ ${#critical_issues[@]} -eq 0 ] && [ ${#warnings[@]} -eq 0 ]; then
print_color "0;32" "${CHECK_MARK} No significant issues found"
fi
}
# Main script flow
check_root_privileges
@@ -405,7 +724,7 @@ elif [[ "$1" == "real_test" ]]; then
else
# Normal script execution
# Display Welcome
print_header "BigBearCasaOS Healthcheck V3.3"
print_header "BigBearCasaOS Healthcheck V3.4"
echo "Here are some links:"
echo "https://community.bigbeartechworld.com"
echo "https://github.com/BigBearTechWorld"
@@ -479,6 +798,18 @@ else
check_system_temperature
check_system_updates
check_dmesg_errors
check_process_resources
check_network_interfaces
check_network_latency
check_filesystem_health
check_time_sync
check_log_rotation
check_security_audit
check_memory_pressure
check_system_limits
check_docker_containers_health
generate_health_report
print_header "Health Check Complete"
fi