update-casaos-healthcheck-to-3.0 (#8)

* 🎨 feat(healthcheck): add new health checks for Docker and storage

This commit introduces two new health checks to the BigBearCasaOS
Healthcheck script:

1. `check_docker_status()`: This function checks if Docker is installed and
   running on the system. It provides clear feedback based on the Docker
   status.

2. `check_storage_health()`: This function checks the health of all storage
   devices (disks) connected to the system using the `smartctl` tool. It
   reports the health status of each disk, indicating if it has passed the
   SMART health check or if there may be potential issues.

These new health checks provide a more comprehensive overview of the system's
overall health and help identify any potential issues related to Docker or
storage-related problems.

* 📄 feat(casaos-healthcheck): Enhance DNS resolution and add system health checks

This change enhances the DNS resolution checks in the casaos-healthcheck script and adds new system health checks for disk space, CPU load, memory usage, and system temperature.

The key changes include:

- Improve DNS resolution by trying multiple DNS servers and retrying on failure
- Add a check for basic network connectivity before attempting DNS lookups
- Implement a check for disk space usage, with a configurable threshold
- Implement a check for CPU load, with a configurable threshold based on the number of cores
- Implement a check for memory usage, with a configurable threshold based on total memory
- Implement a check for system temperature, using the `sensors` command or `/sys/class/thermal` when available
- Add a check for available system updates using `apt-get`

These new checks provide a more comprehensive health monitoring solution for the CasaOS system.

*  feat: Bump healthcheck version to 3.0

Bumped the version of the BigBearCasaOS Healthcheck tool from 2.3 to 3.0.
This change reflects the latest updates and improvements made to the
healthcheck script.
This commit is contained in:
Christopher
2024-09-27 09:56:57 -05:00
committed by GitHub
parent 3618385fe9
commit 66283166f1

View File

@@ -140,23 +140,152 @@ check_dns_resolution() {
"registry.hub.docker.com"
"gcr.io"
"azurecr.io"
"ghcr.io",
"ghcr.io"
"registry.gitlab.com"
)
local retries=3
local timeout=5
local dns_servers=("8.8.8.8" "1.1.1.1" "9.9.9.9" "208.67.222.222")
print_header "DNS Resolution Check:"
# Verify basic network connectivity
if ! ping -c 1 8.8.8.8 &>/dev/null; then
print_color "0;31" "${CROSS_MARK} Network connectivity issue: Unable to reach 8.8.8.8"
return
fi
for registry in "${registries[@]}"; do
if nslookup "$registry" &>/dev/null; then
print_color "0;32" "${CHECK_MARK} DNS resolution for $registry is successful"
else
local success=false
for dns_server in "${dns_servers[@]}"; do
for ((i=1; i<=retries; i++)); do
if timeout $timeout nslookup "$registry" "$dns_server" &>/dev/null; then
print_color "0;32" "${CHECK_MARK} DNS resolution for $registry is successful using DNS server $dns_server"
success=true
break 2
else
echo "Attempt $i of $retries for $registry using DNS server $dns_server failed"
fi
done
done
if [ "$success" = false ]; then
print_color "0;31" "${CROSS_MARK} DNS resolution for $registry failed"
echo "Debugging info for $registry:"
nslookup "$registry" 2>&1 | tee /dev/stderr
nslookup "$registry" 2>&1
# Alternative check with dig for more detailed output
if command -v dig &> /dev/null; then
echo "Dig output:"
dig +short "$registry" @8.8.8.8
fi
fi
done
}
# Function to check Docker status
check_docker_status() {
print_header "Docker Status Check"
if command -v docker &> /dev/null; then
if docker info &> /dev/null; then
print_color "0;32" "${CHECK_MARK} Docker is running"
else
print_color "0;31" "${CROSS_MARK} Docker is installed but not running"
fi
else
print_color "0;31" "${CROSS_MARK} Docker is not installed"
fi
}
# Function to check storage health
check_storage_health() {
print_header "Storage Health Check"
local disks=$(lsblk -ndo NAME,TYPE | awk '$2=="disk"{print $1}')
for disk in $disks; do
if smartctl -H /dev/$disk &> /dev/null; then
local health=$(smartctl -H /dev/$disk | grep "SMART overall-health")
if [[ $health == *"PASSED"* ]]; then
print_color "0;32" "${CHECK_MARK} /dev/$disk is healthy"
else
print_color "0;31" "${CROSS_MARK} /dev/$disk may have issues"
fi
else
print_color "0;33" "${WARNING_MARK} Unable to check health of /dev/$disk"
fi
done
}
# Function to check disk space
check_disk_space() {
print_header "Disk Space Check"
local threshold=80
local usage=$(df / | grep / | awk '{ print $5 }' | sed 's/%//g')
if [ "$usage" -ge "$threshold" ]; then
print_color "0;31" "${CROSS_MARK} Disk usage is at ${usage}%, which is above the threshold of ${threshold}%"
else
print_color "0;32" "${CHECK_MARK} Disk usage is at ${usage}%, which is below the threshold of ${threshold}%"
fi
}
# Function to check CPU load
check_cpu_load() {
print_header "CPU Load Check"
local load=$(uptime | awk -F'load average:' '{ print $2 }' | cut -d, -f1 | xargs)
local cores=$(nproc)
local threshold=$(awk "BEGIN {print $cores * 0.7}")
if (( $(awk "BEGIN {print ($load > $threshold) ? 1 : 0}") )); then
print_color "0;31" "${CROSS_MARK} CPU load is high: ${load} (Threshold: ${threshold})"
else
print_color "0;32" "${CHECK_MARK} CPU load is acceptable: ${load} (Threshold: ${threshold})"
fi
}
# Function to check memory usage
check_memory_usage() {
print_header "Memory Usage Check"
local total_mem=$(free -m | awk '/^Mem:/ { print $2 }')
local used_mem=$(free -m | awk '/^Mem:/ { print $3 }')
local threshold=$(awk "BEGIN {print int($total_mem * 0.8)}")
if (( $(awk "BEGIN {print ($used_mem > $threshold) ? 1 : 0}") )); then
print_color "0;31" "${CROSS_MARK} Memory usage is high: ${used_mem}MB used of ${total_mem}MB (Threshold: ${threshold}MB)"
else
print_color "0;32" "${CHECK_MARK} Memory usage is acceptable: ${used_mem}MB used of ${total_mem}MB (Threshold: ${threshold}MB)"
fi
}
# Function to check system temperature
check_system_temperature() {
print_header "System Temperature Check"
if command -v sensors &> /dev/null; then
sensors | grep -E 'Core|temp' | while read -r line; do
local temp=$(echo "$line" | awk '{print $2}' | sed 's/+//' | sed 's/°C//')
if (( $(awk "BEGIN {print ($temp > 80) ? 1 : 0}") )); then
print_color "0;31" "${CROSS_MARK} High temperature detected: ${line}"
else
print_color "0;32" "${CHECK_MARK} Temperature is normal: ${line}"
fi
done
elif [ -f "/sys/class/thermal/thermal_zone0/temp" ]; then
local temp=$(awk '{print $1/1000}' /sys/class/thermal/thermal_zone0/temp)
if (( $(awk "BEGIN {print ($temp > 80) ? 1 : 0}") )); then
print_color "0;31" "${CROSS_MARK} High temperature detected: ${temp}°C"
else
print_color "0;32" "${CHECK_MARK} Temperature is normal: ${temp}°C"
fi
else
print_color "0;33" "${WARNING_MARK} Unable to check system temperature. sensors command not found and /sys/class/thermal not available."
fi
}
# Function to check for system updates
check_system_updates() {
print_header "System Update Check"
if command -v apt-get &> /dev/null; then
local updates=$(apt-get -s upgrade | grep -P '^\d+ upgraded')
print_color "0;32" "${CHECK_MARK} System updates available: ${updates}"
else
print_color "0;33" "${WARNING_MARK} apt-get command not found, skipping system update check"
fi
}
# Main execution
if [[ "$1" == "simulated_test" ]]; then
@@ -174,7 +303,7 @@ elif [[ "$1" == "real_test" ]]; then
else
# Normal script execution
# Display Welcome
print_header "BigBearCasaOS Healthcheck V2.2"
print_header "BigBearCasaOS Healthcheck V3.0"
echo "Here are some links:"
echo "https://community.bigbeartechworld.com"
echo "https://github.com/BigBearTechWorld"
@@ -235,4 +364,15 @@ else
done <<< "$SERVICES"
check_service_logs "real"
# New health checks
check_docker_status
check_storage_health
check_disk_space
check_cpu_load
check_memory_usage
check_system_temperature
check_system_updates
print_header "Health Check Complete"
fi