I thought I'd share a script I setup to do daily monitoring of my ZFS pools. I run it with the User Scripts plug-in on a Daily schedule. The script will examine the output of zpool status and generate a message to send as a notification to the web GUI. All the configuration is in a block at the head of the script. #!/bin/bash
# ==================== PARAMETERS ====================
# Set AUTO_DISCOVER to "true" to automatically find all pools,
# or "false" to use the manual list below
AUTO_DISCOVER="true"
# Manual pool list (only used if AUTO_DISCOVER="false")
# Use space-separated quoted strings like: ("hdd_main" "nvme_cache")
POOLS=("hdd_main" "nvme_cache")
# Notification settings
NOTIFY_SUCCESS="true" # Set to "false" to only notify on errors/warnings
ERROR_THRESHOLD=0 # Notify if READ/WRITE/CHECKSUM errors > this value
SLOW_IO_THRESHOLD=10 # Notify if slow I/O count > this value (if supported)
# Debug mode - set to "true" to see detailed parsing output
DEBUG="false"
# ====================================================
# Function to get all vdev error counts recursively
get_vdev_errors() {
local json="$1"
local pool="$2"
local vdev_path="$3"
local read_errors=$(echo "$json" | jq -r "$vdev_path.read_errors // \"0\"")
local write_errors=$(echo "$json" | jq -r "$vdev_path.write_errors // \"0\"")
local checksum_errors=$(echo "$json" | jq -r "$vdev_path.checksum_errors // \"0\"")
# Sum up errors from child vdevs if they exist
local child_vdevs=$(echo "$json" | jq -r "$vdev_path.vdevs // empty | keys[]?" 2>/dev/null)
for child in $child_vdevs; do
local child_path="$vdev_path.vdevs[\"$child\"]"
local child_errors=$(get_vdev_errors "$json" "$pool" "$child_path")
read_errors=$((read_errors + $(echo "$child_errors" | cut -d: -f1)))
write_errors=$((write_errors + $(echo "$child_errors" | cut -d: -f2)))
checksum_errors=$((checksum_errors + $(echo "$child_errors" | cut -d: -f3)))
done
echo "$read_errors:$write_errors:$checksum_errors"
}
# Build pool list
if [[ "$AUTO_DISCOVER" == "true" ]]; then
mapfile -t POOLS < <(zpool list -H -o name)
echo "Auto-discovered pools: ${POOLS[*]}"
else
echo "Using manual pool list: ${POOLS[*]}"
fi
# Get JSON status for all pools
# JSON=$(zpool status -j) # Worked fine on my PC... :D
JSON=$(zpool status -j | sed 's/\\/\\\\/g') # Clean up control characters in zpool's erronous JSON output
if [[ $DEBUG == "true" ]]; then
echo "JSON Output:"
echo "$JSON" | jq .
fi
# Check each pool
for POOL in "${POOLS[@]}"; do
echo "========================================="
echo "Checking pool: $POOL"
# Parse JSON for pool information
POOL_STATE=$(echo "$JSON" | jq -r --arg POOL "$POOL" '.pools[$POOL].state // "UNKNOWN"')
if [[ "$POOL_STATE" == "null" || "$POOL_STATE" == "UNKNOWN" ]]; then
echo "ERROR: Pool $POOL not found in status output"
/usr/local/emhttp/webGui/scripts/notify -e "ZFS POOL STATUS" -s "Pool Not Found" -d "Pool $POOL not found - check pool name" -i alert
continue
fi
# Get total error counts for the pool
POOL_ERRORS=$(get_vdev_errors "$JSON" "$POOL" ".pools[\"$POOL\"].vdevs[\"$POOL\"]")
TOTAL_READ_ERRORS=$(echo "$POOL_ERRORS" | cut -d: -f1)
TOTAL_WRITE_ERRORS=$(echo "$POOL_ERRORS" | cut -d: -f2)
TOTAL_CHECKSUM_ERRORS=$(echo "$POOL_ERRORS" | cut -d: -f3)
# Check for scan information (scrub/resilver in progress)
SCAN_STATE=$(echo "$JSON" | jq -r --arg POOL "$POOL" '.pools[$POOL].scan_stats.state // "none"')
SCAN_FUNCTION=$(echo "$JSON" | jq -r --arg POOL "$POOL" '.pools[$POOL].scan_stats.function // "none"')
# Determine notification level and message
DESCRIPTION=""
SUBJECT=""
ICON="normal"
if [[ $DEBUG == "true" ]]; then
echo " State: $POOL_STATE"
echo " Read Errors: $TOTAL_READ_ERRORS"
echo " Write Errors: $TOTAL_WRITE_ERRORS"
echo " Checksum Errors: $TOTAL_CHECKSUM_ERRORS"
echo " Scan State: $SCAN_STATE"
echo " Scan Function: $SCAN_FUNCTION"
fi
# Analyze pool health
case "$POOL_STATE" in
"ONLINE")
if [[ $TOTAL_READ_ERRORS -gt $ERROR_THRESHOLD || $TOTAL_WRITE_ERRORS -gt $ERROR_THRESHOLD || $TOTAL_CHECKSUM_ERRORS -gt $ERROR_THRESHOLD ]]; then
SUBJECT="$POOL Errors Detected"
DESCRIPTION="Pool <b>$POOL</b> is ONLINE but has errors: READ=$TOTAL_READ_ERRORS, WRITE=$TOTAL_WRITE_ERRORS, CHECKSUM=$TOTAL_CHECKSUM_ERRORS"
ICON="warning"
else
SUBJECT="$POOL Healthy"
DESCRIPTION="Pool <b>$POOL</b> is ONLINE and healthy"
ICON="normal"
fi
;;
"DEGRADED")
SUBJECT="$POOL Degraded"
DESCRIPTION="Pool <b>$POOL</b> is DEGRADED"
if [[ $TOTAL_READ_ERRORS -gt 0 || $TOTAL_WRITE_ERRORS -gt 0 || $TOTAL_CHECKSUM_ERRORS -gt 0 ]]; then
DESCRIPTION+=" with errors: READ=$TOTAL_READ_ERRORS, WRITE=$TOTAL_WRITE_ERRORS, CHECKSUM=$TOTAL_CHECKSUM_ERRORS"
fi
DESCRIPTION+=" - May be rebuilding or need attention"
ICON="warning"
;;
"FAULTED"|"UNAVAIL")
SUBJECT="$POOL CRITICAL"
DESCRIPTION="Pool <b>$POOL</b> is $POOL_STATE - IMMEDIATE ATTENTION REQUIRED"
if [[ $TOTAL_READ_ERRORS -gt 0 || $TOTAL_WRITE_ERRORS -gt 0 || $TOTAL_CHECKSUM_ERRORS -gt 0 ]]; then
DESCRIPTION+=" Errors: READ=$TOTAL_READ_ERRORS, WRITE=$TOTAL_WRITE_ERRORS, CHECKSUM=$TOTAL_CHECKSUM_ERRORS"
fi
ICON="alert"
;;
"OFFLINE")
SUBJECT="$POOL Offline"
DESCRIPTION="Pool <b>$POOL</b> is OFFLINE"
ICON="warning"
;;
*)
SUBJECT="$POOL Unknown State"
DESCRIPTION="Pool <b>$POOL</b> has unknown state: $POOL_STATE"
ICON="warning"
;;
esac
# Add scan information if relevant
if [[ "$SCAN_STATE" == "SCANNING" ]]; then
DESCRIPTION+=" ($SCAN_FUNCTION in progress)"
elif [[ "$SCAN_STATE" == "FINISHED" && "$SCAN_FUNCTION" == "SCRUB" ]]; then
SCAN_ERRORS=$(echo "$JSON" | jq -r --arg POOL "$POOL" '.pools[$POOL].scan_stats.errors // "0"')
if [[ "$SCAN_ERRORS" != "0" ]]; then
DESCRIPTION+=" (Last scrub found $SCAN_ERRORS errors)"
if [[ "$ICON" == "normal" ]]; then
ICON="warning"
fi
fi
fi
# Send notification based on settings
if [[ "$ICON" != "normal" || "$NOTIFY_SUCCESS" == "true" ]]; then
/usr/local/emhttp/webGui/scripts/notify -e "ZFS POOL STATUS" -s "$SUBJECT" -d "$DESCRIPTION" -i "$ICON"
fi
echo "$DESCRIPTION"
# Display detailed vdev states if there are issues
if [[ "$ICON" != "normal" ]]; then
echo "Detailed vdev information:"
zpool status "$POOL"
fi
echo ""
done
echo "========================================="
echo "Pool monitoring complete."