mdadm/raid-check

#!/bin/bash
#
# This script reads it's configuration from /etc/sysconfig/raid-check
# Please use that file to enable/disable this script or to set the
# type of check you wish performed.

# We might be on a kernel with no raid support at all, exit if so
[ -f /proc/mdstat ] || exit 0

# and exit if we haven't been set up properly
[ -f /etc/sysconfig/raid-check ] || exit 0
. /etc/sysconfig/raid-check

[ "$ENABLED" != "yes" ] && exit 0

case "$CHECK" in
    check) ;;
    repair) ;;
    *) exit 0;;
esac

active_list=`grep "^md.*: active" /proc/mdstat | cut -f 1 -d ' '`
[ -z "$active_list" ] && exit 0

declare -A check
dev_list=""
check_list=""
for dev in $active_list; do
    echo $SKIP_DEVS | grep -w $dev >/dev/null 2>&1 && continue
    if [ -f /sys/block/$dev/md/sync_action ]; then
	# Only perform the checks on idle, healthy arrays, but delay
	# actually writing the check field until the next loop so we
	# don't switch currently idle arrays to active, which happens
	# when two or more arrays are on the same physical disk
	array_state=`cat /sys/block/$dev/md/array_state`
	sync_action=`cat /sys/block/$dev/md/sync_action`
	if [ "$array_state" = clean -a "$sync_action" = idle ]; then
	    ck=""
	    echo $REPAIR_DEVS | grep -w $dev >/dev/null 2>&1 && ck="repair"
	    echo $CHECK_DEVS | grep -w $dev >/dev/null 2>&1 && ck="check"
	    [ -z "$ck" ] && ck=$CHECK
	    dev_list="$dev_list $dev"
	    check[$dev]=$ck
	    [ "$ck" = "check" ] && check_list="$check_list $dev"
	fi
    fi
done
[ -z "$dev_list" ] && exit 0

for dev in $dev_list; do
    echo "${check[$dev]}" > /sys/block/$dev/md/sync_action
done
[ -z "$check_list" ] && exit 0

checking=1
while [ $checking -ne 0 ]
do
	sleep 60
	checking=0
	for dev in $check_list; do
	sync_action=`cat /sys/block/$dev/md/sync_action`
		if [ "$sync_action" != "idle" ]; then
			checking=1
		fi
	done
done
for dev in $check_list; do
	mismatch_cnt=`cat /sys/block/$dev/md/mismatch_cnt`
	# Due to the fact that raid1 writes in the kernel are unbuffered,
	# a raid1 array can have non-0 mismatch counts even when the
	# array is healthy.  These non-0 counts will only exist in
	# transient data areas where they don't pose a problem.  However,
	# since we can't tell the difference between a non-0 count that
	# is just in transient data or a non-0 count that signifies a
	# real problem, simply don't check the mismatch_cnt on raid1
	# devices as it's providing far too many false positives.  But by
	# leaving the raid1 device in the check list and performing the
	# check, we still catch and correct any bad sectors there might
	# be in the device.
	raid_lvl=`cat /sys/block/$dev/md/level`
	if [ "$mismatch_cnt" -ne 0 -a "$raid_lvl" != "raid1" ]; then
		echo "WARNING: mismatch_cnt is not 0 on /dev/$dev"
	fi
done