#!/bin/bash # # This script reads it's configuration from /etc/sysconfig/raid-check # Please use that file to enable/disable this script or to set the # type of check you wish performed. # We might be on a kernel with no raid support at all, exit if so [ -f /proc/mdstat ] || exit 0 # and exit if we haven't been set up properly [ -f /etc/sysconfig/raid-check ] || exit 0 . /etc/sysconfig/raid-check # Wait until no more than arg1 arrays in arg2 list are busy waitbusy() { local threshold=$(($1 + 1)) local dev_list="$2" while true do local busy=0 local dev="" for dev in $dev_list; do local sync_action=`cat /sys/block/$dev/md/sync_action` if [ "$sync_action" != "idle" ]; then let busy++ fi done [ $busy -lt $threshold ] && break sleep 60 done } [ "$ENABLED" != "yes" ] && exit 0 case "$CHECK" in check) ;; repair) ;; *) exit 0;; esac ionice="" renice="" case $NICE in high) renice="-n -5" ;; low) renice="-n 5" ionice="-c2 -n7" ;; idle) renice="-n 15" ionice="-c3" ;; *) ;; esac active_list=`grep "^md.*: active" /proc/mdstat | cut -f 1 -d ' '` [ -z "$active_list" ] && exit 0 declare -A check dev_list="" check_list="" for dev in $active_list; do echo $SKIP_DEVS | grep -w $dev >&/dev/null && continue if [ -f /sys/block/$dev/md/sync_action ]; then # Only perform the checks on idle, healthy arrays, but delay # actually writing the check field until the next loop so we # don't switch currently idle arrays to active, which happens # when two or more arrays are on the same physical disk array_state=`cat /sys/block/$dev/md/array_state` if [ "$array_state" != "clean" -a "$array_state" != "active" ]; then continue fi sync_action=`cat /sys/block/$dev/md/sync_action` if [ "$sync_action" != idle ]; then continue fi ck="" echo $REPAIR_DEVS | grep -w $dev >&/dev/null && ck="repair" echo $CHECK_DEVS | grep -w $dev >&/dev/null && ck="check" [ -z "$ck" ] && ck=$CHECK dev_list="$dev_list $dev" check[$dev]=$ck [ "$ck" = "check" ] && check_list="$check_list $dev" fi done [ -z "$dev_list" ] && exit 0 for dev in $dev_list; do #Only run $MAXCONCURRENT checks at a time if [ -n "$MAXCONCURRENT" ]; then waitbusy $((MAXCONCURRENT - 1)) "$dev_list" fi echo "${check[$dev]}" > /sys/block/$dev/md/sync_action resync_pid="" wait=10 while [ $wait -gt 0 -a -z "$resync_pid" ]; do sleep 6 let wait-- resync_pid=$(ps -ef | awk -v mddev=$dev 'BEGIN { pattern = "^\\[" mddev "_resync]$" } $8 ~ pattern { print $2 }') done [ -n "$resync_pid" -a -n "$renice" ] && renice $renice -p $resync_pid >&/dev/null [ -n "$resync_pid" -a -n "$ionice" ] && ionice $ionice -p $resync_pid >&/dev/null done [ -z "$check_list" ] && exit 0 waitbusy 0 "$check_list" for dev in $check_list; do mismatch_cnt=`cat /sys/block/$dev/md/mismatch_cnt` # Due to the fact that raid1/10 writes in the kernel are unbuffered, # a raid1 array can have non-0 mismatch counts even when the # array is healthy. These non-0 counts will only exist in # transient data areas where they don't pose a problem. However, # since we can't tell the difference between a non-0 count that # is just in transient data or a non-0 count that signifies a # real problem, simply don't check the mismatch_cnt on raid1 # devices as it's providing far too many false positives. But by # leaving the raid1 device in the check list and performing the # check, we still catch and correct any bad sectors there might # be in the device. raid_lvl=`cat /sys/block/$dev/md/level` if [ "$raid_lvl" = "raid1" -o "$raid_lvl" = "raid10" ]; then continue fi if [ "$mismatch_cnt" -ne 0 ]; then echo "WARNING: mismatch_cnt is not 0 on /dev/$dev" fi done