Thank you for this very complete answer.
- Assuming I really have to implement some sort of inactive slave-link
check.
- Assuming it is acceptable to remove the inactive slave from the bound
for the duration of the check.
Could you help me check my script? It works well for me but as I'm about
to deploy it for production purpose I'd rather have a double check from
you guys.
(Note: I'm not reliable when it comes to (among other things) routing
and network related topics)
Thx.
Olivier
---------------------------------------------------------------------------------------
#!/bin/bash
# Check all nics enslaved in a bond.
# This is a way to check that all nics (including inactive ones) are
working properly.
#
# Authors:
# OA: Olivier Arsac
# History:
# 19/04/2007: OA scratch
# 31/06/2007: OA better handling of "free" IPs used during test
# TODO:
# remove all TODOs from the script
#set -x
# try to be robust -> exit if a variable is not set (probably something
went wrong)
set -o nounset
trap clean INT TERM
PATH=/exploit/local/sbin:/exploit/local/bin:/usr/kerberos/sbin:/usr/kerberos/bin:/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin:/exploit/unix/prod/bin
com=`basename "$0"`
fullcom="$*"
usage() {
echo "Usage: $com [-q] [-i ip] [-t target] [bond]"
echo " Check all nics enslaved to a bond."
echo " This is a way to check that all nics (including inactive
ones) are working properly. You should check that periodicaly to avoid
nasty surprises when your active nic stops working and you have to
fallback to your (unchecked) slave one."
echo " exit 0 if all is OK (or if no bond is present)."
echo " -q: quiet (no verbose message for human operator)."
echo " -i: ip to use during check of inactive slaves."
echo " -t: target ip to ping during checks."
echo "eg: $com"
echo " check all nics from all bonds."
echo "eg: $com -q bond0"
echo " check silently all nics from bond0."
}
quiet=0
ip=""
target=""
while getopts "qi:t:" option
do
case $option in
q) quiet=1;;
i) ip=$OPTARG;;
t) target=$OPTARG;;
*) usage; exit 1;;
esac
done
# drop what has been parsed by getopts
shift `expr $OPTIND - 1`
# get args
if [ "$#" -ne 0 ]
then
bonds="$@"
for bond in $bonds; do
if [ ! -f /proc/net/bonding/$bond ]; then
echoe "Error: $bond is not a valid bond."
exit 6
fi
done
else
bonds=`ls /proc/net/bonding/ 2>/dev/null`
fi
#match a MAC address
re_mac="([a-zA-Z0-9][a-zA-Z0-9]:){5}[a-zA-Z0-9][a-zA-Z0-9]"
re_ip="(([0-9]{3}[.]){3}[0-9]{3}"
function echoe(){
echo "$@" >/dev/stderr
}
function echoq(){
if [ $quiet -eq 0 ] ; then echo "$@"; fi
}
# set a valid mac to a nic
# (must get a mac from the slaves in bond but one that is not currently
in use by the bond)
get_free_mac_ret=""
function get_free_mac(){
bond=$1
nic=$2
free_mac=""
macs=`grep "Permanent HW addr:" /proc/net/bonding/$bond | egrep -o
$re_mac| tr 'a-z' 'A-Z'`
bond_mac=`ifconfig $bond | grep HWaddr | egrep -o $re_mac`
for mac in $macs; do
if [ "$mac" != "$bond_mac" ]; then
free_mac=$mac
fi
done
get_free_mac_ret=$free_mac
}
# ping a target using a specified nic to test for IP connectivity
check_nic_ret=0
function check_nic(){
target=$1
if [ $# -ge 2 ]; then
nic=$2
ping -n -c 3 -I $nic $target 1>/dev/null 2>/dev/null
else
ping -n -c 3 $target 1>/dev/null 2>/dev/null
fi
if [ $? -ne 0 ]; then
ma=""
if [ $# -ge 3 ]; then
ma="(using $3 as ip)"
fi
echoq " [ERROR]"
echo "$nic interface on $host is not working properly! $ma" >
/dev/stderr
check_nic_ret=1
else
echoq " [OK]"
check_nic_ret=0
fi
}
# arping a target using a specified nic to test for IP connectivity
function exercise_nic_arp(){
target=$1
nic=$2
src_ip=$3
arping -c 3 -s "$src_ip" -I "$nic" "$target" 1>/dev/null 2>/dev/null
}
# reset a properly configured bond if someone interrupts the script
clean_bond=""
clean_nic=""
function clean(){
echoq
echoq "Script interrupted, restoring bond."
if [ ! -z $clean_bond ] && [ ! -z $clean_nic ]; then
ifenslave $clean_bond $clean_nic 2>/dev/null
fi
exit 2
}
host=`hostname -s`
table=200
if [ ! -d /proc/net/bonding ]; then
echoe "Warning: Module bonding not loaded. Obviously no bond to check."
#trying to check a bond on a server where none is present is probably
not realy an error -> exit 0 with a warning message
exit 0
fi
if [ -z $target ]; then
# no target given as parameter -> auto-detect
# get the default gateway as a ping target
target=`route -n | grep UG | awk '{print $2}'`
if [ -z $target ]; then
echoe "Error: Unable to auto-detect the target to use during test
(use -t?)."
exit 3
fi
fi
if [ -z "$ip" ]; then
# no ip given as parameter -> auto-detect
ip_b1=`host "${host}-bond-t1" | grep -o "$re_ip"`
ip_b2=`host "${host}-bond-t2" | grep -o "$re_ip"`
if [ -z "$ip_b1" ] && [ -z "$ip_b2" ]; then
echoe "Error: Unable to auto-detect an ip to use during test (use -i?)."
exit 4
fi
fi
error_nb=0
for bond in $bonds
do
bond=`basename $bond`
echoq "checking bond $bond"
active=`grep "Active Slave" /proc/net/bonding/$bond |cut -d':' -f2`
echoq -n " active slave :$active"
check_nic $target
error_nb=$(($error_nb + $check_nic_ret))
slaves=`grep "Slave Interface:" /proc/net/bonding/$bond |cut -d':' -f2`
slave_nb=0
for slave in $slaves
do
if [ $slave != $active ]; then
# this nic is enslaved but not active. we want to check if it is
ready to work (no cable or VPN trouble that will bite us only when the
active slave will change)
echoq -n " inactive slave : $slave"
# search for a free mac in this bond (ie a real phy MAC that is
not the one used by the bond)
get_free_mac $bond $slave
free_mac=$get_free_mac_ret
# store the bond/nic we are going to un-enslave (to be able de
re-enslave it in case of interrupt)
clean_bond=$bond
clean_nic=$slave
if [ -z "$ip" ]; then
ip="$ip_b1" # TODO: use a clever way to match slave and free ip
fi
# free this nic from the bond
ifenslave -d $bond $slave
# set it up with a "free" mac
ifconfig $slave hw ether $free_mac
# set it up with a temp IP
ifconfig $slave $ip netmask 255.255.255.255
# it seems we need a small temporisation here or the rest may fail
sleep 2
exercise_nic_arp $target $slave $ip
check_nic $target $slave $ip
error_nb=$(($error_nb + $check_nic_ret))
# clean this temporary ip/route
ifconfig $slave down
# re-enslave this nic to the bond
ifenslave $bond $slave
clean_bond=""; clean_nic=""
slave_nb=$(($slave_nb + 1))
fi
done
echoq -n " bond : $bond"
check_nic $target $bond
error_nb=$(($error_nb + $check_nic_ret))
if [ $slave_nb -eq 0 ]; then
echoe "Error: No inactive slave in $bond."
exit 5
fi
done
if [ $error_nb -ne 0 ]; then
exit $((10 + $error_nb))
fi
exit 0
---------------------------------------------------------------------------------------
_______________________________________________
LARTC mailing list
LARTC@xxxxxxxxxxxxxxx
http://mailman.ds9a.nl/cgi-bin/mailman/listinfo/lartc