314598340 发表于 2019-1-13 15:04:59

nagios 监控硬盘RAID

背景:由于线上服务器硬盘故障,导致服务,数据失效;
  
目的:保证第一时间发现硬盘信息是否正常;
  
方案:使用Nagios 自定义脚本来监控硬盘状态;
  

注意:下面脚本只提供已经安装hpacucli megacli 软件使用;
#!/bin/bash
#Marc.wang 2014/06/17

export PATH=$PATH:/usr/sbin/:/sbin/:/usr/bin/
Get_localhost_Hostname=`hostname -I |awk '{print $1}'`
Nagios="nagios.org"
SERVER_TYPE=$(/usr/sbin/dmidecode| grep "Vendor" | awk-F\: 'NR==1{print $2}'|awk '{print $1}')


#The nagios command run nsca
Send_nsca_ssl_message (){

/usr/local/nagios/bin/send_nsca-H ${Nagios} -d ";" -c /usr/local/nagios/etc/send_nsca.cfg
}

#hp command run
HP_DISK_STATUS_COMMAND() {
rpm-qa |grephpacucli >> /dev/null 2>&1
echo $?
}
# dell command run
DELL_IBM_DISK_STATUS_COMMAND() {
rpm -qa |grep MegaCli >> /dev/null 2>&1
echo $?
}

bug_test=$(ps ax |grep hpacucli |grep -v grep |wc -l)
if [ "${bug_test}" != "0" ]
then
      echo "$Get_localhost_Hostname;check_raid;2; hpacucli command run not data." | Send_nsca_ssl_message
      exit 2
fi

CHECK_RAID_STATUS_HP () {
/usr/sbin/hpacuclictrl all show config detail |grep physicaldrive-A 4 |sed 's/ //g'|grep "Status:"|grep -v"Status:OK" | wc -l
}


case $SERVER_TYPE in
HP|hp|Hp|Hewlett-Packard)
TEST_HP_COMMAND () {
hpacuclictrl all show config detail >> /dev/null 2>&1
echo $?
}
HP_RPM=$(HP_DISK_STATUS_COMMAND)
sleep 3

if[ ${HP_RPM}!= "0" ]

    then
    echo "$Get_localhost_Hostname;check_raid;2; $SERVER_TYPEcommand hpacucli Not Found" | Send_nsca_ssl_message
    exit 2
elif [[ ${HP_RPM}== "0"]];
    then
    HP_RAID_STATUS_NUMBER=$(CHECK_RAID_STATUS_HP)
    sleep 3
    TEST_HP=$(TEST_HP_COMMAND)
    if [ "$HP_RAID_STATUS_NUMBER" == "0" ] && [ "$TEST_HP"   == "0" ];
      then
      echo "$Get_localhost_Hostname;check_raid;0;Check_Raid_status:OK" | Send_nsca_ssl_message
      exit 0
    elif [ "${TEST_HP}" != "0" ]
         then
          echo "$Get_localhost_Hostname;check_raid;2;Check_Raid_status: run command hpacucli Error" | Send_nsca_ssl_message
          exit 2
    elif[ "$HP_RAID_STATUS_NUMBER" != "0"] && [ "$TEST_HP" == "0" ]
          then
         echo "$Get_localhost_Hostname;check_raid;2;Check_Raid_status:Critical" | Send_nsca_ssl_message
         exit 2
    fi
fi
    ;;
DELL|Dell|DEll|DeLL|dell|IBM|ibm|Ibm|IBm)
if [-f "/opt/MegaRAID/MegaCli/MegaCli64" ];
      then

CHECK_RAID_STATUS_IBM_DELL () {
/opt/MegaRAID/MegaCli/MegaCli64 -LdPdInfo -a0|grep -E "(Media Error Count:|Other Error Count:)"|awk -F: '{sum1 += $2} END {print sum1}'
}
TEST_DELL_COMMAND (){
/opt/MegaRAID/MegaCli/MegaCli64 -LdPdInfo -a0>> /dev/null
echo $?
}
else
CHECK_RAID_STATUS_IBM_DELL () {
MegaCli -LdPdInfo -a0|grep -E "(Media Error Count:|Other Error Count:)"|awk -F: '{sum1 += $2} END {print sum1}'
}
TEST_DELL_COMMAND (){
MegaCli -LdPdInfo -a0>> /dev/null
echo $?
}
fi

IBM_DELL_RPM=$(DELL_IBM_DISK_STATUS_COMMAND)
if[[ ${IBM_DELL_RPM}=="0" ]]
    then
   TEST_OTHER_COMMAND=$(TEST_DELL_COMMAND)
   DELL_IBM_STATUS_NUMBER=$(CHECK_RAID_STATUS_IBM_DELL)
      if [[-z "$DELL_IBM_STATUS_NUMBER" ]]
            then
            echo "$Get_localhost_Hostname;check_raid;2;Check_Raid_status:MegaCli CommandNot Found!" | Send_nsca_ssl_message
            exit 2
      elif [[ "$DELL_IBM_STATUS_NUMBER" -gt "2000" ]] ;
            then
            echo "$Get_localhost_Hostname;check_raid;2;Check_Raid_status:Critical" | Send_nsca_ssl_message
            exit 2

      elif[["$DELL_IBM_STATUS_NUMBER"-lt"2000" ]] && [[ "$TEST_OTHER_COMMAND" == "0" ]]
            then
            echo "$Get_localhost_Hostname;check_raid;0;Check_Raid_status:OK" | Send_nsca_ssl_message
            exit 0
      fi
fi
;;
*)
echo "$Get_localhost_Hostname;check_raid;2;This machine is not IBM DELL or HP!" | Send_nsca_ssl_message
;;
esac  




页: [1]
查看完整版本: nagios 监控硬盘RAID