zidong 发表于 2013-3-14 09:29:59

Nagios检测一些记序

检测命令篇:
文中内容包括:序述nagios从发现主机到web界面显示出状态再到邮件报警的整个过程。方面以后进行排错,还有如何编写特定应用的特定检测程序。
煮酒品茶:文章需要改进的有如何做触发报警的条件,警告等。服务和主机配置文件中的“check_command         check-host-alive”是什么意思呢?

# pwd
/usr/local/nagios/etc/objects
# cat commands.cfg #发现这么一项:

define command{      command_name    check-host-alive
      command_line    $USER1$/check_ping -H $HOSTADDRESS$ -w 3000.0,80% -c 5000.0,100% -p 5
      }


看看命令行:$USER1$/check_ping -H $HOSTADDRESS$ -w 3000.0,80% -c 5000.0,100% -p 5
# pwd
/usr/local/nagios/libexec
# ./check_ping -H 192.168.100.85 -w 3000.0,80% -c 5000.0,100% -p 5
PING OK - Packet loss = 0%, RTA = 0.05 ms|rta=0.055000ms;3000.000000;5000.000000;0.000000 pl=0%;80;100;0
# Web界面 Status Information 里面是不是出现了PING OK - Packet loss = 0%, RTA = 0.05 ms这样的东东。
# -h 使用方法都出来了,由此我们可以看出。ping 192.168.100.85 3000警告,5000直接报警。目前为0.055ms 发送五个包。那很很清晰了。

# ./check_ping -hUse ping to check connection statistics for a remote host.

Usage:check_ping -H <host_address> -w <wrta>,<wpl>% -c <crta>,<cpl>%
[-p packets] [-t timeout] [-4|-6]

Options:
-h, --help
    Print detailed help screen
-V, --version
    Print version information
-4, --use-ipv4
    Use IPv4 connection
-6, --use-ipv6
    Use IPv6 connection
-H, --hostname=HOST
    host to ping
-w, --warning=THRESHOLD
    warning threshold pair
-c, --critical=THRESHOLD
    critical threshold pair
-p, --packets=INTEGER
    number of ICMP ECHO packets to send (Default: 5)
-L, --link
    show HTML in the plugin output (obsoleted by urlize)
-t, --timeout=INTEGER
    Seconds before connection times out (default: 10)



# 倒底有多少个这样的定义的命令的?

# cat commands.cfg |grep command_name      command_name      notify-host-by-email
      command_name      notify-service-by-email
      command_name    check-host-alive
      command_name    check_local_disk
      command_name    check_local_load
      command_name    check_local_procs
      command_name    check_local_users
      command_name      check_local_swap
      command_name      check_local_mrtgtraf
      command_name    check_ftp
      command_name    check_hpjd
      command_name    check_snmp
      command_name    check_http
      command_name      check_ssh
      command_name      check_dhcp
      command_name    check_ping
      command_name    check_pop
      command_name    check_imap
      command_name    check_smtp
      command_name      check_tcp
      command_name      check_udp
      command_name      check_nt
      command_name      process-host-perfdata
      command_name      process-service-perfdata



#挑一条notify-host-by-email,可以清楚的看到发送邮件的过程。

define command{      command_name    notify-host-by-email
      command_line    /usr/bin/printf "%b" "***** Nagios *****\n\nNotification Type: $NOTIFICATIONTYPE$\nHost: $HOSTNAME$\nState: $HOSTSTATE$\nAddress: $HOSTADDRESS$\nInfo: $HOSTOUTPUT$\n\nDate/Time: $LONGDATETIME$\n" | /bin/mail -s "** $NOTIFICATIONTYPE$ Host Alert: $HOSTNAME$ is $HOSTSTATE$ **" $CONTACTEMAIL$
      }


#打散开来看,是不是可以定制邮件发送格式了?

/usr/bin/printf "%b" "***** Nagios *****\n\nNotification Type: $NOTIFICATIONTYPE$\n
Host: $HOSTNAME$\nState: $HOSTSTATE$\n
Address: $HOSTADDRESS$\n
Info: $HOSTOUTPUT$\n\n
Date/Time: $LONGDATETIME$\n
" | /bin/mail -s "** $NOTIFICATIONTYPE$ Host Alert: $HOSTNAME$ is $HOSTSTATE$ **" $CONTACTEMAIL$



#到手的邮件是这样子的。               

主 题:      ** RECOVERY Host Alert: rsync-89 is UP **   [新窗口打开]时 间:         2013-03-13 22:57 (星期三)
发件人:      nagios<nagios@phx2-ss-5-lb.cnet.com>   [添加联系人][邮件往来][拒收]
收件人:         我<zwhset@163.com>               
**** Nagios *****
Notification Type: RECOVERY
Host: rsync-89
State: UP
Address: 192.168.100.89
Info: PING OK - Packet loss = 0%, RTA = 0.32 ms
Date/Time: Wed Mar 13 22:57:44 CST 2013



#那我们加一个监控服务,看看全程如何工作的。查看端口22是否保持链接。check_tcp,我们先看看用法。

# ./check_tcp -hUsage:check_tcp -H host -p port [-w <warning time>] [-c <critical time>] [-s <send string>]
[-e <expect string>] [-q <quit string>][-m <maximum bytes>] [-d <delay>]
[-t <timeout seconds>] [-r <refuse state>] [-M <mismatch state>] [-v] [-4|-6] [-j]
[-D <days to cert expiry>] [-S <use SSL>] [-E]

Options:
-h, --help
    Print detailed help screen
-V, --version
    Print version information
-H, --hostname=ADDRESS
    Host name, IP Address, or unix socket (must be an absolute path)
-p, --port=INTEGER
    Port number (default: none)
-4, --use-ipv4
    Use IPv4 connection
-6, --use-ipv6
    Use IPv6 connection
-E, --escape
    Can use \n, \r, \t or \ in send or quit string. Must come before send or quit option
    Default: nothing added to send, \r\n added to end of quit
-s, --send=STRING
    String to send to the server
-e, --expect=STRING
    String to expect in server response (may be repeated)
-A, --all
    All expect strings need to occur in server response. Default is any
-q, --quit=STRING
    String to send server to initiate a clean close of the connection
-r, --refuse=ok|warn|crit
    Accept TCP refusals with states ok, warn, crit (default: crit)
-M, --mismatch=ok|warn|crit
    Accept expected string mismatches with states ok, warn, crit (default: warn)
-j, --jail
    Hide output from TCP socket
-m, --maxbytes=INTEGER
    Close connection once more than this number of bytes are received
-d, --delay=INTEGER
    Seconds to wait between sending string and polling for response
-w, --warning=DOUBLE
    Response time to result in warning status (seconds)
-c, --critical=DOUBLE
    Response time to result in critical status (seconds)
-t, --timeout=INTEGER
    Seconds before connection times out (default: 10)
-v, --verbose
    Show details for command-line debugging (Nagios may truncate output)



#这似乎是具体方法,让我们看看command里定义的。

define command{      command_name    check_tcp
      command_line    $USER1$/check_tcp -H $HOSTADDRESS$ -p $ARG1$ $ARG2$
      }


#对照上表-H 主机地址,-p 端口 接受参数1 2
#我找不到定义的文件在哪呢,$USER1$是路径也就是/usr/local/nagios/libexec,后面三个也一样。那么可构造 check_tcp 22,$ARG1$ $ARG2$用!号隔开。$USER1$的定义在文件:
# cat /usr/local/nagios/etc/resource.cfg |grep USER1

# Nagios supports up to 32 $USERx$ macros ($USER1$ through $USER32$)# Sets $USER1$ to be the path to the plugins
$USER1$=/usr/local/nagios/libexec



我们添加服务

# vim services.cfg define service {
      host_name      rsync-89
      service_description   check_tcp 80
      check_period          24x7
      max_check_attempts    4
      normal_check_interval 3
      retry_check_interval2
      contact_groups      ktm
      notification_interval   10
      notification_period   24x7
      notification_options    w,u,c,r
      check_command         check_tcp!80
      }


#验证下并滑溜运行程序。
# /usr/local/nagios/bin/nagios -v /usr/local/nagios/etc/nagios.cfg
# kill -Hup 8670
#运行成功,那我们想自由自在的构造一些检测,如何实现?做个实验.
#check_ping 主要输出这些:PING OK - Packet loss = 0%, RTA = 0.06 ms|rta=0.061000ms;3000.000000;5000.000000;0.000000 pl=0%;80;90;0

# cat /test/passwd |wc -l25


#假设有用户则显示用户数,无用户为空则报警。如何设计?
# touch /test/passwda
# cat check_user

#check_user_nagioscwtea#blog: cwtea.blog.
cu=`cat /test/passwda |wc -l`

if [ $cu -ne 0 ]; then
      echo "User OK - User is running (UserNumber: ${cu})"
      else
      echo "User CRITICAL,"User is none""
fi


# ./check_user

User CRITICAL,User is none


# ./check_user

User OK - User is running (UserNumber: 25)

#添加一个定义check_user
# vim commands.cfg

#check userdefine command{
      command_name    check_user
      command_line    $USER1$/check_user
      }


#添加一项服务
# vim services.cfg               

define service {      host_name      rsync-89
      service_description   check_user
      check_period          24x7
      max_check_attempts    4
      normal_check_interval 3
      retry_check_interval2
      contact_groups      ktm
      notification_interval   10
      notification_period   24x7
      notification_options    w,u,c,r
      check_command         check_user
      }


#kill -Hup 23377
#web界面看看,已经出现了。      

#我们把文件弄成空的。
# rm -rf /test/passwd
# touch /test/passwd
#状态显示是OK的,但是信息栏已经出现了我们想要的。
check_user      OK         03-14-2013 00:20:29         0d 0h 3m 25s         1/4         User CRITICAL,User is none
#我们加个返回状态码 exit 2
# cat check_user

#check_user_nagioscwtea#blog: cwtea.blog.
cu=`cat /test/passwd |wc -l`

if [ $cu -ne 0 ]; then
      echo "User OK - User is running (UserNumber: ${cu})"
      else
      echo "User CRITICAL,"User is none""
      exit 2
fi


#过会儿,再看已经down掉了。

#邮件报警等了好久才来

主 题:         ** PROBLEM Service Alert: rsync 89/check_user is CRITICAL **   [新窗口打开]时 间:         2013-03-14 00:32 (星期四)
发件人:         nagios<nagios@phx2-ss-5-lb.cnet.com>   [添加联系人][邮件往来][拒收]
收件人:         我<zwhset@163.com>
***** Nagios *****
Notification Type: PROBLEM
Service: check_user
Host: rsync 89
Address: 192.168.100.89
State: CRITICAL
Date/Time: Thu Mar 14 00:32:39 CST 2013
Additional Info:
User CRITICAL,User is none





sonyet 发表于 2013-3-17 18:19:05

生我之前谁是我,生我之后我是谁?

linghaiyan 发表于 2013-5-16 09:31:06

俺从不写措字,但俺写通假字!

心心失意 发表于 2013-5-17 13:20:33

支持一下:lol

inushome 发表于 2013-5-18 17:24:09

走自己的路,让别人打车去吧。

olga 发表于 2013-5-19 21:56:55

昨天,系花对我笑了一下,乐得我晚上直数羊,一只羊,两只羊,三只羊……

我很黑! 发表于 2013-5-21 00:48:59

如果有一双眼睛陪我一同哭泣,就值得我为生命受苦。
页: [1]
查看完整版本: Nagios检测一些记序