Nagios短信分组报警
Nagios监控对于服务器运维来讲是非常好的工具,可以监控各种操作系统的服务器,windows,linux,aix等等,还可以对路由器和交换机,打印机等网络设备进行监控,再结合邮件,短信,MSN等报警方式为系统管理人员节省了许多巡检时间而且成效显著!在我的运维环境中,随着监控客户端的增多,报警的频率也随着提高,没日没夜的收短信,最终不得不把手机短信调成震动模式。在收到的报警短信中,大部分是windows客户端异常的短信,相对而言,在服务器领域linux系统比windows系统稳定可靠的多。于是想到取消集中报警的方式,将windows报警发送到windows sa的手机上,linux报警发到自己的手机上,也就是分组短信报警。下面就来介绍下配置过程!一:编辑command.cfg文件,添加fetion发送短信命令
[*]# vi /usr/local/nagios/etc/objects/commands.cfg
[*]define command{
[*]command_name notify-service-by-linux
[*]command_line /usr/local/fx/fetion --mobile=1383838438 --pwd=123 --to=1383838438 --msg-utf8="主机:IP地址$HOSTADDRESS$,服务器描述: $HOSTALIAS$/$SERVICEDESC$ 目前状
[*]
[*]态:$SERVICESTATE$ 信息摘要: $SERVICEOUTPUT$" --msg-type=1
[*]}
[*]
[*]define command{
[*]command_name notify-service-by-windows
[*]command_line /usr/local/fx/fetion --mobile=1383838438 --pwd=123 --to=1333333333 --msg-utf8="主机:IP地址$HOSTADDRESS$,服务器描述: $HOSTALIAS$/$SERVICEDESC$ 目前状
[*]
[*]态:$SERVICESTATE$ 信息摘要: $SERVICEOUTPUT$" --msg-type=1
[*]}
[*]
[*]define command{
[*]command_name notify-host-by-sms
[*]command_line/bin/echo null
[*]}
二:编辑contacts.cfg文件,分别定义linux服务器的告警信息通过notify-service-by-linux发送,windows服务器的告警信息通过notify-service-by-windows,在本例中因为所有的服务器均禁止ping,所以所有的主机告警信息不发送!
[*]# grep -v '^#'/usr/local/nagios/etc/objects/contacts.cfg
[*]define contact{
[*] contact_name linux
[*] use linux-contact
[*] alias linux
[*] service_notification_period 24x7
[*] host_notification_period 24x7
[*] service_notification_options w,u,c,r,f,s
[*] host_notification_options d,u,r,f,s
[*] service_notification_commands notify-service-by-linux
[*] host_notification_commands notify-host-by-sms
[*] }
[*]
[*]define contact{
[*] contact_name windows
[*] use windows-contact
[*] alias windows
[*] service_notification_period 24x7
[*] host_notification_period 24x7
[*] service_notification_options w,u,c,r,f,s
[*] host_notification_options d,u,r,f,s
[*] service_notification_commands notify-service-by-windows
[*] host_notification_commands notify-host-by-sms
[*] }
[*]
[*]define contactgroup{
[*] contactgroup_name linux
[*] alias linux
[*] members linux
[*] }
[*]
[*]define contactgroup{
[*] contactgroup_name windows
[*] alias windows
[*] members windows
[*] }
三:编辑templates.cfg文件,在模板文件中分别定义linux服务器和windows服务器的各监控属性和通知类型(最重要的参数contact_groups),最后用于监控客户端配置文件上
[*]# vi /usr/local/nagios/etc/objects/templates.cfg
[*]define contact{
[*] name linux-contact
[*] service_notification_period 24x7
[*] host_notification_period 24x7
[*] service_notification_options w,u,c,r,f,s
[*] host_notification_options d,u,r,f,s
[*] service_notification_commands notify-service-by-linux
[*] host_notification_commands notify-host-by-sms
[*] register 0
[*] }
[*]
[*]define contact{
[*] name windows-contact
[*] service_notification_period 24x7
[*] host_notification_period 24x7
[*] service_notification_options w,u,c,r,f,s
[*] host_notification_options d,u,r,f,s
[*] service_notification_commands notify-service-by-windows
[*] host_notification_commands notify-host-by-sms
[*] register 0
[*] }
[*]
[*]define host{
[*] name generic-host
[*] notifications_enabled 1
[*] event_handler_enabled 1
[*] flap_detection_enabled 1
[*] failure_prediction_enabled 1
[*] process_perf_data 1
[*] retain_status_information 1
[*] retain_nonstatus_information 1
[*] notification_period 24x7
[*] register 0
[*] }
[*]
[*]define host{
[*] name linux-server
[*] use generic-host
[*] check_period 24x7
[*] check_interval 5
[*] retry_interval 1
[*] max_check_attempts 10
[*] check_command check-host-alive
[*] notification_period 24x7
[*] notification_interval 120
[*] notification_options d,u,r
[*] contact_groups linux
[*] register 0
[*] }
[*]
[*]define host{
[*] name windows-server
[*] use generic-host
[*] check_period 24x7
[*] check_interval 5
[*] retry_interval 1
[*] max_check_attempts 10
[*] check_command check-host-alive
[*] #notification_period 24x7
[*] notification_interval 30
[*] notification_options d,u,r
[*] contact_groups windows
[*] register 0
[*] }
[*]
[*]define service{
[*] name linux-service
[*] active_checks_enabled 1
[*] passive_checks_enabled 1
[*] parallelize_check 1 problems)
[*] obsess_over_service 1
[*] check_freshness 0
[*] notifications_enabled 1
[*] event_handler_enabled 1
[*] flap_detection_enabled 1
[*] failure_prediction_enabled 1
[*] process_perf_data 1
[*] action_url /nagios/pnp/index.php?host=$HOSTNAME$&srv=$SERVICEDESC$
[*] retain_status_information 1
[*] retain_nonstatus_information 1
[*] is_volatile 0
[*] check_period 24x7
[*] max_check_attempts 3
[*] normal_check_interval 1
[*] retry_check_interval 2
[*] contact_groups linux
[*] notification_options w,u,c,r
[*] notification_interval 1440
[*] notification_period 24x7
[*] register 0
[*] }
[*]
[*]define service{
[*] name windows-service
[*] active_checks_enabled 1
[*] passive_checks_enabled 1
[*] parallelize_check 1
[*] obsess_over_service 1
[*] check_freshness 0
[*] notifications_enabled 1
[*] event_handler_enabled 1
[*] flap_detection_enabled 1
[*] failure_prediction_enabled 1
[*] process_perf_data 1
[*] action_url /nagios/pnp/index.php?host=$HOSTNAME$&srv=$SERVICEDESC$
[*] retain_status_information 1
[*] retain_nonstatus_information 1
[*] is_volatile 0
[*] check_period 24x7
[*] max_check_attempts 3
[*] normal_check_interval 1
[*] retry_check_interval 2
[*] contact_groups windows
[*] notification_options w,u,c,r
[*] notification_interval 1440
[*] notification_period 24x7
[*] register 0
[*] }
四:linux客户端文件,host调用linux-server模板,service调用linux-service
[*]# grep -v '^#'/usr/local/nagios/etc/objects/client/10.0.1.11.cfg
[*]define host{
[*] use linux-server ; Name of host template to use
[*] ; This host definition will inherit all variables that are defined
[*] ; in (or inherited by) the linux-server host template definition.
[*] host_name 10.0.1.11
[*] alias 10.0.1.11
[*] address 10.0.1.11
[*] }
[*]
[*]define service{
[*] use linux-service ; Name of service template to use
[*] host_name 10.0.1.11
[*] service_description CPU Load
[*] check_command check_nrpe!check_load
[*] }
五:windows客户端文件,host调用windows-server模板,service调用windows-service
[*]# grep -v '^#'/usr/local/nagios/etc/objects/client/10.0.1.13.cfg
[*]define host{
[*] use windows-server; Inherit default values from a template
[*] host_name 10.0.1.13 ; The name we're giving to this host
[*] alias 10.0.1.13 ; A longer name associated with the host
[*] address 10.0.1.13 ; IP address of the host
[*] }
[*]
[*]define service{
[*] use windows-service
[*] host_name 10.0.1.13
[*] service_description NSClient++ Version
[*] check_command check_nt!CLIENTVERSION
[*] notifications_enabled 1
[*] }
六:重启nagios服务,并验证结果
[*]# service nagios reload
[*]Running configuration check...done.
[*]Reloading nagios configuration...done
http://blog.运维网.com/attachment/201303/135337967.jpg
http://blog.运维网.com/attachment/201303/135401915.jpg
后记:服务器报警信息固然重要,但如果报警信息泛滥,从应用或者资源上无法及时调整到位,运维人员难免会对报警短信产生厌烦情绪,进而可能会忽略重要的报警信息,对服务器运维造成负面的影响,也会让领导认为监控系统可有可无,其实还是可以从技术角度上解决一些问题的,例如工作时间短信发送到QQ邮箱,MSN上,非工作时间短信发送到运维人员手机或者值班手机上,就是一种不错的办法,但监控只是一种手段,服务器问题还是早预见早解决为妙!
页:
[1]