Nagios监控SSD健康的脚本
生产中使用了SSD磁盘,使用smartctl -a /dev/sdb可以查看到该磁盘的各项值,我们可以根据这些Value和Worst等于或小于thresh时候就要注意了,下面上脚本:[*]#!/usr/bin/perl
[*]
[*]=head
[*]check ssd infomation
[*]
[*]usage: add "nagiosALL=(root) NOPASSWD: /usr/sbin/smartctl" to /etc/sudoers file;
[*]=cut
[*]
[*]use strict;
[*]use warnings;
[*]use Data::Dumper;
[*]use Getopt::Long;
[*]
[*]my ($result,$device,$h,$debug);
[*]my ($start_time,$use_time) = (time,0.00);;
[*]my %ssd_attribute = ( 5 =>"Reallocated_Sector_Ct",
[*] 184 =>"End_to_End_Error_Detection_Count",
[*] 225 =>"Raw_Read_Error_Rate",
[*] 232 =>"Available_Reserver_Space",
[*] 233 =>"Media_Wearout_Indicator",
[*] 9 =>"Power_On_Hours",
[*] );
[*]
[*]$result = GetOptions ("device=s" => \$device,
[*] "debug" => \$debug);
[*]
[*]$device ||= 'sdb';
[*]$debug ||=0;
[*]
[*]$h->{$device}->{output} = "";
[*]$h->{$device}->{perfdata} = "";
[*]$h->{$device}->{status} = 0;
[*]$h->{$device}->{total_info} = `sudo /usr/sbin/smartctl -a /dev/$device 2>&1`;
[*]if ( $h->{$device}->{total_info} =~ m{===\s+START\s+OF\s+INFORMATION\s+SECTION\s+===(.*)===\s+START\s+OF\s+READ\s+SMART\s+DATA\s+SECTION\s+===\s+SMART\s+overall-health\s+self-assessment\s+test\s+result:\s+(\w+)[\d\D]+Vendor\s+Specific\s+SMART\s+Attributes\s+with\s+Thresholds([\d\D]+)SMART\s+Error\s+Log\s+Version:\s+(\d+)}is) {
[*] $h->{$device}->{info_section} = $1;
[*] $h->{$device}->{smart_test_result} = $2;
[*] $h->{$device}->{healt_result} = $3;
[*] print "************************************** get $device healt info sta **************************************\n" if $debug;
[*] foreach my $line (split /\n/,$h->{$device}->{healt_result}) {
[*] #if ($line =~ m{(\d+)\s+([^\s]+)\s+([\dx]+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\w+)\s+(\w+)\s+([^\s]+)\s+(\d+)}i) {
[*] if ($line =~ m{(\d+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)}i) {
[*] print "ID:$1\tATTRIBUTE_NAME:$2\tFLAG:$3\tVALUE:$4\tWORST:$5\tTHRESH:$6\tTYPE:$7\tUPDATED:$8\tWHEN_FAILED:$9\tRAW_VALUE:$10\n" if $debug;
[*] $h->{$device}->{healt}->{$1}->{id} = $1;
[*] $h->{$device}->{healt}->{$1}->{attribute_name} = $2;
[*] $h->{$device}->{healt}->{$1}->{flag} = $3;
[*] $h->{$device}->{healt}->{$1}->{value} = $4;
[*] $h->{$device}->{healt}->{$1}->{worst} = $5;
[*] $h->{$device}->{healt}->{$1}->{thresh} = $6;
[*] $h->{$device}->{healt}->{$1}->{type} = $7;
[*] $h->{$device}->{healt}->{$1}->{updated} = $8;
[*] $h->{$device}->{healt}->{$1}->{when_failed} = $9;
[*] $h->{$device}->{healt}->{$1}->{raw_value} = $10;
[*] }
[*] }
[*] print "************************************** get $device healt info end **************************************\n" if $debug;
[*] $h->{$device}->{smart_error_log_version} = $4;
[*] $h->{$device}->{match} = 1;
[*]} else {
[*] $h->{$device}->{match} = 0;
[*]}
[*]
[*]print "runging..... `sudo /usr/sbin/smartctl -a /dev/$device 2>&1`\n" if $debug;
[*]print "\n\n-------------------------------- Dumper \$h sta --------------------------------\n" if $debug;
[*]print Dumper $h if $debug;
[*]print "-------------------------------- Dumper \$h end --------------------------------\n\n" if $debug;
[*]
[*]$use_time = sprintf("%0.2f",time - $start_time);
[*]if ( (exists $h->{$device}->{match} && $h->{$device}->{match} == 0) ) {
[*] print "CRITICAL - smartctl get $device total info fail|status=1 time=$use_time\n";
[*] exit (2);
[*]} elsif ( ! exists $h->{$device}->{healt} ) {
[*] print "WARNING - smartctl get $device healt info fail|status=1 time=$use_time\n";
[*] exit (1);
[*]} else {
[*] while ( (my ($id,$id_hash)) = (each %{$h->{$device}->{healt}} ) ) {
[*] if ( !exists $ssd_attribute{$id}) {
[*] print "not exists \$ssd_attribute{\$id},now next\n" if $debug;
[*] next;
[*] }
[*] print "----------------------------- loop \$h->{\$device}->{healt} hash -----------------------------\n" if $debug;
[*] print $h->{$device}->{healt}->{$id}->{worst} . "\t" if $debug;
[*] print $h->{$device}->{healt}->{$id}->{value} . "\t" if $debug;
[*] print $h->{$device}->{healt}->{$id}->{thresh} . "\n" if $debug;
[*] if ($h->{$device}->{healt}->{$id}->{value} {$device}->{healt}->{$id}->{thresh}){
[*] $h->{$device}->{output} .= "CRITICAL - $device " if ($h->{$device}->{output} eq "");
[*] $h->{$device}->{output} .= "id:$id attribute_name:" . $ssd_attribute{$id} . " value:" . $h->{$device}->{healt}->{$id}->{value} . " ";
[*] $h->{$device}->{status} = 2;
[*] print 'value
页:
[1]