#!/usr/local/bin/perl # NASpareCheck - NetApp Failure Reporting and Paging Tool # Version 2.04: Dec 12th, 2002 # (c) Ben Rockwood - benr@cuddletech.com # cuddletech - use unix or die(c). # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # A re-write of NACHECKFILERS in NetSNMP v5 PERL modules ## CHANGELOG: # v2.00 -> Rewrite to use SNMP module # v2.01 -> Returns most of the old 1.x functinoality # v2.02 -> Implements "page once" functions # v2.03 -> Add mutli host functionality and cleanup # v2.04 -> Added fping test and added reporting output - various cleanups as well $VERSION = "v2.04"; use SNMP '5.0.2.pre1'; ### Configuration Section: $COMMUNITY = "public"; $MIN_SPARES = "2"; $SENDMAIL = "/usr/lib/sendmail"; $ADMIN_ADDR = '1234@skytel.com,myaddress@cuddletech.com'; ### Net-SNMP Setup ####################### #$ENV{'MIBFILES'}="NETWORK-APPLIANCE-MIB"; #<--- This uses the file of that name in the pwd $ENV{'MIBS'}="ALL"; #<-- Magic Key for using system MIBS /usr/local/share/snmp $SNMP::verbose = 1; #Debugging $SNMP::use_enums=1; $SNMP::use_sprint_value=1; ########################################## ################################ ## Usage information on no-args. $NUM_ARGS = @ARGV; unless ($NUM_ARGS > 0) { die("Usage: $0 [-page | -report] , , ...\n\nNA-SpareCheck $VERSION\n(c)2002 Ben Rockwood [benr\@cuddletech.com]\n"); } ################################ ######################################## ## Parse args and redirect to subroutine if($ARGV[0] eq "-page") { for ($i = 1; $i < $NUM_ARGS; $i++) { $TARGET_HOST = $ARGV[$i]; print("DEBUG: checking $TARGET_HOST\n"); if (`/usr/local/sbin/fping $TARGET_HOST 2> /dev/null`) { &__check_filer(); } else { $MESSAGE = "$TARGET_HOST is DOWN. Host unreachable.\n"; &__send_mail(); } } } elsif($ARGV[0] eq "-report") { for ($i = 1; $i < $NUM_ARGS; $i++) { $TARGET_HOST = $ARGV[$i]; &__gen_filer_report(); } } else { die("Usage: $0 [-page | -report] , , ...\n"); } ######################################## ######################################## ## SUBROUTINE: __check_filer ## Purpose: Grab status on FRUs from the Filer create 2 arrays to pass on sub __check_filer { # Open a session to the host $SESSION = new SNMP::Session ( DestHost => $TARGET_HOST, Community => $COMMUNITY, Version => 1); # Get a VarList of Disk related failure counters $DISK_VLIST = new SNMP::VarList(['diskTotalCount'], #0 ['diskActiveCount'], #1 ['diskReconstructingCount'], #2 ['diskReconstructingParityCount'], #3 ['diskVerifyingParityCount'], #4 ['diskFailedCount'], #5 ['diskSpareCount'], #6 ['diskFailedMessage']); #7 # Pass the VarList to getnext building an array of the output @DISK_STATUS = $SESSION->getnext($DISK_VLIST); $FRU_VLIST = new SNMP::VarList(['envFailedFanCount'], ['envFailedPowerSupplyCount']); @FRU_STATUS = $SESSION->getnext($FRU_VLIST); #Get a VarList of the Filers Stats for a report #@FILER_VLIST = new SNMP::VarList(' #### DEBUGGING --> To test failure detection. #$DISK_STATUS[6] = "1"; # Strip quotes off of the diskFailedMessage $DISK_STATUS[7] =~ s/\"//g; #Now pass that information along (via inheritance) to __parse_errors for parsing &__parse_errors(); } ############################################## ############################################# ## SUBROUTINE: __parse_errors ## Purpose: To detect failures from SNMP info and report accordingly ## Also, to make sure that failed componants only page once, not everytime the script is run. ## The two arrays are: @FRU_STATUS (ps, fans, etc) and @DISK_STATUS (disks, reconstructs, etc) sub __parse_errors { $STATUS_FILE = "NACHECKFILERS.$TARGET_HOST"; if ( $DISK_STATUS[1] + $DISK_STATUS[6] != $DISK_STATUS[0] ) { ## if ActiveCount + SpareCount != TotalCount ## This is the Stealth Failure we sometimes see $MESSAGE = "$TARGET_HOST has $DISK_STATUS[6] of $MIN_SPARES spares left- $DISK_STATUS[7] -1: $DISK_STATUS[1] $DISK_STATUS[6] $DISK_STATUS[0]\n"; #$MESSAGE = "$TARGET_HOST. Spares-$DISK_STATUS. Failed-???????."; Possible new message &__gen_message(); } elsif ( $DISK_STATUS[1] + $DISK_STATUS[6] - $DISK_STATUS[5] != $DISK_STATUS[0] ) { ## if ActiveCount + SpareCount - FailedCount != TotalCount ## This is the CORRECT way for a disk to fail. $MESSAGE = "$TARGET_HOST has $DISK_STATUS[6] of $MIN_SPARES spares left-2\n"; &__gen_message(); } elsif ( $DISK_STATUS[6] != $MIN_SPARES ) { ### If the standard check don't catching it, do a brute force look $MESSAGE = "$TARGET_HOST has $DISK_STATUS[6] of $MIN_SPARES spares left-3\n"; &__gen_message(); } else { ### If it passes all the above filters, it's fine... so unlink the file. if ( -e "/tmp/$STATUS_FILE" ) { unlink("/tmp/$STATUS_FILE") || die("Couldn't unlink $STATUS_FILE!\n"); $MESSAGE = "$TARGET_HOST is now fine. $MIN_SPARES spares avalible.\n"; &__send_mail(); } else { ### Catch all. Should never get here. #$MESSAGE = "$TARGET_HOST: Error - System UP but not matching tests\n"; #All tests pass. } } #if ( $FRU_STATUS[0] != "0" ) { # #Failed FAN # $MESSAGE = "$TARGET_HOST has $FRU_STATUS[0] failed Fan\n"; # &__gen_message() # # } elsif ( $FRU_STATUS[1] != "0") { # #Failed PowerSupply # $MESSAGE = "$TARGET_HOST has $FRU_STATUS[1] failed Power Supply\n"; # &__gen_message() # # } else { # ### CAn't do this... it'll unlink even on a disk failure, just cause no FRU failed # #if ( -e "/tmp/$STATUS_FILE" ) { # # unlink("/tmp/$STATUS_FILE") || die("Couldn't unlink $STATUS_FILE!\n"); # #} # } if ( $FRU_STATUS[0] != "0" ) { #Failed FAN $MESSAGE = "$TARGET_HOST has $FRU_STATUS[0] failed Fan\n"; &__gen_message() } if ( $FRU_STATUS[1] != "0") { #Failed PowerSupply $MESSAGE = "$TARGET_HOST has $FRU_STATUS[1] failed Power Supply\n"; print("Power Supply status is: $FRU_STATUS[1]\n"); &__gen_message() } #print("Debug: Power Supply status is: $FRU_STATUS[1]\n"); } ############################################# ## SUBROUTING: __gen_message ## Purpose: Mark a holder file in /tmp that will keep us from getting repedative pages. sub __gen_message { if( -e "/tmp/$STATUS_FILE") { print("DEBUGGING: File $STATUS_FILE exsists - Not sending page!\n"); } else { open(FH, ">/tmp/$STATUS_FILE") || die("Unable to open file $STATUS_FILE: $!\n"); print FH $MESSAGE; close(FH); print("DEBUGGING: I've created the file $STATUS_FILE. Passing message: $MESSAGE\n"); &__send_mail(); } } ############################################# ############################################# ## SUBROUTINE: __send_mail sub __send_mail { #print("DEBUGGING: I am sending the message: $MESSAGE"); #DEBUGGING ##NOTE: To send a message it must have a line return, and not contain any ":"s $MESSAGE =~ s/:/\./g; if ($MESSAGE) { open(SENDMAIL, "|$SENDMAIL $ADMIN_ADDR") || die "Cannot open $SENDMAIL: $!"; print SENDMAIL $MESSAGE; close(SENDMAIL); } else { print("Error: You are try to send a message without any text! \n"); } } ############################################## ############################################## ## SUBROUTINE: __gen_filer_report ## sub __gen_filer_report { if (`/usr/local/sbin/fping $TARGET_HOST 2> /dev/null`) { print ("Filer $TARGET_HOST is UP.\n"); &__check_filer(); print("Total Disks: $DISK_STATUS[0] | Active: $DISK_STATUS[1] | Reconstructing: $DISK_STATUS[2]\n"); print("Spare Disks: $DISK_STATUS[6] | Failed: $DISK_STATUS[5] -> $DISK_STATUS[7]\n"); print("Failed Fans: $FRU_STATUS[0] | Failed Power Supplies: $FRU_STATUS[1]\n"); print("\n----------------------------------\n"); } else { print("Filer $TARGET_HOST is DOWN!\n"); print("\n----------------------------------\n"); } } #### # # ##### ##### # ###### ##### ###### #### # # # # # # # # # # # # # # # # # # # # # # # # # # ##### # ##### # ###### # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #### #### ##### ##### ###### ###### # ###### #### # #