#!/usr/bin/perl use strict; use warnings; # Copyright (C) 2008-2012 Glen Pitt-Pladdy # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # # # See: https://www.pitt-pladdy.com/blog/_20110601-152728%2B0100%20Pinger%20improved%20%28with%20Cacti%29/ # # version 20140830 # set command / arguments $0 = __FILE__.' '.join(' ',@ARGV); use POSIX qw(strftime :sys_wait_h); use threads; # Use fping to pinig a large number of nodes, logging the data to a .csv file my $PIDFILE = '/var/run/fping_logger.pid'; # path to the PID file my $DAEMON = 1; # go into background my $FPING = '/usr/bin/fping'; # if "unbuffer" is available, it will be automatically used my $FPINGRESTART = 100; # how many pings before we start a fresh fping instance my $THISNODE = `/bin/hostname`; chomp $THISNODE; my $MAIL = '/usr/bin/mail'; # normal mail command for sending emails my $CONFIG = '/etc/fping_logger.pl'; # perl include to override defaults below my $LOGDIR = '/var/local/fping_logger'; # where to keep the logs my $HOTLOG = 1; # use hot filehandles (no buffering). On FLASH this may be set to 0 to reduce writes # Monitoring periods my $PINGTIME = 15000; # how often to ping in miliseconds my $TIMEOUT = 1000; # how long to wait for a response # Normal availability is based on a number of consecutive cycles in one # state, which is useful when there is a hard cutoff of connectivity my $AVAILTHRESH = 4; # how many consecutive pings / losses to decide up / dow # When the problem is high loss, then normal availability may never be # able to make a clear decision (eg. alternate pings/losses) and leave # you with an unusable connection which is still marked as up. # Averaging allows the lossy situation to be taken into account # To diable average loss set $LOSSTHRESHDOWN and/or $LOSSTHRESHUP to zero or # for individual nodes %NODELOSSTHRESHDOWN and/or %NODELOSSTHRESHUP to zero my $LOSSAVFACTOR = 0.05; # integration factor for new cycle for measuring average loss my $LOSSTHRESHDOWN = 0.8; # 20% loss means down, but must be > $LOSSAVFACTOR my $LOSSTHRESHUP = 0.95; # 5% loss means we're probably usable again my $LOSSLIMIT; # optional default limit on the loss average to help it recovver quickly after losses # Statistics my @AVERAGESEC = ( # what sets of averages do we keep stats for (seconds) 300, # 5 minute 3600, # 1 hour 86400, # 1 day ); my $CONTINUEAVERAGES = 1; # try and read previous averages on startup and continue with them # History - how many files do we keep? (including current file) my $PINGHIST = 30; # 30 days my $AVERAGEHIST = 2; # 2 days - it's really just for graphing # what nodes to watch my %NODES; # nodes to ping in the format 'human name' => 'host or IP' my %NODETHRESH; # thresholds for specific nodes my %NODELOSSTHRESHDOWN; # thresholds for specific nodes my %NODELOSSTHRESHUP; # thresholds for specific nodes my %NODELOSSLIMIT; # limit the average on losses to help it recover quickly after losses my %NODESUPRESSMAILIFDOWN; # if the given node is down, suppress email as it's an upstream fault # what to do when nodes change state # The output of this command will be included in the logs with the first # column being "COMMAND", and the second being the node name, and the # output of the command in the third column my %UPDOWNCOMMAND; # If we want reports to be emailed, then put the address to send them # here - if none is supplied then no emails are sent for that node my %UPDOWNEMAIL; my $EMAILSUBJECT = "$0:"; # my $NAGIOSREPORT; # reporting path for Nagios (external command) my %NODENAGIOSREPORT; # report to Nagios - defaults to 1 if $NAGIOSREPORT set # Read in the config # Everything up to this point may be overridden by the config # Now that we have the basics down, override them with the config if ( -f $CONFIG ) { open my $inc, '<', $CONFIG or die "$0: FATAL - can't read config: $!\n"; eval join '', <$inc>; close $inc; } # sort thresholds which haven't been set by the config foreach my $node (keys %NODES) { if ( ! exists $NODETHRESH{$node} ) { $NODETHRESH{$node} = $AVAILTHRESH; } if ( ! exists $NODELOSSTHRESHDOWN{$node} ) { $NODELOSSTHRESHDOWN{$node} = $LOSSTHRESHDOWN; } if ( ! exists $NODELOSSTHRESHUP{$node} ) { $NODELOSSTHRESHUP{$node} = $LOSSTHRESHUP; } if ( ! exists $NODELOSSLIMIT{$node} ) { if ( defined $LOSSLIMIT ) { # if we have a default limit set then use it $NODELOSSLIMIT{$node} = $LOSSLIMIT; } else { # else we set symetric limits round the hystereis band $NODELOSSLIMIT{$node} = $LOSSTHRESHDOWN - $LOSSTHRESHUP; } } if ( exists $NODESUPRESSMAILIFDOWN{$node} ) { if ( ! exists $NODES{$NODESUPRESSMAILIFDOWN{$node}} ) { die "$0: FATAL - %NODESUPRESSMAILIFDOWN for \"$node\" references \"$NODESUPRESSMAILIFDOWN{$node}\" which is not a monitored node\n"; } } if ( defined $NAGIOSREPORT ) { if ( ! exists $NODENAGIOSREPORT{$node} ) { $NODENAGIOSREPORT{$node} = $node; } } else { $NODENAGIOSREPORT{$node} = 0; } } # work out the average factor my %AVERAGEFAC; foreach (@AVERAGESEC) { $AVERAGEFAC{$_} = $PINGTIME / 1000 / $_; } # check executables we need if ( ! -x $FPING ) { die "$0: FATAL - need \"$FPING\"... yeah, really, it's vital\n"; } if ( %UPDOWNEMAIL and ! -x $MAIL ) { die "$0: FATAL - need \"$MAIL\" if to send reports\n"; } # sort a header my $header = 'time,epoch,'.join ( ',', keys %NODES )."\n"; $header .= ',,"'.join ( '","', map { local $_ = $_; s/"/"""/g; $_ } values %NODES )."\"\n"; # sort fping command my $fping = "$FPING -l -p $PINGTIME -t $TIMEOUT"; foreach my $node (keys %NODES) { my $host = $node; $host =~ s/\\/\\\\/g; $host =~ s/'/\\'/g; $fping .= " '$host'"; } # initialise time/date stamps my $epoch = time; my $timestamp = strftime '%d/%m/%Y %H:%M:%S', localtime ( $epoch ); my $datestamp = strftime '%Y%m%d', localtime ( $epoch ); my $lastdatestamp = strftime '%Y%m%d', localtime ( $epoch - 86400 ); # go back 1 day # initialise all the node objects my %nodes; foreach (keys %NODES) { $nodes{$_} = Node->new ( $_, $NODES{$_} ); } # see if we can read-in previous averages to continue from where we last where if ( $CONTINUEAVERAGES ) { foreach my $average (@AVERAGESEC) { my %prevaverages = readprevavg ( $average, $datestamp ); my $readlast = 0; foreach (keys %NODES) { if ( exists $prevaverages{$_} ) { $nodes{$_}->setavg ( $average, $prevaverages{$_} ); } else { $readlast = 1; } } if ( $readlast == 0 ) { next; } # got everything # some stuff still not found - look even further back my %prev2averages = readprevavg ( $average, $lastdatestamp ); foreach (keys %NODES) { if ( ! exists $prevaverages{$_} and defined $prev2averages{$_} ) { $nodes{$_}->setavg ( $average, $prev2averages{$_} ); } } } } # we're all ready to go - daemonise if needed if ( $DAEMON ) { if ( -f $PIDFILE ) { die "$0: FATAL - pid file \"$PIDFILE\" already exists\n"; } # daemonise use Proc::Daemon; my $ret = Proc::Daemon::Init (); if ( ! defined $ret ) { die "$0: FATAL - failed to fork()\n"; } elsif ( $ret > 0 ) { # we are the parent - no need to go further exit 0; } # we are the child (daemon) # set signal hanlder $SIG{'TERM'} = 'exitclean'; $SIG{'QUIT'} = 'exitclean'; $SIG{'INT'} = 'exitclean'; $SIG{'HUP'} = 'exitclean'; # get the PID sorted open my $pid, '>', $PIDFILE or die "$0: FATAL - can't write \"$PIDFILE\": $!\n"; print $pid $$; close $pid; } $SIG{'CHLD'} = 'reapchild'; # we're ready to go my $fh; # fping filehandle my $log; # ping log filehandle my %avgfh; # average filehandles my $lastcount; # the last count/cycle from fping my @errors; # error message accumulator my @command; # command output accumulator while ( 1 ) { # check if we have a live fping running if ( ! defined $fh ) { open $fh, '-|', "$fping 2>&1" or die "$0:".__LINE__." FATAL - can't run \"$fping\": $!\n"; $lastcount = 0; # don't trigger an output before the full set of data } # read until we see a new count while ( defined $fh and defined ( my $line = <$fh> ) ) { chomp $line; if ( $line =~ /^([^\s]+)\s*:\s\[(\d+)\],\s\d+\sbytes,\s([\d\.]+)\sms\s/ ) { my ( $host, $cycle, $pingtime ) = ( $1, $2, $3 ); if ( $cycle != $lastcount ) { if ( $cycle < $lastcount and $cycle != 0 ) { # went back by one - out of order pings my $errtime = time; push @errors, strftime ( '%d/%m/%Y %H:%M:%S', localtime($errtime) ).",$errtime,\"Out of order pings ($cycle->$lastcount) for: $line\""; push @errors, strftime ( '%d/%m/%Y %H:%M:%S', localtime($errtime) ).",$errtime,\"Restarting fping.....\""; # stop fping - it's flaking out close $fh; undef $fh; # skip this loop to restart fping last; } # if the datestamp has changed then re-open the log files for the new day if ( $datestamp ne $lastdatestamp ) { # open/reopen all the logfiles (implicity closes old ones) open $log, '>>', "$LOGDIR/pinglog-$datestamp.csv" or die "$0:".__LINE__." FATAL - can't write \"LOGDIR/pinglog-$datestamp.csv\": $!\n"; my $origfh = select $log; $| = $HOTLOG; print $log $header; foreach my $average (@AVERAGESEC) { open my $afh, '>>', "$LOGDIR/pingavg$average-$datestamp.csv" or die "$0:".__LINE__." FATAL - can't write \"$LOGDIR/pingavg$average-$datestamp.csv\": $!\n"; select $afh; $| = $HOTLOG; print $afh $header; $avgfh{$average} = $afh; } select $origfh; # back to normal # also a good time for a cleanup my @files = reverse glob "$LOGDIR/pinglog-*.csv"; if ( $#files >= $PINGHIST ) { unlink splice ( @files, $PINGHIST ); } foreach (@AVERAGESEC) { @files = reverse glob "$LOGDIR/pingavg$_-*.csv"; if ( $#files >= $AVERAGEHIST ) { unlink splice ( @files, $AVERAGEHIST ); } } # new datestamp in place $lastdatestamp = $datestamp; } # work out node status foreach (keys %NODES) { $nodes{$_}->processstatus; } # log averages foreach my $average (@AVERAGESEC) { my $averageline = "$timestamp,$epoch"; foreach my $node (keys %NODES) { $averageline .= ','.$nodes{$node}->getavg($average); } my $afh = $avgfh{$average}; print $afh "$averageline\n"; } # log cycle, and get up/down stats my $logline = "$timestamp,$epoch"; foreach (keys %NODES) { $logline .= ','.$nodes{$_}->pingresult; } print $log "$logline\n"; # print and clear errors for this cycle foreach (@errors) { print $log "#ERROR# $_\n"; } undef @errors; # re-join all the action threads, send mail etc. foreach (keys %NODES) { my $suppressmail; if ( exists $NODESUPRESSMAILIFDOWN{$_} ) { # climb the depenancy tree to see if anything is down my $tmpnode = $_; while ( exists $NODESUPRESSMAILIFDOWN{$tmpnode} ) { if ( $nodes{$NODESUPRESSMAILIFDOWN{$tmpnode}}->isup < 0 ) { # dependancy down - set suppression on $suppressmail = $NODESUPRESSMAILIFDOWN{$tmpnode}; last; } $tmpnode = $NODESUPRESSMAILIFDOWN{$tmpnode}; # next level } } push @command, $nodes{$_}->processaction ( $suppressmail ); $nodes{$_}->clearresults; } # print and clear command output foreach (@command) { s/"/"""/g; print $log "#COMMAND#,\"$_\"\n"; } undef @command; # start the next cycle $lastcount = $cycle; } if ( $cycle >= $FPINGRESTART and defined $fh ) { # "defined $fh" is for woking round bug 78494 above TODO remove when fixed # stop fping - it's reached it's limit close $fh; undef $fh; } else { # fping is still good - store the latest data $nodes{$host}->pingresult ( $pingtime ); } # we are starting a new cycle, so get timestamps updated $epoch = time; my @ltime = localtime $epoch; $timestamp = strftime '%d/%m/%Y %H:%M:%S', @ltime; $datestamp = strftime '%Y%m%d', @ltime; } else { # some weird stuff happened - put it into errors for this cycle with the immediate time my $errtime = time; push @errors, strftime ( '%d/%m/%Y %H:%M:%S', localtime($errtime) ).",$errtime,\"$line\""; } } } sub readprevavg { my ( $average, $datestamp ) = @_; if ( ! -f "$LOGDIR/pingavg$average-$datestamp.csv" ) { return; } my $log; if ( ! open $log, '<', "$LOGDIR/pingavg$average-$datestamp.csv" ) { warn "$0:".__LINE__." WARNING - can't read \"$LOGDIR/pingavg$average-$datestamp.csv\": $!\n"; return; } # we don't need to be efficient as starting up is a rare event # we can afford to read the whole file my $header; my $last; while ( defined ( my $line = <$log> ) ) { if ( $line =~ /^\d{2}\/\d{2}\/\d{4} \d{2}:\d{2}:\d{2},\d+,[\d\.,]+\n$/ ) { $last = $line; } elsif ( $line =~ /^time,epoch,/ ) { $header = $line; } } close $log; # hopefully we got both a haeder and the last data if ( ! defined $header or ! defined $last ) { warn "$0:".__LINE__." WARNING - can't get header or last line in \"$LOGDIR/pingavg$average-$datestamp.csv\"\n"; return; } chomp $header; my @header = split ',', $header; chomp $last; my @last = split ',', $last; # check that the lengths match if ( $#header != $#last ) { warn "$0:".__LINE__." WARNING - can't get header or last line in \"$LOGDIR/pingavg$average-$datestamp.csv\"\n"; return; } # check it's at newer than half the average time if ( $last[1] < $^T - $average / 2 ) { warn "$0:".__LINE__." WARNING - last line in \"$LOGDIR/pingavg$average-$datestamp.csv\" is older than half $average from $^T ($last[1] < ".($^T - $average / 2).")\n"; return; } # populate a hash my %lastaverages; for ( my $i = 2; $i <= $#header; ++$i ) { $lastaverages{$header[$i]} = $last[$i]; } return %lastaverages; } sub exitclean { # we got a signal - tidy up after ourselves if ( defined $fh ) { close $fh; } if ( defined $log ) { close $log; } foreach (values %avgfh) { close $_; } unlink $PIDFILE; exit 0; } sub reapchild { while ( waitpid( -1, &WNOHANG ) > 0 ) {} if ( defined $fh ) { close $fh; undef $fh; } # could have $fh still open } ##################################################################### # Class for each Node ##################################################################### package Node; sub new { my ( $class, $address, $name ) = @_; my $self = {}; $self->{'address'} = $address; $self->{'name'} = $name; $self->{'losscount'} = 0; # state due to concurrent losses - start off neutral # $self->{'lossaverage'} = # store the saverage loss for determining up/down $self->{'lossavstate'} = 0; # state due to average loss - start off neutral # $self->{'losscause'} = # cause of loss when down # $self->{'isup'} = # node status # $self->{'suppressmail'} = # mail suppression status (ie. dependencies down) # $self->{'lastmailstate'} = # status of the node about which last mail was sent # $self->{'messagestore'} = # store for most recent mail message generated but not sent $self->{'updowntime'} = $epoch; # # $self->{'actionthread'} = # thread object for command return bless $self, $class; } # add recieved stats - call with ping time sub pingresult { my $self = shift; if ( defined $_[0] ) { $self->{'results'} = shift; } return (defined $self->{'results'})?$self->{'results'}:'NULL'; } sub processstatus { my $self = shift; # update averages $self->{score} = $self->{results}?1:0; foreach my $average (@AVERAGESEC) { if ( ! exists $self->{averages}->{$average} ) { $self->{averages}->{$average} = $self->{score}; } $self->{averages}->{$average} *= ( 1 - $AVERAGEFAC{$average} ); $self->{averages}->{$average} += $self->{score} * $AVERAGEFAC{$average}; } # check state my ( $changed, $cause ) = $self->_updowncheck (); # kick off a thread for commands if ( $changed ) { # Aw! This is horrible - work around Perl bug: http://rt.perl.org/rt3/Public/Bug/Display.html?id=78494 # by closing the running fping we no longer have a pipe open TODO remove when fixed if ( defined $fh ) { close $fh; undef $fh; } # thread action to avoid multiple actions delaying each other $self->{actionthread} = threads->create ( '_updownaction', $self, $cause ); $self->{updowntime} = $epoch; # update change time for next change } } sub clearresults { my $self = shift; undef $self->{'results'}; } sub processaction { my ( $self, $suppressmail ) = @_; my $node = $self->{address}; my @commandoutput; # join thre thread if ( defined $self->{actionthread} ) { ( $self->{messagestore}, @commandoutput ) = @{$self->{actionthread}->join()}; undef $self->{actionthread}; } # send mail if needed if ( ! exists $self->{messagestore} or $self->{messagestore} eq '' ) { return @commandoutput; } # no messages for this node if ( $self->{lastmailstate}>0 and $self->{isup}<0 ) { # last mail sent/stored was up, but we are now down $self->{suppressmail} = $suppressmail; # send mail or at worst store what would have been sent # now decide what we do with this message if ( defined $self->{suppressmail} ) { # we must not mail - just log the suppression instead push @commandoutput, "EMAIL SUPPRESSED for $node ($NODES{$node}) going ".($self->{isup}>0?'UP':'DOWN')." due to $self->{suppressmail}"; } else { # mail is not suppressed - send it push @commandoutput, @{$self->_sendreportmail}; } $self->{lastmailstate} = $self->{isup}; } elsif ( $self->{lastmailstate}<0 and exists $self->{suppressmail} and $self->{isup}<0 ) { # last mail stored (due to suppression) was down, and we are still down # the big question is if the suppression has been lifted and we should now send the down mail my $suppressmailold = $self->{suppressmail}; $self->{suppressmail} = $suppressmail; # has it been lifted? if ( ! defined $self->{suppressmail} ) { # yup - we need to send that mail now # add in a notice about suppression if relevant my $messagepre = "IMPORTANT:\n"; $messagepre .= "This message was previously suppressed due to $suppressmailold\n"; $messagepre .= "Times of the outage are not fully known as a result\n"; $messagepre .= "The message is now being sent as suppression has been lifted\n"; $messagepre .= "(ie. the node(s) causing the suppression have recovered)\n"; $messagepre .= "\n"; # prepend it $self->{messagestore} = $messagepre.$self->{messagestore}; # send queued (down) mail push @commandoutput, @{$self->_sendreportmail}; } } elsif ( $self->{lastmailstate}<0 and $self->{isup}>0 ) { # last mail sent was down, but we are now up if ( defined $self->{suppressmail} ) { # not going to send mail - suppressed, but remove the suppression undef $self->{suppressmail}; # we must not mail - just log the suppression instead push @commandoutput, "EMAIL SUPPRESSED for $node ($NODES{$node}) going ".($self->{isup}>0?'UP':'DOWN')." due to $self->{suppressmail}"; } else { # we where not suppressed so need to send the up mail # send up mail push @commandoutput, @{$self->_sendreportmail}; } $self->{lastmailstate} = $self->{isup}; } return @commandoutput; } sub getavg { my ( $self, $average ) = @_; return $self->{'averages'}->{$average}; } sub setavg { my ( $self, $average, $value ) = @_; $self->{'averages'}->{$average} = $value; } sub isup { my $self = shift; return $self->{isup}; } sub _updowncheck { my $self = shift; my $node = $self->{address}; # sort loss coutners if ( $self->{'score'} ) { if ( $self->{'losscount'} < 0 ) { $self->{'losscount'} = 0; } ++$self->{'losscount'}; } else { if ( $self->{'losscount'} > 0 ) { $self->{'losscount'} = 0; } --$self->{'losscount'}; } # sort loss averages if ( $NODELOSSTHRESHUP{$node} > 0 and $NODELOSSTHRESHDOWN{$node} > 0 ) { if ( ! exists $self->{'lossaverage'} ) { $self->{'lossaverage'} = $self->{score}; } $self->{'lossaverage'} *= 1 - $LOSSAVFACTOR; $self->{'lossaverage'} += $self->{score} * $LOSSAVFACTOR; # work out current loss average state if ( $self->{'lossaverage'} > $NODELOSSTHRESHUP{$node} ) { $self->{'lossavstate'} = 1; } elsif ( $self->{'lossaverage'} < $NODELOSSTHRESHDOWN{$node} ) { $self->{'lossavstate'} = -1; } } else { $self->{'lossavstate'} = 0; # disabled - hold in neutral } # sort state my $state = 0; # overall state - start off undetermined if ( ! exists $self->{'losscause'} ) { # first run - need to initialise $state = $self->{'score'}?1:-1; $self->{'isup'} = $state; $self->{'lastmailstate'} = $state; $self->{'losscause'} = $self->{'score'}?'':'consecutive pings'; } elsif ( $self->{'losscause'} eq '' ) { # we are up so we only need to worry about going down $state = 1; if ( $self->{'losscount'} <= -$NODETHRESH{$node} ) { $state = -1; $self->{'losscause'} = 'consecutive pings'; } elsif ( $self->{'lossavstate'} == -1 ) { $state = -1; $self->{'losscause'} = 'average loss'; # limit the loss average to help speed recovery if things improve if ( $self->{'lossaverage'} < $NODELOSSLIMIT{$node} ) { $self->{'laossaverage'} = $NODELOSSLIMIT{$node}; } } } elsif ( $self->{'losscause'} eq 'consecutive pings' ) { # we are in a consecutive loss situation $state = -1; # we can return if the count is good again if ( $self->{'losscount'} >= $NODETHRESH{$node} ) { $state = 1; $self->{'losscause'} = ''; } else { # Limit loss average to the verge of going down # This ensures that if connectivity returns but is flaky # that we immediately go into an average loss failure # This also prevents long periods of hard-loss from taking # the loss average way down so it takes a long time to recover if ( $NODELOSSTHRESHUP{$node} > 0 and $NODELOSSTHRESHDOWN{$node} > 0 and $self->{'lossaverage'} < $NODELOSSTHRESHDOWN{$node} ) { $self->{'lossaverage'} = $NODELOSSTHRESHDOWN{$node}; # reset the loss average state to neutral to prevent # it coming back in a failure state $self->{'lossavstate'} = 0; } } } elsif ( $self->{'losscause'} eq 'average loss' ) { # we are in an average loss situation $state = -1; # we can return if the lossavstate has returned to up if ( $self->{'lossavstate'} == 1 ) { $state = 1; $self->{'losscause'} = ''; } # we don't need to mess with the consecutive loss stuff } # Nagios extention - report state, losscause if ( $NODENAGIOSREPORT{$node} and open my $fh, '>', $NAGIOSREPORT ) { my $report = "[".time()."] PROCESS_SERVICE_CHECK_RESULT;$NODENAGIOSREPORT{$node};fping_logger Connectivity;"; if ( $state == 1 ) { $report .= "0;PING OK - $self->{results}ms"; } elsif ( $state == -1 ) { $report .= "2;PING CRITICAL - "; if ( $self->{losscause} eq 'consecutive pings' ) { $report .= "$self->{losscount} out of $NODETHRESH{$node} ($self->{losscause})"; } else { $report .= "average $self->{lossaverage} (threshold: $NODELOSSTHRESHUP{$node}/$NODELOSSTHRESHDOWN{$node}) ($self->{losscause})"; } } else { $report .= "1;PING WARNING - can't determine state"; } print $fh "$report\n"; close $fh; } # on with the show if ( $state == 0 ) { return; } # no point going further if we can't tell # see if we got a change if ( $state != $self->{'isup'} ) { # something happened! $self->{'isup'} = $state; # update the new state return ( 1, "\"$self->{losscause}\"" ); # stuff changed, and why } else { return 0; # nothing happened } # we never get here } sub _updownaction { my ( $self, $cause ) = @_; my $node = $self->{address}; # run the command my @commandoutput; if ( $UPDOWNCOMMAND{$node} ) { # build environment $ENV{action_address} = $self->{address}; $ENV{action_name} = $self->{name}; $ENV{action_statetime} = $epoch - $self->{updowntime}; $ENV{action_state} = ($self->{isup}>0?'UP':'DOWN'); # we have a command to run if ( open my $com, '-|', "$UPDOWNCOMMAND{$node} 2>&1" ) { @commandoutput = <$com>; close $com; chomp @commandoutput; # add some useful info fore and aft unshift @commandoutput, ($self->{isup}>0?'UP':'DOWN').": Executing '$UPDOWNCOMMAND{$node}'"; push @commandoutput, "Done '$UPDOWNCOMMAND{$node}'"; } else { push @commandoutput, __FILE__.':'.__LINE__." FATAL - Executing '$UPDOWNCOMMAND{$node}' FAILED: $!"; } } my $messagestore; if ( $UPDOWNEMAIL{$node} ) { # generate email message to sent later when suppression is determined my $change = $self->{isup}>0?'UP':'DOWN'; my $lastchange = $self->{isup}>0?'DOWN':'UP'; # generate the message $messagestore = ''; $messagestore .= "Monitoring on \"$THISNODE\" detected:\n"; $messagestore .= "$node ($NODES{$node}) changed to $change status\n"; $messagestore .= "at $timestamp\n"; if ( $self->{isup} < 0 ) { $self->{messagestore} .= "due to: $cause\n"; } $messagestore .= "\n"; $messagestore .= "packet count: $self->{losscount} (threshold: +/-$NODETHRESH{$node})\n"; if ( $NODELOSSTHRESHUP{$node} > 0 and $NODELOSSTHRESHDOWN{$node} > 0 ) { $messagestore .= "packet average: $self->{lossaverage} (threshold: $NODELOSSTHRESHUP{$node}/$NODELOSSTHRESHDOWN{$node})\n"; } $messagestore .= "\n"; $messagestore .= "Previously $lastchange for"; my $changetime = $epoch - $self->{updowntime}; if ( $changetime >= 86400 ) { $messagestore .= "\"adding days ($changetime)\"\n"; $messagestore .= " ".int($changetime/86400)." day(s)"; $changetime = $changetime%86400; $messagestore .= "\"after ($changetime)\"\n"; } $messagestore .= ::strftime ' %H:%M:%S', gmtime $changetime; $messagestore .= "\n"; $messagestore .= "\n\n"; if ( $#commandoutput != -1 ) { # put the command output in the mail $messagestore .= join "\n", @commandoutput; $messagestore .= "\n"; $messagestore .= "\n\n"; } if ( $#errors != -1 ) { # put current errors in the mail $messagestore .= "The following ERRORS are logged for this cycle:\n"; $messagestore .= join "\n", @errors; $messagestore .= "\n"; $messagestore .= "\n\n"; } } return [$messagestore,@commandoutput]; # return info gathered to be logged } sub _sendreportmail { my $self = shift; my $node = $self->{address}; my @commandoutput; my $change = $self->{isup}>0?'UP':'DOWN'; if ( $EMAILSUBJECT !~ /\s$/ ) { $EMAILSUBJECT .= ': '; } if ( open my $mail, '|-', "$MAIL -s '$EMAILSUBJECT$node ($NODES{$node}) $change' '$UPDOWNEMAIL{$node}'" ) { print $mail $self->{messagestore}; close $mail; delete $self->{messagestore}; } else { # bad stuff happened trying to send mail log this push @commandoutput, __FILE__.':'.__LINE__." FATAL - Executing mail command \"$MAIL -s '$EMAILSUBJECT$change $node ($NODES{$node})' '$UPDOWNEMAIL{$node}'\": $!"; } return \@commandoutput; # return info gathered to be logged }