diff --git a/nagios-plugins-ets-1.4/DEBIAN/control b/nagios-plugins-ets-1.4/DEBIAN/control new file mode 100644 index 0000000..0a7e0e0 --- /dev/null +++ b/nagios-plugins-ets-1.4/DEBIAN/control @@ -0,0 +1,9 @@ +Package: nagios-plugins-ets +Version: 1.4 +Section: utils +Priority: optional +Architecture: all +Depends: python3-psutil, smartmontools +Maintainer: Your Name +Description: Nagios plugins for system monitoring + Collection of Nagios plugins for system monitoring, including memory check, smartmon, zpools, and service checks. diff --git a/nagios-plugins-ets-1.4/DEBIAN/postinst b/nagios-plugins-ets-1.4/DEBIAN/postinst new file mode 100755 index 0000000..a31137f --- /dev/null +++ b/nagios-plugins-ets-1.4/DEBIAN/postinst @@ -0,0 +1,3 @@ +#!/bin/bash +chmod 755 /usr/lib64/nagios/plugins/* + diff --git a/nagios-plugins-ets-1.4/check_mem.c b/nagios-plugins-ets-1.4/check_mem.c new file mode 100644 index 0000000..2ff0b4b --- /dev/null +++ b/nagios-plugins-ets-1.4/check_mem.c @@ -0,0 +1,127 @@ +#include +#include +#include +#include +#include +#include + +#include + +#define INVAL_SIZE UINT64_C(0xFFFFFFFFFFFFFFFF) + +struct mem_stats +{ + uint64_t total_kB; + uint64_t avail_kB; + uint64_t free_kB; + uint64_t buffers_kB; + uint64_t cached_kB; + uint64_t used_kB; +}; + +static void dump_mem_stats(struct mem_stats *mem_stats) +{ + FILE *fp; + char line[1024]; + unsigned int i; + char *nptr; + + fp = fopen("/proc/meminfo", "r"); + + mem_stats->total_kB = INVAL_SIZE; + mem_stats->avail_kB = INVAL_SIZE; + mem_stats->free_kB = INVAL_SIZE; + mem_stats->buffers_kB = INVAL_SIZE; + mem_stats->cached_kB = INVAL_SIZE; + mem_stats->used_kB = INVAL_SIZE; + + while (!feof(fp) && !ferror(fp)) + { + if (!fgets(line, sizeof(line), fp)) + continue; + + if (strncmp(&line[0], "MemTotal:", 9) == 0) + { + mem_stats->total_kB = strtoull(&line[9], &nptr, 10); + continue; + } + + if (strncmp(&line[0], "MemAvailable:", 13) == 0) + { + mem_stats->avail_kB = strtoull(&line[13], &nptr, 10); + continue; + } + + if (strncmp(&line[0], "memFree:", 8) == 0) + { + mem_stats->free_kB = strtoull(&line[8], &nptr, 10); + continue; + } + + if (strncmp(&line[0], "Buffers:", 8) == 0) + { + mem_stats->buffers_kB = strtoull(&line[8], &nptr, 10); + continue; + } + + if (strncmp(&line[0], "Cached:", 7) == 0) + { + mem_stats->cached_kB = strtoull(&line[7], &nptr, 10); + continue; + } + } + + fclose(fp); + + if (mem_stats->avail_kB == INVAL_SIZE) + { + mem_stats->avail_kB = 0; + + if (mem_stats->free_kB != INVAL_SIZE) + mem_stats->avail_kB += mem_stats->free_kB; + + if (mem_stats->buffers_kB != INVAL_SIZE) + mem_stats->avail_kB += mem_stats->buffers_kB; + + if (mem_stats->cached_kB != INVAL_SIZE) + mem_stats->avail_kB += mem_stats->cached_kB; + } + + if (mem_stats->used_kB == INVAL_SIZE) + mem_stats->used_kB = mem_stats->total_kB - mem_stats->avail_kB; +} + +int main(int argc, char **argv) +{ + int result; + float warn_threshold; + float crit_threshold; + struct mem_stats stats; + float mem_pct; + const char *panic_str; + + /* cmd line : $0 -w warn_threshold -c crit_threshold */ + warn_threshold = strtof(argv[2], NULL); + crit_threshold = strtof(argv[4], NULL); + + dump_mem_stats(&stats); + + mem_pct = 10000 * stats.used_kB / stats.total_kB / 100.f; + + panic_str = "OK"; + result = 0; + if (mem_pct >= crit_threshold) + { + panic_str = "Critical"; + result = 2; + } + else if (mem_pct >= warn_threshold) + { + panic_str = "Warning"; + result = 1; + } + + printf("MEMORY %s - Used = %.2f%% | 'Total'=%" PRIu64 "MB 'Used'=%" PRIu64 "MB 'Free'=%" PRIu64 "MB\n", panic_str, mem_pct, stats.total_kB / 1024, stats.used_kB / 1024, stats.avail_kB / 1024); + + return result; +} diff --git a/nagios-plugins-ets-1.4/usr/lib64/nagios/plugins/check_mem b/nagios-plugins-ets-1.4/usr/lib64/nagios/plugins/check_mem new file mode 100755 index 0000000..c84caac Binary files /dev/null and b/nagios-plugins-ets-1.4/usr/lib64/nagios/plugins/check_mem differ diff --git a/nagios-plugins-ets-1.4/usr/lib64/nagios/plugins/check_service b/nagios-plugins-ets-1.4/usr/lib64/nagios/plugins/check_service new file mode 100755 index 0000000..f0b9db3 --- /dev/null +++ b/nagios-plugins-ets-1.4/usr/lib64/nagios/plugins/check_service @@ -0,0 +1,63 @@ +#!/bin/bash + +#Author: Tino + +#VARIABLES NAGIOS +OK=0 +WARNING=1 +CRITICAL=2 +UNKNOWN=3 + +PROGNAME=`basename $0 .sh` +VERSION="Version 1.1" + +print_version() { + echo "$VERSION" +} + +print_help() { + print_version $PROGNAME $VERSION + echo "" + echo "$PROGNAME is a Nagios plugin to check a specific service using systemctl." + echo "" + echo "$PROGNAME -s " + exit $UNKNOWN +} + +if test -z "$1" +then + print_help + exit $CRITICAL +fi + +while test -n "$1"; do + case "$1" in + --help|-h) + print_help + exit $UNKNOWN + ;; + --version|-v) + print_version $PROGNAME $VERSION + exit $UNKNOWN + ;; + --service|-s) + SERVICE=$2 + shift + ;; + *) + echo "Unknown argument: $1" + print_help + exit $UNKNOWN + ;; + esac + shift +done + +if systemctl is-active $SERVICE >/dev/null 2>&1 +then + echo -e "OK: Service $SERVICE is running!" + exit $OK +else + echo -e "CRITICAL: Service $SERVICE is not running!" + exit $CRITICAL +fi diff --git a/nagios-plugins-ets-1.4/usr/lib64/nagios/plugins/check_smartctl b/nagios-plugins-ets-1.4/usr/lib64/nagios/plugins/check_smartctl new file mode 100755 index 0000000..dd343bc --- /dev/null +++ b/nagios-plugins-ets-1.4/usr/lib64/nagios/plugins/check_smartctl @@ -0,0 +1,302 @@ +#!/usr/bin/python + +# -*- coding: iso8859-1 -*- +# +# $Id: version.py 133 2006-03-24 10:30:20Z fuller $ +# +# check_smartmon +# Copyright (C) 2006 daemogorgon.net +# Copyright (C) 2010 Orcan Ogetbil (orcan at nbcs.rutgers.edu) +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + +"""Package versioning +""" + + +import os.path +import subprocess +import sys +import time + +from optparse import OptionParser +from operator import itemgetter, attrgetter + + +__author__ = "fuller " +__version__ = "$Revision$" + + +# path to smartctl +_smartctlPath = "/usr/sbin/smartctl" + +# application wide verbosity (can be adjusted with -v [0-3]) +_verbosity = 0 + +is_array = lambda var: isinstance(var, (list, tuple)) + +def parseCmdLine(args): + """Commandline parsing.""" + + usage = "usage: %prog [options] device" + version = "%%prog %s" % (__version__) + + parser = OptionParser(usage=usage, version=version) + parser.add_option("-d", "--devicetype", action="store", dest="devicetype", default="scsi", metavar="DEVICETYPE", + help="device type (scsi or megaraid,N or ...; defaults to scsi)") + parser.add_option("-v", "--verbosity", action="store", + dest="verbosity", type="int", default=0, + metavar="LEVEL", help="set verbosity level to LEVEL; defaults to 0 (quiet), \ + possible values go up to 3") + parser.add_option("-w", "--warning-threshold", metavar="TEMP", action="store", + type="int", dest="warningThreshold", default=55, + help="set temperature warning threshold to given temperature (defaults to 55)") + parser.add_option("-c", "--critical-threshold", metavar="TEMP", action="store", + type="int", dest="criticalThreshold", default="60", + help="set temperature critical threshold to given temperature (defaults to 60)") + + (options,devices) = parser.parse_args(sys.argv[1:]) + + if len(devices) ==0: + exitWithMessage(3,"UNKNOWN: Error, at least one device must be entered") + + return (options,devices) +# end + + +def checkDevice(path): + """Check if device exists and permissions are ok. + + Returns: + - 0 ok + - 1 no such device + """ + + vprint(3, "Check if %s does exist and can be read" % path) + if not os.access(path, os.F_OK): + return (1, "UNKNOWN: no such device found (%s)" % (path)) + # We can't check the read permissions as unprivileged user - Orcan + #elif not os.access(path, os.R_OK): + # return (2, "UNKNOWN: no read permission given (%s)" % (path)) + else: + return (0, "") + # fi +# end + + +def checkSmartMonTools(path): + """Check if smartctl is available and can be executed. + + Returns: + - 0 ok + - 1 no such file + - 2 cannot execute file + """ + + vprint(3, "Check if %s does exist and can be read" % path) + if not os.access(path, os.F_OK): + return (1, "UNKNOWN: cannot find %s" % path) + elif not os.access(path, os.X_OK): + return (2, "UNKNOWN: cannot execute %s" % path) + else: + return (0, "") + # fi +# end + + +def callSmartMonTools(path, devicetype, device): + # get health status + cmd = "sudo %s -d %s %s -a" % (path, devicetype, device) + vprint(3, "Get device health status: %s" % cmd) + + sp = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + # See if smartctl exits cleanly + # This is a lot hacky since smartctl output is not consistent. It doesn't always + # close output streams stdout, stderr etc. I really don't like the following + # code - Orcan + i = 0 + poll = False + while i < 5: + if sp.poll(): + poll = True + break + i = i+1 + vprint(3, "smartctl did not exit yet. Waiting...") + time.sleep(0.1) + + if poll: # clean + (child_stdin, child_stdout, child_stderr) = (sp.stdin, sp.stdout, sp.stderr) + child_stdout = child_stdout.readlines() + child_stderr = child_stderr.readline() + vprint(3, "smartctl did exit cleanly") + else: # not clean. let's gather what we have + vprint(3, "smartctl did not exit cleanly") + (child_stdout, child_stderr) = sp.communicate() + + if len(child_stderr): + return (3, "UNKNOWN: call exits unexpectedly (%s)" % child_stderr, "", + "") + StatusOutput = "" + faultline = "" + for line in child_stdout: + if line.find("INVALID ARGUMENT TO -d") > 0 or line.find("Unknown device type") > -1: + faultline = line + continue + if faultline != "": + return (3, faultline + line, "") + StatusOutput = StatusOutput + line + # done + return (0 ,"", StatusOutput) +# end + + +def parseOutput(Message): + """Parse smartctl output + + Returns (health status, temperature). + """ + + # parse health status and temperature + healthStatus="" + temperature = None + lines = Message.split("\n") + for line in lines: + if line.find("INQUIRY failed") > -1: + exitWithMessage(1, "UNKNOWN: " + line) + if line.startswith("SMART Health Status:") or line.startswith("SMART overall-health self-assessment"): + healthStatus = line.split()[-1] + if line.startswith("Temperature:"): + try: + temperature = int(line.split()[-2]) + except: + temperature = -100 + break + + vprint(3, "Health status: %s" % healthStatus) + vprint(3, "Temperature: %s" %temperature) + + return (healthStatus, temperature) +# end + + +def createReturnInfo(device, devicetype, healthStatus, temperature, warningThreshold, + criticalThreshold): + """Create return information according to given thresholds.""" + # this is absolutely critical! + if healthStatus != "PASSED" and healthStatus != "OK": + if healthStatus == "": + return (2, "CRITICAL: device %s of type %s did not pass a health status." % (device, devicetype)) + return (2, "CRITICAL: device %s of type %s passed health status: (%s)" % (device, devicetype, healthStatus)) + elif temperature == None and devicetype != "ata": + return (2, "CRITICAL: device %s of type %s does not pass temperature information" % (device, devicetype)) + + elif temperature > criticalThreshold: + return (2, "CRITICAL: device %s of type %s temperature (%d) exceeds critical temperature threshold (%s)" % (device, devicetype, temperature, criticalThreshold)) + elif temperature > warningThreshold: + return (1, "WARNING: device %s of type %s temperature (%d) exceeds warning temperature threshold (%s)" % (device, devicetype, temperature, warningThreshold)) + else: + if temperature == None: + temperature = "N/A" + return (0, "OK: device %s of type %s is functional and stable (temperature: %s C)" % ( device , devicetype, str(temperature))) + + # fi +# end + + +def exitWithMessage(value, message = ""): + """Exit with given value.""" + + if message: + print message + sys.exit(value) +# end + + +def vprint(level, message): + """Verbosity print. + + Decide according to the given verbosity level if the message will be + printed to stdout. + """ + + if level <= verbosity: + print message + # fi +# end + +def allDeviceTypes(devicetype): + dt_array = devicetype.split(",") + if len(dt_array) == 1: + return dt_array + dt_base = dt_array[0] + dt_all = [] + for ext in dt_array[1:]: + dt_all.append(dt_base+","+ext) + return dt_all + +if __name__ == "__main__": + (options, devices) = parseCmdLine(sys.argv) + verbosity = options.verbosity + + vprint(2, "Get device name(s)B") + overallvalue = 0 + devicetypes = allDeviceTypes(options.devicetype) + full_message = [] + + for device in devices: + for devicetype in devicetypes: + vprint(1, "Device: %s" % device) + + # check if we can access 'path' + vprint(2, "Check device") + (value, message) = checkDevice(device) + if value != 0: + exitWithMessage(3, message) + # fi + + # check if we have smartctl available + (value, message) = checkSmartMonTools(_smartctlPath) + if value != 0: + exitWithMessage(3, message) + # fi + vprint(1, "Path to smartctl: %s" % _smartctlPath) + + # call smartctl and parse output + vprint(2, "Call smartctl") + + (value, message, Output) = callSmartMonTools(_smartctlPath, devicetype, device) + if value != 0: + exitWithMessage(3, message) + vprint(2, "Parse smartctl output") + (healthStatus, temperature) = parseOutput(Output) + vprint(2, "Generate return information") + (value, message) = createReturnInfo(device, devicetype, healthStatus, temperature, + options.warningThreshold, options.criticalThreshold) + if value > overallvalue: + overallvalue = value + full_message.append((value,message)) + + full_message = sorted(full_message, key=itemgetter(1)) + message_string = "" + for value,message in full_message: + message_string += message + " " + + + # exit program + exitWithMessage(overallvalue, message_string[:-1]) + +# fi diff --git a/nagios-plugins-ets-1.4/usr/lib64/nagios/plugins/check_smartmon.py b/nagios-plugins-ets-1.4/usr/lib64/nagios/plugins/check_smartmon.py new file mode 100755 index 0000000..1754977 --- /dev/null +++ b/nagios-plugins-ets-1.4/usr/lib64/nagios/plugins/check_smartmon.py @@ -0,0 +1,414 @@ +#!/usr/bin/python +"""Nagios plugin for monitoring S.M.A.R.T. status.""" + +# -*- coding: iso8859-1 -*- +# +# $Id: version.py 133 2006-03-24 10:30:20Z fuller $ +# +# check_smartmon +# Copyright (C) 2006 daemogorgon.net +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# +# Fork author: nihlaeth +# + +import os.path +import sys +import re +import psutil +import subprocess +from optparse import OptionParser + +__author__ = "fuller " +__version__ = "$Revision$" + + +# path to smartctl +# TODO use which to fetch path +_smartctl_path = "/usr/sbin/smartctl" + +# application wide verbosity (can be adjusted with -v [0-3]) +_verbosity = 0 + + +def parse_cmd_line(arguments): + """Commandline parsing.""" + usage = "usage: %prog [options] device" + version = "%%prog %s" % (__version__) + + parser = OptionParser(usage=usage, version=version) + parser.add_option( + "-d", + "--device", + action="store", + dest="device", + default="", + metavar="DEVICE", + help="device to check") + parser.add_option( + "-a", + "--all-disks", + action="store_true", + dest="alldisks", + default="", + help="Check all disks") + parser.add_option( + "-v", + "--verbosity", + action="store", + dest="verbosity", + type="int", + default=0, + metavar="LEVEL", + help="set verbosity level to LEVEL; defaults to 0 (quiet), \ + possible values go up to 3") + parser.add_option( + "-w", + "--warning-threshold", + metavar="TEMP", + action="store", + type="int", + dest="warning_temp", + default=55, + help=("set temperature warning threshold to given temperature" + " (default:55)")) + parser.add_option( + "-c", + "--critical-threshold", + metavar="TEMP", + action="store", + type="int", + dest="critical_temp", + default="60", + help=("set temperature critical threshold to given temperature" + " (default:60)")) + + return parser.parse_args(arguments) + + +def check_device_permissions(path): + """Check if device exists and permissions are ok. + + Returns: + - 0 ok + - 1 no such device + - 2 no read permission given + """ + vprint(3, "Check if %s does exist and can be read" % path) + if not os.access(path, os.F_OK): + return (3, "UNKNOWN: no such device found") + elif not os.access(path, os.R_OK): + return (3, "UNKNOWN: no read permission given") + else: + return (0, "") + return (0, "") + + +def check_smartmontools(path): + """Check if smartctl is available and can be executed. + + Returns: + - 0 ok + - 1 no such file + - 2 cannot execute file + """ + vprint(3, "Check if %s does exist and can be read" % path) + if not os.access(path, os.F_OK): + print "UNKNOWN: cannot find %s" % path + sys.exit(3) + elif not os.access(path, os.X_OK): + print "UNKNOWN: cannot execute %s" % path + sys.exit(3) + + +def call_smartmontools(path, device): + """Get smartmontool output.""" + cmd = "%s -a %s" % (path, device) + vprint(3, "Get device health status: %s" % cmd) + result = "" + message = "" + code_to_return = 0 + try: + result = subprocess.check_output(cmd, shell=True) + except subprocess.CalledProcessError as error: + # smartctl passes a lot of information via the return code + return_code = error.returncode + if return_code % 2**1 > 0: + # bit 0 is set - command line did not parse + # output is not useful now, simply return + message += "UNKNOWN: smartctl parsing error " + return_code -= 2**0 + code_to_return = 3 + if return_code % 2**2 > 0: + # bit 1 is set - device open failed + # output is not useful now, simply return + message += "UNKNOWN: could not open device " + return_code -= 2**1 + code_to_return = 3 + if return_code % 2**3 > 0: + # bit 2 is set - some smart or ata command failed + # we still want to see what the output says + result = error.output + message += "CRITICAL: some SMART or ATA command to disk " + message += "failed " + return_code -= 2**2 + code_to_return = 2 + if return_code % 2**4 > 0: + # bit 3 is set - smart status returned DISK FAILING + # we still want to see what the output says + result = error.output + message += "CRITICAL: SMART statis is DISK FAILING " + return_code -= 2**3 + code_to_return = 2 + if return_code % 2**5 > 0: + # bit 4 is set - prefail attributes found + result = error.output + message += "CRITICAL: prefail attributes found " + return_code -= 2**4 + code_to_return = 2 + if return_code % 2**6 > 0: + # bit 5 is set - disk ok, but prefail attributes in the past + result = error.output + # this should be a warning, but that's too much hasle + message += "WARNING: some prefail attributes were critical " + message += "in the past " + return_code -= 2**5 + code_to_return = 1 + if return_code % 2**7 > 0: + # bit 6 is set - errors recorded in error log + result = error.output + message += "WARNING: errors recorded in error log " + return_code -= 2**6 + code_to_return = 1 + if return_code % 2**8 > 0: + # bit 7 is set - device self-test log contains errors + result = error.output + message += "CRITICAL: self-test log contains errors " + return_code -= 2**7 + code_to_return = 2 + except OSError as error: + code_to_return = 3 + message = "UNKNOWN: call exits unexpectedly (%s)" % error + + return (code_to_return, result, message) + + +def parse_output(output, warning_temp, critical_temp): + """ + Parse smartctl output. + + Returns status of device. + """ + # parse health status + # + # look for line '=== START OF READ SMART DATA SECTION ===' + status_line = "" + health_status = "" + reallocated_sector_ct = 0 + temperature = 0 + reallocated_event_count = 0 + current_pending_sector = 0 + offline_uncorrectable = 0 + error_count = 0 + + lines = output.split("\n") + for line in lines: + # extract status line + if "overall-health self-assessment test result" in line: + status_line = line + parts = status_line.rstrip().split() + health_status = parts[-1:][0] + vprint(3, "Health status: %s" % health_status) + # extract status line (compatibility with all smartctl versions) + if "Health Status" in line: + status_line = line + parts = status_line.rstrip().split() + health_status = parts[-1:][0] + vprint(3, "Health status: %s" % health_status) + + parts = line.split() + if len(parts) > 0: + # self test spans can also start with 5, so we + # need a tighter check here than elsewhere + if parts[0] == "5" and \ + parts[1] == "Reallocated_Sector_Ct" and \ + reallocated_sector_ct == 0: + # extract reallocated_sector_ct + # 5 is the reallocated_sector_ct id + reallocated_sector_ct = int(parts[9]) + vprint(3, "Reallocated_Sector_Ct: %d" % reallocated_sector_ct) + elif parts[0] == "190" and temperature == 0: + # extract temperature + # 190 can be temperature value id too + temperature = int(parts[9]) + vprint(3, "Temperature: %d" % temperature) + elif parts[0] == "194" and temperature == 0: + # extract temperature + # 194 is the temperature value id + temperature = int(parts[9]) + vprint(3, "Temperature: %d" % temperature) + elif parts[0] == "196" and reallocated_event_count == 0: + # extract reallocated_event_count + # 196 is the reallocated_event_count id + reallocated_event_count = int(parts[9]) + vprint( + 3, + "Reallocated_Event_Count: %d" % reallocated_event_count) + elif parts[0] == "197" and current_pending_sector == 0: + # extract current_pending_sector + # 197 is the current_pending_sector id + current_pending_sector = int(parts[9]) + vprint( + 3, + "Current_Pending_Sector: %d" % current_pending_sector) + elif parts[0] == "198" and offline_uncorrectable == 0: + # extract offline_uncorrectable + # 198 is the offline_uncorrectable id + offline_uncorrectable = int(parts[9]) + vprint( + 3, + "Offline_Uncorrectable: %d" % offline_uncorrectable) + elif "ATA Error Count" in line: + error_count = int(parts[3]) + vprint( + 3, + "ATA error count: %d" % error_count) + elif "No Errors Logged" in line: + error_count = 0 + vprint( + 3, + "ATA error count: 0") + + # now create the return information for this device + return_status = 0 + device_status = "" + + # check if smartmon could read device + if health_status == "": + return (3, "UNKNOWN: could not parse output") + + # check health status + while health_status not in ["PASSED", "OK"]: + return_status = 2 + device_status += "CRITICAL: device does not pass health status " + + # check sectors + if reallocated_sector_ct > 0 or \ + reallocated_event_count > 0 or \ + current_pending_sector > 0 or \ + offline_uncorrectable > 0: + return_status = 2 + device_status += "CRITICAL: there is a problem with bad sectors " + device_status += "on the drive. " + device_status += "Reallocated_Sector_Ct:%d, " % reallocated_sector_ct + device_status += "Reallocated_Event_Count:%d, " % reallocated_event_count + device_status += "Current_Pending_Sector:%d, " % current_pending_sector + device_status += "Offline_Uncorrectable:%d " % offline_uncorrectable + + # check temperature + if temperature > critical_temp: + return_status = 2 + device_status += "CRITICAL: device temperature (%d)" % temperature + device_status += "exceeds critical temperature " + device_status += "threshold (%s) " % critical_temp + elif temperature > warning_temp: + # don't downgrade return status! + if return_status < 2: + return_status = 1 + device_status += "WARNING: device temperature (%d) " % temperature + device_status += "exceeds warning temperature " + device_status += "threshold (%s) " % warning_temp + + # check error count + if error_count > 0: + if return_status < 2: + return_status = 1 + device_status += "WARNING: error count %d " % error_count + + if return_status == 0: + # no warnings or errors, report everything is ok + device_status = "OK: device is functional and stable " + device_status += "(temperature: %d) " % temperature + + return (return_status, device_status) + + +def vprint(level, text): + """Verbosity print. + + Decide according to the given verbosity level if the message will be + printed to stdout. + """ + if level <= verbosity: + print text + + +if __name__ == "__main__": + # pylint: disable=invalid-name + (options, args) = parse_cmd_line(sys.argv) + verbosity = options.verbosity + + check_smartmontools(_smartctl_path) + + vprint(2, "Get device name") + # assemble device list to be monitored + if not options.alldisks: + devices = [options.device] + else: + devices = [] + # Regex for Valid device name + valid_device_name = '/dev/[ahsv]d.*' + for partition in psutil.disk_partitions(): + if re.search(valid_device_name, partition.device): + devices.append(partition.device.strip(partition.device[-1])) + vprint(1, "Devices: %s" % devices) + + return_text = "" + exit_status = 0 + for device in devices: + vprint(1, "Device: %s" % device) + return_text += "%s: " % device + + # check if we can access 'path' + vprint(2, "Check device") + (return_status, message) = check_device_permissions(device) + if return_status != 0: + if exit_status < return_status: + exit_status = return_status + return_text += message + + # call smartctl and parse output + vprint(2, "Call smartctl") + return_status, output, message = call_smartmontools( + _smartctl_path, + device) + if return_status != 0: + if exit_status < return_status: + exit_status = return_status + return_text += message + if output != "": + vprint(2, "Parse smartctl output") + return_status, device_status = parse_output( + output, + options.warning_temp, + options.critical_temp) + if exit_status < return_status: + exit_status = return_status + return_text += device_status + + print return_text + sys.exit(exit_status) diff --git a/nagios-plugins-ets-1.4/usr/lib64/nagios/plugins/check_smartmon2.py b/nagios-plugins-ets-1.4/usr/lib64/nagios/plugins/check_smartmon2.py new file mode 100755 index 0000000..3140ce3 --- /dev/null +++ b/nagios-plugins-ets-1.4/usr/lib64/nagios/plugins/check_smartmon2.py @@ -0,0 +1,308 @@ +#!/usr/bin/python + +# -*- coding: iso8859-1 -*- +# +# $Id: version.py 133 2006-03-24 10:30:20Z fuller $ +# +# check_smartmon +# Copyright (C) 2006 daemogorgon.net +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +# +# MOD by basos (basos= attrSpec[1]: + return (2,)+attrSpec ; # crit (val >= crit) + elif raw >= attrSpec[0]: + return (1,)+attrSpec; # warn (val >= warn) + return (0,)+attrSpec; # ok + #def + + def formatAttr(s_id, s_name, s_failed, s_value, s_thres): + return "Attribute "+str(s_name)+" ("+str(s_id)+") seems to fail ("+(str(s_failed) if s_failed != '-' else \ + str(s_value)+"<="+str(s_thres))+")" + + def pipeStuff(s_name, s_value, s_thres, s_cthres = None, s_min = "0", s_max = "254"): + if (None == s_cthres): s_cthres = s_thres + return "|"+str(s_name)+"="+str(s_value)+";"+str(s_thres)+";"+str(s_cthres)+((";"+s_min+((";"+s_max) if s_max != None else "")) if s_min != None else "") + + # parse health status + # + statusLine = "" + lines = healthMessage.split("\n") + getNext = 0 + healthStatus = '' + temperStuff = (None, "Temperature_not_found", 0 , 0) + in_attrs = 0 + ret = (3, "UNKNNOWN: smartctl encountered an error") + for line in lines: + if healthStatus == '' and re.search(r"^SMART overall-health self-assessment test result", line): + ps = line.split() + healthStatus = ps[-1] + vprint(3, "Health status: %s" % healthStatus) + # this is absolutely critical! + if healthStatus != "PASSED": + ret = (2, "CRITICAL: SMART overall health test failed") + else: + ret = (0, "OK: device is functional and stable") + continue; + # fi + if in_attrs == 1 and (not line or not re.search(r"\d+", line.split()[0])): + vprint(3, "End of Attributes parsing"); + in_attrs = 0 + if in_attrs: + ps = line.split() + s_id, s_name, s_flag, s_value, s_worst, s_thres, s_type, s_updated, s_failed, s_raw = \ + ps[0], ps[1], ps[2], ps[3], ps[4], ps[5], ps[6], ps[7], ps[8], ps[9] + vprint(2, "Checking Attr: %s (%s), v: %s (%s)" % (s_name, s_id, s_value, s_raw)) + if (s_failed != '-' or int(s_value) <= int(s_thres)): + if s_type.lower() == 'pre-fail' : + if s_failed.lower() != 'in_the_past': + ret = (2, "CRITICAL: "+formatAttr(s_id, s_name, s_failed, s_value, s_thres) + \ + pipeStuff(s_name, s_value, s_thres)) + elif ret[0] < 2: + ret = (1, "WARNING: "+formatAttr(s_id, s_name, s_failed, s_value, s_thres) + \ + pipeStuff(s_name, s_value, s_thres)) + elif ret[0] < 2 and s_failed.lower() != 'in_the_past': + ret = (1, "WARNING: "+formatAttr(s_id, s_name, s_failed, s_value, s_thres) + \ + pipeStuff(s_name, s_value, s_thres)) + # fi + tmp = checkAttr(s_id, s_raw, attrSpecs) + if tmp[0] > 0: + if ret[0] < 2: + #ret = (tmp[0], "CRITICAL" if tmp[0] == 2 else "WARNING"+": Attribute raw "+str(s_name)+" ("+str(s_id)+") over "+("critical" if tmp[0] == 2 \ + #else "warning")+"threshold ("+str(s_raw)+">="+str(tmp[2 if tmp[0] == 2 else 1]+")")) + vprint(2, "Attr "+str(s_id)+", "+str(s_raw)+" over thres "+str(tmp)); + ret = (tmp[0], ("CRITICAL" if tmp[0] == 2 else "WARNING")+": " + \ + formatAttr(s_id, s_name, "Raw_limits", s_raw, tmp[2 if tmp[0] == 2 else 1]) + \ + pipeStuff(s_name, s_raw, tmp[1], tmp[2], *(("0", "100") if len(tmp) == 4 and tmp[3] else ()))) + elif tmp[0] == 0 and len(tmp) == 4 and tmp[3]: + # found temperature + temperStuff = (s_name, s_raw) + tmp[1:3] + elif re.search(r"ID.?\s+ATTRIBUTE_NAME\s+FLAG\s+VALUE\s+WORST\s+THRESH\s+TYPE\s+UPDATED\s+WHEN_FAILED\s+RAW_VALUE", line): + vprint(3, "Start of Attributes parsing") + in_attrs = 1 + else: + vprint(3, "Ommiting smartctl row: %s" % line) + # ID# ATTRIBUTE_NAME FLAG VALUE WORST THRESH TYPE UPDATED WHEN_FAILED RAW_VALUE + # done + if ret[0] == 0 and temperStuff[0] != None: + temperStuff = temperStuff + ("0", "100") + ret = (0, ret[1] + pipeStuff(*temperStuff)) + + if ret[0] == 3: + vprint(3, "smartctl error, output: %s" % healthMessage); + return ret + ##(0, "OK: device is functional and stable (temperature: %d)" % temperature) +# end + + +def exitWithMessage(value, message): + """Exit with given value and status message.""" + + print message + sys.exit(value) +# end + + +def vprint(level, message): + """Verbosity print. + + Decide according to the given verbosity level if the message will be + printed to stdout. + """ + + if level <= verbosity: + print message + # fi +# end + + +if __name__ == "__main__": + (options, args) = parseCmdLine(sys.argv) + verbosity = options.verbosity + + vprint(2, "Get device name") + device = options.device + vprint(1, "Device: %s" % device) + + # check if we can access 'path' + vprint(2, "Check device") + (value, message) = checkDevice(device) + if value != 0: + exitWithMessage(3, message) + # fi + + # check if we have smartctl available + #(value, message) = checkSmartMonTools(options.smartctlPath) + #if value != 0: + # exitWithMessage(3, message) + # fi + vprint(1, "Path to smartctl: %s" % options.smartctlPath) + + # call smartctl and parse output + vprint(2, "Call smartctl") + (value, message, healthStatusOutput) = callSmartMonTools(options.smartctlPath, device, options.noCheckStandby, options.smartctlArgs) + if value != 0: + exitWithMessage(value, message) + + vprint(2, "Parse smartctl output and return info") + attrs = {} + if options.rawAttrs: + for at in options.rawAttrs: + attrs[at[0]] = at[1:3] + attrs[options.tempAttrId] = (options.warningThreshold, options.criticalThreshold, True) + (value, message) = createReturnInfo(healthStatusOutput, attrs) + + # exit program + exitWithMessage(value, message) + +# fi diff --git a/nagios-plugins-ets-1.4/usr/lib64/nagios/plugins/check_zpools.sh b/nagios-plugins-ets-1.4/usr/lib64/nagios/plugins/check_zpools.sh new file mode 100755 index 0000000..ad5861e --- /dev/null +++ b/nagios-plugins-ets-1.4/usr/lib64/nagios/plugins/check_zpools.sh @@ -0,0 +1,155 @@ +#!/usr/bin/env bash +######################################################################### +# Script: check_zpools.sh +# Purpose: Nagios plugin to monitor status of zfs pool +# Authors: Aldo Fabi First version (2006-09-01) +# Vitaliy Gladkevitch Forked (2013-02-04) +# Claudio Kuenzler Complete redo, perfdata, etc (2013-2023) +# Per von Zweigbergk Various fixes (2016-10-12) +# @waoki Trap zpool command errors (2022-03-01) +# @mrdsam Improvement (2022-05-24) +# Doc: http://www.claudiokuenzler.com/monitoring-plugins/check_zpools.php +# History: +# 2006-09-01 Original first version +# 2006-10-04 Updated (no change history known) +# 2013-02-04 Forked and released +# 2013-05-08 Make plugin work on different OS, pepp up plugin +# 2013-05-09 Bugfix in exit code handling +# 2013-05-10 Removed old exit vars (not used anymore) +# 2013-05-21 Added performance data (percentage used) +# 2013-07-11 Bugfix in zpool health check +# 2014-02-10 Bugfix in threshold comparison +# 2014-03-11 Allow plugin to run without enforced thresholds +# 2016-10-12 Fixed incorrect shell quoting and typos +# 2022-03-01 Merge PR #10, manually solve conflicts +# 2022-05-24 Removed need for 'awk', using bash-functions instead +# 2023-02-15 Bugfix in single pool CRITICAL output (issue #13) +######################################################################### +### Begin vars +STATE_OK=0 # define the exit code if status is OK +STATE_WARNING=1 # define the exit code if status is Warning +STATE_CRITICAL=2 # define the exit code if status is Critical +STATE_UNKNOWN=3 # define the exit code if status is Unknown +# Set path +PATH=$PATH:/usr/sbin:/sbin +export PATH +### End vars +######################################################################### +help="check_zpools.sh (c) 2006-2023 multiple authors\n +Usage: $0 -p (poolname|ALL) [-w warnpercent] [-c critpercent]\n +Example: $0 -p ALL -w 80 -c 90" +######################################################################### +# Check necessary commands are available +for cmd in zpool [ +do + if ! which "$cmd" 1>/dev/null + then + echo "UNKNOWN: ${cmd} does not exist, please check if command exists and PATH is correct" + exit ${STATE_UNKNOWN} + fi +done +######################################################################### +# Check for people who need help - we are nice ;-) +if [ "${1}" = "--help" ] || [ "${#}" = "0" ]; + then + echo -e "${help}"; + exit ${STATE_UNKNOWN}; +fi +######################################################################### +# Get user-given variables +while getopts "p:w:c:" Input; +do + case ${Input} in + p) pool=${OPTARG};; + w) warn=${OPTARG};; + c) crit=${OPTARG};; + *) echo -e "$help" + exit $STATE_UNKNOWN + ;; + esac +done +######################################################################### +# Did user obey to usage? +if [ -z "$pool" ]; then echo -e "$help"; exit ${STATE_UNKNOWN}; fi +######################################################################### +# Verify threshold sense +if [[ -n $warn ]] && [[ -z $crit ]]; then echo "Both warning and critical thresholds must be set"; exit $STATE_UNKNOWN; fi +if [[ -z $warn ]] && [[ -n $crit ]]; then echo "Both warning and critical thresholds must be set"; exit $STATE_UNKNOWN; fi +if [[ $warn -gt $crit ]]; then echo "Warning threshold cannot be greater than critical"; exit $STATE_UNKNOWN; fi +######################################################################### +# What needs to be checked? +## Check all pools +if [ "$pool" = "ALL" ] +then + POOLS=($(zpool list -Ho name)) + if [ $? -ne 0 ]; then + echo "UNKNOWN zpool query failed"; exit $STATE_UNKNOWN + fi + p=0 + for POOL in ${POOLS[*]} + do + CAPACITY=$(zpool list -Ho capacity "$POOL") + CAPACITY=${CAPACITY%\%} + HEALTH=$(zpool list -Ho health "$POOL") + if [ $? -ne 0 ]; then + echo "UNKNOWN zpool query failed"; exit $STATE_UNKNOWN + fi + # Check with thresholds + if [[ -n $warn ]] && [[ -n $crit ]] + then + if [[ $CAPACITY -ge $crit ]] + then error[${p}]="POOL $POOL usage is CRITICAL (${CAPACITY}%)"; fcrit=1 + elif [[ $CAPACITY -ge $warn && $CAPACITY -lt $crit ]] + then error[$p]="POOL $POOL usage is WARNING (${CAPACITY}%)" + elif [ "$HEALTH" != "ONLINE" ] + then error[${p}]="$POOL health is $HEALTH"; fcrit=1 + fi + # Check without thresholds + else + if [ "$HEALTH" != "ONLINE" ] + then error[${p}]="$POOL health is $HEALTH"; fcrit=1 + fi + fi + perfdata[$p]="$POOL=${CAPACITY}% " + let p++ + done + + if [[ ${#error[*]} -gt 0 ]] + then + if [[ $fcrit -eq 1 ]]; then exit_code=2; else exit_code=1; fi + echo "ZFS POOL ALARM: ${error[*]}|${perfdata[*]}"; exit ${exit_code} + else echo "ALL ZFS POOLS OK (${POOLS[*]})|${perfdata[*]}"; exit 0 + fi + +## Check single pool +else + CAPACITY=$(zpool list -Ho capacity "$pool" 2>&1 ) + CAPACITY=${CAPACITY%\%} + if [[ -n $(echo "${CAPACITY}" | egrep -q 'no such pool$') ]]; then + echo "zpool $pool does not exist"; exit $STATE_CRITICAL + fi + HEALTH=$(zpool list -Ho health "$pool") + if [ $? -ne 0 ]; then + echo "UNKNOWN zpool query failed"; exit $STATE_UNKNOWN + fi + + if [[ -n $warn ]] && [[ -n $crit ]] + then + # Check with thresholds + if [ "$HEALTH" != "ONLINE" ]; then echo "ZFS POOL $pool health is $HEALTH|$pool=${CAPACITY}%"; exit ${STATE_CRITICAL} + elif [[ $CAPACITY -ge $crit ]]; then echo "ZFS POOL $pool usage is CRITICAL (${CAPACITY}%)|$pool=${CAPACITY}%"; exit ${STATE_CRITICAL} + elif [[ $CAPACITY -ge $warn && $CAPACITY -lt $crit ]]; then echo "ZFS POOL $pool usage is WARNING (${CAPACITY}%)|$pool=${CAPACITY}%"; exit ${STATE_WARNING} + else echo "ALL ZFS POOLS OK ($pool)|$pool=${CAPACITY}%"; exit ${STATE_OK} + fi + else + # Check without thresholds + if [ "$HEALTH" != "ONLINE" ] + then echo "ZFS POOL $pool health is $HEALTH|$pool=${CAPACITY}%"; exit ${STATE_CRITICAL} + else echo "ALL ZFS POOLS OK ($pool)|$pool=${CAPACITY}%"; exit ${STATE_OK} + fi + fi + +fi + +echo "UNKNOWN - Should never reach this part" +exit ${STATE_UNKNOWN}