From eed91835a4aaddc5dce98f21089bd1b339a72930 Mon Sep 17 00:00:00 2001 From: Gardouille Date: Sun, 1 Dec 2024 16:50:27 +0100 Subject: [PATCH] New path for Xymon --- xymon/get.xymon.alert.vars.sh | 23 ++ xymon/plugins/client/ext/dcheck | 8 + xymon/plugins/client/ext/dres | 258 ++++++++++++++++ xymon/plugins/client/ext/dscan | 13 + xymon/plugins/client/ext/sge.sh | 242 +++++++++++++++ xymon/plugins/client/ext/smart | 202 +++++++++++++ xymon/plugins/client/ext/smartoverall | 300 +++++++++++++++++++ xymon/plugins/client/ext/test.int.compare.sh | 66 ++++ xymon/sample.messages/apt.alert | 28 ++ xymon/sample.messages/cpu.alert | 62 ++++ xymon/sample.messages/files.alert | 41 +++ xymon/sample.messages/libs.alert | 29 ++ xymon/sample.messages/memory.alert | 22 ++ xymon/sample.messages/ntpq.alert | 38 +++ xymon/sample.messages/ports.alert | 31 ++ xymon/sample.messages/procs.alert | 71 +++++ xymon/tar.client.logfiles.sh | 65 ++++ xymon/xymon.alert.sh | 59 ++++ xymon/xymon.apt.alert.sh | 62 ++++ xymon/xymon.files.alert.sh | 102 +++++++ xymon/xymon.libs.alert.sh | 159 ++++++++++ xymon/xymon.procs.alert.sh | 107 +++++++ 22 files changed, 1988 insertions(+) create mode 100755 xymon/get.xymon.alert.vars.sh create mode 100644 xymon/plugins/client/ext/dcheck create mode 100644 xymon/plugins/client/ext/dres create mode 100644 xymon/plugins/client/ext/dscan create mode 100755 xymon/plugins/client/ext/sge.sh create mode 100755 xymon/plugins/client/ext/smart create mode 100755 xymon/plugins/client/ext/smartoverall create mode 100755 xymon/plugins/client/ext/test.int.compare.sh create mode 100644 xymon/sample.messages/apt.alert create mode 100644 xymon/sample.messages/cpu.alert create mode 100644 xymon/sample.messages/files.alert create mode 100644 xymon/sample.messages/libs.alert create mode 100644 xymon/sample.messages/memory.alert create mode 100644 xymon/sample.messages/ntpq.alert create mode 100644 xymon/sample.messages/ports.alert create mode 100644 xymon/sample.messages/procs.alert create mode 100755 xymon/tar.client.logfiles.sh create mode 100755 xymon/xymon.alert.sh create mode 100755 xymon/xymon.apt.alert.sh create mode 100755 xymon/xymon.files.alert.sh create mode 100755 xymon/xymon.libs.alert.sh create mode 100755 xymon/xymon.procs.alert.sh diff --git a/xymon/get.xymon.alert.vars.sh b/xymon/get.xymon.alert.vars.sh new file mode 100755 index 0000000..3b20918 --- /dev/null +++ b/xymon/get.xymon.alert.vars.sh @@ -0,0 +1,23 @@ +#!/bin/sh + +log_file="/tmp/xymon.alert.${BBHOSTSVC}.vars" + +# Print variables for an alert on a specific host +rm -f -- "${log_file}" +touch "${log_file}" +printf '%b' "BBCOLORLEVEL=\"${BBCOLORLEVEL}\"\\n" >> "${log_file}" +printf '%b' "BBALPHAMSG=\"${BBALPHAMSG}\"\\n" >> "${log_file}" +printf '%b' "ACKCODE=\"${ACKCODE}\"\\n" >> "${log_file}" +printf '%b' "RCPT=\"${RCPT}\"\\n" >> "${log_file}" +printf '%b' "BBHOSTNAME=\"${BBHOSTNAME}\"\\n" >> "${log_file}" +printf '%b' "MACHIP=\"${MACHIP}\"\\n" >> "${log_file}" +printf '%b' "BBSVCNAME=\"${BBSVCNAME}\"\\n" >> "${log_file}" +printf '%b' "BBSVCNUM=\"${BBSVCNUM}\"\\n" >> "${log_file}" +printf '%b' "BBHOSTSVC=\"${BBHOSTSVC}\"\\n" >> "${log_file}" +printf '%b' "BBHOSTSVCCOMMAS=\"${BBHOSTSVCCOMMAS}\"\\n" >> "${log_file}" +printf '%b' "BBNUMERIC=\"${BBNUMERIC}\"\\n" >> "${log_file}" +printf '%b' "RECOVERED=\"${RECOVERED}\"\\n" >> "${log_file}" +printf '%b' "DOWNSECS=\"${DOWNSECS}\"\\n" >> "${log_file}" +printf '%b' "DOWNSECSMSG=\"${DOWNSECSMSG}\"\\n" >> "${log_file}" + +exit 0 diff --git a/xymon/plugins/client/ext/dcheck b/xymon/plugins/client/ext/dcheck new file mode 100644 index 0000000..78593b5 --- /dev/null +++ b/xymon/plugins/client/ext/dcheck @@ -0,0 +1,8 @@ +3&green /dev/sda auto +3&green /dev/sdb auto +3&green /dev/sdc auto +3&green /dev/sdd auto +3&green /dev/sde auto +4&clear /dev/sdf unsupported +3&green /dev/bus/0 megaraid,12 +3&green /dev/bus/0 megaraid,13 diff --git a/xymon/plugins/client/ext/dres b/xymon/plugins/client/ext/dres new file mode 100644 index 0000000..b4b490f --- /dev/null +++ b/xymon/plugins/client/ext/dres @@ -0,0 +1,258 @@ +&green /dev/sda auto + +SMART Health Status: OK +&clear Device is unknown or not complete in smartmontools database. Please take a look to the FAQ: +https://www.smartmontools.org/wiki/FAQ#SmartmontoolsDatabase +=== START OF INFORMATION SECTION === +Vendor: HGST +Product: HUS728T8TAL5200 +Revision: RS01 +Compliance: SPC-4 +User Capacity: 8,001,563,222,016 bytes [8.00 TB] +Logical block size: 512 bytes +Physical block size: 4096 bytes +Formatted with type 2 protection +LU is fully provisioned +Rotation Rate: 7200 rpm +Form Factor: 3.5 inches +Logical Unit id: 0x5000cca09976b8c4 +Serial number: VAJ392BL +Device type: disk +Transport protocol: SAS (SPL-3) +Local Time is: Fri Feb 28 15:07:11 2020 CET +SMART support is: Available - device has SMART capability. +SMART support is: Enabled +Temperature Warning: Disabled or Not Supported + +=== START OF READ SMART DATA SECTION === +SMART Self-test log +Num Test Status segment LifeTime LBA_first_err [SK ASC ASQ] + Description number (hours) +# 1 Background short Completed 96 5771 - [- - -] +# 2 Background short Completed 96 5747 - [- - -] +------------------------------------------------------------ + + +&green /dev/sdb auto + +SMART Health Status: OK +&clear Device is unknown or not complete in smartmontools database. Please take a look to the FAQ: +https://www.smartmontools.org/wiki/FAQ#SmartmontoolsDatabase +=== START OF INFORMATION SECTION === +Vendor: HGST +Product: HUS728T8TAL5200 +Revision: RS01 +Compliance: SPC-4 +User Capacity: 8,001,563,222,016 bytes [8.00 TB] +Logical block size: 512 bytes +Physical block size: 4096 bytes +Formatted with type 2 protection +LU is fully provisioned +Rotation Rate: 7200 rpm +Form Factor: 3.5 inches +Logical Unit id: 0x5000cca09975fc04 +Serial number: VAJ2WHPL +Device type: disk +Transport protocol: SAS (SPL-3) +Local Time is: Fri Feb 28 15:07:11 2020 CET +SMART support is: Available - device has SMART capability. +SMART support is: Enabled +Temperature Warning: Disabled or Not Supported + +=== START OF READ SMART DATA SECTION === +SMART Self-test log +Num Test Status segment LifeTime LBA_first_err [SK ASC ASQ] + Description number (hours) +# 1 Background short Completed 96 5607 - [- - -] +# 2 Background short Completed 96 5583 - [- - -] +------------------------------------------------------------ + + +&green /dev/sdc auto + +SMART Health Status: OK +&clear Device is unknown or not complete in smartmontools database. Please take a look to the FAQ: +https://www.smartmontools.org/wiki/FAQ#SmartmontoolsDatabase +=== START OF INFORMATION SECTION === +Vendor: HGST +Product: HUS728T8TAL5200 +Revision: RS01 +Compliance: SPC-4 +User Capacity: 8,001,563,222,016 bytes [8.00 TB] +Logical block size: 512 bytes +Physical block size: 4096 bytes +Formatted with type 2 protection +LU is fully provisioned +Rotation Rate: 7200 rpm +Form Factor: 3.5 inches +Logical Unit id: 0x5000cca099757c5c +Serial number: VAJ2M04L +Device type: disk +Transport protocol: SAS (SPL-3) +Local Time is: Fri Feb 28 15:07:11 2020 CET +SMART support is: Available - device has SMART capability. +SMART support is: Enabled +Temperature Warning: Disabled or Not Supported + +=== START OF READ SMART DATA SECTION === +SMART Self-test log +Num Test Status segment LifeTime LBA_first_err [SK ASC ASQ] + Description number (hours) +# 1 Background short Completed 96 5859 - [- - -] +# 2 Background short Completed 96 5835 - [- - -] +------------------------------------------------------------ + + +&green /dev/sdd auto + +SMART Health Status: OK +&clear Device is unknown or not complete in smartmontools database. Please take a look to the FAQ: +https://www.smartmontools.org/wiki/FAQ#SmartmontoolsDatabase +=== START OF INFORMATION SECTION === +Vendor: HGST +Product: HUS728T8TAL5200 +Revision: RS01 +Compliance: SPC-4 +User Capacity: 8,001,563,222,016 bytes [8.00 TB] +Logical block size: 512 bytes +Physical block size: 4096 bytes +Formatted with type 2 protection +LU is fully provisioned +Rotation Rate: 7200 rpm +Form Factor: 3.5 inches +Logical Unit id: 0x5000cca099765214 +Serial number: VAJ327BL +Device type: disk +Transport protocol: SAS (SPL-3) +Local Time is: Fri Feb 28 15:07:11 2020 CET +SMART support is: Available - device has SMART capability. +SMART support is: Enabled +Temperature Warning: Disabled or Not Supported + +=== START OF READ SMART DATA SECTION === +SMART Self-test log +Num Test Status segment LifeTime LBA_first_err [SK ASC ASQ] + Description number (hours) +# 1 Background short Completed 96 5599 - [- - -] +# 2 Background short Completed 96 5575 - [- - -] +------------------------------------------------------------ + + +&green /dev/sde auto + +SMART Health Status: OK +&clear Device is unknown or not complete in smartmontools database. Please take a look to the FAQ: +https://www.smartmontools.org/wiki/FAQ#SmartmontoolsDatabase +=== START OF INFORMATION SECTION === +Vendor: HGST +Product: HUS728T8TAL5200 +Revision: RS01 +Compliance: SPC-4 +User Capacity: 8,001,563,222,016 bytes [8.00 TB] +Logical block size: 512 bytes +Physical block size: 4096 bytes +Formatted with type 2 protection +LU is fully provisioned +Rotation Rate: 7200 rpm +Form Factor: 3.5 inches +Logical Unit id: 0x5000cca09976e460 +Serial number: VAJ3BZDL +Device type: disk +Transport protocol: SAS (SPL-3) +Local Time is: Fri Feb 28 15:07:11 2020 CET +SMART support is: Available - device has SMART capability. +SMART support is: Enabled +Temperature Warning: Disabled or Not Supported + +=== START OF READ SMART DATA SECTION === +SMART Self-test log +Num Test Status segment LifeTime LBA_first_err [SK ASC ASQ] + Description number (hours) +# 1 Background short Completed 96 5599 - [- - -] +# 2 Background short Completed 96 5575 - [- - -] +------------------------------------------------------------ + + +&clear /dev/sdf unsupported + +SMART Health Status can't be determine because of: +SMART support is: Unavailable - device lacks SMART capability. + +=== START OF INFORMATION SECTION === +Vendor: DELL +Product: PERC H730P Mini +Revision: 4.30 +User Capacity: 146,163,105,792 bytes [146 GB] +Logical block size: 512 bytes +Logical Unit id: 0x61866da06192eb00256e8c0a2d73f5b6 +Serial number: 00b6f5732d0a8c6e2500eb9261a06d86 +Device type: disk +Local Time is: Fri Feb 28 15:07:12 2020 CET +SMART support is: Unavailable - device lacks SMART capability. + + +------------------------------------------------------------ + + +&green /dev/bus/0 megaraid,12 + +SMART Health Status: OK +&clear Device is unknown or not complete in smartmontools database. Please take a look to the FAQ: +https://www.smartmontools.org/wiki/FAQ#SmartmontoolsDatabase +=== START OF INFORMATION SECTION === +Vendor: SEAGATE +Product: ST9146803SS +Revision: FS64 +User Capacity: 146,815,733,760 bytes [146 GB] +Logical block size: 512 bytes +Rotation Rate: 10000 rpm +Form Factor: 2.5 inches +Logical Unit id: 0x5000c5003ac7ef07 +Serial number: 6SD3HJV0 +Device type: disk +Transport protocol: SAS (SPL-3) +Local Time is: Fri Feb 28 15:07:13 2020 CET +SMART support is: Available - device has SMART capability. +SMART support is: Enabled +Temperature Warning: Disabled or Not Supported + +=== START OF READ SMART DATA SECTION === +SMART Self-test log +Num Test Status segment LifeTime LBA_first_err [SK ASC ASQ] + Description number (hours) +# 1 Background short Completed 16 62479 - [- - -] +# 2 Background short Completed 16 62455 - [- - -] +------------------------------------------------------------ + + +&green /dev/bus/0 megaraid,13 + +SMART Health Status: OK +&clear Device is unknown or not complete in smartmontools database. Please take a look to the FAQ: +https://www.smartmontools.org/wiki/FAQ#SmartmontoolsDatabase +=== START OF INFORMATION SECTION === +Vendor: SEAGATE +Product: ST9146803SS +Revision: FS64 +User Capacity: 146,815,733,760 bytes [146 GB] +Logical block size: 512 bytes +Rotation Rate: 10000 rpm +Form Factor: 2.5 inches +Logical Unit id: 0x5000c5003ac956db +Serial number: 6SD3HH6J +Device type: disk +Transport protocol: SAS (SPL-3) +Local Time is: Fri Feb 28 15:07:14 2020 CET +SMART support is: Available - device has SMART capability. +SMART support is: Enabled +Temperature Warning: Disabled or Not Supported + +=== START OF READ SMART DATA SECTION === +SMART Self-test log +Num Test Status segment LifeTime LBA_first_err [SK ASC ASQ] + Description number (hours) +# 1 Background short Completed 16 61374 - [- - -] +# 2 Background short Completed 16 61350 - [- - -] +------------------------------------------------------------ + + diff --git a/xymon/plugins/client/ext/dscan b/xymon/plugins/client/ext/dscan new file mode 100644 index 0000000..d7a76f8 --- /dev/null +++ b/xymon/plugins/client/ext/dscan @@ -0,0 +1,13 @@ +/dev/sda -d scsi # /dev/sda, SCSI device +/dev/sdb -d scsi # /dev/sdb, SCSI device +/dev/sdc -d scsi # /dev/sdc, SCSI device +/dev/sdd -d scsi # /dev/sdd, SCSI device +/dev/sde -d scsi # /dev/sde, SCSI device +/dev/sdf -d scsi # /dev/sdf, SCSI device +/dev/bus/0 -d megaraid,0 # /dev/bus/0 [megaraid_disk_00], SCSI device +/dev/bus/0 -d megaraid,1 # /dev/bus/0 [megaraid_disk_01], SCSI device +/dev/bus/0 -d megaraid,2 # /dev/bus/0 [megaraid_disk_02], SCSI device +/dev/bus/0 -d megaraid,3 # /dev/bus/0 [megaraid_disk_03], SCSI device +/dev/bus/0 -d megaraid,4 # /dev/bus/0 [megaraid_disk_04], SCSI device +/dev/bus/0 -d megaraid,12 # /dev/bus/0 [megaraid_disk_12], SCSI device +/dev/bus/0 -d megaraid,13 # /dev/bus/0 [megaraid_disk_13], SCSI device diff --git a/xymon/plugins/client/ext/sge.sh b/xymon/plugins/client/ext/sge.sh new file mode 100755 index 0000000..6c83b23 --- /dev/null +++ b/xymon/plugins/client/ext/sge.sh @@ -0,0 +1,242 @@ +#!/bin/sh +# +# SGE: Sun Grid Engine check - Xymon external script test +# +##### Purpose is to report back to a central server, all Sun +##### Grid Engine software faults. +##### +# +# version 0.4 +# +# BIG BROTHER / XXXXXXXXXXXXXXXX status +# +# Written by Butch Deal +# Daniel Gomez +# Jérémy Gardais +# +# v0.4 09/06/20 clean, correction,… for Xymon 4.3.28 +# v0.3e 10/14/08 cut down on the number of qhost runs +# v0.3d 03/31/06 added alarm/suspend state identification +# v0.3c 03/01/06 propogated yellow state upon UNAVAILABLE queue intances +# v0.3b 01/31/06 fixed yellow warning queue status for ambigious config test +# v0.3a 01/31/06 added unknown queue status and ambigious config test +# v0.3 01/26/06 fixed status reporting and optimized job status +# v0.2 08/03/05 flag disabled queues as clear +# v0.1 07/28/05 authored + +######################################## +# NOTE +# The version v0.4 has only been tested with Xymon (server and client) 4.2.x. +# +# The color status with respects to queue status is arbitrary and should be +# reviewed for your particular environment. +# +# Tested on : +# Solaris & Linux +# Linux only (for v0.4) +######################################## + +######################################## +# INSTALLATION +# step 1 - copy to Xymon client's ext dir +# step 2 - New clientlaunch.d/sge.cfg file +# step 3 - restart Xymon client +# +# NOTE - the TEST variable in the configuration section, this is the name used +# as the column header. +######################################## + +################################## +# CONFIGURE IT HERE +################################## +readonly PLUGIN_NAME=$(basename "${0}") + +readonly TEST="sge" +readonly PLUGIN_RESULT="${XYMONTMP}/${MACHINEDOTS}.${TEST}.plugin_result" +readonly PLUGIN_STATE="${XYMONTMP}/${MACHINEDOTS}.${TEST}.plugin_state" +true > "${PLUGIN_STATE}" + +readonly QSTAT=$(command -v qstat) +readonly QHOST=$(command -v qhost) +readonly QSELECT=$(command -v qselect) +export QSTAT QHOST QSELECT + +# define colours for graphics +# Comment these out if using older BB versions +CLEAR_PIC="&clear" +RED_PIC="&red" +YELLOW_PIC="&yellow" +GREEN_PIC="&green" +UNKNOWN_PIC="&purple" + +################################## +# Start of script +################################## + +get_header() +{ + echo "" + #echo "$1 ($2)
" + echo "$1
" + # If you do not want the header in a bigger font use line below instead + #echo "$1 ($2)" + # If you want the "Paul Luzzi" look uncomment this section and comment + # out the above sections: + #echo "


" + #echo "============== $1 ==============" + #echo "--- ($2) ---" + #echo "
" + #echo "
" +} +get_header_small() +{ + echo "" + #echo "$1 ($2)
" + echo "$1
" + # If you do not want the header in a bigger font use line below instead + # echo "$1 ($2)" + # If you want the "Paul Luzzi" look uncomment this section and comment + # out the above sections: + #echo "


" + #echo "============== $1 ==============" + #echo "--- ($2) ---" + #echo "
" + #echo "
" +} + + +get_footer() +{ + echo "" + # If you want the "Paul Luzzi" look uncomment this section and comment + # out the above sections: + #echo "
" +} + +##### +##### Get Status proc - used to get all responses +##### +get_status() +{ + + # Check defaults have been set + if [ "${QSTAT}" = "" ]; then + readonly QSTAT=$(command -v qstat) + echo "" + echo "$YELLOW_PIC QSTAT command is not defined in etc/bbsys.local - using default: ${QSTAT}" + fi + + if [ "${QHOST}" = "" ]; then + readonly QHOST=$(command -v qhost) + echo "" + echo "$YELLOW_PIC QHOST command is not defined in etc/bbsys.local - using default: ${QHOST}" + fi + + if [ "${QSELECT}" = "" ]; then + readonly QSELECT=$(command -v qselect) + echo "" + echo "$YELLOW_PIC QSELECT command is not defined in etc/bbsys.local - using default: ${QSELECT}" + fi + + ### + ### Check the jobs + ### + get_header "Jobs" "$QSTAT -l hostname=$MACHINEDOTS" + jobs=$(${QSTAT} -l hostname="${MACHINEDOTS}" -s r -u \*) + if [ -z "$jobs" ]; then + echo "No Running Jobs" + else + ${QSTAT} -l hostname="${MACHINEDOTS}" -s r -u \* + fi + get_footer + + ### + ### Check the host + ### + get_header "Host" "$METAHS -i" + ${QHOST} -h "${MACHINEDOTS}" | grep -v "global" + get_footer + + ### + ### Identify queue memberships + ### + #get_header "Queue Membership" "$QHOST -q" + #${QHOST} -h ${MACHINEDOTS} -q | tail +5 + #get_footer + + ### + ### Check queue instance states + ### + queueTriggered=false; + ${QHOST} -h "${MACHINEDOTS}" -q | tail +5 > "${PLUGIN_RESULT}.QSTATE" + while IFS= read -r _LINE; do + queue=$(printf -- '%s' "${_LINE}" | awk '{ print $1 }') + qstate=$(printf -- '%s' "${_LINE}" | awk '{ print $4 }') + + # Order determines more significant alert status + if [ "$(echo "${qstate}" | grep -c d)" != "0" ]; then + echo "4&clear $queue@$HOST is DISABLED" >> "${PLUGIN_STATE}" + queueMsg=$(echo "$queueMsg
$CLEAR_PIC $queue@$HOST is DISABLED
${_LINE}") + queueTriggered=true; + elif [ "$(echo "${qstate}" | grep -c E)" != "0" ]; then + echo "1&red $queue@$HOST is in ERROR!" >> "${PLUGIN_STATE}" + queueMsg=$(echo "$queueMsg
$RED_PIC $queue@$HOST is in ERROR!
${_LINE}") + queueTriggered=true; + elif [ "$(echo "${qstate}" | grep -c c)" != "0" ]; then + echo "2&yellow $queue@$HOST has an ambigious configuration!" >> "${PLUGIN_STATE}" + queueMsg=$(echo "$queueMsg
$YELLOW_PIC $queue@$HOST has an ambigious configuration!
${_LINE}") + queueTriggered=true; + elif [ "$(echo "${qstate}" | grep -c a)" != "0" ] || \ + [ "$(echo "${qstate}" | grep -c A)" != "0" ]; then + echo "2&yellow $queue@$HOST is in ALARM" >> "${PLUGIN_STATE}" + queueMsg=$(echo "$queueMsg
$YELLOW_PIC $queue@$HOST is in ALARM
${_LINE}") + elif [ "$(echo "${qstate}" | grep -c s)" != "0" ] || \ + [ "$(echo "${qstate}" | grep -c S)" != "0" ]; then + echo "2&yellow $queue@$HOST is SUSPENDED" >> "${PLUGIN_STATE}" + queueMsg=$(echo "$queueMsg
$YELLOW_PIC $queue@$HOST is SUSPENDED
${_LINE}") + elif [ "$(echo "${qstate}" | grep -c u)" != "0" ]; then + echo "2&yellow $queue@$HOST is UNAVAILABLE" >> "${PLUGIN_STATE}" + queueMsg=$(echo "$queueMsg
$YELLOW_PIC $queue@$HOST is UNAVAILABLE!
${_LINE}") + queueTriggered=true; + elif [ "$qstate" = "" ]; then + echo "3&green $queue@$HOST is OK" >> "${PLUGIN_STATE}" + queueMsg=$(echo "$queueMsg
$GREEN_PIC $queue@$HOST is OK
${_LINE}") + else + echo "5&purple $queue@$HOST is UNKNOWN" >> "${PLUGIN_STATE}" + queueMsg=$(echo "$queueMsg
$UNKNOWN_PIC $queue@$HOST is UNKNOWN
${_LINE}") + queueTriggered=true; + fi + done < "${PLUGIN_RESULT}.QSTATE" + + get_header "Queue Instance Status Report" + echo "$queueMsg" + get_footer + +##### +##### End of get_status proc +##### +} + +##### +##### Main body +##### +get_status > "${PLUGIN_RESULT}" + +# Set the global color according to the highest alert +COLOR=$(< "${PLUGIN_STATE}" awk '{print $1}' | sort | uniq | head -1 | cut -c3-) + + # NOW USE THE XYMON COMMAND TO SEND THE DATA ACROSS + $XYMON "${XYMSRV}" "status ${MACHINE}.${TEST} ${COLOR} $($DATE) $(cat ${PLUGIN_RESULT})" + #For testing only + # echo $XYMON "${XYMSRV}" "status ${MACHINE}.${TEST} ${COLOR} $($DATE) $(cat ${PLUGIN_RESULT})" > /tmp/sgetmp + +# Clean up our mess +# Checking for existence of each file since the whole test may be optional +# and may not actually run on every client +# +if [ -f "${PLUGIN_RESULT}" ]; then + rm -f -- "${PLUGIN_RESULT}" "${PLUGIN_STATE}" "${PLUGIN_RESULT}.QSTATE" +fi +############################################## +# end of script +############################################## diff --git a/xymon/plugins/client/ext/smart b/xymon/plugins/client/ext/smart new file mode 100755 index 0000000..1de176b --- /dev/null +++ b/xymon/plugins/client/ext/smart @@ -0,0 +1,202 @@ +#!/usr/bin/perl +# $Id: sensors 70 2011-11-25 09:21:18Z skazi $ +# Author: Jacek Tomasiak +# https://github.com/skazi0/xymon-plugins/blob/master/client/ext/smart + +use strict; +# add script's directory to module search path for Hobbit.pm on non-debian systems +use FindBin; +use lib $FindBin::Bin; + +use Hobbit; +use Data::Dumper; + +my $bb = new Hobbit('smart'); + +my $temp_disk_list = "$ENV{'XYMONTMP'}/$ENV{'MACHINEDOTS'}.smart.drivedb.list"; +my @disks = (); +my %olderr = {}; + +my $CACHETIME = 10; # minutes +my $CACHEFILE = "$ENV{'XYMONTMP'}/$ENV{'MACHINEDOTS'}.smart.cache"; + +&load_config("$ENV{'XYMONTMP'}/logfetch.$ENV{'MACHINEDOTS'}.cfg"); + +my @disks_stat = stat($temp_disk_list); +my $disks_mtime = scalar @disks_stat ? $disks_stat[9] : 0; +# +# Regenerate disks list if the file is too old (600 minutes) +if (time() - $disks_mtime > 600) +{ + unlink $temp_disk_list; +} + +if (-e $temp_disk_list) { + # Should use the existing file +} +else { + # Create a file with the list of disks + system("ls -1 /dev/sd* | grep -vE '[0-9]' > $temp_disk_list") == 0 + or die "system command to create $temp_disk_list failed: $?"; +} + +# fallback to disk detection if nothing defined in the config +unless (@disks) { + ## Put temp_disk_list content to disks array + open(my $fh, '<:encoding(UTF-8)', $temp_disk_list) + or die "Could not open file '$temp_disk_list' $!"; + while (my $row = <$fh>) { + chomp $row; + push(@disks, "$row"); + } +} + +my @stat = stat($CACHEFILE); +my $mtime = scalar @stat ? $stat[9] : 0; +# regenerate sensors cache if outdated +if (time() - $mtime > $CACHETIME * 60) +{ + open(OUT, ">$CACHEFILE") or die "cannot open $CACHEFILE"; + + foreach my $name (@disks) + { + print OUT ('=' x 20) . " $name " . ('=' x 20) . "\n"; + my @output = `sudo smartctl -AHi -l error -l selftest $name 2>&1` or die; + my $ncv = ''; + my $newerr = 1; + my $ponhours = undef; + my $lasttest = undef; + foreach my $line (@output) + { + # skip header + next if ($line =~ /smartctl|Copyright|Home page|===/); + + if ($line =~ /.*overall-health.*:\s*(.*)/) + { + my $lstatus = ($1 eq 'PASSED') ? 'green' : 'red'; + print OUT "&$lstatus $line"; + } + elsif ($line =~ /^\s*(\d+)\s+(\S+)\s+\S+\s+(\d+)\s+(\d+)\s+(\d+)\s+\S+\s+\S+\s+(\S+)\s+(.*)$/) + { + my ($aname, $value, $worst, $thresh, $failure, $raw) = ($2, $3, $4, $5, $6, $7); + my $lstatus = 'green'; + if ($aname =~ /Current_Pending_Sector|Offline_Uncorrectable/ and int($raw) > 0) + { + $lstatus = 'yellow'; + } + elsif ($aname =~ /Power_On_Hours/) + { + $ponhours = $raw; + } + if ($failure =~ /FAIL/) { + $lstatus = 'red'; + } + + print OUT "&$lstatus $line"; + + $ncv .= "$name-$aname-value : $value\n"; + $ncv .= "$name-$aname-worst : $worst\n"; + $ncv .= "$name-$aname-thresh : $thresh\n"; + $ncv .= "$name-$aname-raw : $raw\n"; + } + elsif ($line =~ /^\s*No Errors Logged\s*$/) + { + $newerr = 0; + print OUT "&green $line"; + } + elsif ($line =~ /Error Count:\s*(\d+)/) + { + $newerr = $1 - $olderr{$name}; + my $lstatus = $newerr > 0 ? 'red' : 'green'; + print OUT "&$lstatus $line" + } + elsif ($line =~ /^\s*Error \d+ occurred/) + { + my $lstatus = $newerr > 0 ? 'red' : 'green'; + print OUT "&$lstatus $line" + } + elsif ($line =~ /^\s*#\s*\d+\s+(Conveyance offline|Extended offline|Short offline|Extended captive)\s+(.*)\s+\d+%\s+(\d+)/) + { + my $status = $2; + my $lifetime = $3; + my $lstatus = 'red'; + $lasttest = $lifetime if (!defined($lasttest)); + $lstatus = 'yellow' if ($status =~ /Aborted by host|Interrupted \(host reset\)/); + $lstatus = 'green' if ($status =~ /Completed without error|Self-test routine in progress|Interrupted \(host reset\)/); + print OUT "&$lstatus $line"; + } + else + { + print OUT " $line"; + } + } + # test status footer + my $lasttestage = $ponhours % 65536 - $lasttest; + my $lasttestmsg = "$lasttestage hours ago"; + my $lasttestcolor = 'green'; + if (!defined($lasttest)) + { + $lasttestcolor = 'yellow'; + $lasttestmsg = 'no test performed'; + } + elsif ($lasttestage > 24 * 7) + { + $lasttestcolor = 'red'; + } + elsif ($lasttestage > 24 * 2) + { + $lasttestcolor = 'yellow'; + } + print OUT "&$lasttestcolor Last Self-test: $lasttestmsg\n"; + + # hidden output for ncv + print OUT "\n"; + } + + close OUT; +} + +# send cached content +{ + open IN, $CACHEFILE or die "cannot open $CACHEFILE"; + while (my $line = ) + { + if ($line =~ /^\s*&(\S+)/) + { + $bb->color_print($1, $line); + } + else + { + $bb->print($line); + } + } + close IN; +} + + +$bb->send; + +sub load_config +{ + my $path = shift; + + open C, "<$path" or return; +# print "loading config from $path\n"; + while (my $line = ) + { + next if ($line =~ /^\s*#/); + if ($line =~ /DISKS\s*=\s*['"](.*?)["']/) + { + @disks = split(/\s+/, $1); + } + if ($line =~ /SMARTOLDERROR\[([\w\/]+)\]\s+(\d+)/) + { + $olderr{$1} = $2; + } + if ($line =~ /SMARTCACHETIME=(\d+)/) + { + $CACHETIME = $1; + } + } + close C; +} diff --git a/xymon/plugins/client/ext/smartoverall b/xymon/plugins/client/ext/smartoverall new file mode 100755 index 0000000..680e7b6 --- /dev/null +++ b/xymon/plugins/client/ext/smartoverall @@ -0,0 +1,300 @@ +#!/bin/sh +# .. vim: foldmarker=[[[,]]]:foldmethod=marker + +# NOTE: Must be run as root, so you probably need to setup sudo for this. + +# This script is mostly intend to be used with Xymon and rather for devices unknown to the smartmontools base. +# Based on xymon.com's script : https://www.xymon.com/xymon-cgi/viewconf.sh?smart +# The script will scan all devices compatible with SMART and for each disk, it will : [[[ +# * try to guess the expected TYPE (even megaraid,…). +# * display health status. +# * set a "clear" state for incompatible device. +# * display last selftests. +# * set a "error" state if no selftest is recorded. +# * display basic informations. +# * recommend a more advanced SMART script if the disk is known of smartmontools's database (drivedb.h) or redirect to smartmontools's FAQ if not. +# ]]] +# Things the script CAN'T do : [[[ +# * ensure a recent selftest was run. +# * compare current value with vendor's one (for failure prediction or error). +# * give detail about errors. +# * Take a look to this more advance script for such features : https://github.com/skazi0/xymon-plugins/blob/master/client/ext/smart +# ]]] + +# Vars [[[ +debug="1" + +## Colors [[[ +c_redb='\033[1;31m' +c_magentab='\033[1;35m' +c_reset='\033[0m' +## ]]] + +plugin_name=$(basename "${0}") + +plugin_result="${XYMONTMP}/${MACHINEDOTS}.smartoverall.plugin_result" +plugin_state="${XYMONTMP}/${MACHINEDOTS}.smartoverall.plugin_state" +device_list="${XYMONTMP}/${MACHINEDOTS}.smartoverall.dscan" +## List of devices known from the smartmontools base and compatible with test logging +## This file might be used by a more advanced script such as skazi0's one +drivedb_list="${XYMONTMP}/${MACHINEDOTS}.smart.drivedb.list" + +# By default, don't empty files newer than 10hours (600 minutes) +default_mtime_minutes="600" + +xymon_username="xymon" +xymon_groupname="xymon" +# ]]] + +# Functions +## Create or empty a file if it's too old [[[ +## First argument (required): Absolut path to the file +## Second argument (optionnal): Maximum number of minutes since last modification +regenerate_if_too_old() { + ## Set variables according to the number of passed arguments [[[ + case $# in + 0 ) + [ "${debug}" -eq "0" ] && printf "${c_magentab}%-6b${c_reset}\n" "DEBUG : regenerate_if_too_old func − Need at least 1 argument." + exit 1 + ;; + 1 ) + _file="${1}" + [ "${debug}" -eq "0" ] && printf "${c_magentab}%-6b${c_reset}\n" "DEBUG : regenerate_if_too_old func − Use default_mtime_minutes value: ${default_mtime_minutes}." + _max_mtime_minutes="${default_mtime_minutes}" + ;; + 2 ) + _file="${1}" + _max_mtime_minutes="${2}" + ;; + esac + ## ]]] + _current_timestamp=$(date "+%s") + _file_mtime_timestamp=$(stat --format="%Y" -- "${_file}") + + ## Substract last modification timestamp of the file to current timestamp + : $(( _file_mtime_seconds=_current_timestamp-_file_mtime_timestamp )) + ## Get maximum allowed mtime in seconds + : $(( _max_mtime_seconds=_max_mtime_minutes*60 )) + + ## Compare last modification mtime with the maximum allowed + if [ "${_file_mtime_seconds}" -gt "${_max_mtime_seconds}" ]; then + [ "${debug}" -eq "0" ] && printf "${c_magentab}%-6b${c_reset}\n" "DEBUG : regenerate_if_too_old func − Need to empty or create ${_file} last modification happened ${_file_mtime_seconds} seconds ago (maximum is ${_max_mtime_seconds})." + true > "${_file}" + chown -- "${xymon_username}":"${xymon_groupname}" "${_file}" + else + [ "${debug}" -eq "0" ] && printf "${c_magentab}%-6b${c_reset}\n" "DEBUG : regenerate_if_too_old func − Don't need to empty ${_file} last modification happened ${_file_mtime_seconds} seconds ago (maximum is ${_max_mtime_seconds})." + fi + +} +## ]]] +## Test if a disk really support SMART [[[ +## Smartctl can give an health status even without a full support +## of SMART for some type (eg. scsi or megaraid). +## Exemple : SMART support is: Unavailable - device lacks SMART capability. +is_disk_support_smart() { + _disk="${1}" + _type="${2}" + + _smarctl_support_result="${XYMONTMP}/${MACHINEDOTS}.smartoverall.support.$(basename "${_disk}").${_type}" + + smart_support_msg="" + + [ "${debug}" -eq "0" ] && printf "${c_magentab}%-6b${c_reset}\n" "DEBUG : is_disk_support_smart func − check if SMART is supported on : ${_disk}." + + ## Create or empty previous file only if older than 24h (1440 minutes) + regenerate_if_too_old "${_smarctl_support_result}" 1440 + + ## Grep only "support" lines from disk's informations only if the file was emptied + if test ! -s "${_smarctl_support_result}"; then + smartctl -d "${_type}" -i -- "${_disk}" | grep -E "^SMART support is:" -- >> "${_smarctl_support_result}" + fi + + ## If the file is not empty + if test -s "${_smarctl_support_result}"; then + ## Parse all "support" lines + while IFS= read -r _LINE; do + if ! printf -- '%s' "${_LINE}" | grep -q -E -- "(Enabled|Available)" + then + smart_support_msg="${_LINE}" + fi + done < "${_smarctl_support_result}" + else + smart_support_msg="smartctl was not able to open ${_disk} DEVICE with ${_type} TYPE." + fi + + if [ -z "${smart_support_msg}" ]; then + [ "${debug}" -eq "0" ] && printf "${c_magentab}%-6b${c_reset}\n" "DEBUG : is_disk_support_smart func − SMART seems fully supported on : ${_disk} with ${_type} type." + else + [ "${debug}" -eq "0" ] && printf "${c_magentab}%-6b${c_reset}\n" "DEBUG : is_disk_support_smart func − SMART is not fully supported on : ${_disk} with ${_type} type. See smartctl informations :\n${smart_support_msg}" + fi + +## Clean temp files +### As the Xymon's tmpdir is used to store log files, no need to delete them at +### the end of the script. They will be emptied, reused or regenerate (if oldest +### than the expected interval) at the next run. + +} +## ]]] +## Test the type of disk with smartctl [[[ +## Cause the scanned one might not be the one to use +choose_correct_type() { + _disk="${1}" + _scanned_type="${2}" + _default_type="auto" + + TYPE="" + SMART_SUPPORT_MSG="" + + for test_type in "${_scanned_type}" "${_default_type}"; do + is_disk_support_smart "${_disk}" "${test_type}" + + ## If no message, the type is correct + if [ -z "${smart_support_msg}" ]; then + TYPE="${test_type}" + SMART_SUPPORT_MSG="" + return + else + SMART_SUPPORT_MSG="${smart_support_msg}" + fi + + done + +} +## ]]] + +# Create or empty previous files +true > "${plugin_result}" +chown -- "${xymon_username}":"${xymon_groupname}" "${plugin_result}" +true > "${plugin_state}" +chown -- "${xymon_username}":"${xymon_groupname}" "${plugin_state}" +## Create or empty previous file only if older than 24h (1440 minutes) +regenerate_if_too_old "${device_list}" 1440 +regenerate_if_too_old "${drivedb_list}" 1440 + +# Get the list of all available devices if the previous list was emptied +if test ! -s "${device_list}"; then + smartctl --scan >> "${device_list}" +fi + +# If the file is not empty +if test -s "${device_list}"; then + while IFS= read -r LINE; do + ## Get device path + DISK=$(echo "${LINE}" | cut -d" " -f1) + ## Try to determine the best type + SCANNED_TYPE=$(echo "${LINE}" | cut -d" " -f3) + choose_correct_type "${DISK}" "${SCANNED_TYPE}" + + ## If no correct type was found for this device + if [ -z "${TYPE}" ]; then + [ "${debug}" -eq "0" ] && printf "${c_magentab}%-6b${c_reset}\n" "DEBUG : SMART is not fully supported." + DRES=$(printf '%s' "SMART Health Status can't be determine because of:\n${SMART_SUPPORT_MSG}") + DCODE="2" + TYPE="unsupported" + ### Still try to display informations about unsupported device (eg. RAID controller,…) + DID="unsupported-${DISK}" + DINFO=$(smartctl -i -d "${SCANNED_TYPE}" "${DISK}" | grep -v -E "^smartctl|^Copyright|^$" || printf '%s' "Can't get informations due to no SMART support.") + DDRIVEDB_MSG="" + DSELFTEST="" + else + [ "${debug}" -eq "0" ] && printf "${c_magentab}%-6b${c_reset}\n" "DEBUG : SMART seems fully supported, proceed normally." + ### Get SMART Health Status and return code + DRES=$(/usr/sbin/smartctl -H -d "${TYPE}" -n standby "${DISK}") + DCODE=$? + ### Get disk's serial number and informations + DID=$(smartctl -i -d "${TYPE}" "${DISK}" | awk '/.erial .umber:/ { print $NF }') + DINFO=$(smartctl -i -d "${TYPE}" "${DISK}" | grep -v -E "^smartctl|^Copyright|^$") + + ## If the model of the disk is known from smartmontools database + if smartctl -d "${TYPE}" -P show "${DISK}" | grep -qi -- "drive found in"; then + DDRIVEDB_MSG="&green Device is known in smartmontools database. You might consider using a more advanced plugin such as: +https://github.com/skazi0/xymon-plugins/blob/master/client/ext/smart" + else + DDRIVEDB_MSG="&clear Device is unknown or not complete in smartmontools database. Please take a look to the FAQ: +https://www.smartmontools.org/wiki/FAQ#SmartmontoolsDatabase" + fi + + DSELFTEST=$(smartctl -d "${TYPE}" -l selftest "${DISK}" | grep -v -E -- "^smartctl|^Copyright|^$") + ## If no selftest have been recorded + if smartctl -d "${TYPE}" -l selftest "${DISK}" | grep -qi -- "No self-tests"; then + DSELFTEST_MSG="&red No self-tests recorded:" + DCODE="8" + ## If the device doesn't support test logging + elif smartctl -d "${TYPE}" -l selftest "${DISK}" | grep -qEi -- "(does not support.*logging|Log not supported)"; then + DSELFTEST_MSG="&clear Test logging are not supported:" + else + DSELFTEST_MSG="" + ### If the device is also known from smartmontools database + ### and not already present in the list of compatible disk + if printf -- '%s' "${DDRIVEDB_MSG}" | grep -q -E -- "green" && + ! grep -q -- "${DISK}" "${drivedb_list}" + then + echo "${DISK}" >> "${drivedb_list}" + fi + fi + fi + + ## Test health status + DSTBY=$(( DCODE & 2 )) + DFAIL=$(( DCODE & 8 )) + DWARN=$(( DCODE & 32 )) + + ## According to health, give a weight to each color to easily get the page status + if test $DSTBY -ne 0 + then + COLOR="4&clear" + elif test $DFAIL -ne 0 + then + COLOR="1&red" + elif test $DWARN -ne 0 + then + COLOR="2&yellow" + else + COLOR="3&green" + fi + + ## Avoid duplicate device + if ! grep -q "${DID}" "${plugin_result}"; then + ## For summary + echo "${COLOR} $DISK ${TYPE}" + + ## For detailed informations + { + echo "${COLOR} $DISK ${TYPE}" | cut -c2- + echo "" + echo "$DRES" | grep -v -E "^smartctl|^Copyright|^$|^===" + echo "${DDRIVEDB_MSG}" + echo "${DINFO}" + echo "${DSELFTEST_MSG}" + echo "${DSELFTEST}" | head -n12 + echo "------------------------------------------------------------" + echo "" + echo "" + } >> "${plugin_result}" + fi + done < "${device_list}" >> "${plugin_state}" + +# If the file is empty +else + echo "1&red Error while scanning devices with smartctl" >> "${plugin_state}" +fi + +# Set the global color according to the highest alert +COLOR=$(< "${plugin_state}" awk '{print $1}' | sort | uniq | head -1 | cut -c3-) + +# Send informations to Xymon server +$XYMON "${XYMSRV}" "status ${MACHINE}.${plugin_name} ${COLOR} SMART health check + +$(< "${plugin_state}" cut -c2-) + +==================== Detailed status ==================== + +$(cat "${plugin_result}") +" + +## Clean temp files +### As the Xymon's tmpdir is used to store log files, no need to delete them at +### the end of the script. They will be emptied, reused or regenerate (if oldest +### than the expected interval) at the next run. + +exit 0 diff --git a/xymon/plugins/client/ext/test.int.compare.sh b/xymon/plugins/client/ext/test.int.compare.sh new file mode 100755 index 0000000..3e0403c --- /dev/null +++ b/xymon/plugins/client/ext/test.int.compare.sh @@ -0,0 +1,66 @@ +#!/bin/sh +# .. vim: foldmarker=[[[,]]]:foldmethod=marker + +# Vars [[[ +debug="0" + +## Colors [[[ +c_redb='\033[1;31m' +c_magentab='\033[1;35m' +c_reset='\033[0m' +## ]]] +default_mtime_minutes="600" + +drivedb_list="/tmp/test.css" + +DDRIVEDB_MSG="green" +#DDRIVEDB_MSG="red" +DISK="/dev/sda" +# ]]] +## Create or empty a file if it's too old [[[ +regenerate_if_too_old() { + ## Set variables according to the number of passed arguments + case $# in + 1 ) + _file="${1}" + [ "${debug}" -eq "0" ] && printf "${c_magentab}%-6b${c_reset}\n" "DEBUG : regenerate_if_too_old func − Use default_mtime_minutes value: ${default_mtime_minutes}." + _max_mtime_minutes="${default_mtime_minutes}" + ;; + 2 ) + _file="${1}" + _max_mtime_minutes="${2}" + ;; + esac + + _current_timestamp=$(date "+%s") + _file_mtime_timestamp=$(stat --format="%Y" -- "${_file}") + + ## Substract last modification timestamp of the file to current timestamp + : $(( _file_mtime_seconds=_current_timestamp-_file_mtime_timestamp )) + ## Get maximum allowed mtime in seconds + : $(( _max_mtime_seconds=_max_mtime_minutes*60 )) + + ## Compare last modification mtime with the maximum allowed + if [ "${_file_mtime_seconds}" -gt "${_max_mtime_seconds}" ]; then + [ "${debug}" -eq "0" ] && printf "${c_magentab}%-6b${c_reset}\n" "DEBUG : regenerate_if_too_old func − Need to empty or create ${_file} last modification happened ${_file_mtime_seconds} seconds ago (maximum is ${_max_mtime_seconds})." + true > "${_file}" + else + [ "${debug}" -eq "0" ] && printf "${c_magentab}%-6b${c_reset}\n" "DEBUG : regenerate_if_too_old func − Don't need to empty ${_file} last modification happened ${_file_mtime_seconds} seconds ago (maximum is ${_max_mtime_seconds})." + fi + +} +## ]]] + +regenerate_if_too_old /tmp/css_style.css +regenerate_if_too_old /tmp/font.css 60 +regenerate_if_too_old /tmp/user/1337/serverauth.qGdeK8OOzr 1440 +regenerate_if_too_old /tmp/test.css 600 + +if printf -- '%s' "${DDRIVEDB_MSG}" | grep -q -E -- "green" && + ! grep -q -- "${DISK}" "${drivedb_list}" +then + echo "${DISK}" >> "${drivedb_list}" + fi + + +exit 0 diff --git a/xymon/sample.messages/apt.alert b/xymon/sample.messages/apt.alert new file mode 100644 index 0000000..2b586f0 --- /dev/null +++ b/xymon/sample.messages/apt.alert @@ -0,0 +1,28 @@ +BBCOLORLEVEL="red" +BBALPHAMSG="HOST.DOMAIN.ORG:apt red [168321] +red Wed Aug 22 11:26:34 2018 - apt NOT ok +Debian GNU/Linux 9.5 (stretch) + +&red Security updates (4): apt-get install openssh-client openssh-server openssh-sftp-server ssh + openssh-client (1:7.4p1-10+deb9u3 1:7.4p1-10+deb9u4) + openssh-server (1:7.4p1-10+deb9u3 1:7.4p1-10+deb9u4) + openssh-sftp-server (1:7.4p1-10+deb9u3 1:7.4p1-10+deb9u4) + ssh (1:7.4p1-10+deb9u3 1:7.4p1-10+deb9u4) + +&red Last apt update: 3.0 day(s) ago + + +See http://localhost/xymon-cgi/svcstatus.sh?HOST=HOST.DOMAIN.ORG&SERVICE=apt +" +ACKCODE="168321" +RCPT="1234567890" +BBHOSTNAME="HOST.DOMAIN.ORG" +MACHIP="111222333444" +BBSVCNAME="apt" +BBSVCNUM="0" +BBHOSTSVC="HOST.DOMAIN.ORG.apt" +BBHOSTSVCCOMMAS="HOST,DOMAIN,ORG.apt" +BBNUMERIC="000111222333444168321" +RECOVERED="0" +DOWNSECS="36" +DOWNSECSMSG="" diff --git a/xymon/sample.messages/cpu.alert b/xymon/sample.messages/cpu.alert new file mode 100644 index 0000000..4eb1c79 --- /dev/null +++ b/xymon/sample.messages/cpu.alert @@ -0,0 +1,62 @@ +BBCOLORLEVEL="yellow" +BBALPHAMSG="HOST.DOMAIN.ORG:cpu yellow [520216] +yellow Tue Aug 21 11:43:22 CEST 2018 up: 00:00, 0 users, 29 procs, load=1.90 +&yellow Machine recently rebooted +System clock is 0 seconds off + + +top - 11:43:23 up 0 min, 0 users, load average: 1.49, 1.90, 3.03 +Tasks: 33 total, 2 running, 31 sleeping, 0 stopped, 0 zombie +%Cpu(s): 3.8 us, 5.9 sy, 0.0 ni, 89.3 id, 0.8 wa, 0.0 hi, 0.1 si, 0.0 st +KiB Mem : 1048576 total, 897468 free, 33268 used, 117840 buff/cache +KiB Swap: 1048576 total, 1048576 free, 0 used. 1015308 avail Mem + + PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND + 263 xymon 20 0 64284 6264 5072 D 100.0 0.6 0:00.05 apt-cache + 1 root 20 0 69588 7680 5968 S 0.0 0.7 0:00.08 systemd + 43 root 20 0 93256 16952 16356 S 0.0 1.6 0:00.03 systemd-j+ + 89 root 20 0 22560 1052 0 S 0.0 0.1 0:00.00 dhclient + 121 root 20 0 29668 2804 2520 S 0.0 0.3 0:00.00 cron + 122 root 20 0 321836 2756 2348 S 0.0 0.3 0:00.00 rsyslogd + 124 root 20 0 171944 9704 8308 S 0.0 0.9 0:00.00 sssd + 130 message+ 20 0 59424 4036 3576 S 0.0 0.4 0:00.00 dbus-daem+ + 145 daemon 20 0 27968 2120 1916 S 0.0 0.2 0:00.00 atd + 147 root 20 0 71988 5648 4896 S 0.0 0.5 0:00.00 sshd + 164 xymon 20 0 4272 1488 1332 S 0.0 0.1 0:00.00 xymonlaun+ + 167 xymon 20 0 4292 712 640 S 0.0 0.1 0:00.00 xymonclie+ + 168 xymon 20 0 18300 5152 3940 S 0.0 0.5 0:00.00 apt + 176 root 20 0 201132 15804 10264 S 0.0 1.5 0:00.02 sssd_be + 177 root 20 0 17076 160 0 S 0.0 0.0 0:00.00 in.tftpd + 191 root 20 0 168252 33204 32060 S 0.0 3.2 0:00.02 sssd_nss + 192 root 20 0 147780 7452 6396 S 0.0 0.7 0:00.00 sssd_pam + 193 root 20 0 141356 7364 6344 S 0.0 0.7 0:00.00 sssd_auto+ + 198 xymon 20 0 4292 1360 1252 S 0.0 0.1 0:00.00 xymonclie+ + 211 arpwatch 20 0 31872 4792 4104 S 0.0 0.5 0:00.00 arpwatch + 214 root 20 0 54532 2768 2276 S 0.0 0.3 0:00.00 systemd-l+ + 223 root 20 0 69592 1716 4 S 0.0 0.2 0:00.00 (agetty) + 224 root 20 0 69592 1716 4 S 0.0 0.2 0:00.00 (agetty) + 251 root 20 0 361140 15748 6296 S 0.0 1.5 0:00.04 fail2ban-+ + 292 root 20 0 81152 6396 5636 S 0.0 0.6 0:00.00 postmulti + 298 root 20 0 4292 752 680 S 0.0 0.1 0:00.00 postfix-s+ + 306 xymon 20 0 38236 2984 2584 R 0.0 0.3 0:00.00 top + 345 root 20 0 4292 712 636 S 0.0 0.1 0:00.00 sh + 349 root 20 0 22536 1392 1164 D 0.0 0.1 0:00.00 nft + 351 root 20 0 4292 760 688 S 0.0 0.1 0:00.00 postfix-s+ + 353 root 20 0 4292 96 0 S 0.0 0.0 0:00.00 postfix-s+ + 354 root 20 0 79236 4740 4064 R 0.0 0.5 0:00.00 postconf + 355 root 20 0 13216 1004 900 S 0.0 0.1 0:00.00 sed + +See http://localhost/xymon-cgi/svcstatus.sh?HOST=HOST.DOMAIN.ORG&SERVICE=cpu +" +ACKCODE="520216" +RCPT="1234567890" +BBHOSTNAME="HOST.DOMAIN.ORG" +MACHIP="111222333444" +BBSVCNAME="cpu" +BBSVCNUM="200" +BBHOSTSVC="HOST.DOMAIN.ORG.cpu" +BBHOSTSVCCOMMAS="HOST,DOMAIN,ORG.cpu" +BBNUMERIC="200111222333444520216" +RECOVERED="0" +DOWNSECS="0" +DOWNSECSMSG="" diff --git a/xymon/sample.messages/files.alert b/xymon/sample.messages/files.alert new file mode 100644 index 0000000..2a4ccd9 --- /dev/null +++ b/xymon/sample.messages/files.alert @@ -0,0 +1,41 @@ +BBCOLORLEVEL="yellow" +BBALPHAMSG="HOST.DOMAIN.ORG:files yellow [275849] +yellow Thu Oct 10 11:17:18 CEST 2019 - Files NOT ok + +&yellow /var/log/cron.log +File was modified 4642 seconds ago - should be <3800 + +&yellow /tmp/.github.TEST.upgrade +File exists + +&green /var/log/kern.log + +&green /var/log/messages + +&green /var/log/syslog + +&green /var/log + +&green /tmp/ + +&green /bin/su + +&green /usr/bin/sudo + +&green /var/log/installer + + +See http://localhost/xymon-cgi/svcstatus.sh?HOST=HOST.DOMAIN.ORG&SERVICE=files +" +ACKCODE="275849" +RCPT="1234567890" +BBHOSTNAME="HOST.DOMAIN.ORG" +MACHIP="037187001062" +BBSVCNAME="files" +BBSVCNUM="0" +BBHOSTSVC="HOST.DOMAIN.ORG.files" +BBHOSTSVCCOMMAS="HOST,DOMAIN.ORG.files" +BBNUMERIC="000037187001062275849" +RECOVERED="0" +DOWNSECS="603" +DOWNSECSMSG="" diff --git a/xymon/sample.messages/libs.alert b/xymon/sample.messages/libs.alert new file mode 100644 index 0000000..3c98eea --- /dev/null +++ b/xymon/sample.messages/libs.alert @@ -0,0 +1,29 @@ +BBCOLORLEVEL="yellow" +BBALPHAMSG="HOST.DOMAIN.ORG:libs yellow [0] +yellow Thu Aug 16 16:41:56 2018 - libs NOT ok +&yellow Machine should be rebooted. Running not the newest installed kernel: + + Running kernel=\"4.9.0-7-amd64, version #1 SMP Debian 4.9.110-3+deb9u1 (2018-08-03)\" + Newest installed kernel=\"4.9.0-7-amd64, version #1 SMP Debian 4.9.110-3+deb9u2 (2018-08-13)\" + +&yellow The following processes have libs linked that were upgraded: + +root: + /lib/systemd/systemd-udevd (546) +systemd-timesync: + /usr/lib/postfix/qmgr (52880) + + +See http://localhost/xymon-cgi/svcstatus.sh?HOST=HOST.DOMAIN.ORG&SERVICE=libs" +ACKCODE="0" +RCPT="1234567890" +BBHOSTNAME="HOST.DOMAIN.ORG" +MACHIP="111222333444" +BBSVCNAME="libs" +BBSVCNUM="0" +BBHOSTSVC="HOST.DOMAIN.ORG.libs" +BBHOSTSVCCOMMAS="HOST.DOMAIN.ORG.libs" +BBNUMERIC="0001112223334440" +RECOVERED="0" +DOWNSECS="36320" +DOWNSECSMSG="" diff --git a/xymon/sample.messages/memory.alert b/xymon/sample.messages/memory.alert new file mode 100644 index 0000000..953b69a --- /dev/null +++ b/xymon/sample.messages/memory.alert @@ -0,0 +1,22 @@ +BBCOLORLEVEL="red" +BBALPHAMSG="HOST.DOMAIN.ORG:memory red [251314] +red Tue Aug 21 11:42:42 CEST 2018 - Memory CRITICAL + Memory Used Total Percentage +&green Real/Physical 1017M 1536M 66% +&green Actual/Virtual 692M 1536M 45% +&red Swap/Page 1024M 1024M 100% + +See http://localhost/xymon-cgi/svcstatus.sh?HOST=HOST.DOMAIN.ORG&SERVICE=memory +" +ACKCODE="251314" +RCPT="1234567890" +BBHOSTNAME="HOST.DOMAIN.ORG" +MACHIP="111222333444" +BBSVCNAME="memory" +BBSVCNUM="0" +BBHOSTSVC="HOST.DOMAIN.ORG.memory" +BBHOSTSVCCOMMAS="HOST,DOMAIN,ORG.memory" +BBNUMERIC="000111222333444251314" +RECOVERED="0" +DOWNSECS="0" +DOWNSECSMSG="" diff --git a/xymon/sample.messages/ntpq.alert b/xymon/sample.messages/ntpq.alert new file mode 100644 index 0000000..ed8d9cb --- /dev/null +++ b/xymon/sample.messages/ntpq.alert @@ -0,0 +1,38 @@ +BBCOLORLEVEL="red" +BBALPHAMSG="HOST.DOMAIN.ORG:ntpq red [842850] +red Tue Aug 21 11:20:53 2018 - ntpq NOT ok +NTP peers: + + remote refid st t when poll reach delay offset jitter + ============================================================================== +&clear WWW.XXX.YYY.ZZZ 213.251.53.11 3 u 18 64 1 0.542 -2.836 0.000 + +&red No system peer entry ("*") found + + remote refid st t when poll reach delay offset jitter + ============================================================================== +&green *WWW.XXX.YYY.ZZZ 51.15.178.157 3 u 243 1024 377 1.194 102.549 83.035 + +SyspeerDelay: 1.194 +SyspeerOffset: 102.549 +SyspeerJitter: 83.035 + +SyspeerOffset thresholds: +Warning: 100ms +Critical: 2000ms +&yellow SyspeerOffset > 100ms + +See http://localhost/xymon-cgi/svcstatus.sh?HOST=HOST.DOMAIN.ORG&SERVICE=ntpq +" +ACKCODE="842850" +RCPT="1234567890" +BBHOSTNAME="HOST.DOMAIN.ORG" +MACHIP="111222333444" +BBSVCNAME="ntpq" +BBSVCNUM="0" +BBHOSTSVC="HOST.DOMAIN.ORG.ntpq" +BBHOSTSVCCOMMAS="HOST,DOMAIN,ORG.ntpq" +BBNUMERIC="000111222333444842850" +RECOVERED="0" +DOWNSECS="1544" +DOWNSECSMSG="" diff --git a/xymon/sample.messages/ports.alert b/xymon/sample.messages/ports.alert new file mode 100644 index 0000000..61caeef --- /dev/null +++ b/xymon/sample.messages/ports.alert @@ -0,0 +1,31 @@ +BBCOLORLEVEL="yellow" +BBALPHAMSG="HOST.DOMAIN.ORG:ports yellow [40450] +yellow Tue Aug 21 11:43:22 CEST 2018 - Ports NOT ok +&yellow SMTP listener (found 0, req. between 1 and 2) +&green SSH logins (found 0, req. at most 10) +&green Bad listeners (found 0, req. at most 0) + +Active Internet connections (servers and established) +Proto Recv-Q Send-Q Local Address Foreign Address State +tcp 0 0 0.0.0.0:22 0.0.0.0:* LISTEN +tcp 0 0 111.222.333.444:45250 WWW.XXX.YYY.ZZZ:1984 TIME_WAIT +tcp 0 0 111.222.333.444:54522 444.333.222.111:389 ESTABLISHED +tcp 0 0 111.222.333.444:45244 WWW.XXX.YYY.ZZZ:1984 TIME_WAIT +tcp 0 0 111.222.333.444:45242 WWW.XXX.YYY.ZZZ:1984 TIME_WAIT +tcp6 0 0 :::22 :::* LISTEN +udp 0 0 0.0.0.0:68 0.0.0.0:* +udp6 0 0 :::69 :::* +See http://localhost/xymon-cgi/svcstatus.sh?HOST=HOST.DOMAIN.ORG&SERVICE=ports +" +ACKCODE="40450" +RCPT="1234567890" +BBHOSTNAME="HOST.DOMAIN.ORG" +MACHIP="111222333444" +BBSVCNAME="ports" +BBSVCNUM="0" +BBHOSTSVC="HOST.DOMAIN.ORG.ports" +BBHOSTSVCCOMMAS="HOST,DOMAIN,ORG.ports" +BBNUMERIC="00011122233344440450" +RECOVERED="0" +DOWNSECS="6" +DOWNSECSMSG="" diff --git a/xymon/sample.messages/procs.alert b/xymon/sample.messages/procs.alert new file mode 100644 index 0000000..186475d --- /dev/null +++ b/xymon/sample.messages/procs.alert @@ -0,0 +1,71 @@ +BBCOLORLEVEL="red" +BBALPHAMSG="HOST.DOMAIN.ORG:procs red [757744] +red Tue Aug 21 13:29:28 CEST 2018 - Processes NOT ok +&green systemd-journald (found 1, req. between 1 and 1) +&green systemd-logind (found 1, req. between 1 and 1) +&green CRON (found 1, req. between 1 and 999) +&green ATD (found 1, req. between 1 and 999) +&green MTA-Stretch (found 1, req. between 1 and 1) +&green SSHD (found 3, req. between 1 and 20) +&green SSSD (found 1, req. between 1 and 1) +&green Fail2Ban (found 1, req. between 1 and 1) +&red tftpd-hpa (found 0, req. between 1 and 1) +&yellow ARPwatch (found 0, req. between 1 and 1) +&red Jenkins (found 0, req. between 1 and 5) + + PID PPID USER STARTED S PRI %CPU TIME %MEM RSZ VSZ CMD + 1 0 root 11:43:19 S 19 0.0 00:00:00 0.7 7816 69724 /sbin/init + 43 1 root 11:43:20 S 19 0.0 00:00:00 2.4 25848 109744 /lib/systemd/systemd-journald + 89 1 root 11:43:20 S 19 0.0 00:00:00 0.1 1052 22560 /sbin/dhclient -4 -v -pf /run/dhclient.eth0.pid -lf /var/lib/dhcp/dhclient.eth0.leases -I -df /var/lib/dhcp/dhclient6.eth0.leases eth0 + 121 1 root 11:43:21 S 19 0.0 00:00:00 0.2 2804 29668 /usr/sbin/cron -f + 122 1 root 11:43:21 S 19 0.0 00:00:00 0.2 2756 321836 /usr/sbin/rsyslogd -n + 124 1 root 11:43:21 S 19 0.0 00:00:00 0.9 9704 171944 /usr/sbin/sssd -i -f + 176 124 root 11:43:21 S 19 0.0 00:00:00 1.7 18244 212360 \_ /usr/lib/x86_64-linux-gnu/sssd/sssd_be --domain ur1 --uid 0 --gid 0 --debug-to-files + 191 124 root 11:43:21 S 19 0.0 00:00:00 3.1 33204 168252 \_ /usr/lib/x86_64-linux-gnu/sssd/sssd_nss --uid 0 --gid 0 --debug-to-files + 192 124 root 11:43:21 S 19 0.0 00:00:00 0.8 8672 147912 \_ /usr/lib/x86_64-linux-gnu/sssd/sssd_pam --uid 0 --gid 0 --debug-to-files + 193 124 root 11:43:21 S 19 0.0 00:00:00 0.7 7364 141356 \_ /usr/lib/x86_64-linux-gnu/sssd/sssd_autofs --uid 0 --gid 0 --debug-to-files + 145 1 daemon 11:43:21 S 19 0.0 00:00:00 0.2 2120 27968 /usr/sbin/atd -f -l 9.6 -b 98 + 147 1 root 11:43:21 S 19 0.0 00:00:00 0.5 6180 71988 /usr/sbin/sshd -D + 4810 147 root 13:29:14 S 19 0.0 00:00:00 0.7 7468 134176 \_ sshd: USER [priv] + 5023 4810 USER 13:29:14 S 19 0.0 00:00:00 0.3 3952 134176 \_ sshd: USER@pts/2 + 5024 5023 USER 13:29:14 S 19 0.0 00:00:00 0.5 5608 59296 \_ zsh + 5057 5024 USER 13:29:15 S 19 0.0 00:00:00 0.2 2988 19336 \_ tmux + 223 1 root 11:43:21 S 19 0.0 00:00:00 0.1 2060 14316 /sbin/agetty -o -p -- \u --noclear --keep-baud tty1 115200,38400,9600 linux + 224 1 root 11:43:21 S 19 0.0 00:00:00 0.1 2056 14316 /sbin/agetty -o -p -- \u --noclear --keep-baud console 115200,38400,9600 linux + 251 1 root 11:43:22 S 19 0.0 00:00:01 1.5 15800 361140 /usr/bin/python3 /usr/bin/fail2ban-server -s /var/run/fail2ban/fail2ban.sock -p /var/run/fail2ban/fail2ban.pid -x -b + 412 1 root 11:43:22 S 19 0.0 00:00:00 0.4 4244 83252 /usr/lib/postfix/sbin/master -w + 415 412 postfix 11:43:22 S 19 0.0 00:00:00 0.6 6628 95528 \_ qmgr -l -t unix -u + 4453 412 postfix 13:19:59 S 19 0.0 00:00:00 0.6 6532 95480 \_ showq -t unix -u -c + 4504 412 postfix 13:23:22 S 19 0.0 00:00:00 0.6 6524 95480 \_ pickup -l -t unix -u -c + 416 1 netdata 11:43:22 S 19 0.0 00:02:06 7.1 75324 224012 /usr/sbin/netdata -D + 432 416 netdata 11:43:23 R 19 0.0 00:00:16 0.2 2628 28108 \_ /usr/lib/x86_64-linux-gnu/netdata/plugins.d/apps.plugin 1 + 433 416 netdata 11:43:23 S 19 0.0 00:01:30 4.6 48576 121208 \_ /usr/bin/python /usr/lib/x86_64-linux-gnu/netdata/plugins.d/python.d.plugin 1 + 4643 416 netdata 13:25:26 S 19 0.0 00:00:00 0.2 2688 9700 \_ bash /usr/lib/x86_64-linux-gnu/netdata/plugins.d/tc-qos-helper.sh 1 + 660 1 xymon-s+ 11:43:27 S 19 0.0 00:00:00 0.6 6788 68928 /lib/systemd/systemd --user + 662 660 xymon-s+ 11:43:27 S 19 0.0 00:00:00 0.1 1904 123912 \_ (sd-pam) + 881 1 message+ 11:43:27 S 19 0.0 00:00:00 0.4 4384 67636 /usr/bin/dbus-daemon --system --address=systemd: --nofork --nopidfile --systemd-activation + 883 1 root 11:43:27 S 19 0.0 00:00:00 0.5 5552 73268 /lib/systemd/systemd-logind + 1377 1 USER 11:49:11 S 19 0.0 00:00:00 0.6 6716 68932 /lib/systemd/systemd --user + 1379 1377 USER 11:49:11 S 19 0.0 00:00:00 0.1 2040 132260 \_ (sd-pam) + 4768 1 xymon 13:28:48 S 19 0.0 00:00:00 0.0 756 4292 sh -c vmstat 300 2 1>/var/lib/xymon/tmp/xymon_vmstat.HOST.DOMAIN.ORG.4719 2>&1; mv /var/lib/xymon/tmp/xymon_vmstat.HOST.DOMAIN.ORG.4719 /var/lib/xymon/tmp/xymon_vmstat.HOST.DOMAIN.ORG + 4770 4768 xymon 13:28:48 S 19 0.0 00:00:00 0.1 1400 24900 \_ vmstat 300 2 + 5059 1 USER 13:29:15 S 19 0.0 00:00:00 0.3 3364 28044 tmux + 5067 5059 USER 13:29:15 S 19 0.0 00:00:00 0.6 6396 62052 \_ -zsh + 5187 1 xymon 13:29:27 S 19 0.0 00:00:00 0.1 1576 4272 /usr/lib/xymon/client/bin/xymonlaunch --config=/etc/xymon/clientlaunch.cfg --log=/var/log/xymon/clientlaunch.log --pidfile=/var/run/xymon/clientlaunch.pid + 5191 5187 xymon 13:29:27 S 19 0.0 00:00:00 0.1 1572 4292 \_ /bin/sh /usr/lib/xymon/client/bin/xymonclient.sh + 5214 5191 xymon 13:29:27 S 19 0.0 00:00:00 0.1 1600 4292 | \_ /bin/sh /usr/lib/xymon/client/bin/xymonclient-linux.sh + 5256 5214 xymon 13:29:28 R 19 0.0 00:00:00 0.2 2804 44404 | \_ ps -Aww f -o pid,ppid,user,start,state,pri,pcpu,time:12,pmem,rsz:10,vsz:10,cmd + 5192 5187 xymon 13:29:27 S 19 0.0 00:00:00 0.4 5164 18300 \_ /usr/bin/perl -w /usr/lib/xymon/client/ext/apt + 5231 5192 xymon 13:29:28 R 19 0.0 00:00:00 3.9 41128 70068 \_ apt-cache policy acl adduser apt apt-listchanges apt-transport-https apt-utils aptitude aptitude-common arpwatch at base-files base-passwd bash bash-completion bind9-host binutils bsd-mailx bsdmainutils bsdutils bzip2 ca-certificates coreutils cpio cpp cpp-6 cracklib-runtime cron curl dash dbus dctrl-tools debconf debconf-i18n debian-archive-keyring debian-faq debian-goodies debianutils debsecan debsums dh-python diffutils dirmngr distro-info-data dmidecode dmsetup doc-debian dpkg e2fslibs:amd64 e2fsprogs ed etckeeper fail2ban file findutils fontconfig-config fonts-dejavu-core fonts-font-awesome fping gcc-6-base:amd64 gettext-base git git-man gnupg gnupg-agent gnutls-bin gpgv grep groff-base gzip hobbit-plugins hostname htop iftop ifupdown init init-system-helpers DOMAINoute2 iputils-ping isc-dhcp-client isc-dhcp-common kmod krb5-locales less libacl1:amd64 libapparmor1:amd64 libapt-inst2.0:amd64 libapt-pkg5.0:amd64 libasprintf0v5:amd64 libassuan0:amd64 libattr1:amd64 libaudit-common libaudit1:amd64 libavahi-client3:amd64 libavahi-common-data:amd64 libavahi-common3:amd64 libbasicobjects0:amd64 libbind9-140:amd64 libblkid1:amd64 libboost-filesystem1.62.0:amd64 libboost-iostreams1.62.0:amd64 libboost-system1.62.0:amd64 libbsd0:amd64 libbz2-1.0:amd64 libc-ares2:amd64 libc-bin libc-l10n libc6:amd64 libcap-ng0:amd64 libcap2-bin libcap2:amd64 libclass-isa-perl libcollection4:amd64 libcomerr2:amd64 libcrack2:amd64 libcryptsetup4:amd64 libcups2:amd64 libcurl3-gnutls:amd64 libcurl3:amd64 libcwidget3v5:amd64 libdb5.3:amd64 libdbus-1-3:amd64 libdebconfclient0:amd64 libdevmapper1.02.1:amd64 libdhash1:amd64 libdns-export162 libdns162:amd64 libdpkg-perl libdrm2:amd64 libedit2:amd64 libelf1:amd64 liberror-perl libes" +ACKCODE="757744" +RCPT="1234567890" +BBHOSTNAME="HOST.DOMAIN.ORG" +MACHIP="111222333444" +BBSVCNAME="procs" +BBSVCNUM="300" +BBHOSTSVC="HOST.DOMAIN.ORG.procs" +BBHOSTSVCCOMMAS="HOST,DOMAIN,ORG.procs" +BBNUMERIC="300111222333444757744" +RECOVERED="0" +DOWNSECS="0" +DOWNSECSMSG="" diff --git a/xymon/tar.client.logfiles.sh b/xymon/tar.client.logfiles.sh new file mode 100755 index 0000000..40ababc --- /dev/null +++ b/xymon/tar.client.logfiles.sh @@ -0,0 +1,65 @@ +#!/bin/sh + +# Purpose : +# Create an XZ archive of all files between 2 dates. +# Then remove these files + +# Call this script from an /var/lib/xymon/hostdata subdirectory or +# /var/lib/xymon/histlogs subdirectory. + +# Vars {{{ +## Enable (0) or disable (1) debug +debug=0 + +## Colors {{{ +c_redb='\033[1;31m' +c_magentab='\033[1;35m' +c_reset='\033[0m' +## }}} + +## Manage files of year +date_year="2019" + +## Compress files between these dates +date_start="${date_year}-01-01 00:00:01" +date_end="${date_year}-12-31 23:59:59" + +## Best XZ compression level +xz_compression_lvl="-9" +## Fatest XZ compression level +#xz_compression_lvl="-0" + +## Get current directory name +current_dir=${PWD##*/} + +## Count the number of files +match_files=$(find . -type f -newermt "${date_start}" -not -newermt "${date_end}" -not -iname "*.tar*" | wc -l) + +## Archive name +tar_file_name="${date_year}.${current_dir}${xz_compression_lvl}.tar.xz" + +# }}} + +# If archive already exists +if [ -s "${tar_file_name}" ]; then + printf "${c_redb}%-6b${c_reset}\n" "ERROR : ${tar_file_name} already exists (also ${match_files} files match the expected pattern). Please manage this directory manually or remove the archive or files then restart." + exit 1 +fi + +# If some files match +if [ ! ${match_files} -eq "0" ]; then + [ "${debug}" -eq "0" ] && printf "${c_magentab}%-6b${c_reset}\n" "DEBUG : Create an archive for ${current_dir} files between ${date_start} and ${date_end} (${match_files} files) using XZ's compression level : ${xz_compression_lvl}." + + ## Get the list of files between the 2 dates and ignore tar files + find . -type f -newermt "${date_start}" -not -newermt "${date_end}" -not -iname "*.tar*" -print0 | tar cJf "${tar_file_name}" --null -T - + + ## Check previous return code and if the archive exists with size > 0 + if [ "${?}" -eq "0" ] && [ -s "${tar_file_name}" ]; then + [ "${debug}" -eq "0" ] && printf "${c_magentab}%-6b${c_reset}\n" "DEBUG : ${tar_file_name} successfully created, the files can be deleted." + find . -type f -newermt "${date_start}" -not -newermt "${date_end}" -not -iname "*.tar*" -delete + fi +else + [ "${debug}" -eq "0" ] && printf "${c_magentab}%-6b${c_reset}\n" "DEBUG : Skip ${current_dir}, no files found between ${date_start} and ${date_end}." +fi + +exit 0 diff --git a/xymon/xymon.alert.sh b/xymon/xymon.alert.sh new file mode 100755 index 0000000..7014ca4 --- /dev/null +++ b/xymon/xymon.alert.sh @@ -0,0 +1,59 @@ +#!/bin/sh +# Purpose {{{ +## If Xymon server says that a probe is in error on a remote host, try to call the appropriate script. +## For debugging messages, you can check xymon's logs (/var/log/xymon/alert.log) + +## How-to use : {{{ +### Define an alert in Xymon configuration file (/etc/xymon/alerts.cfg) +#HOST=HOST.DOMAIN.ORG +# SCRIPT /PATH/TO/SCRIPT/xymon.alert.sh 1234567890 FORMAT=SCRIPT DURATION<20 + +## }}} +# }}} +# Vars {{{ +DEBUG=1 + +script_path="$(dirname -- ${0})" +script_apt="${script_path}/xymon.apt.alert.sh" +script_files="${script_path}/xymon.files.alert.sh" +script_libs="${script_path}/xymon.libs.alert.sh" +script_procs="${script_path}/xymon.procs.alert.sh" +# }}} + +[ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : ${BBHOSTNAME} — ${BBSVCNAME} is in error." + +# Match probe name with the script {{{ +case "${BBSVCNAME}" in + 'apt' ) + script_to_run="${script_apt}" + ;; + 'files' ) + script_to_run="${script_files}" + ;; + 'libs' ) + script_to_run="${script_libs}" + ;; + 'procs' ) + script_to_run="${script_procs}" + ;; + # default + * ) + script_to_run="NOT.MANAGED" + ;; +esac +# }}} + +# Call the next script if managed {{{ +if [ "${script_to_run}" != "NOT.MANAGED" ]; then + # Export vars {{{ + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : ${BBHOSTNAME} — Export vars for ${script_to_run}" + export BBALPHAMSG + export BBHOSTNAME + export BBSVCNAME + # }}} + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : ${BBHOSTNAME} — Run ${script_to_run} script." + "${script_to_run}" +fi +# }}} + +exit 0 diff --git a/xymon/xymon.apt.alert.sh b/xymon/xymon.apt.alert.sh new file mode 100755 index 0000000..cface36 --- /dev/null +++ b/xymon/xymon.apt.alert.sh @@ -0,0 +1,62 @@ +#!/bin/sh +# Purpose {{{ +## If Xymon server says that the last apt update is too old, try to run a new one. +## 1. Create a ssh keyring for xymon user {{{ +# sudo mkdir -p -- /var/lib/xymon/.ssh/ +# sudo ssh-keygen -f /var/lib/xymon/.ssh/id_rsa -N '' -q +# sudo chown -R xymon:xymon /var/lib/xymon/.ssh/ +## }}} +## 2. Remote user {{{ +# Ensure to have the ${REMOTE_SSH_USER} available on remote hosts and allowed to connect with SSH. +# Restrict the SSH access to a single SSH key from the Xymon server IP (~${REMOTE_SSH_USER}/.ssh/authorized_keys) : +## from="IP.SRV.XYM.ON" ssh-rsa AAAAA… +# Allow sudo commands to restart services and run apt update (/etc/sudoers.d/xymon-ssh) : +## xymon-ssh ALL=(root:root) NOPASSWD: /usr/bin/apt update +## xymon-ssh ALL=(root:root) NOPASSWD: /bin/systemctl restart * +## }}} + +# }}} +# Vars {{{ +DEBUG=1 + +LOCAL_SSH_USER="xymon" +REMOTE_SSH_USER="xymon-ssh" + +temp_dir=$(mktemp -d -t xymon-apt-alert-XXXXXX.tmp) +debug_stdout="${temp_dir}/debug.stdout" +debug_stderr="${temp_dir}/debug.stderr" +# }}} + +# Create log files +touch "${debug_stdout}" "${debug_stderr}" + +# Manage only apt probe {{{ +if [ "${BBSVCNAME}" = "apt" ]; then + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : ${BBHOSTNAME} — ${BBSVCNAME} error" >> "${debug_stdout}" +else + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : ${BBHOSTNAME} — ${BBSVCNAME} probe is not managed." >> "${debug_stderr}" + [ "${DEBUG}" -eq "0" ] || rm -rf -- "${temp_dir}" + exit 0 +fi +# }}} + +# Check if repos need to be updated {{{ +if echo "${BBALPHAMSG}" | grep -qE "\\&(red|yellow) Last apt update.*ago$" ; then + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : Test APT repos — APT repos need to be updated." >> "${debug_stdout}" + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : Test APT repos — ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${REMOTE_SSH_USER}@${BBHOSTNAME} sudo apt update" >> "${debug_stdout}" + ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "${REMOTE_SSH_USER}"@"${BBHOSTNAME}" "sudo apt update" >> "${debug_stdout}" 2>> "${debug_stderr}" + # Also restart xymon-client service {{{ + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : Test APT repos — xymon-client also need to be restarted." >> "${debug_stdout}" + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : Test APT repos — ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${REMOTE_SSH_USER}@${BBHOSTNAME} sudo systemctl restart xymon-client.service" >> "${debug_stdout}" + ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "${REMOTE_SSH_USER}"@"${BBHOSTNAME}" "sudo systemctl restart xymon-client.service" >> "${debug_stdout}" 2>> "${debug_stderr}" + # }}} +fi +# }}} + +# Remove empty error file +[ -s "${debug_stderr}" ] || rm -f "${debug_stderr}" + +# Remove temp_dir if DEBUG is disable +[ "${DEBUG}" -eq "0" ] || rm -rf -- "${temp_dir}" + +exit 0 diff --git a/xymon/xymon.files.alert.sh b/xymon/xymon.files.alert.sh new file mode 100755 index 0000000..28ff4bb --- /dev/null +++ b/xymon/xymon.files.alert.sh @@ -0,0 +1,102 @@ +#!/bin/sh +# Purpose {{{ +## If Xymon server says that a file is in error on a remote host, try to restart the related service. +## 1. Create a ssh keyring for xymon user {{{ +# sudo mkdir -p -- /var/lib/xymon/.ssh/ +# sudo ssh-keygen -f /var/lib/xymon/.ssh/id_rsa -N '' -q +# sudo chown -R xymon:xymon /var/lib/xymon/.ssh/ +## }}} +## 2. Remote user {{{ +# Ensure to have the ${REMOTE_SSH_USER} available on remote hosts and allowed to connect with SSH. +# Restrict the SSH access to a single SSH key from the Xymon server IP (~${REMOTE_SSH_USER}/.ssh/authorized_keys) : +## from="IP.SRV.XYM.ON" ssh-rsa AAAAA… +# Allow sudo commands to restart services (/etc/sudoers.d/xymon-ssh) : +## xymon-ssh ALL=(root:root) NOPASSWD: /bin/systemctl restart * +## }}} + +# }}} +# Vars {{{ +DEBUG=1 + +LOCAL_SSH_USER="xymon" +REMOTE_SSH_USER="xymon-ssh" + +temp_dir=$(mktemp -d -t xymon-files-alert-XXXXXX.tmp) +debug_stdout="${temp_dir}/debug.stdout" +debug_stderr="${temp_dir}/debug.stderr" +file_list="${temp_dir}/services.error.list" +# }}} + +# Create log files +touch "${debug_stdout}" "${debug_stderr}" + +# Manage only files probe {{{ +if [ "${BBSVCNAME}" = "files" ]; then + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : ${BBHOSTNAME} — ${BBSVCNAME} error" >> "${debug_stdout}" +else + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : ${BBHOSTNAME} — ${BBSVCNAME} probe is not managed." >> "${debug_stderr}" + [ "${DEBUG}" -eq "0" ] || rm -rf -- "${temp_dir}" + exit 0 +fi +# }}} + +# Check if a file exceeds it's modification time {{{ +if echo "${BBALPHAMSG}" | grep -qE "File was modified.*ago - should be.*" ; then + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : Test file — Some files exceeds their modification time." >> "${debug_stdout}" + # First restart Rsyslog service {{{ + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : Test service — First restart rsyslog service." >> "${debug_stdout}" + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : Test service — ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${REMOTE_SSH_USER}@${BBHOSTNAME} sudo systemctl restart rsyslog.service" >> "${debug_stdout}" + ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "${REMOTE_SSH_USER}"@"${BBHOSTNAME}" "sudo systemctl restart rsyslog.service" >> "${debug_stdout}" 2>> "${debug_stderr}" + # }}} + ## Get the list of files path + ## TODO : Set a pattern for both yellow and red colors + echo "${BBALPHAMSG}" | sed -n 's;^\&yellow \(.*\);\1;p' > "${file_list}" + + while IFS= read -r file_path; do + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : while file_path loop — ${file_path} exceeds it's modification time." >> "${debug_stdout}" + # Match files path and services name {{{ + case "${file_path}" in + # Cron + '/var/log/cron.log' ) + service_name="cron" + ;; + # default + * ) + service_name="NOT.MANAGED" + ;; + esac + # }}} + + # Restart service if needed {{{ + if [ "${service_name}" != "NOT.MANAGED" ] && [ "${service_name}" != "${previous_service_name}" ]; then + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : while file_path loop — ${service_name} need to be restarted." >> "${debug_stdout}" + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : while file_path loop — ssh -n -o StrictHostKeyChecking=no ${REMOTE_SSH_USER}@${BBHOSTNAME} sudo systemctl restart ${service_name}.service" >> "${debug_stdout}" + ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "${REMOTE_SSH_USER}"@"${BBHOSTNAME}" "sudo systemctl restart ${service_name}.service" >> "${debug_stdout}" 2>> "${debug_stderr}" + previous_service_name="${service_name}" + else + if [ "${service_name}" = "NOT.MANAGED" ]; then + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : while file_path loop — service for ${file_path} is not managed." >> "${debug_stdout}" + else + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : while file_path loop — ${service_name} was already restarted." >> "${debug_stdout}" + fi + fi + # }}} + done < "${file_list}" + + # Also restart xymon-client service {{{ + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : Test service — xymon-client also need to be restarted." >> "${debug_stdout}" + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : Test service — ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${REMOTE_SSH_USER}@${BBHOSTNAME} sudo systemctl restart xymon-client.service" >> "${debug_stdout}" + ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "${REMOTE_SSH_USER}"@"${BBHOSTNAME}" "sudo systemctl restart xymon-client.service" >> "${debug_stdout}" 2>> "${debug_stderr}" + # }}} +else + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : Test file — All files seems up to date." >> "${debug_stdout}" +fi +# }}} + +# Remove empty error file +[ -s "${debug_stderr}" ] || rm -f "${debug_stderr}" + +# Remove temp_dir if DEBUG is disable +[ "${DEBUG}" -eq "0" ] || rm -rf -- "${temp_dir}" + +exit 0 diff --git a/xymon/xymon.libs.alert.sh b/xymon/xymon.libs.alert.sh new file mode 100755 index 0000000..3bee8c7 --- /dev/null +++ b/xymon/xymon.libs.alert.sh @@ -0,0 +1,159 @@ +#!/bin/sh +# Purpose {{{ +## If Xymon server says that a service is in error on a remote host, try to restart this service. +## 1. Create a ssh keyring for xymon user {{{ +# sudo mkdir -p -- /var/lib/xymon/.ssh/ +# sudo ssh-keygen -f /var/lib/xymon/.ssh/id_rsa -N '' -q +# sudo chown -R xymon:xymon /var/lib/xymon/.ssh/ +## }}} +## 2. Remote user {{{ +# Ensure to have the ${REMOTE_SSH_USER} available on remote hosts and allowed to connect with SSH. +# Restrict the SSH access to a single SSH key from the Xymon server IP (~${REMOTE_SSH_USER}/.ssh/authorized_keys) : +## from="IP.SRV.XYM.ON" ssh-rsa AAAAA… +# Allow sudo commands to restart services (/etc/sudoers.d/xymon-ssh) : +## xymon-ssh ALL=(root:root) NOPASSWD: /bin/systemctl restart * +## }}} + +# }}} +# Vars {{{ +DEBUG=1 + +LOCAL_SSH_USER="xymon" +REMOTE_SSH_USER="xymon-ssh" + +temp_dir=$(mktemp -d -t xymon-libs-alert-XXXXXX.tmp) +debug_stdout="${temp_dir}/debug.stdout" +debug_stderr="${temp_dir}/debug.stderr" +service_list="${temp_dir}/services.error.list" +# }}} + +# Create log files +touch "${debug_stdout}" "${debug_stderr}" + +# Manage only libs probe {{{ +if [ "${BBSVCNAME}" = "libs" ]; then + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : ${BBHOSTNAME} — ${BBSVCNAME} error" >> "${debug_stdout}" +else + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : ${BBHOSTNAME} — ${BBSVCNAME} probe is not managed." >> "${debug_stderr}" + [ "${DEBUG}" -eq "0" ] || rm -rf -- "${temp_dir}" + exit 0 +fi +# }}} + +# Check if host need to reboot {{{ +if echo "${BBALPHAMSG}" | grep -qE "\\&yellow Machine should be reboot.*" ; then + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : Test kernel — The host need to be rebooted." >> "${debug_stdout}" +fi +# }}} + +# Check if a service need to restart +if echo "${BBALPHAMSG}" | grep -qE "\\&yellow The following processes.*" ; then + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : Test service — Some services need to be restarted." >> "${debug_stdout}" + # Get the list of binaries path + echo "${BBALPHAMSG}" | sed -n 's/^ \(\/.*\) (.*)/\1/p' > "${service_list}" + + while IFS= read -r bin_path; do + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : while bin_path loop — ${bin_path} use old libs." >> "${debug_stdout}" + # Match binaries path and services name {{{ + case "${bin_path}" in + # Apache2 + '/usr/sbin/apache2' ) + service_name="apache2" + ;; + # Arpwatch + '/usr/sbin/arpwatch' ) + service_name="arpwatch" + ;; + # Dbus + '/usr/bin/dbus-daemon' ) + service_name="dbus" + ;; + # lvmetad − LVM metadata cache daemon + '/sbin/lvmetad' ) + service_name="lvm2-lvmetad" + ;; + # Netdata + '/usr/sbin/netdata' ) + service_name="netdata" + ;; + # blkmapd − pNFS block layout mapping daemon + '/usr/sbin/blkmapd' ) + service_name="nfs-utils" + ;; + # Mumble-server + '/usr/sbin/murmurd' ) + service_name="mumble-server" + ;; + # Nginx + '/usr/sbin/nginx' ) + service_name="nginx" + ;; + # Ntp + '/usr/sbin/ntpd' ) + service_name="ntp" + ;; + # Nslcd + '/usr/sbin/nslcd' ) + service_name="nslcd" + ;; + # PHP-FPM 7.0 + '/usr/sbin/php-fpm7.0' ) + service_name="php7.0-fpm" + ;; + # PHP-FPM 7.3 + '/usr/sbin/php-fpm7.3' ) + service_name="php7.3-fpm" + ;; + # Postfix + '/usr/lib/postfix/sbin/pickup' | '/usr/lib/postfix/qmgr' | '/usr/lib/postfix/sbin/tlsmgr' | '/usr/lib/postfix/sbin/qmgr' ) + service_name="postfix" + ;; + # Rdnssd + '/sbin/rdnssd' ) + service_name="rdnssd" + ;; + # Systemd-journald + '/lib/systemd/systemd-journald' ) + service_name="systemd-journald" + ;; + # Systemd-logind + '/lib/systemd/systemd-logind' ) + service_name="systemd-logind" + ;; + # default + * ) + service_name="NOT.MANAGED" + ;; + esac + # }}} + + # Restart service if needed {{{ + if [ "${service_name}" != "NOT.MANAGED" ] && [ "${service_name}" != "${previous_service_name}" ]; then + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : while bin_path loop — ${service_name} need to be restarted." >> "${debug_stdout}" + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : while bin_path loop — ssh -n -o StrictHostKeyChecking=no ${REMOTE_SSH_USER}@${BBHOSTNAME} sudo systemctl restart ${service_name}.service" >> "${debug_stdout}" + ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "${REMOTE_SSH_USER}"@"${BBHOSTNAME}" "sudo systemctl restart ${service_name}.service" >> "${debug_stdout}" 2>> "${debug_stderr}" + previous_service_name="${service_name}" + else + if [ "${service_name}" = "NOT.MANAGED" ]; then + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : while bin_path loop — service for ${bin_path} is not managed." >> "${debug_stdout}" + else + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : while bin_path loop — ${service_name} was already restarted." >> "${debug_stdout}" + fi + fi + # }}} + done < "${service_list}" + + # Also restart xymon-client service {{{ + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : Test service — xymon-client also need to be restarted." >> "${debug_stdout}" + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : Test service — ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${REMOTE_SSH_USER}@${BBHOSTNAME} sudo systemctl restart xymon-client.service" >> "${debug_stdout}" + ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "${REMOTE_SSH_USER}"@"${BBHOSTNAME}" "sudo systemctl restart xymon-client.service" >> "${debug_stdout}" 2>> "${debug_stderr}" + # }}} +fi + +# Remove empty error file +[ -s "${debug_stderr}" ] || rm -f "${debug_stderr}" + +# Remove temp_dir if DEBUG is disable +[ "${DEBUG}" -eq "0" ] || rm -rf -- "${temp_dir}" + +exit 0 diff --git a/xymon/xymon.procs.alert.sh b/xymon/xymon.procs.alert.sh new file mode 100755 index 0000000..9729190 --- /dev/null +++ b/xymon/xymon.procs.alert.sh @@ -0,0 +1,107 @@ +#!/bin/sh +# Purpose {{{ +## If Xymon server says that a service is in error on a remote host, try to +## restart this service. +## 1. Create a ssh keyring for xymon user {{{ +# sudo mkdir -p -- /var/lib/xymon/.ssh/ +# sudo ssh-keygen -f /var/lib/xymon/.ssh/id_rsa -N '' -q +# sudo chown -R xymon:xymon /var/lib/xymon/.ssh/ +## }}} +## 2. Remote user {{{ +# Ensure to have the ${REMOTE_SSH_USER} available on remote hosts and allowed +# to connect with SSH. +# Restrict the SSH access to a single SSH key from the Xymon server IP +# (~${REMOTE_SSH_USER}/.ssh/authorized_keys) : +## from="IP.SRV.XYM.ON" ssh-rsa AAAAA… +# Allow sudo commands to restart services (/etc/sudoers.d/xymon-ssh) : +## xymon-ssh ALL=(root:root) NOPASSWD: /bin/systemctl restart * +## }}} +## 3. Xymon Configuration {{{ +# PROC monitoring need to display the real service name in it's description : +## PROC %^/sbin/rpcbind MIN=1 MAX=1 COLOR=red "TEXT=rpcbind" +# You can add more information about this proc if you an underscore "_" : +## PROC %^/usr/sbin/rpc.idmapd MIN=1 MAX=1 COLOR=red "TEXT=NFS-server_rpc.idmapd" +## This way, the script will only take the text before the underscore "_" as the +## service name to be restarted. +# Don't add whitespaces in the description of a process. + +## }}} +# }}} + +# Vars {{{ +DEBUG=1 + +REMOTE_SSH_USER="xymon-ssh" + +temp_dir=$(mktemp -d -t xymon-procs-alert-XXXXXX.tmp) +debug_stdout="${temp_dir}/debug.stdout" +debug_stderr="${temp_dir}/debug.stderr" +service_list="${temp_dir}/services.error.list" +# }}} + +# Create log files +touch "${debug_stdout}" "${debug_stderr}" + +# Manage only procs probe {{{ +if [ "${BBSVCNAME}" = "procs" ]; then + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : ${BBHOSTNAME} — ${BBSVCNAME} error" >> "${debug_stdout}" +else + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : ${BBHOSTNAME} — ${BBSVCNAME} probe is not managed." >> "${debug_stderr}" + [ "${DEBUG}" -eq "0" ] || rm -rf -- "${temp_dir}" + exit 0 +fi +# }}} + +# Get the list of processes with an error +echo "${BBALPHAMSG}" | grep -E "&(red|yellow)" | cut -d" " -f2- | tr '[:upper:]' '[:lower:]' > "${service_list}" + +# If any error on a process +if [ -s "${service_list}" ]; then + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : process list — Some processes seems to be in error." >> "${debug_stdout}" + while IFS= read -r line; do + ## Pattern "req. between" {{{ + if echo "${line}" | grep -q -E -- ".* \\(found .*, req. between .* and .*\\)" ; then + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : while process loop — Pattern \"req. between\"." >> "${debug_stdout}" + service_name="$(echo "${line}" | cut -d" " -f1 | sed 's/_.*//')" + process_found="$(echo "${line}" | cut -d" " -f3 | tr -d ',')" + process_min="$(echo "${line}" | cut -d" " -f6)" + process_max="$(echo "${line}" | cut -d" " -f8 | tr -d ')')" + fi + ## }}} + ## Pattern "req. .* or more" {{{ + if echo "${line}" | grep -q -E -- ".* \\(found .*, req. .* or more\\)" ; then + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : while process loop — Pattern \"req. .* or more\"." >> "${debug_stdout}" + service_name="$(echo "${line}" | cut -d" " -f1 | sed 's/_.*//')" + process_found="$(echo "${line}" | cut -d" " -f3 | tr -d ',')" + process_min="$(echo "${line}" | cut -d" " -f5)" + process_max="nolimit" + fi + ## }}} + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : while process loop — Found ${process_found} process(es) for ${service_name} service and require between ${process_min} and ${process_max}." >> "${debug_stdout}" + # Restart service if needed {{{ + if [ "${process_found}" ] && [ "${process_min}" ] && [ "${process_found}" -lt "${process_min}" ]; then + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : while process loop — ${service_name} need to be restarted." >> "${debug_stdout}" + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : while process loop — ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${REMOTE_SSH_USER}@${BBHOSTNAME} sudo systemctl restart ${service_name}.service" >> "${debug_stdout}" + ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "${REMOTE_SSH_USER}"@"${BBHOSTNAME}" "sudo systemctl restart ${service_name}.service" >> "${debug_stdout}" 2>> "${debug_stderr}" + else + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : while process loop — ${service_name} service is not managed." >> "${debug_stdout}" + fi + # }}} + done < "${service_list}" + + # Also restart xymon-client service {{{ + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : process list — xymon-client also need to be restarted." >> "${debug_stdout}" + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : process list — ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${REMOTE_SSH_USER}@${BBHOSTNAME} sudo systemctl restart xymon-client.service" >> "${debug_stdout}" + ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "${REMOTE_SSH_USER}"@"${BBHOSTNAME}" "sudo systemctl restart xymon-client.service" >> "${debug_stdout}" 2>> "${debug_stderr}" + # }}} +else + [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : process list — No error on any process." >> "${debug_stdout}" +fi + +# Remove empty error file +[ -s "${debug_stderr}" ] || rm -f "${debug_stderr}" + +# Remove temp_dir if DEBUG is disable +[ "${DEBUG}" -eq "0" ] || rm -rf -- "${temp_dir}" + +exit 0