scripts/xymon/xymon.procs.alert.sh
2024-12-01 16:50:27 +01:00

108 lines
5.4 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/sh
# Purpose {{{
## If Xymon server says that a service is in error on a remote host, try to
## restart this service.
## 1. Create a ssh keyring for xymon user {{{
# sudo mkdir -p -- /var/lib/xymon/.ssh/
# sudo ssh-keygen -f /var/lib/xymon/.ssh/id_rsa -N '' -q
# sudo chown -R xymon:xymon /var/lib/xymon/.ssh/
## }}}
## 2. Remote user {{{
# Ensure to have the ${REMOTE_SSH_USER} available on remote hosts and allowed
# to connect with SSH.
# Restrict the SSH access to a single SSH key from the Xymon server IP
# (~${REMOTE_SSH_USER}/.ssh/authorized_keys):
## from="IP.SRV.XYM.ON" ssh-rsa AAAAA…
# Allow sudo commands to restart services (/etc/sudoers.d/xymon-ssh):
## xymon-ssh ALL=(root:root) NOPASSWD: /bin/systemctl restart *
## }}}
## 3. Xymon Configuration {{{
# PROC monitoring need to display the real service name in it's description:
## PROC %^/sbin/rpcbind MIN=1 MAX=1 COLOR=red "TEXT=rpcbind"
# You can add more information about this proc if you an underscore "_":
## PROC %^/usr/sbin/rpc.idmapd MIN=1 MAX=1 COLOR=red "TEXT=NFS-server_rpc.idmapd"
## This way, the script will only take the text before the underscore "_" as the
## service name to be restarted.
# Don't add whitespaces in the description of a process.
## }}}
# }}}
# Vars {{{
DEBUG=1
REMOTE_SSH_USER="xymon-ssh"
temp_dir=$(mktemp -d -t xymon-procs-alert-XXXXXX.tmp)
debug_stdout="${temp_dir}/debug.stdout"
debug_stderr="${temp_dir}/debug.stderr"
service_list="${temp_dir}/services.error.list"
# }}}
# Create log files
touch "${debug_stdout}" "${debug_stderr}"
# Manage only procs probe {{{
if [ "${BBSVCNAME}" = "procs" ]; then
[ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG: ${BBHOSTNAME}${BBSVCNAME} error" >> "${debug_stdout}"
else
[ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG: ${BBHOSTNAME}${BBSVCNAME} probe is not managed." >> "${debug_stderr}"
[ "${DEBUG}" -eq "0" ] || rm -rf -- "${temp_dir}"
exit 0
fi
# }}}
# Get the list of processes with an error
echo "${BBALPHAMSG}" | grep -E "&(red|yellow)" | cut -d" " -f2- | tr '[:upper:]' '[:lower:]' > "${service_list}"
# If any error on a process
if [ -s "${service_list}" ]; then
[ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG: process list — Some processes seems to be in error." >> "${debug_stdout}"
while IFS= read -r line; do
## Pattern "req. between" {{{
if echo "${line}" | grep -q -E -- ".* \\(found .*, req. between .* and .*\\)" ; then
[ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG: while process loop — Pattern \"req. between\"." >> "${debug_stdout}"
service_name="$(echo "${line}" | cut -d" " -f1 | sed 's/_.*//')"
process_found="$(echo "${line}" | cut -d" " -f3 | tr -d ',')"
process_min="$(echo "${line}" | cut -d" " -f6)"
process_max="$(echo "${line}" | cut -d" " -f8 | tr -d ')')"
fi
## }}}
## Pattern "req. .* or more" {{{
if echo "${line}" | grep -q -E -- ".* \\(found .*, req. .* or more\\)" ; then
[ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG: while process loop — Pattern \"req. .* or more\"." >> "${debug_stdout}"
service_name="$(echo "${line}" | cut -d" " -f1 | sed 's/_.*//')"
process_found="$(echo "${line}" | cut -d" " -f3 | tr -d ',')"
process_min="$(echo "${line}" | cut -d" " -f5)"
process_max="nolimit"
fi
## }}}
[ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG: while process loop — Found ${process_found} process(es) for ${service_name} service and require between ${process_min} and ${process_max}." >> "${debug_stdout}"
# Restart service if needed {{{
if [ "${process_found}" ] && [ "${process_min}" ] && [ "${process_found}" -lt "${process_min}" ]; then
[ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG: while process loop — ${service_name} need to be restarted." >> "${debug_stdout}"
[ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG: while process loop — ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${REMOTE_SSH_USER}@${BBHOSTNAME} sudo systemctl restart ${service_name}.service" >> "${debug_stdout}"
ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "${REMOTE_SSH_USER}"@"${BBHOSTNAME}" "sudo systemctl restart ${service_name}.service" >> "${debug_stdout}" 2>> "${debug_stderr}"
else
[ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG: while process loop — ${service_name} service is not managed." >> "${debug_stdout}"
fi
# }}}
done < "${service_list}"
# Also restart xymon-client service {{{
[ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG: process list — xymon-client also need to be restarted." >> "${debug_stdout}"
[ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG: process list — ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${REMOTE_SSH_USER}@${BBHOSTNAME} sudo systemctl restart xymon-client.service" >> "${debug_stdout}"
ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "${REMOTE_SSH_USER}"@"${BBHOSTNAME}" "sudo systemctl restart xymon-client.service" >> "${debug_stdout}" 2>> "${debug_stderr}"
# }}}
else
[ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG: process list — No error on any process." >> "${debug_stdout}"
fi
# Remove empty error file
[ -s "${debug_stderr}" ] || rm -f "${debug_stderr}"
# Remove temp_dir if DEBUG is disable
[ "${DEBUG}" -eq "0" ] || rm -rf -- "${temp_dir}"
exit 0