Another freeze break request - this one to add a nagios check for connectivity to memcached. This will attempt to call the daemon's stats command which, if broken, might hang and cause nrpe to time out. We want that, as it will give us a clue to what might be causing some other app to fail. This is in part what happened this morning. The memcached process was present, and it was even accepting tcp requests, but it would not respond to commands. --- .../files/scripts/check_memcache_connect | 24 ++++++++++++++++++++ roles/nagios_client/tasks/main.yml | 1 + .../nagios_client/templates/check_memcache.cfg.j2 | 2 +- .../files/nagios/services/memcached.cfg | 16 +++++++++++- roles/nagios_server/files/nrpe.cfg | 1 + 5 files changed, 41 insertions(+), 3 deletions(-) create mode 100644 roles/nagios_client/files/scripts/check_memcache_connect diff --git a/roles/nagios_client/files/scripts/check_memcache_connect b/roles/nagios_client/files/scripts/check_memcache_connect new file mode 100644 index 0000000..7c472e3 --- /dev/null +++ b/roles/nagios_client/files/scripts/check_memcache_connect @@ -0,0 +1,24 @@ +#!/bin/bash +# +# 2014-12-19 +# Author: Ralph Bean <rbean@xxxxxxxxxx> + +# exit codes +ok=0 +warn=1 +crit=2 +unkn=3 + +# Right now we just check to see if we can even run this command without +# hanging and timing out. In the future, we could parse stdout for more +# fine-grained information. +echo stats | nc 127.0.0.1 11211 > /dev/null +status=$? + +if [ $status -ne 0 ]; then + echo "CRIT: stats command got status code $status" + exit $crit +else + echo "OK: stats command got status code $status" + exit $ok +fi diff --git a/roles/nagios_client/tasks/main.yml b/roles/nagios_client/tasks/main.yml index 6c91dda..aa9b6c2 100644 --- a/roles/nagios_client/tasks/main.yml +++ b/roles/nagios_client/tasks/main.yml @@ -31,6 +31,7 @@ - check_fedmsg_producers_consumers.py - check_supybot_plugin - check_datanommer_timesince.py + - check_memcache_connect when: not inventory_hostname.startswith('noc') tags: - nagios_client diff --git a/roles/nagios_client/templates/check_memcache.cfg.j2 b/roles/nagios_client/templates/check_memcache.cfg.j2 index b350a65..b0ec100 100644 --- a/roles/nagios_client/templates/check_memcache.cfg.j2 +++ b/roles/nagios_client/templates/check_memcache.cfg.j2 @@ -1,2 +1,2 @@ command[check_memcache]=/usr/lib64/nagios/plugins/check_procs -c 1:1 -a '/usr/bin/memcached' -u memcached - +command[check_memcache_connect]=/usr/lib64/nagios/plugins/check_memcache_connect diff --git a/roles/nagios_server/files/nagios/services/memcached.cfg b/roles/nagios_server/files/nagios/services/memcached.cfg index 9f497b5..814a5a8 100644 --- a/roles/nagios_server/files/nagios/services/memcached.cfg +++ b/roles/nagios_server/files/nagios/services/memcached.cfg @@ -1,12 +1,24 @@ define service { host_name memcached01 - service_description Check memcached daemon + service_description Check for the presence of the memcached daemon check_command check_by_nrpe!check_memcache use defaulttemplate } define service { host_name memcached02 - service_description Check memcached daemon + service_description Check for the presence of the memcached daemon check_command check_by_nrpe!check_memcache use defaulttemplate } +define service { + host_name memcached01 + service_description Check for connectivity to the memcached daemon + check_command check_by_nrpe!check_memcache_connect + use defaulttemplate +} +define service { + host_name memcached02 + service_description Check for connectivity to the memcached daemon + check_command check_by_nrpe!check_memcache_connect + use defaulttemplate +} diff --git a/roles/nagios_server/files/nrpe.cfg b/roles/nagios_server/files/nrpe.cfg index 86af64b..4fb1cdb 100644 --- a/roles/nagios_server/files/nrpe.cfg +++ b/roles/nagios_server/files/nrpe.cfg @@ -238,6 +238,7 @@ command[check_fcomm_queue]=/usr/lib64/nagios/plugins/check_fcomm_queue command[check_redis_proc]=/usr/lib64/nagios/plugins/check_procs -c 1:1 -C 'redis-server' -u redis command[check_openvpn_link]=/usr/lib64/nagios/plugins/check_ping -H 192.168.1.58 -w 375.0,20% -c 500,60% command[check_memcache]=/usr/lib64/nagios/plugins/check_procs -c 1:1 -a '/usr/bin/memcached' -u memcached +command[check_memcache_connect]=/usr/lib64/nagios/plugins/check_memcache_connect # The following are fedmsg/datanommer checks to be run on busgateway01. # They check for the time since the latest message in any particular category. -- 1.7.2.1
Attachment:
pgphx2PrEuL8W.pgp
Description: PGP signature
_______________________________________________ infrastructure mailing list infrastructure@xxxxxxxxxxxxxxxxxxxxxxx https://admin.fedoraproject.org/mailman/listinfo/infrastructure