From 1bb3df6c6eeca7bf31ee2a73d0101eeeb4ba31dd Mon Sep 17 00:00:00 2001 From: Patrick East Date: Mon, 11 May 2015 13:46:52 -0700 Subject: [PATCH] Add some Nagios check scripts for some services involved with CI systems These can be set up for nrpe or ssh checks on jenkins master nodes and cover a few basic checks like: - Ensuring a nodepool image age is not greater than some threshold - Checking a specific job is registered in gearman - Checking the health score for a specific jenkins job All of these have tons of room for improvement and are just barebones check scripts (closer to idiot checks) but can help detect some common problems with 3rd party ci systems. Change-Id: Ic6fbcfbdbf577646ebebf133b492f35535f37b00 --- monitoring/nagios/checks/check_gearman.py | 29 +++++++++++ monitoring/nagios/checks/check_jenkins.py | 40 +++++++++++++++ .../nagios/checks/check_nodepool_image.py | 49 +++++++++++++++++++ monitoring/nagios/checks/utils.py | 8 +++ 4 files changed, 126 insertions(+) create mode 100755 monitoring/nagios/checks/check_gearman.py create mode 100755 monitoring/nagios/checks/check_jenkins.py create mode 100755 monitoring/nagios/checks/check_nodepool_image.py create mode 100644 monitoring/nagios/checks/utils.py diff --git a/monitoring/nagios/checks/check_gearman.py b/monitoring/nagios/checks/check_gearman.py new file mode 100755 index 0000000..685ae07 --- /dev/null +++ b/monitoring/nagios/checks/check_gearman.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python + +import argparse + +import utils + + +def check_gearman_status(job_name): + """Returns a tuple of exit code and message string + + Exit codes are either 2 -> critical or 0 -> OK + There are no warnings with gearman job checker + """ + try: + gearadmin_status = utils.run_command_local('(echo status ; sleep 0.1) | netcat 127.0.0.1 4730 -w 1') + if job_name not in gearadmin_status: + return 2, 'Failed to find job registered with gearman!\nstatus:\n%s' % gearadmin_status + except Exception, e: + return 2, 'Failed to check gearman status' + e.message + + return 0, job_name + ' is registered with gearman' + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Check gearman job status.') + parser.add_argument('--job', required=True, type=str, help='the job name to check for') + args = parser.parse_args() + code, message = check_gearman_status(args.job) + print message + exit(code) \ No newline at end of file diff --git a/monitoring/nagios/checks/check_jenkins.py b/monitoring/nagios/checks/check_jenkins.py new file mode 100755 index 0000000..c49042e --- /dev/null +++ b/monitoring/nagios/checks/check_jenkins.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python + +import argparse +import urllib + +import utils + + +def check_jenkins_status(job_name, warning_threshold, critial_threshold): + """Returns a tuple of exit code and message string + + Exit codes are either 2 -> critical, 1 -> warning, or 0 -> OK + There code is determined based on the job health score and thresholds + passed into the script. + """ + try: + target_url = 'http://localhost:8080/job/%s/api/python' % job_name + jenkins_volume_job = eval(urllib.urlopen(target_url).read()) + + if jenkins_volume_job: + health_score = jenkins_volume_job['healthReport'][0]['score'] + exit_code = 0 + if health_score <= critial_threshold: + exit_code = 2 + elif health_score <= warning_threshold: + exit_code = 1 + return exit_code, 'Jenkins job health score is ' + str(health_score) + + except Exception, e: + return 2, 'Error checking jenkins job status: ' + e.message + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Check jenkins job status.') + parser.add_argument('--job', required=True, type=str, help='the job name to check for') + parser.add_argument('-w', required=True, type=int, help='warning threshold of health score') + parser.add_argument('-c', required=True, type=int, help='critical threshold of health score') + args = parser.parse_args() + code, message = check_jenkins_status(args.job, args.w, args.c) + print message + exit(code) \ No newline at end of file diff --git a/monitoring/nagios/checks/check_nodepool_image.py b/monitoring/nagios/checks/check_nodepool_image.py new file mode 100755 index 0000000..97b2c0d --- /dev/null +++ b/monitoring/nagios/checks/check_nodepool_image.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python + +import argparse +import re + +import utils + + +def check_nodepool_image_status(warning_threshold, critial_threshold): + """Returns a tuple of exit code and message string + + Exit codes are either 2 -> critical or 0 -> OK + There are no warnings with gearman job checker + """ + try: + image_list_raw = utils.run_command_local('sudo /usr/local/bin/nodepool image-list') + image_list_lines = image_list_raw.split('\n') + newest_image_age = None + + for line in image_list_lines: + match = re.search('\|\s+(\w+)\s+\|\s+(\d+\.\d+)\s+\|$', line) + if match: + status = match.group(1) + age = float(match.group(2)) + if status == 'ready': + if (newest_image_age is None) or (age < newest_image_age): + newest_image_age = age + + if not newest_image_age: + return 2, 'Error running command, output: ' + image_list_raw + + exit_code = 0 + if newest_image_age > warning_threshold: + exit_code = 2 + elif newest_image_age > warning_threshold: + exit_code = 1 + return exit_code, 'Nodepool image age (hours): ' + str(newest_image_age) + + except Exception, e: + return 2, 'Error checking nodepool images: %s' + str(e) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Check nodepool image status.') + parser.add_argument('-w', required=True, type=int, help='warning threshold for age of the image in hours') + parser.add_argument('-c', required=True, type=int, help='critical threshold for age of the image in hours') + args = parser.parse_args() + code, message = check_nodepool_image_status(args.w, args.c) + print message + exit(code) \ No newline at end of file diff --git a/monitoring/nagios/checks/utils.py b/monitoring/nagios/checks/utils.py new file mode 100644 index 0000000..2d432b9 --- /dev/null +++ b/monitoring/nagios/checks/utils.py @@ -0,0 +1,8 @@ +import subprocess + + +def run_command_local(command): + try: + return subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT) + except Exception, e: + return e.message \ No newline at end of file