From 1e2387474a449452b78520b9ad96a8b4b5e99722 Mon Sep 17 00:00:00 2001 From: Harald Pfeiffer Date: Wed, 17 Apr 2019 19:07:19 +0200 Subject: initial commit of source fetch --- .../nagios/bin/pmp-check-mongo.py | 594 +++++++++++++++++++++ 1 file changed, 594 insertions(+) create mode 100755 nagios-plugins-contrib-24.20190301~bpo9+1/percona-nagios-plugins/nagios/bin/pmp-check-mongo.py (limited to 'nagios-plugins-contrib-24.20190301~bpo9+1/percona-nagios-plugins/nagios/bin/pmp-check-mongo.py') diff --git a/nagios-plugins-contrib-24.20190301~bpo9+1/percona-nagios-plugins/nagios/bin/pmp-check-mongo.py b/nagios-plugins-contrib-24.20190301~bpo9+1/percona-nagios-plugins/nagios/bin/pmp-check-mongo.py new file mode 100755 index 0000000..0de0987 --- /dev/null +++ b/nagios-plugins-contrib-24.20190301~bpo9+1/percona-nagios-plugins/nagios/bin/pmp-check-mongo.py @@ -0,0 +1,594 @@ +#!/usr/bin/env python2.7 +"""MongoDB Nagios check script + +This program is part of $PROJECT_NAME$ +License: GPL License (see COPYING) + +Author David Murphy +Copyright 2014-2015 Percona LLC and/or its affiliates +""" + +import sys +import time +import optparse +import os +import stat +import pickle +import traceback +import pprint + +from types import FunctionType +# Not yet implemented +# import DeepDiff + +try: + import pymongo +except ImportError, e: + print e + sys.exit(2) + +# As of pymongo v 1.9 the SON API is part of the BSON package, therefore attempt +# to import from there and fall back to pymongo in cases of older pymongo +if pymongo.version >= "1.9": + import bson.son as son +else: + import pymongo.son as son + + +# Adding special behavior for optparse +class OptionParsingError(RuntimeError): + def __init__(self, msg): + self.msg = msg + + +class ModifiedOptionParser(optparse.OptionParser): + def error(self, msg): + raise OptionParsingError(msg) + +def unicode_truncate(s, length, encoding='utf-8'): + encoded = s.encode(encoding)[:length] + return encoded.decode(encoding, 'ignore') + +def parse_options(args): + funcList = [] + for item_name, item_type in NagiosMongoChecks.__dict__.items(): + if type(item_type) is FunctionType and item_name.startswith("check_") and item_name is not 'check_levels': + funcList.append(item_name) + p = ModifiedOptionParser() + + p.add_option('-H', '--host', action='store', type='string', dest='host', default='127.0.0.1', help='The hostname you want to connect to') + p.add_option('-P', '--port', action='store', type='int', dest='port', default=27017, help='The port mongodb is running on') + p.add_option('-u', '--user', action='store', type='string', dest='user', default=None, help='The username you want to login as') + p.add_option('-p', '--password', action='store', type='string', dest='passwd', default=None, help='The password you want to use for that user') + p.add_option('-W', '--warning', action='store', dest='warning', default=None, help='The warning threshold you want to set') + p.add_option('-C', '--critical', action='store', dest='critical', default=None, help='The critical threshold you want to set') + p.add_option('-A', '--action', action='store', type='choice', dest='action', default='check_connect', + choices=funcList, help="The action you want to take. Valid choices are (%s) Default: %s" % (", ".join(funcList), 'check_connect')) + p.add_option('-s', '--ssl', dest='ssl', default=False, help='Connect using SSL') + p.add_option('-r', '--replicaset', dest='replicaset', default=None, help='Connect to replicaset') + p.add_option('-c', '--collection', action='store', dest='collection', default='foo', help='Specify the collection in check_cannary_test') + p.add_option('-d', '--database', action='store', dest='database', default='tmp', help='Specify the database in check_cannary_test') + p.add_option('-q', '--query', action='store', dest='query', default='{"_id":1}', help='Specify the query in check_cannary_test') + p.add_option('--statusfile', action='store', dest='status_filename', default='status.dat', help='File to current store state data in for delta checks') + p.add_option('--backup-statusfile', action='store', dest='status_filename_backup', default='status_backup.dat', + help='File to previous store state data in for delta checks') + p.add_option('--max-stale', action='store', dest='max_stale', type='int', default=60, help='Age of status file to make new checks (seconds)') + # Add options for output stat file + try: + result = p.parse_args() + except OptionParsingError, e: + if 'no such option' in e.msg: + sys.exit("UNKNOWN - No such options of %s" % e.msg.split(":")[1]) + if 'invalid choice' in e.msg: + error_item = e.msg.split(":")[2].split("'")[1] + sys.exit('UNKNOWN - No such action of %s found!' % error_item) + return result + + +def return_result(result_type, message): + if result_type == "ok": + print "OK - " + message + sys.exit(0) + elif result_type == "critical": + print "CRITICAL - " + message + sys.exit(2) + elif result_type == "warning": + print "WARNING - " + message + sys.exit(1) + else: + print "UNKNOWN - " + message + sys.exit(2) + + +def main(argv): + options, arguments = parse_options(argv) + check(options, options.action) + + +def check(args, check_name): + try: + checksObj = globals()['NagiosMongoChecks'](args) + run_check = getattr(checksObj, check_name) + result_type, message = run_check(args, args.warning, args.critical) + except Exception, e: + raise + print(traceback.extract_tb(sys.exc_info()[-1], 1)) + return_result("critical", str(e)) + return_result(result_type, message) + + +class NagiosMongoChecks: + # need to initialize variables and such still + def __init__(self, args): + # setup inital values from optParse + self.host = '127.0.0.1' + self.port = 27017 + self.user = None + self.password = None + self.warning = None + self.critical = None + self.action = 'check_connect' + self.ssl = False + self.replicaset = None + self.collection = 'foo' + self.database = 'tmp' + self.query = '{"_id":1}' + self.status_filename = 'status.dat' + self.status_filename_backup = 'status_backup.dat' + self.max_stale = 60 + + for option in vars(args): + setattr(self, option, getattr(args, option)) + + # Fix filepaths to be relative + if not self.status_filename.startswith("/") or not self.status_filename.startswith(".."): + self.status_filename_backup = "%s/%s" % (os.curdir, self.status_filename_backup) + self.status_filename = "%s/%s" % (os.curdir, self.status_filename) + + # ammend known intenal values we will need + self.current_status = {} + self.last_status = {} + self.connection = None + self.connection_time = None + self.pyMongoError = None + + self.connect() + + if self.file_age(self.status_filename) <= self.max_stale: + # Save status_file contents status as current_status + self.get_last_status(True) + # Save status_filename_backup contents as last_status + self.get_last_status(False, self.status_filename_backup) + else: + if self.connection is None: + raise pymongo.errors.ConnectionFailure(self.pyMongoError or "No connection Found, did connect fail?") + # Get fresh current_status from server + self.current_status = self.sanatize(self.get_server_status()) + # user last status_filename contents as last_status + self.get_last_status(False, self.status_filename) + # Not yet implemented + # self.compute_deltas() + + # get last status + # check if needs refresh, refresh if needed + # set last/current to self.current_status + pass + + def get_last_status(self, returnAsCurrent, forceFile=None): + # Open file using self.file + try: + file_name = forceFile if forceFile is not None else self.status_filename + fileObject = open(file_name, 'r') + if returnAsCurrent is None or returnAsCurrent is False: + self.last_status = pickle.load(fileObject) + else: + self.current_status = pickle.load(fileObject) + except Exception: + return False + return True + + def get_server_status(self): + try: + data = self.connection['admin'].command(pymongo.son_manipulator.SON([('serverStatus', 1)])) + except: + try: + data = self.connection['admin'].command(son.SON([('serverStatus', 1)])) + except Exception, e: + if type(e).__name__ == "OperationFailure": + sys.exit("UNKNOWN - Not authorized!") + else: + sys.exit("UNKNOWN - Unable to run serverStatus: %s::%s" % (type(e).__name__, unicode_truncate(e.message, 45))) + + if self.current_status is None: + self.current_status = data + + return data + + # figure out how to use this one later + def rotate_files(self): + # 1)this needs to rename self.status_filename to status_filename_backup + # 2) Save current_status to self.status_filename ( new file ) + if self.last_status == {}: + # Build the last status file for future deltas from current data + self.save_file(self.status_filename_backup, self.current_status) + # Set the current status file to empty to set the aging clock + self.save_file(self.status_filename, {}) + sys.exit("UNKNOWN - No status data present, please try again in %s seconds" % self.max_stale) + else: + self.save_file(self.status_filename_backup, self.last_status) + self.save_file(self.status_filename, self.current_status) + + + def save_file(self, filename, contents): + try: + pickle.dump(contents, open(filename, "wb")) + except Exception, e: + sys.exit("UNKNOWN - Error saving stat file %s: %s" % (filename, e.message)) + + # TODO - Fill in all check defaults + def get_default(self, key, level): + + defaults = { + 'check_connections': {'warning': 15000, 'critical': 19000}, + 'check_connect': {'warning': 50, 'critical': 100}, + 'check_queues': {'warning': 30, 'critical': 100}, + 'check_lock_pct': {'warning': 30, 'critical': 50}, + 'check_repl_lag': {'warning': 200, 'critical': 500}, + # 'check_flushing': {'warning':XX, 'critical': XX}, + 'check_total_indexes': {'warning': 100, 'critical': 300}, + 'check_cannary_test': {'warning': 30, 'critical': 50}, + 'check_oplog': {'warning': 36, 'critical': 24}, + 'check_index_ratio': {'warning': .9, 'critical': .8}, + } + try: + return defaults[key][level] + except KeyError: + sys.exit("UNKNOWN - Missing defaults found for %s please use -w and -c" % key) + + # Not yet implemented + # def compute_deltas(self): + # deltas = [] + # for item in DeepDiff(self.last_status, self.current_status)['values_changed']: + # name = item.split(":")[0].split("root")[1].replace("['", "").replace("']", ".")[:-1] + # if 'time' not in item.lower(): + # values = item.split(":")[1] + # print(values) + # old, new = values.split("===>") + # print("%s: %s - %s = %s" % (name, new, old, float(new)-float(old))) + # deltas[name] = float(new) - float(old) + # self.delta_data = deltas + # return True + + def file_age(self, filename): + try: + age = time.time() - os.stat(filename)[stat.ST_CTIME] + except OSError: + age = 999999 + return age + + # TODO - Add meat to this if needed, here for future planning + def sanatize(self, status_output): + return status_output + + def connect(self): + start_time = time.time() + try: + # ssl connection for pymongo > 2.3 + if self.replicaset is None: + con = pymongo.MongoClient(self.host, self.port, ssl=self.ssl, serverSelectionTimeoutMS=2500) + else: + con = pymongo.MongoClient(self.host, self.port, ssl=self.ssl, replicaSet=self.replicaset, serverSelectionTimeoutMS=2500) + if (self.user and self.passwd) and not con['admin'].authenticate(self.user, self.passwd): + sys.exit("CRITICAL - Username and password incorrect") + except Exception, e: + raise + if isinstance(e, pymongo.errors.AutoReconnect) and str(e).find(" is an arbiter") != -1: + # We got a pymongo AutoReconnect exception that tells us we connected to an Arbiter Server + # This means: Arbiter is reachable and can answer requests/votes - this is all we need to know from an arbiter + print "OK - State: 7 (Arbiter)" + sys.exit(0) + con = None + self.pyMongoError = str(e) + if con is not None: + try: + con['admin'].command(pymongo.son_manipulator.SON([('ping', 1)])) + except Exception, e: + sys.exit("UNKNOWN - Unable to run commands, possible auth issue: %s" % e.message) + self.connection_time = round(time.time() - start_time, 2) + version = con.server_info()['version'].split('.') + self.mongo_version = (version[0], version[1], version[2]) + self.connection = con + + def check_levels(self, check_result, warning_level, critical_level, message): + if check_result < warning_level: + return "ok", message + elif check_result > critical_level: + return "critical", message + elif check_result > warning_level and check_result < critical_level: + return "warning", message + else: + return "unknown", "Unable to parse %s into a result" % check_result + + def check_connect(self, args, warning_level, critical_level): + warning_level = warning_level or self.get_default('check_connect', 'warning') + critical_level = critical_level or self.get_default('check_connect', 'critical') + con_time = self.connection_time + message = "Connection time %.2f ms" % con_time + return self.check_levels(float(con_time), float(warning_level), float(critical_level), message) + + def check_connections(self, args, warning_level, critical_level): + warning_level = warning_level or self.get_default('check_connections', 'warning') + critical_level = critical_level or self.get_default('check_connections', 'critical') + connections = self.current_status['connections'] + connections['total'] = connections['available'] + connections['current'] + used_percent = int((connections['current'] / connections['total']) * 100) + message = "%i%% connections used ( %d of %d )" % (used_percent, connections['current'], connections['total']) + return self.check_levels(float(used_percent), int(warning_level), int(critical_level), message) + + def check_lock_pct(self, args, warning_level, critical_level): + warning_level = warning_level or self.get_default('check_lock_pct', 'warning') + critical_level = critical_level or self.get_default('check_lock_pct', 'critical') + if self.mongo_version >= ('2', '7', '0'): + return "ok", "Mongo 3.0 and above do not have lock %" + lockTime = self.current_status['globalLock']['lockTime'] - self.last_status['globalLock']['lockTime'] + totalTime = self.current_status['globalLock']['totalTime'] - self.last_status['globalLock']['totalTime'] + lock_percent = int((lockTime / totalTime) * 100) + message = "%i%% locking found (over 100%% is possible)" % (lock_percent) + return self.check_levels(lock_percent, warning_level, critical_level, message) + + def check_flushing(self, args, warning_level, critical_level): + warning_level = warning_level or self.get_default('check_flushing', 'warning') + critical_level = critical_level or self.get_default('check_flushing', 'critical') + flushData = self.current_status['backgroundFlushing'] + if args.average: + flush_time = flushData['average_ms'] + stat_type = "Average" + else: + flush_time = flushData['last_ms'] + stat_type = "Last" + + message = "%s Flush Time: %.2fms" % (stat_type, flush_time) + return self.check_levels(flush_time, warning_level, critical_level, message) + + def check_index_ratio(self, args, warning_level, critical_level): + warning_level = warning_level or self.get_default('check_index_ratio', 'warning') + critical_level = critical_level or self.get_default('check_index_ratio', 'critical') + message = None + + indexCounters = self.current_status['indexCounters'] + if 'note' in indexCounters: + ratio = 1.0 + message = "not supported defaulting to 1.0 ratio" + elif self.mongo_version >= ('2', '4', '0'): + ratio = indexCounters['missRatio'] + else: + ratio = indexCounters['btree']['missRatio'] + if message is None: + message = "Miss Ratio: %.2f" % ratio + return self.check_levels(ratio, warning_level, critical_level, message) + + def check_have_primary(self, args, warning_level, critical_level): + replset_status = self.connection['admin'].command("replSetGetStatus") + for member in replset_status['members']: + if member['state'] == 1: + return "ok", "Cluster has primary" + return "critical", "Cluster has no primary!" + + def check_total_indexes(self, args, warning_level, critical_level): + warning_level = warning_level or self.get_default('check_total_indexes', 'warning') + critical_level = critical_level or self.get_default('check_total_indexes', 'critical') + index_count = 0 + database_count = 0 + for database in self.connection.database_names(): + if database not in ["admin", "local"]: + database_count += 1 + self.connection[database]['system.indexes'].count() + index_count += self.connection[database]['system.indexes'].count() + message = "Found %d indexes in %d databases" % (index_count, database_count) + return self.check_levels(index_count, warning_level, critical_level, message) + + def check_queues(self, args, warning_level, critical_level): + warning_level = warning_level or self.get_default('check_queues', 'warning') + critical_level = critical_level or self.get_default('check_queues', 'critical') + currentQueue = self.current_status['globalLock']['currentQueue'] + currentQueue['total'] = currentQueue['readers'] + currentQueue['writers'] + message = "Queue Sizes: read (%d) write(%d) total (%d)" % (currentQueue['readers'], currentQueue['writers'], currentQueue['total']) + return self.check_levels(currentQueue['total'], warning_level, critical_level, message) + + def check_oplog(self, args, warning_level, critical_level): + warning_level = warning_level or self.get_default('check_oplog', 'warning') + critical_level = critical_level or self.get_default('check_oplog', 'critical') + if 'local' not in self.connection.database_names() or 'oplog.rs' not in self.connection['local'].collection_names(): + return "critical", "We do not seem to be in a replset!" + oplog = self.connection['local']['oplog.rs'] + first_ts = oplog.find().sort("$natural", pymongo.ASCENDING).limit(1)[0]['ts'] + last_ts = oplog.find().sort("$natural", pymongo.DESCENDING).limit(1)[0]['ts'] + oplog_range = (last_ts.as_datetime() - first_ts.as_datetime()) + oplog_range_hours = oplog_range.total_seconds() / 60 / 60 + message = "Oplog Time is %d hours" % (oplog_range_hours) + return self.check_levels(int(oplog_range_hours), warning_level, critical_level, message) + + def check_election(self, args, warning_level, critical_level): + replset_status = self.connection['admin'].command("replSetGetStatus") + for member in replset_status['members']: + if member['stateStr'] == "PRIMARY": + #last_primary = member.name + last_primary = member['name'] + for member in replset_status['members']: + if member['stateStr'] == "PRIMARY": + current_primary = member['name'] + message = "Old PRI: %s New PRI: %s" % (last_primary, current_primary) + if current_primary == last_primary: + return "ok", message + else: + return "critical", message + + def is_balanced(self): + chunks = {} + + # Loop through each of the chunks, tallying things up + for chunk in self.connection["config"]["chunks"].find(): + namespace = chunk['ns'] + shard = chunk['shard'] + if namespace not in chunks: + chunks[namespace] = {'shards': {}, 'total': 0} + if shard not in chunks[namespace]['shards']: + chunks[namespace]['shards'][shard] = 0 + chunks[namespace]['shards'][shard] += 1 + chunks[namespace]['total'] += 1 + + shardsCount = self.connection["config"]["shards"].count() + chunksCount = self.connection["config"]["chunks"].count() + + # Different migration thresholds depending on cluster size + # http://docs.mongodb.org/manual/core/sharding-internals/#sharding-migration-thresholds + if chunksCount < 20: + threshold = 2 + elif chunksCount < 80 and chunksCount > 21: + threshold = 4 + else: + threshold = 8 + + # Default to balanced state, any failure will then mark it as False forevermore + isBalanced = True + # Loop through each ns and determine if it's balanced or not + for ns in chunks: + balanced = chunks[ns]['total'] / shardsCount + for shard in chunks[ns]['shards']: + if shard > balanced - threshold and shard < balanced + threshold: + pass + else: + isBalanced = False + + return isBalanced + + def check_balance(self, args, warning_level, critical_level): + if self.is_balanced() is True: + return "ok", "Shards are balanced by chunk counts" + else: + return "critcal", "Shards are not balanced by chunk and need review" + + def check_cannary_test(self, args, warning_level, critical_level): + warning_level = warning_level or self.get_default('check_cannary_test', 'warning') + critical_level = critical_level or self.get_default('check_cannary_test', 'critical') + # this does not check for a timeout, we assume NRPE or Nagios will alert on that timeout. + try: + start = time.time() + self.connection[self.database][self.collection].find_one(self.query) + time_range = (time.time() - start).total_seconds + message = "Collection %s.%s query took: %d s" % (self.database, self.collection, time_range) + return self.check_levels(time_range, warning_level, critical_level, message) + except Exception, e: + message = "Collection %s.%s query FAILED: %s" % (self.database, self.collection, e) + return "critical", message + + def check_repl_lag(self, args, warning_level, critical_level): + warning_level = warning_level or self.get_default('check_repl_lag', 'warning') + critical_level = critical_level or self.get_default('check_repl_lag', 'critical') + + # make a write incase the client is not writing, but us an update to avoid wasting space + self.connection['test']['lag_check'].update({"_id":1}, {"_id": 1, "x": 1}) + # get a fresh status for the replset + try: + replset_status = self.connection['admin'].command("replSetGetStatus") + except Exception, e: + return "critical", "Are your running with --replset? - %s" % (e) + + for member in replset_status['members']: + if member['stateStr'] == "PRIMARY": + primary = member + if 'self' in member and member['self'] is True: + hostOptimeDate = member['optimeDate'] + + if primary is not None: + highest_optimeDate = primary['optimeDate'] + highest_name = primary['name'] + else: + # find the most current secondary as there is not primary + highest_optimeDate = time.gmtime(0) + for member in replset_status['members']: + if member['optimeDate'] > highest_optimeDate: + highest_optimeDate = member['optimeDate'] + highest_name = member['name'] + + rep_lag_seconds = (highest_optimeDate - hostOptimeDate).seconds + rep_lag_hours = round(rep_lag_seconds/60/60, 4) + message = "Lagging %s by %.4f hours" % (highest_name, rep_lag_hours) + return self.check_levels(rep_lag_hours, warning_level, critical_level, message) + +# +# main app +# +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) + +# ############################################################################ +# Documentation +# ############################################################################ +""" +=pod + +=head1 NAME + +pmp-check-mongo.py - MongoDB Nagios check script. + +=head1 SYNOPSIS + + Usage: pmp-check-mongo.py [options] + + Options: + -h, --help show this help message and exit + -H HOST, --host=HOST The hostname you want to connect to + -P PORT, --port=PORT The port mongodb is running on + -u USER, --user=USER The username you want to login as + -p PASSWD, --password=PASSWD + The password you want to use for that user + -W WARNING, --warning=WARNING + The warning threshold you want to set + -C CRITICAL, --critical=CRITICAL + The critical threshold you want to set + -A ACTION, --action=ACTION + The action you want to take. Valid choices are + (check_connections, check_election, check_lock_pct, + check_repl_lag, check_flushing, check_total_indexes, + check_balance, check_queues, check_cannary_test, + check_have_primary, check_oplog, check_index_ratio, + check_connect) Default: check_connect + -s SSL, --ssl=SSL Connect using SSL + -r REPLICASET, --replicaset=REPLICASET + Connect to replicaset + -c COLLECTION, --collection=COLLECTION + Specify the collection in check_cannary_test + -d DATABASE, --database=DATABASE + Specify the database in check_cannary_test + -q QUERY, --query=QUERY + Specify the query in check_cannary_test + --statusfile=STATUS_FILENAME + File to current store state data in for delta checks + --backup-statusfile=STATUS_FILENAME_BACKUP + File to previous store state data in for delta checks + --max-stale=MAX_STALE + Age of status file to make new checks (seconds) + +=head1 COPYRIGHT, LICENSE, AND WARRANTY + +This program is copyright 2014 Percona LLC and/or its affiliates. +Feedback and improvements are welcome. + +THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation, version 2. You should have received a copy of the GNU General +Public License along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + +=head1 VERSION + +$PROJECT_NAME$ pmp-check-mongo.py $VERSION$ + +=cut + +""" -- cgit v1.2.3