Revert "Create NRPE check to verify ceph daemons versions"

This reverts commit dfbda68e1add1e8a31ef0e14c043b584532fcd03.

Reason for revert:

The Ceph version check seems to be missing a consideration of users to
execute the nrpe check. It actually fails to get keyrings to execute the
command as it's run by a non-root user.

$ juju run-action --wait nrpe/0 run-nrpe-check name=check-ceph-daemons-versions
unit-nrpe-0:
  UnitId: nrpe/0
  id: "20"
  results:
    Stderr: |
      2023-02-01T03:03:09.556+0000 7f4677361700 -1 auth: unable to find
      a keyring on
      /etc/ceph/ceph.client.admin.keyring,/etc/ceph/ceph.keyring,/etc/ceph/keyring,/etc/ceph/keyring.bin:
      (2) No such file or directory
      2023-02-01T03:03:09.556+0000 7f4677361700 -1
      AuthRegistry(0x7f467005f540) no keyring found at
      /etc/ceph/ceph.client.admin.keyring,/etc/ceph/ceph.keyring,/etc/ceph/keyring,/etc/ceph/keyring.bin,
      disabling cephx
      2023-02-01T03:03:09.556+0000 7f4677361700 -1 auth: unable to find
      a keyring on
      /etc/ceph/ceph.client.admin.keyring,/etc/ceph/ceph.keyring,/etc/ceph/keyring,/etc/ceph/keyring.bin:
      (2) No such file or directory
      2023-02-01T03:03:09.556+0000 7f4677361700 -1
      AuthRegistry(0x7f4670064d88) no keyring found at
      /etc/ceph/ceph.client.admin.keyring,/etc/ceph/ceph.keyring,/etc/ceph/keyring,/etc/ceph/keyring.bin,
      disabling cephx
      2023-02-01T03:03:09.560+0000 7f4677361700 -1 auth: unable to find
      a keyring on
      /etc/ceph/ceph.client.admin.keyring,/etc/ceph/ceph.keyring,/etc/ceph/keyring,/etc/ceph/keyring.bin:
      (2) No such file or directory
      2023-02-01T03:03:09.560+0000 7f4677361700 -1
      AuthRegistry(0x7f4677360000) no keyring found at
      /etc/ceph/ceph.client.admin.keyring,/etc/ceph/ceph.keyring,/etc/ceph/keyring,/etc/ceph/keyring.bin,
      disabling cephx
      [errno 2] RADOS object not found (error connecting to the cluster)
    check-output: 'UNKNOWN: could not determine OSDs versions, error: Command ''[''ceph'',
      ''versions'']'' returned non-zero exit status 1.'
  status: completed
  timing:
    completed: 2023-02-01 03:03:10 +0000 UTC
    enqueued: 2023-02-01 03:03:09 +0000 UTC
    started: 2023-02-01 03:03:09 +0000 UTC

Related-Bug: #1943628
Change-Id: I84b306e84661e6664e8a69fa93dfdb02fa4f1e7e
This commit is contained in:
Nobuto Murata 2023-02-01 03:11:22 +00:00
parent 87600a9c31
commit c9389a8cd0
11 changed files with 0 additions and 357 deletions

View File

@ -4,8 +4,6 @@ resume-health:
description: "Resume ceph health operations across the entire ceph cluster"
get-health:
description: "Output the current cluster health reported by `ceph health`"
get-versions-report:
description: "Outputs running daemon versions for all cluster members"
create-cache-tier:
description: "Create a new cache tier"
params:

View File

@ -23,11 +23,6 @@ from charmhelpers.contrib.storage.linux.ceph import pool_set, \
set_pool_quota, snapshot_pool, remove_pool_snapshot
class CephReportError(Exception):
"""This indicates a critical error."""
pass
def list_pools():
"""Return a list of all Ceph pools."""
try:
@ -37,52 +32,6 @@ def list_pools():
action_fail(str(e))
def get_versions_report():
"""
Return a mapping of hosts and their related ceph daemon versions.
On error, raise a CephReportError.
"""
report = dict()
try:
output = check_output(['ceph', 'node', 'ls']).decode('UTF-8')
except CalledProcessError as e:
action_fail(str(e))
raise(CephReportError("Getting nodes list fail"))
nodes_list = json.loads(output)
# osd versions
for osd_host, osds in nodes_list['osd'].items():
report.setdefault(osd_host, [])
for osd in osds:
try:
output = check_output(['ceph', 'tell',
"osd.{}".format(osd),
'version']).decode('UTF-8')
except CalledProcessError:
raise(
CephReportError("Getting osd.{} version fail".format(osd))
)
report[osd_host].append(json.loads(output)['version'])
# mon versions
for mon_host, mons in nodes_list['mon'].items():
report.setdefault(mon_host, [])
for mon in mons:
try:
output = check_output(['ceph', 'tell',
"mon.{}".format(mon),
'version']).decode('UTF-8')
except CalledProcessError as e:
action_fail(str(e))
raise(
CephReportError("Getting mon.{} version fail".format(mon))
)
report[mon_host].append(json.loads(output)['version'])
return json.dumps(report, indent=4)
def pool_get():
"""
Returns a key from a pool using 'ceph osd pool get'.

View File

@ -1 +0,0 @@
get_versions_report.py

View File

@ -1,26 +0,0 @@
#!/usr/bin/env python3
#
# Copyright 2022 Canonical Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ceph_ops import get_versions_report, CephReportError
from charmhelpers.core.hookenv import log, action_set, action_fail
if __name__ == '__main__':
try:
action_set({'message': get_versions_report()})
except CephReportError as e:
log(e)
action_fail(
"get versions report failed with message: {}".format(str(e)))

View File

@ -86,32 +86,6 @@ def get_ceph_version():
return out_version
def get_daemons_versions():
"""
Uses CLI to get the ceph versions
:returns: set containing tuple of integers,
all the differents versions encountered in the cluster
:raises: UnknownError
"""
try:
tree = subprocess.check_output(['ceph',
'versions']).decode('UTF-8')
except subprocess.CalledProcessError as e:
raise UnknownError(
"UNKNOWN: could not determine OSDs versions, error: {}".format(e))
ceph_versions = json.loads(tree)
# ceph version command return a json output
# containing version of all daemons connected to the cluster
# here we parse the overall field,
# to get a set of all versions seen by the cluster
daemons_versions = set(map(
lambda x: tuple(int(i) for i in
x.split(' ')[2].split('.')),
ceph_versions['overall'].keys()))
return daemons_versions
def get_status_and_messages(status_data):
"""
Used to get general status of a Ceph cluster as well as a list of
@ -161,50 +135,6 @@ def check_ceph_status(args):
"""
status_critical = False
# if it is just --check_daemons_versions_consistency,
# deal with it and ignore overall health
if args.check_daemons_versions_consistency:
daemons_versions = get_daemons_versions()
# we check that the osds have same versions
num_of_versions = len(daemons_versions)
if num_of_versions == 1:
message_ok = "OK: All versions alligned"
return message_ok
else:
# version diverged
# we check if major release are the same
# by parsing version number in the daemon_version set
# and keeping major version number or coverting the minor
# version number if major version is 0
num_of_releases = set(map(lambda x: x[0], daemons_versions))
if len(num_of_releases) == 1:
msg = 'WARNING: Components minor versions diverged.'
'Run get-versions-report to know more'
raise WarnError(msg)
else:
# Releases diverged
major, _minor, _patch = get_ceph_version()
release_versions_diff = list(map(lambda x: major - x,
num_of_releases))
if max(release_versions_diff) >= 2:
msg = "CRITICAL: A component is " \
"{} version behind osd leader" \
". Run get-versions-report to know more".format(
max(release_versions_diff))
raise CriticalError(msg)
if min(release_versions_diff) <= -1:
msg = "CRITICAL: A component is " \
"{} version ahead osd leader" \
". Run get-versions-report to know more".format(
abs(min(release_versions_diff)))
raise CriticalError(msg)
if max(release_versions_diff) == 1:
msg = "WARNING: A component is " \
"{} version behind osd leader" \
". Run get-versions-report to know more".format(
max(release_versions_diff))
raise WarnError(msg)
if args.status_file:
check_file_freshness(args.status_file)
with open(args.status_file) as f:
@ -357,11 +287,6 @@ def parse_args(args):
dest='check_num_osds', default=False,
action='store_true',
help="Check whether all OSDs are up and in")
parser.add_argument('--check_daemons_versions_consistency',
dest='check_daemons_versions_consistency',
default=False,
action='store_true',
help="Check all OSDs versions")
return parser.parse_args(args)

View File

@ -1185,14 +1185,6 @@ def update_nrpe_config():
description='Check whether all OSDs are up and in',
check_cmd=check_cmd
)
if is_leader():
check_cmd = 'check_ceph_status.py -f {}' \
' --check_daemons_versions'.format(STATUS_FILE)
nrpe_setup.add_check(
shortname='ceph_daemons_versions',
description='Check wheter all ceph daemons versions are alligned',
check_cmd=check_cmd
)
nrpe_setup.write()

View File

@ -1,35 +0,0 @@
{
"mon": {
"juju-c8b0a2-3-lxd-0": [
"juju-c8b0a2-3-lxd-0"
],
"juju-c8b0a2-4-lxd-0": [
"juju-c8b0a2-4-lxd-0"
],
"juju-c8b0a2-5-lxd-0": [
"juju-c8b0a2-5-lxd-0"
]
},
"osd": {
"aware-bee": [
1
],
"grand-ape": [
0
],
"lucky-muskox": [
2
]
},
"mgr": {
"juju-c8b0a2-3-lxd-0": [
"juju-c8b0a2-3-lxd-0"
],
"juju-c8b0a2-4-lxd-0": [
"juju-c8b0a2-4-lxd-0"
],
"juju-c8b0a2-5-lxd-0": [
"juju-c8b0a2-5-lxd-0"
]
}
}

View File

@ -1,15 +0,0 @@
{
"mon": {
"ceph version 16.2.7 (dd0603118f56ab514f133c8d2e3adfc983942503) pacific (stable)": 3
},
"mgr": {
"ceph version 16.2.7 (dd0603118f56ab514f133c8d2e3adfc983942503) pacific (stable)": 3
},
"osd": {
"ceph version 16.2.7 (dd0603118f56ab514f133c8d2e3adfc983942503) pacific (stable)": 2
},
"mds": {},
"overall": {
"ceph version 16.2.7 (dd0603118f56ab514f133c8d2e3adfc983942503) pacific (stable)": 8
}
}

View File

@ -1,19 +0,0 @@
{
"mon": {
"ceph version 15.2.16 (d46a73d6d0a67a79558054a3a5a72cb561724974) octopus (stable)": 1,
"ceph version 17.2.0 (43e2e60a7559d3f46c9d53f1ca875fd499a1e35e) quincy (stable)": 2
},
"mgr": {
"ceph version 15.2.16 (d46a73d6d0a67a79558054a3a5a72cb561724974) octopus (stable)": 3
},
"osd": {
"ceph version 16.2.7 (dd0603118f56ab514f133c8d2e3adfc983942503) pacific (stable)": 3,
"ceph version 17.2.0 (43e2e60a7559d3f46c9d53f1ca875fd499a1e35e) quincy (stable)": 2
},
"mds": {},
"overall": {
"ceph version 15.2.16 (d46a73d6d0a67a79558054a3a5a72cb561724974) octopus (stable)": 4,
"ceph version 16.2.7 (dd0603118f56ab514f133c8d2e3adfc983942503) pacific (stable)": 3,
"ceph version 17.2.0 (43e2e60a7559d3f46c9d53f1ca875fd499a1e35e) quincy (stable)": 4
}
}

View File

@ -13,7 +13,6 @@
import json
import sys
import unittest.mock as mock
from subprocess import CalledProcessError
from test_utils import CharmTestCase
@ -49,45 +48,6 @@ class OpsTestCase(CharmTestCase):
"action_fail",
"open"])
def test_get_version_report_ok(self):
def _call_rslt():
with open('unit_tests/ceph_ls_node.json') as f:
tree = f.read()
yield tree.encode('UTF-8')
while True:
yield ('{'
' "version": "16.2.7",'
' "release": "pacific",'
' "release_type": "stable"'
'}').encode('UTF-8')
self.check_output.side_effect = _call_rslt()
result = actions.get_versions_report()
self.assertEqual('{\n'
' "aware-bee": [\n'
' "16.2.7"\n'
' ],\n'
' "grand-ape": [\n'
' "16.2.7"\n'
' ],\n'
' "lucky-muskox": [\n'
' "16.2.7"\n'
' ],\n'
' "juju-c8b0a2-3-lxd-0": [\n'
' "16.2.7"\n'
' ],\n'
' "juju-c8b0a2-4-lxd-0": [\n'
' "16.2.7"\n'
' ],\n'
' "juju-c8b0a2-5-lxd-0": [\n'
' "16.2.7"\n'
' ]\n'
'}', result)
def test_get_version_report_fail(self):
self.check_output.side_effect = CalledProcessError(1, 'ceph node ls')
self.assertRaises(actions.CephReportError,
lambda: actions.get_versions_report())
@mock.patch('socket.gethostname')
def test_get_quorum_status(self, mock_hostname):
mock_hostname.return_value = 'mockhost'

View File

@ -17,7 +17,6 @@ import os
import sys
from unittest.mock import patch
from subprocess import CalledProcessError
# import the module we want to test
os.sys.path.insert(1, os.path.join(sys.path[0], 'files/nagios'))
@ -26,90 +25,6 @@ import check_ceph_status
@patch('subprocess.check_output')
class NagiosTestCase(unittest.TestCase):
def test_get_daemons_versions_alligned(self, mock_subprocess):
with open('unit_tests/ceph_versions_alligned.json', 'rb') as f:
mock_subprocess.return_value = f.read()
osds_versions = check_ceph_status.get_daemons_versions()
self.assertEqual(osds_versions, set([(16, 2, 7)]))
def test_get_daemons_versions_diverged(self, mock_subprocess):
with open('unit_tests/ceph_versions_diverged.json', 'rb') as f:
mock_subprocess.return_value = f.read()
osds_versions = check_ceph_status.get_daemons_versions()
self.assertEqual(osds_versions, set([(16, 2, 7), (17, 2, 0),
(15, 2, 16)]))
def test_get_daemons_versions_exeption(self, mock_subprocess):
mock_subprocess.side_effect = CalledProcessError(1, 'ceph versions')
self.assertRaises(check_ceph_status.UnknownError,
lambda: check_ceph_status.get_daemons_versions())
# Version Alligned
@patch('check_ceph_status.get_daemons_versions')
def test_versions_alligned(self, mock_daemons_versions, mock_subprocess):
mock_subprocess.return_value = 'ceph version 16.2.7 ' \
'(dd0603118f56ab514f133c8d2e3adfc983942503)'.encode('UTF-8')
mock_daemons_versions.return_value = set([(16, 2, 7)])
args = check_ceph_status.parse_args([
'--check_daemons_versions_consistency'])
check_output = check_ceph_status.check_ceph_status(args)
self.assertRegex(check_output, r"^OK: All versions alligned$")
# Minor version diverged
@patch('check_ceph_status.get_daemons_versions')
def test_min_versions_diverged(self, mock_daemons_versions,
mock_subprocess):
mock_subprocess.return_value = 'ceph version 16.2.7 ' \
'(dd0603118f56ab514f133c8d2e3adfc983942503)'.encode('UTF-8')
mock_daemons_versions.return_value = set([(16, 2, 7), (16, 1, 7)])
args = check_ceph_status.parse_args([
'--check_daemons_versions_consistency'])
self.assertRaises(check_ceph_status.WarnError,
lambda: check_ceph_status.check_ceph_status(args))
# Major version ahead
@patch('check_ceph_status.get_daemons_versions')
def test_one_version_ahead(self, mock_daemons_versions, mock_subprocess):
mock_subprocess.return_value = 'ceph version 16.2.7 ' \
'(dd0603118f56ab514f133c8d2e3adfc983942503)'.encode('UTF-8')
mock_daemons_versions.return_value = set([(16, 2, 7), (17, 2, 0)])
args = check_ceph_status.parse_args([
'--check_daemons_versions_consistency'])
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))
# Two major version ahead
@patch('check_ceph_status.get_daemons_versions')
def test_two_version_ahead(self, mock_daemons_versions, mock_subprocess):
mock_subprocess.return_value = 'ceph version 15.2.16 ' \
'(d46a73d6d0a67a79558054a3a5a72cb561724974)'.encode('UTF-8')
mock_daemons_versions.return_value = set([(15, 2, 16), (17, 2, 0)])
args = check_ceph_status.parse_args([
'--check_daemons_versions_consistency'])
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))
# Major version behind
@patch('check_ceph_status.get_daemons_versions')
def test_version_behind(self, mock_daemons_versions, mock_subprocess):
mock_subprocess.return_value = 'ceph version 16.2.7 ' \
'(dd0603118f56ab514f133c8d2e3adfc983942503)'.encode('UTF-8')
mock_daemons_versions.return_value = set([(15, 2, 16), (16, 2, 7)])
args = check_ceph_status.parse_args([
'--check_daemons_versions_consistency'])
self.assertRaises(check_ceph_status.WarnError,
lambda: check_ceph_status.check_ceph_status(args))
# Two major version behind
@patch('check_ceph_status.get_daemons_versions')
def test_two_version_behind(self, mock_daemons_versions, mock_subprocess):
mock_subprocess.return_value = 'ceph version 17.2.0 ' \
'(43e2e60a7559d3f46c9d53f1ca875fd499a1e35e)'.encode('UTF-8')
mock_daemons_versions.return_value = set([(15, 2, 16), (17, 2, 0)])
args = check_ceph_status.parse_args([
'--check_daemons_versions_consistency'])
self.assertRaises(check_ceph_status.CriticalError,
lambda: check_ceph_status.check_ceph_status(args))
def test_get_ceph_version(self, mock_subprocess):
mock_subprocess.return_value = 'ceph version 10.2.9 ' \