From 393cda3e8b9882e69a8f3d17418a47ff2ed00140 Mon Sep 17 00:00:00 2001 From: Ovidiu Poncea Date: Wed, 25 Apr 2018 01:01:01 +0300 Subject: [PATCH] Fix periodic thread that monitors Ceph A timeout between ceph-manager and sysinv is causing ceph-manager to stop responding. When Ceph Manager detects that 'require_jewel_osds' flag needs to be set it queries sysinv which for whatever reason (most likely dealing with some final operations) fails to respond in a reasonable amount of time. This causes an exception in ceph-manager which breaks one of the periodic threads execution. Change-Id: If49f5ffbce4aeac3d50d52f526d1ce905be3cecb Signed-off-by: Kristine Bujold Signed-off-by: Scott Little --- .../ceph-manager/ceph_manager/monitor.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/ceph/ceph-manager/ceph-manager/ceph_manager/monitor.py b/ceph/ceph-manager/ceph-manager/ceph_manager/monitor.py index 941e5fc0..51308240 100644 --- a/ceph/ceph-manager/ceph-manager/ceph_manager/monitor.py +++ b/ceph/ceph-manager/ceph-manager/ceph_manager/monitor.py @@ -112,7 +112,7 @@ class HandleUpgradesMixin(object): "Getting software upgrade status failed " "with: %s. Skip auto-heal attempt " "(will retry on next ceph status poll).") % str(ex)) - return + return health state = upgrade.get('state') # surpress require_jewel_osds in case upgrade is # in progress but not completed or aborting @@ -181,15 +181,23 @@ class Monitor(HandleUpgradesMixin): def run(self): # Wait until Ceph cluster is up and we can get the fsid while True: - self.ceph_get_fsid() + try: + self.ceph_get_fsid() + except Exception: + LOG.exception("Error getting fsid, " + "will retry in %ss" % constants.CEPH_HEALTH_CHECK_INTERVAL) if self.service.entity_instance_id: break time.sleep(constants.CEPH_HEALTH_CHECK_INTERVAL) # Start monitoring ceph status while True: - self.ceph_poll_status() - self.ceph_poll_quotas() + try: + self.ceph_poll_status() + self.ceph_poll_quotas() + except Exception: + LOG.exception("Error running periodic monitoring of ceph status, " + "will retry in %ss" % constants.CEPH_HEALTH_CHECK_INTERVAL) time.sleep(constants.CEPH_HEALTH_CHECK_INTERVAL) def ceph_get_fsid(self):