Fix periodic thread that monitors Ceph

A timeout between ceph-manager and sysinv is causing ceph-manager to stop responding. When Ceph Manager detects that 'require_jewel_osds' flag needs to be set it queries sysinv which for whatever reason (most likely dealing with some final operations) fails to respond in a reasonable amount of time. This causes an exception in ceph-manager which breaks one of the periodic threads execution. Change-Id: If49f5ffbce4aeac3d50d52f526d1ce905be3cecb Signed-off-by: Kristine Bujold <kristine.bujold@windriver.com> Signed-off-by: Scott Little <scott.little@windriver.com>
2018-04-25 01:01:01 +03:00 · 2018-04-25 01:01:01 +03:00 · 393cda3e8b
commit 393cda3e8b
parent e778c76ea4
1 changed files with 12 additions and 4 deletions
--- a/ceph/ceph-manager/ceph-manager/ceph_manager/monitor.py
+++ b/ceph/ceph-manager/ceph-manager/ceph_manager/monitor.py
@ -112,7 +112,7 @@ class HandleUpgradesMixin(object):
                    "Getting software upgrade status failed "
                    "with: %s. Skip auto-heal attempt "
                    "(will retry on next ceph status poll).") % str(ex))
-                return
+                return health
            state = upgrade.get('state')
            # surpress require_jewel_osds in case upgrade is
            # in progress but not completed or aborting
@ -181,15 +181,23 @@ class Monitor(HandleUpgradesMixin):
    def run(self):
        # Wait until Ceph cluster is up and we can get the fsid
        while True:
-            self.ceph_get_fsid()
+            try:
+                self.ceph_get_fsid()
+            except Exception:
+                LOG.exception("Error getting fsid, "
+                              "will retry in %ss" % constants.CEPH_HEALTH_CHECK_INTERVAL)
            if self.service.entity_instance_id:
                break
            time.sleep(constants.CEPH_HEALTH_CHECK_INTERVAL)

        # Start monitoring ceph status
        while True:
-            self.ceph_poll_status()
-            self.ceph_poll_quotas()
+            try:
+                self.ceph_poll_status()
+                self.ceph_poll_quotas()
+            except Exception:
+                LOG.exception("Error running periodic monitoring of ceph status, "
+                              "will retry in %ss" % constants.CEPH_HEALTH_CHECK_INTERVAL)
            time.sleep(constants.CEPH_HEALTH_CHECK_INTERVAL)

    def ceph_get_fsid(self):