From 227eed55f761f3e7e9dc42864cfe69714b65d414 Mon Sep 17 00:00:00 2001 From: Krishna Venkata Date: Mon, 21 Jan 2019 04:10:16 -0600 Subject: [PATCH] Create scripts to remove Failed OSD Support to rewight OSDs Change-Id: I84bd92c01a35701e596d7a917c71715043bf6c2a --- .../bin/utility/_osd-maintenance.tpl | 110 ++++++++++++++++++ ceph-utility/templates/configmap-bin.yaml | 3 + .../templates/deployment-utility.yaml | 4 + ceph-utility/values.yaml | 3 +- docs/ceph_maintenance.md | 64 ++++++++++ 5 files changed, 183 insertions(+), 1 deletion(-) create mode 100644 ceph-utility/templates/bin/utility/_osd-maintenance.tpl create mode 100644 docs/ceph_maintenance.md diff --git a/ceph-utility/templates/bin/utility/_osd-maintenance.tpl b/ceph-utility/templates/bin/utility/_osd-maintenance.tpl new file mode 100644 index 00000000..3925341f --- /dev/null +++ b/ceph-utility/templates/bin/utility/_osd-maintenance.tpl @@ -0,0 +1,110 @@ +#!/bin/bash +{{/* +Copyright 2017 The Openstack-Helm Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/}} + +set -ex + +function check_osd_status () { + OSD_ID=$(nccli ceph osd tree -f json-pretty | jq '.nodes[]|select(.type=="osd")|select(.status == "down")|.id') + if [ "${OSD_ID}" != '' ];then + for i in $OSD_ID; do + echo "OSD id $i is in Down Status" + done + else + echo "All OSDs are Good" + exit + fi +} + +function osd_remove () { + check_osd_status + for id in $OSD_ID; do + read -p "Enter 'yes' to purge OSD=$id and 'no' to skip=" YN + if [[ $YN == "y" || $YN == "Y" || $YN == "yes" || $YN == "YES" ]]; then + echo "Purging OSD=$id" + nccli ceph osd purge $id --yes-i-really-mean-it + sleep 3 + elif [[ $YN == "n" || $YN == "N" || $YN == "no" || $YN == "NO" ]]; then + echo "Not purging OSD=$id" + else + echo "Invalid Option" + fi + done +} + +function osd_remove_by_id () { + OSDID=$1 + OSD_STATUS=$(nccli ceph osd tree -f json-pretty | jq '.nodes[]|select(.type=="osd")|select(.id == '$OSDID')|.status') + if [ "$OSD_STATUS" == '"down"' ]; then + echo "OSD id $OSDID is in Down Status, So purging it" + nccli ceph osd purge $OSDID --yes-i-really-mean-it + elif [[ -z "$OSD_STATUS" ]]; then + echo "OSD id $OSDID is not found, Please enter correct OSD id" + exit + else + echo "OSD id $OSDID is not in Down Status, Not purging it" + exit + fi +} + +function reweight_osds () { + for OSD_ID in $(nccli ceph osd df | awk '$3 == "0" {print $1}'); do + OSD_WEIGHT=$(nccli ceph osd df --format json-pretty| grep -A7 "\bosd.${OSD_ID}\b" | awk '/"kb"/{ gsub(",",""); d= $2/1073741824 ; r = sprintf("%.2f", d); print r }'); + nccli ceph osd crush reweight osd.${OSD_ID} ${OSD_WEIGHT}; + done +} + +usage() { + set +ex + echo "Usage: nccli osd-maintenance check_osd_status" + echo " nccli osd-maintenance osd_remove" + echo " nccli osd-maintenance osd_remove_by_id --osd-id " + echo " nccli osd-maintenance reweight_osds" + exit 1 +} + +if [ $# -eq 0 ]; then + usage +else + OSDID="" + case $1 in + osd_remove_by_id ) + shift + if [ "$1" == "--osd-id" ]; then + shift + if [ "$1" == "" ]; then + usage + exit 1 + fi + OSDID=$1 + osd_remove_by_id $OSDID + else + usage + exit 1 + fi + ;; + osd_remove ) osd_remove + ;; + check_osd_status ) check_osd_status + ;; + reweight_osds ) reweight_osds + ;; + *) + usage + exit 1 + ;; + esac +fi diff --git a/ceph-utility/templates/configmap-bin.yaml b/ceph-utility/templates/configmap-bin.yaml index baf37eee..c3bcd42c 100644 --- a/ceph-utility/templates/configmap-bin.yaml +++ b/ceph-utility/templates/configmap-bin.yaml @@ -41,4 +41,7 @@ data: nccli: | {{ tuple "bin/utility/_nccli.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} + osd-maintenance: | +{{ tuple "bin/utility/_osd-maintenance.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} + {{- end }} diff --git a/ceph-utility/templates/deployment-utility.yaml b/ceph-utility/templates/deployment-utility.yaml index 12ac4e4e..13bf9452 100644 --- a/ceph-utility/templates/deployment-utility.yaml +++ b/ceph-utility/templates/deployment-utility.yaml @@ -67,6 +67,10 @@ spec: mountPath: /usr/local/bin/ceph-utility-rootwrap subPath: ceph-utility-rootwrap readOnly: true + - name: ceph-utility-bin + mountPath: /tmp/osd-maintenance + subPath: osd-maintenance + readOnly: true - name: ceph-utility-sudoers mountPath: /etc/sudoers.d/nccli-sudo subPath: nccli-sudo diff --git a/ceph-utility/values.yaml b/ceph-utility/values.yaml index 476c5e7e..7b2999de 100644 --- a/ceph-utility/values.yaml +++ b/ceph-utility/values.yaml @@ -91,6 +91,7 @@ conf: rados: CommandFilter, rados, root radosgw-admin: CommandFilter, radosgw-admin, root rbd: CommandFilter, rbd, root + osd-maintenance: CommandFilter, osd-maintenance, root # Below are examples of RegExpFilter. This will restict access to ceph cluster even with admin user #rbd00: RegExpFilter, rbd, root, rbd, (^((?!clone|copy|cp|create|export|export-diff|flatten|import|import-diff|map|merge-diff|pool|remove|rm|rename|mv|resize|unmap).)*$) #rbd01: RegExpFilter, rbd, root, rbd, image-meta, (^((?!get|remove|set).)*$) @@ -113,7 +114,7 @@ conf: # explicitely specify a full path (separated by ',') # If not specified, defaults to system PATH environment variable. # These directories MUST all be only writeable by root ! - exec_dirs: /sbin,/usr/sbin,/bin,/usr/bin,/usr/local/bin,/usr/local/sbin + exec_dirs: /sbin,/usr/sbin,/bin,/usr/bin,/usr/local/bin,/usr/local/sbin,/tmp # Enable logging to syslog # Default value is False use_syslog: True diff --git a/docs/ceph_maintenance.md b/docs/ceph_maintenance.md new file mode 100644 index 00000000..45811d2c --- /dev/null +++ b/docs/ceph_maintenance.md @@ -0,0 +1,64 @@ +# Ceph Maintenance + +This MOP covers Maintenance Activities related to Ceph. + +## Table of Contents ## + + + +- Table of Contents + - 1. Generic commands + - 2. Replace failed OSD + +## 1. Generic Commands ## + +### Check OSD Status +To check the current status of OSDs, execute the following: + +``` +nccli osd-maintenance check_osd_status +``` + +### OSD Removal +To purge OSDs in down state, execute the following: + +``` +nccli osd-maintenance osd_remove +``` + +### OSD Removal By OSD ID +To purge OSDs by OSD ID in down state, execute the following: + +``` +nccli osd-maintenance remove_osd_by_id --osd-id +``` + +### Reweight OSDs +To adjust an OSD’s crush weight in the CRUSH map of a running cluster, execute the following: + +``` +nccli osd-maintenance reweight_osds +``` + +## 2. Replace failed OSD ## + +In the context of a failed drive, Please follow below procedure. Following commands should be run from utility container + +Capture the failed OSD ID. Check for status `down` + + nccli ceph osd tree + +Remove the OSD from Cluster. Replace `` with above captured failed OSD ID + + nccli osd-maintenance osd_remove_by_id --osd-id + +Remove the failed drive and replace it with a new one without bringing down the node. + +Once new drive is placed, delete the concern OSD pod in `error` or `CrashLoopBackOff` state. Replace `` with failed OSD pod name. + + kubectl delete pod -n ceph + +Once pod is deleted, kubernetes will re-spin a new pod for the OSD. Once Pod is up, the osd is added to ceph cluster with weight equal to `0`. we need to re-weight the osd. + + nccli osd-maintenance reweight_osds +