Create scripts to remove Failed OSD

Support to rewight OSDs

Change-Id: I84bd92c01a35701e596d7a917c71715043bf6c2a
This commit is contained in:
Krishna Venkata 2019-01-21 04:10:16 -06:00
parent fcaf62a7fa
commit 227eed55f7
5 changed files with 183 additions and 1 deletions

View File

@ -0,0 +1,110 @@
#!/bin/bash
{{/*
Copyright 2017 The Openstack-Helm Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/}}
set -ex
function check_osd_status () {
OSD_ID=$(nccli ceph osd tree -f json-pretty | jq '.nodes[]|select(.type=="osd")|select(.status == "down")|.id')
if [ "${OSD_ID}" != '' ];then
for i in $OSD_ID; do
echo "OSD id $i is in Down Status"
done
else
echo "All OSDs are Good"
exit
fi
}
function osd_remove () {
check_osd_status
for id in $OSD_ID; do
read -p "Enter 'yes' to purge OSD=$id and 'no' to skip=" YN
if [[ $YN == "y" || $YN == "Y" || $YN == "yes" || $YN == "YES" ]]; then
echo "Purging OSD=$id"
nccli ceph osd purge $id --yes-i-really-mean-it
sleep 3
elif [[ $YN == "n" || $YN == "N" || $YN == "no" || $YN == "NO" ]]; then
echo "Not purging OSD=$id"
else
echo "Invalid Option"
fi
done
}
function osd_remove_by_id () {
OSDID=$1
OSD_STATUS=$(nccli ceph osd tree -f json-pretty | jq '.nodes[]|select(.type=="osd")|select(.id == '$OSDID')|.status')
if [ "$OSD_STATUS" == '"down"' ]; then
echo "OSD id $OSDID is in Down Status, So purging it"
nccli ceph osd purge $OSDID --yes-i-really-mean-it
elif [[ -z "$OSD_STATUS" ]]; then
echo "OSD id $OSDID is not found, Please enter correct OSD id"
exit
else
echo "OSD id $OSDID is not in Down Status, Not purging it"
exit
fi
}
function reweight_osds () {
for OSD_ID in $(nccli ceph osd df | awk '$3 == "0" {print $1}'); do
OSD_WEIGHT=$(nccli ceph osd df --format json-pretty| grep -A7 "\bosd.${OSD_ID}\b" | awk '/"kb"/{ gsub(",",""); d= $2/1073741824 ; r = sprintf("%.2f", d); print r }');
nccli ceph osd crush reweight osd.${OSD_ID} ${OSD_WEIGHT};
done
}
usage() {
set +ex
echo "Usage: nccli osd-maintenance check_osd_status"
echo " nccli osd-maintenance osd_remove"
echo " nccli osd-maintenance osd_remove_by_id --osd-id <OSDID>"
echo " nccli osd-maintenance reweight_osds"
exit 1
}
if [ $# -eq 0 ]; then
usage
else
OSDID=""
case $1 in
osd_remove_by_id )
shift
if [ "$1" == "--osd-id" ]; then
shift
if [ "$1" == "" ]; then
usage
exit 1
fi
OSDID=$1
osd_remove_by_id $OSDID
else
usage
exit 1
fi
;;
osd_remove ) osd_remove
;;
check_osd_status ) check_osd_status
;;
reweight_osds ) reweight_osds
;;
*)
usage
exit 1
;;
esac
fi

View File

@ -41,4 +41,7 @@ data:
nccli: |
{{ tuple "bin/utility/_nccli.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
osd-maintenance: |
{{ tuple "bin/utility/_osd-maintenance.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
{{- end }}

View File

@ -67,6 +67,10 @@ spec:
mountPath: /usr/local/bin/ceph-utility-rootwrap
subPath: ceph-utility-rootwrap
readOnly: true
- name: ceph-utility-bin
mountPath: /tmp/osd-maintenance
subPath: osd-maintenance
readOnly: true
- name: ceph-utility-sudoers
mountPath: /etc/sudoers.d/nccli-sudo
subPath: nccli-sudo

View File

@ -91,6 +91,7 @@ conf:
rados: CommandFilter, rados, root
radosgw-admin: CommandFilter, radosgw-admin, root
rbd: CommandFilter, rbd, root
osd-maintenance: CommandFilter, osd-maintenance, root
# Below are examples of RegExpFilter. This will restict access to ceph cluster even with admin user
#rbd00: RegExpFilter, rbd, root, rbd, (^((?!clone|copy|cp|create|export|export-diff|flatten|import|import-diff|map|merge-diff|pool|remove|rm|rename|mv|resize|unmap).)*$)
#rbd01: RegExpFilter, rbd, root, rbd, image-meta, (^((?!get|remove|set).)*$)
@ -113,7 +114,7 @@ conf:
# explicitely specify a full path (separated by ',')
# If not specified, defaults to system PATH environment variable.
# These directories MUST all be only writeable by root !
exec_dirs: /sbin,/usr/sbin,/bin,/usr/bin,/usr/local/bin,/usr/local/sbin
exec_dirs: /sbin,/usr/sbin,/bin,/usr/bin,/usr/local/bin,/usr/local/sbin,/tmp
# Enable logging to syslog
# Default value is False
use_syslog: True

64
docs/ceph_maintenance.md Normal file
View File

@ -0,0 +1,64 @@
# Ceph Maintenance
This MOP covers Maintenance Activities related to Ceph.
## Table of Contents ##
<!-- TOC depthFrom:1 depthTo:6 withLinks:1 updateOnSave:1 orderedList:0 -->
- Table of Contents
- 1. Generic commands
- 2. Replace failed OSD
## 1. Generic Commands ##
### Check OSD Status
To check the current status of OSDs, execute the following:
```
nccli osd-maintenance check_osd_status
```
### OSD Removal
To purge OSDs in down state, execute the following:
```
nccli osd-maintenance osd_remove
```
### OSD Removal By OSD ID
To purge OSDs by OSD ID in down state, execute the following:
```
nccli osd-maintenance remove_osd_by_id --osd-id <OSDID>
```
### Reweight OSDs
To adjust an OSDs crush weight in the CRUSH map of a running cluster, execute the following:
```
nccli osd-maintenance reweight_osds
```
## 2. Replace failed OSD ##
In the context of a failed drive, Please follow below procedure. Following commands should be run from utility container
Capture the failed OSD ID. Check for status `down`
nccli ceph osd tree
Remove the OSD from Cluster. Replace `<OSD_ID>` with above captured failed OSD ID
nccli osd-maintenance osd_remove_by_id --osd-id <OSD_ID>
Remove the failed drive and replace it with a new one without bringing down the node.
Once new drive is placed, delete the concern OSD pod in `error` or `CrashLoopBackOff` state. Replace `<pod_name>` with failed OSD pod name.
kubectl delete pod <pod_name> -n ceph
Once pod is deleted, kubernetes will re-spin a new pod for the OSD. Once Pod is up, the osd is added to ceph cluster with weight equal to `0`. we need to re-weight the osd.
nccli osd-maintenance reweight_osds