Create scripts to remove Failed OSD
Support to rewight OSDs Change-Id: I84bd92c01a35701e596d7a917c71715043bf6c2a
This commit is contained in:
parent
fcaf62a7fa
commit
227eed55f7
110
ceph-utility/templates/bin/utility/_osd-maintenance.tpl
Normal file
110
ceph-utility/templates/bin/utility/_osd-maintenance.tpl
Normal file
@ -0,0 +1,110 @@
|
||||
#!/bin/bash
|
||||
{{/*
|
||||
Copyright 2017 The Openstack-Helm Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/}}
|
||||
|
||||
set -ex
|
||||
|
||||
function check_osd_status () {
|
||||
OSD_ID=$(nccli ceph osd tree -f json-pretty | jq '.nodes[]|select(.type=="osd")|select(.status == "down")|.id')
|
||||
if [ "${OSD_ID}" != '' ];then
|
||||
for i in $OSD_ID; do
|
||||
echo "OSD id $i is in Down Status"
|
||||
done
|
||||
else
|
||||
echo "All OSDs are Good"
|
||||
exit
|
||||
fi
|
||||
}
|
||||
|
||||
function osd_remove () {
|
||||
check_osd_status
|
||||
for id in $OSD_ID; do
|
||||
read -p "Enter 'yes' to purge OSD=$id and 'no' to skip=" YN
|
||||
if [[ $YN == "y" || $YN == "Y" || $YN == "yes" || $YN == "YES" ]]; then
|
||||
echo "Purging OSD=$id"
|
||||
nccli ceph osd purge $id --yes-i-really-mean-it
|
||||
sleep 3
|
||||
elif [[ $YN == "n" || $YN == "N" || $YN == "no" || $YN == "NO" ]]; then
|
||||
echo "Not purging OSD=$id"
|
||||
else
|
||||
echo "Invalid Option"
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
function osd_remove_by_id () {
|
||||
OSDID=$1
|
||||
OSD_STATUS=$(nccli ceph osd tree -f json-pretty | jq '.nodes[]|select(.type=="osd")|select(.id == '$OSDID')|.status')
|
||||
if [ "$OSD_STATUS" == '"down"' ]; then
|
||||
echo "OSD id $OSDID is in Down Status, So purging it"
|
||||
nccli ceph osd purge $OSDID --yes-i-really-mean-it
|
||||
elif [[ -z "$OSD_STATUS" ]]; then
|
||||
echo "OSD id $OSDID is not found, Please enter correct OSD id"
|
||||
exit
|
||||
else
|
||||
echo "OSD id $OSDID is not in Down Status, Not purging it"
|
||||
exit
|
||||
fi
|
||||
}
|
||||
|
||||
function reweight_osds () {
|
||||
for OSD_ID in $(nccli ceph osd df | awk '$3 == "0" {print $1}'); do
|
||||
OSD_WEIGHT=$(nccli ceph osd df --format json-pretty| grep -A7 "\bosd.${OSD_ID}\b" | awk '/"kb"/{ gsub(",",""); d= $2/1073741824 ; r = sprintf("%.2f", d); print r }');
|
||||
nccli ceph osd crush reweight osd.${OSD_ID} ${OSD_WEIGHT};
|
||||
done
|
||||
}
|
||||
|
||||
usage() {
|
||||
set +ex
|
||||
echo "Usage: nccli osd-maintenance check_osd_status"
|
||||
echo " nccli osd-maintenance osd_remove"
|
||||
echo " nccli osd-maintenance osd_remove_by_id --osd-id <OSDID>"
|
||||
echo " nccli osd-maintenance reweight_osds"
|
||||
exit 1
|
||||
}
|
||||
|
||||
if [ $# -eq 0 ]; then
|
||||
usage
|
||||
else
|
||||
OSDID=""
|
||||
case $1 in
|
||||
osd_remove_by_id )
|
||||
shift
|
||||
if [ "$1" == "--osd-id" ]; then
|
||||
shift
|
||||
if [ "$1" == "" ]; then
|
||||
usage
|
||||
exit 1
|
||||
fi
|
||||
OSDID=$1
|
||||
osd_remove_by_id $OSDID
|
||||
else
|
||||
usage
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
osd_remove ) osd_remove
|
||||
;;
|
||||
check_osd_status ) check_osd_status
|
||||
;;
|
||||
reweight_osds ) reweight_osds
|
||||
;;
|
||||
*)
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
fi
|
@ -41,4 +41,7 @@ data:
|
||||
nccli: |
|
||||
{{ tuple "bin/utility/_nccli.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
|
||||
|
||||
osd-maintenance: |
|
||||
{{ tuple "bin/utility/_osd-maintenance.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
|
||||
|
||||
{{- end }}
|
||||
|
@ -67,6 +67,10 @@ spec:
|
||||
mountPath: /usr/local/bin/ceph-utility-rootwrap
|
||||
subPath: ceph-utility-rootwrap
|
||||
readOnly: true
|
||||
- name: ceph-utility-bin
|
||||
mountPath: /tmp/osd-maintenance
|
||||
subPath: osd-maintenance
|
||||
readOnly: true
|
||||
- name: ceph-utility-sudoers
|
||||
mountPath: /etc/sudoers.d/nccli-sudo
|
||||
subPath: nccli-sudo
|
||||
|
@ -91,6 +91,7 @@ conf:
|
||||
rados: CommandFilter, rados, root
|
||||
radosgw-admin: CommandFilter, radosgw-admin, root
|
||||
rbd: CommandFilter, rbd, root
|
||||
osd-maintenance: CommandFilter, osd-maintenance, root
|
||||
# Below are examples of RegExpFilter. This will restict access to ceph cluster even with admin user
|
||||
#rbd00: RegExpFilter, rbd, root, rbd, (^((?!clone|copy|cp|create|export|export-diff|flatten|import|import-diff|map|merge-diff|pool|remove|rm|rename|mv|resize|unmap).)*$)
|
||||
#rbd01: RegExpFilter, rbd, root, rbd, image-meta, (^((?!get|remove|set).)*$)
|
||||
@ -113,7 +114,7 @@ conf:
|
||||
# explicitely specify a full path (separated by ',')
|
||||
# If not specified, defaults to system PATH environment variable.
|
||||
# These directories MUST all be only writeable by root !
|
||||
exec_dirs: /sbin,/usr/sbin,/bin,/usr/bin,/usr/local/bin,/usr/local/sbin
|
||||
exec_dirs: /sbin,/usr/sbin,/bin,/usr/bin,/usr/local/bin,/usr/local/sbin,/tmp
|
||||
# Enable logging to syslog
|
||||
# Default value is False
|
||||
use_syslog: True
|
||||
|
64
docs/ceph_maintenance.md
Normal file
64
docs/ceph_maintenance.md
Normal file
@ -0,0 +1,64 @@
|
||||
# Ceph Maintenance
|
||||
|
||||
This MOP covers Maintenance Activities related to Ceph.
|
||||
|
||||
## Table of Contents ##
|
||||
|
||||
<!-- TOC depthFrom:1 depthTo:6 withLinks:1 updateOnSave:1 orderedList:0 -->
|
||||
|
||||
- Table of Contents
|
||||
- 1. Generic commands
|
||||
- 2. Replace failed OSD
|
||||
|
||||
## 1. Generic Commands ##
|
||||
|
||||
### Check OSD Status
|
||||
To check the current status of OSDs, execute the following:
|
||||
|
||||
```
|
||||
nccli osd-maintenance check_osd_status
|
||||
```
|
||||
|
||||
### OSD Removal
|
||||
To purge OSDs in down state, execute the following:
|
||||
|
||||
```
|
||||
nccli osd-maintenance osd_remove
|
||||
```
|
||||
|
||||
### OSD Removal By OSD ID
|
||||
To purge OSDs by OSD ID in down state, execute the following:
|
||||
|
||||
```
|
||||
nccli osd-maintenance remove_osd_by_id --osd-id <OSDID>
|
||||
```
|
||||
|
||||
### Reweight OSDs
|
||||
To adjust an OSD’s crush weight in the CRUSH map of a running cluster, execute the following:
|
||||
|
||||
```
|
||||
nccli osd-maintenance reweight_osds
|
||||
```
|
||||
|
||||
## 2. Replace failed OSD ##
|
||||
|
||||
In the context of a failed drive, Please follow below procedure. Following commands should be run from utility container
|
||||
|
||||
Capture the failed OSD ID. Check for status `down`
|
||||
|
||||
nccli ceph osd tree
|
||||
|
||||
Remove the OSD from Cluster. Replace `<OSD_ID>` with above captured failed OSD ID
|
||||
|
||||
nccli osd-maintenance osd_remove_by_id --osd-id <OSD_ID>
|
||||
|
||||
Remove the failed drive and replace it with a new one without bringing down the node.
|
||||
|
||||
Once new drive is placed, delete the concern OSD pod in `error` or `CrashLoopBackOff` state. Replace `<pod_name>` with failed OSD pod name.
|
||||
|
||||
kubectl delete pod <pod_name> -n ceph
|
||||
|
||||
Once pod is deleted, kubernetes will re-spin a new pod for the OSD. Once Pod is up, the osd is added to ceph cluster with weight equal to `0`. we need to re-weight the osd.
|
||||
|
||||
nccli osd-maintenance reweight_osds
|
||||
|
Loading…
x
Reference in New Issue
Block a user