Move STX specific files from stx-ceph to stx-integ
By moving STX specific files from stx-ceph to stx-integ, we decouple STX code from the upstream ceph repo. When making changes in those STX files, we don't need to make "pull request" in stx-ceph repo any more. Change-Id: Ifaaae452798561ddfa7557cf59b072535bec7687 Story: 2002844 Task: 28993 Signed-off-by: Wei Zhou <wei.zhou@windriver.com>
This commit is contained in:
parent
e12b3a436f
commit
ed8655fa77
@ -1,4 +1,5 @@
|
||||
SRC_DIR="$CGCS_BASE/git/ceph"
|
||||
COPY_LIST="files/*"
|
||||
TIS_BASE_SRCREV=3f07f7ff1a5c7bfa8d0de12c966594d5fb7cf4ec
|
||||
TIS_PATCH_VER=GITREVCOUNT
|
||||
BUILD_IS_BIG=40
|
||||
|
@ -1 +0,0 @@
|
||||
../../../../git/ceph/ceph.spec
|
1884
ceph/ceph/centos/ceph.spec
Normal file
1884
ceph/ceph/centos/ceph.spec
Normal file
File diff suppressed because it is too large
Load Diff
282
ceph/ceph/files/ceph-init-wrapper.sh
Executable file
282
ceph/ceph/files/ceph-init-wrapper.sh
Executable file
@ -0,0 +1,282 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Copyright (c) 2019 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# This script is a helper wrapper for pmon monitoring of ceph
|
||||
# processes. The "/etc/init.d/ceph" script does not know if ceph is
|
||||
# running on the node. For example when the node is locked, ceph
|
||||
# processes are not running. In that case we do not want pmond to
|
||||
# monitor these processes.
|
||||
#
|
||||
# The script "/etc/services.d/<node>/ceph.sh" will create the file
|
||||
# "/var/run/.ceph_started" when ceph is running and remove it when
|
||||
# is not.
|
||||
#
|
||||
# The script also extracts one or more ceph process names that are
|
||||
# reported as 'not running' or 'dead' or 'failed' by '/etc/intit.d/ceph status'
|
||||
# and writes the names to a text file: /tmp/ceph_status_failure.txt for
|
||||
# pmond to access. The pmond adds the text to logs and alarms. Example of text
|
||||
# samples written to file by this script are:
|
||||
# 'osd.1'
|
||||
# 'osd.1, osd.2'
|
||||
# 'mon.storage-0'
|
||||
# 'mon.storage-0, osd.2'
|
||||
#
|
||||
# Moreover, for processes that are reported as 'hung' by '/etc/intit.d/ceph status'
|
||||
# the script will try increase their logging to 'debug' for a configurable interval.
|
||||
# With logging increased it will outputs a few stack traces then, at the end of this
|
||||
# interval, it dumps its stack core and kills it.
|
||||
#
|
||||
# Return values;
|
||||
# zero - /etc/init.d/ceph returned success or ceph is not running on the node
|
||||
# non-zero /etc/init.d/ceph returned a failure or invalid syntax
|
||||
#
|
||||
|
||||
source /usr/bin/tsconfig
|
||||
source /etc/platform/platform.conf
|
||||
|
||||
CEPH_SCRIPT="/etc/init.d/ceph"
|
||||
CEPH_FILE="$VOLATILE_PATH/.ceph_started"
|
||||
CEPH_RESTARTING_FILE="$VOLATILE_PATH/.ceph_restarting"
|
||||
CEPH_GET_STATUS_FILE="$VOLATILE_PATH/.ceph_getting_status"
|
||||
CEPH_STATUS_FAILURE_TEXT_FILE="/tmp/ceph_status_failure.txt"
|
||||
|
||||
BINDIR=/usr/bin
|
||||
SBINDIR=/usr/sbin
|
||||
LIBDIR=/usr/lib64/ceph
|
||||
ETCDIR=/etc/ceph
|
||||
source $LIBDIR/ceph_common.sh
|
||||
|
||||
LOG_PATH=/var/log/ceph
|
||||
LOG_FILE=$LOG_PATH/ceph-process-states.log
|
||||
LOG_LEVEL=NORMAL # DEBUG
|
||||
verbose=0
|
||||
|
||||
DATA_PATH=$VOLATILE_PATH/ceph_hang # folder where we keep state information
|
||||
mkdir -p $DATA_PATH # make sure folder exists
|
||||
|
||||
MONITORING_INTERVAL=15
|
||||
TRACE_LOOP_INTERVAL=5
|
||||
GET_STATUS_TIMEOUT=120
|
||||
CEPH_STATUS_TIMEOUT=20
|
||||
|
||||
WAIT_FOR_CMD=1
|
||||
|
||||
RC=0
|
||||
|
||||
args=("$@")
|
||||
|
||||
if [ ! -z $ARGS ]; then
|
||||
IFS=";" read -r -a new_args <<< "$ARGS"
|
||||
args+=("${new_args[@]}")
|
||||
fi
|
||||
|
||||
wait_for_status ()
|
||||
{
|
||||
timeout=$GET_STATUS_TIMEOUT # wait for status no more than $timeout seconds
|
||||
while [ -f ${CEPH_GET_STATUS_FILE} ] && [ $timeout -gt 0 ]; do
|
||||
sleep 1
|
||||
let timeout-=1
|
||||
done
|
||||
if [ $timeout -eq 0 ]; then
|
||||
wlog "-" "WARN" "Getting status takes more than ${GET_STATUS_TIMEOUT}s, continuing"
|
||||
rm -f $CEPH_GET_STATUS_FILE
|
||||
fi
|
||||
}
|
||||
|
||||
start ()
|
||||
{
|
||||
if [ -f ${CEPH_FILE} ]; then
|
||||
wait_for_status
|
||||
${CEPH_SCRIPT} start $1
|
||||
RC=$?
|
||||
else
|
||||
# Ceph is not running on this node, return success
|
||||
exit 0
|
||||
fi
|
||||
}
|
||||
|
||||
stop ()
|
||||
{
|
||||
wait_for_status
|
||||
${CEPH_SCRIPT} stop $1
|
||||
}
|
||||
|
||||
restart ()
|
||||
{
|
||||
if [ -f ${CEPH_FILE} ]; then
|
||||
wait_for_status
|
||||
touch $CEPH_RESTARTING_FILE
|
||||
${CEPH_SCRIPT} restart $1
|
||||
rm -f $CEPH_RESTARTING_FILE
|
||||
else
|
||||
# Ceph is not running on this node, return success
|
||||
exit 0
|
||||
fi
|
||||
|
||||
}
|
||||
|
||||
log_and_restart_blocked_osds ()
|
||||
{
|
||||
# Log info about the blocked osd daemons and then restart it
|
||||
local names=$1
|
||||
for name in $names; do
|
||||
wlog $name "INFO" "Restarting OSD with blocked operations"
|
||||
${CEPH_SCRIPT} restart $name
|
||||
done
|
||||
}
|
||||
|
||||
log_and_kill_hung_procs ()
|
||||
{
|
||||
# Log info about the hung processes and then kill them; later on pmon will restart them
|
||||
local names=$1
|
||||
for name in $names; do
|
||||
type=`echo $name | cut -c 1-3` # e.g. 'mon', if $item is 'mon1'
|
||||
id=`echo $name | cut -c 4- | sed 's/^\\.//'`
|
||||
get_conf run_dir "/var/run/ceph" "run dir"
|
||||
get_conf pid_file "$run_dir/$type.$id.pid" "pid file"
|
||||
pid=$(cat $pid_file)
|
||||
wlog $name "INFO" "Dealing with hung process (pid:$pid)"
|
||||
|
||||
# monitoring interval
|
||||
wlog $name "INFO" "Increasing log level"
|
||||
execute_ceph_cmd ret $name "ceph daemon $name config set debug_$type 20/20"
|
||||
monitoring=$MONITORING_INTERVAL
|
||||
while [ $monitoring -gt 0 ]; do
|
||||
if [ $(($monitoring % $TRACE_LOOP_INTERVAL)) -eq 0 ]; then
|
||||
date=$(date "+%Y-%m-%d_%H-%M-%S")
|
||||
log_file="$LOG_PATH/hang_trace_${name}_${pid}_${date}.log"
|
||||
wlog $name "INFO" "Dumping stack trace to: $log_file"
|
||||
$(pstack $pid >$log_file) &
|
||||
fi
|
||||
let monitoring-=1
|
||||
sleep 1
|
||||
done
|
||||
wlog $name "INFO" "Trigger core dump"
|
||||
kill -ABRT $pid &>/dev/null
|
||||
rm -f $pid_file # process is dead, core dump is archiving, preparing for restart
|
||||
# Wait for pending systemd core dumps
|
||||
sleep 2 # hope systemd_coredump has started meanwhile
|
||||
deadline=$(( $(date '+%s') + 300 ))
|
||||
while [[ $(date '+%s') -lt "${deadline}" ]]; do
|
||||
systemd_coredump_pid=$(pgrep -f "systemd-coredump.*${pid}.*ceph-${type}")
|
||||
[[ -z "${systemd_coredump_pid}" ]] && break
|
||||
wlog $name "INFO" "systemd-coredump ceph-${type} in progress: pid ${systemd_coredump_pid}"
|
||||
sleep 2
|
||||
done
|
||||
kill -KILL $pid &>/dev/null
|
||||
done
|
||||
}
|
||||
|
||||
|
||||
status ()
|
||||
{
|
||||
if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]] && [[ "$1" == "osd" ]]; then
|
||||
timeout $CEPH_STATUS_TIMEOUT ceph -s
|
||||
if [ "$?" -ne 0 ]; then
|
||||
# Ceph cluster is not accessible. Don't panic, controller swact
|
||||
# may be in progress.
|
||||
wlog "-" INFO "Ceph is down, ignoring OSD status."
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -f ${CEPH_RESTARTING_FILE} ]; then
|
||||
# Ceph is restarting, we don't report state changes on the first pass
|
||||
rm -f ${CEPH_RESTARTING_FILE}
|
||||
exit 0
|
||||
fi
|
||||
if [ -f ${CEPH_FILE} ]; then
|
||||
# Make sure the script does not 'exit' between here and the 'rm -f' below
|
||||
# or the checkpoint file will be left behind
|
||||
touch -f ${CEPH_GET_STATUS_FILE}
|
||||
result=`${CEPH_SCRIPT} status $1`
|
||||
RC=$?
|
||||
if [ "$RC" -ne 0 ]; then
|
||||
erred_procs=`echo "$result" | sort | uniq | awk ' /not running|dead|failed/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
|
||||
hung_procs=`echo "$result" | sort | uniq | awk ' /hung/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
|
||||
blocked_ops_procs=`echo "$result" | sort | uniq | awk ' /blocked ops/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
|
||||
invalid=0
|
||||
host=`hostname`
|
||||
if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]]; then
|
||||
# On 2 node configuration we have a floating monitor
|
||||
host="controller"
|
||||
fi
|
||||
for i in $(echo $erred_procs $hung_procs); do
|
||||
if [[ "$i" =~ osd.?[0-9]?[0-9]|mon.$host ]]; then
|
||||
continue
|
||||
else
|
||||
invalid=1
|
||||
fi
|
||||
done
|
||||
|
||||
log_and_restart_blocked_osds $blocked_ops_procs
|
||||
log_and_kill_hung_procs $hung_procs
|
||||
|
||||
hung_procs_text=""
|
||||
for i in $(echo $hung_procs); do
|
||||
hung_procs_text+="$i(process hung) "
|
||||
done
|
||||
|
||||
rm -f $CEPH_STATUS_FAILURE_TEXT_FILE
|
||||
if [ $invalid -eq 0 ]; then
|
||||
text=""
|
||||
for i in $erred_procs; do
|
||||
text+="$i, "
|
||||
done
|
||||
for i in $hung_procs; do
|
||||
text+="$i (process hang), "
|
||||
done
|
||||
echo "$text" | tr -d '\n' > $CEPH_STATUS_FAILURE_TEXT_FILE
|
||||
else
|
||||
echo "$host: '${CEPH_SCRIPT} status $1' result contains invalid process names: $erred_procs"
|
||||
echo "Undetermined osd or monitor id" > $CEPH_STATUS_FAILURE_TEXT_FILE
|
||||
fi
|
||||
fi
|
||||
|
||||
rm -f ${CEPH_GET_STATUS_FILE}
|
||||
|
||||
if [[ $RC == 0 ]] && [[ "$1" == "mon" ]] && [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]]; then
|
||||
# SM needs exit code != 0 from 'status mon' argument of the init script on
|
||||
# standby controller otherwise it thinks that the monitor is running and
|
||||
# tries to stop it.
|
||||
# '/etc/init.d/ceph status mon' checks the status of monitors configured in
|
||||
# /etc/ceph/ceph.conf and if it should be running on current host.
|
||||
# If it should not be running it just exits with code 0. This is what
|
||||
# happens on the standby controller.
|
||||
# When floating monitor is running on active controller /var/lib/ceph/mon of
|
||||
# standby is not mounted (Ceph monitor partition is DRBD synced).
|
||||
test -e "/var/lib/ceph/mon/ceph-controller"
|
||||
if [ "$?" -ne 0 ]; then
|
||||
exit 3
|
||||
fi
|
||||
fi
|
||||
else
|
||||
# Ceph is not running on this node, return success
|
||||
exit 0
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
case "${args[0]}" in
|
||||
start)
|
||||
start ${args[1]}
|
||||
;;
|
||||
stop)
|
||||
stop ${args[1]}
|
||||
;;
|
||||
restart)
|
||||
restart ${args[1]}
|
||||
;;
|
||||
status)
|
||||
status ${args[1]}
|
||||
;;
|
||||
*)
|
||||
echo "Usage: $0 {start|stop|restart|status} [{mon|osd|osd.<number>|mon.<hostname>}]"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
exit $RC
|
@ -1,6 +1,6 @@
|
||||
#!/usr/bin/python
|
||||
#
|
||||
# Copyright (c) 2016 Wind River Systems, Inc.
|
||||
# Copyright (c) 2019 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
@ -12,6 +12,7 @@ import re
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
DEVICE_NAME_NVME = "nvme"
|
||||
|
||||
#########
|
||||
# Utils #
|
||||
@ -85,7 +86,11 @@ def is_partitioning_correct(disk_path, partition_sizes):
|
||||
partition_index = 1
|
||||
for size in partition_sizes:
|
||||
# Check that each partition size matches the one in input
|
||||
partition_node = disk_node + str(partition_index)
|
||||
if DEVICE_NAME_NVME in disk_node:
|
||||
partition_node = '{}p{}'.format(disk_node, str(partition_index))
|
||||
else:
|
||||
partition_node = '{}{}'.format(disk_node, str(partition_index))
|
||||
|
||||
output, _, _ = command(["udevadm", "settle", "-E", partition_node])
|
||||
cmd = ["parted", "-s", partition_node, "unit", "MiB", "print"]
|
||||
output, _, _ = command(cmd)
|
||||
@ -118,7 +123,7 @@ def create_partitions(disk_path, partition_sizes):
|
||||
# GPT partitions on the storage node so nothing to remove in this case
|
||||
links = []
|
||||
if os.path.isdir(DISK_BY_PARTUUID):
|
||||
links = [ os.path.join(DISK_BY_PARTUUID,l) for l in os.listdir(DISK_BY_PARTUUID)
|
||||
links = [ os.path.join(DISK_BY_PARTUUID,l) for l in os.listdir(DISK_BY_PARTUUID)
|
||||
if os.path.islink(os.path.join(DISK_BY_PARTUUID, l)) ]
|
||||
|
||||
# Erase all partitions on current node by creating a new GPT table
|
||||
|
18
ceph/ceph/files/ceph-radosgw.service
Normal file
18
ceph/ceph/files/ceph-radosgw.service
Normal file
@ -0,0 +1,18 @@
|
||||
[Unit]
|
||||
Description=radosgw RESTful rados gateway
|
||||
After=network.target
|
||||
#After=remote-fs.target nss-lookup.target network-online.target time-sync.target
|
||||
#Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=forking
|
||||
Restart=no
|
||||
KillMode=process
|
||||
RemainAfterExit=yes
|
||||
ExecStart=/etc/rc.d/init.d/ceph-radosgw start
|
||||
ExecStop=/etc/rc.d/init.d/ceph-radosgw stop
|
||||
ExecReload=/etc/rc.d/init.d/ceph-radosgw reload
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
92
ceph/ceph/files/ceph-rest-api
Normal file
92
ceph/ceph/files/ceph-rest-api
Normal file
@ -0,0 +1,92 @@
|
||||
#!/bin/sh
|
||||
|
||||
### BEGIN INIT INFO
|
||||
# Provides: ceph-rest-api
|
||||
# Required-Start: $ceph
|
||||
# Required-Stop: $ceph
|
||||
# Default-Start: 2 3 4 5
|
||||
# Default-Stop: 0 1 6
|
||||
# Short-Description: Ceph REST API daemon
|
||||
# Description: Ceph REST API daemon
|
||||
### END INIT INFO
|
||||
|
||||
DESC="ceph-rest-api"
|
||||
DAEMON="/usr/bin/ceph-rest-api"
|
||||
RUNDIR="/var/run/ceph"
|
||||
PIDFILE="${RUNDIR}/ceph-rest-api.pid"
|
||||
|
||||
start()
|
||||
{
|
||||
if [ -e $PIDFILE ]; then
|
||||
PIDDIR=/proc/$(cat $PIDFILE)
|
||||
if [ -d ${PIDDIR} ]; then
|
||||
echo "$DESC already running."
|
||||
exit 0
|
||||
else
|
||||
echo "Removing stale PID file $PIDFILE"
|
||||
rm -f $PIDFILE
|
||||
fi
|
||||
fi
|
||||
|
||||
echo -n "Starting $DESC..."
|
||||
mkdir -p $RUNDIR
|
||||
start-stop-daemon --start --quiet --background \
|
||||
--pidfile ${PIDFILE} --make-pidfile --exec ${DAEMON}
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "done."
|
||||
else
|
||||
echo "failed."
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
stop()
|
||||
{
|
||||
echo -n "Stopping $DESC..."
|
||||
start-stop-daemon --stop --quiet --pidfile $PIDFILE
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "done."
|
||||
else
|
||||
echo "failed."
|
||||
fi
|
||||
rm -f $PIDFILE
|
||||
}
|
||||
|
||||
status()
|
||||
{
|
||||
pid=`cat $PIDFILE 2>/dev/null`
|
||||
if [ -n "$pid" ]; then
|
||||
if ps -p $pid &>/dev/null ; then
|
||||
echo "$DESC is running"
|
||||
exit 0
|
||||
else
|
||||
echo "$DESC is not running but has pid file"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
echo "$DESC is not running"
|
||||
exit 3
|
||||
}
|
||||
|
||||
case "$1" in
|
||||
start)
|
||||
start
|
||||
;;
|
||||
stop)
|
||||
stop
|
||||
;;
|
||||
restart|force-reload|reload)
|
||||
stop
|
||||
start
|
||||
;;
|
||||
status)
|
||||
status
|
||||
;;
|
||||
*)
|
||||
echo "Usage: $0 {start|stop|force-reload|restart|reload|status}"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
exit 0
|
16
ceph/ceph/files/ceph-rest-api.service
Normal file
16
ceph/ceph/files/ceph-rest-api.service
Normal file
@ -0,0 +1,16 @@
|
||||
[Unit]
|
||||
Description=Ceph REST API
|
||||
After=network.target ceph.target
|
||||
|
||||
[Service]
|
||||
Type=forking
|
||||
Restart=no
|
||||
KillMode=process
|
||||
RemainAfterExit=yes
|
||||
ExecStart=/etc/rc.d/init.d/ceph-rest-api start
|
||||
ExecStop=/etc/rc.d/init.d/ceph-rest-api stop
|
||||
ExecReload=/etc/rc.d/init.d/ceph-rest-api reload
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
50
ceph/ceph/files/ceph.conf
Normal file
50
ceph/ceph/files/ceph.conf
Normal file
@ -0,0 +1,50 @@
|
||||
[global]
|
||||
# Unique ID for the cluster.
|
||||
fsid = %CLUSTER_UUID%
|
||||
# Public network where the monitor is connected to, i.e, 128.224.0.0/16
|
||||
#public network = 127.0.0.1/24
|
||||
# For version 0.55 and beyond, you must explicitly enable
|
||||
# or disable authentication with "auth" entries in [global].
|
||||
auth_cluster_required = cephx
|
||||
auth_service_required = cephx
|
||||
auth_client_required = cephx
|
||||
osd_journal_size = 1024
|
||||
|
||||
# Uncomment the following line if you are mounting with ext4
|
||||
# filestore xattr use omap = true
|
||||
|
||||
# Number of replicas of objects. Write an object 2 times.
|
||||
# Cluster cannot reach an active + clean state until there's enough OSDs
|
||||
# to handle the number of copies of an object. In this case, it requires
|
||||
# at least 2 OSDs
|
||||
osd_pool_default_size = 2
|
||||
|
||||
# Allow writing one copy in a degraded state.
|
||||
osd_pool_default_min_size = 1
|
||||
|
||||
# Ensure you have a realistic number of placement groups. We recommend
|
||||
# approximately 100 per OSD. E.g., total number of OSDs multiplied by 100
|
||||
# divided by the number of replicas (i.e., osd pool default size). So for
|
||||
# 2 OSDs and osd pool default size = 2, we'd recommend approximately
|
||||
# (100 * 2) / 2 = 100.
|
||||
osd_pool_default_pg_num = 64
|
||||
osd_pool_default_pgp_num = 64
|
||||
osd_crush_chooseleaf_type = 1
|
||||
setuser match path = /var/lib/ceph/$type/$cluster-$id
|
||||
|
||||
# Override Jewel default of 2 reporters. StarlingX has replication factor 2
|
||||
mon_osd_min_down_reporters = 1
|
||||
|
||||
# Use Hammer's report interval default value
|
||||
osd_mon_report_interval_max = 120
|
||||
|
||||
[osd]
|
||||
osd_mkfs_type = xfs
|
||||
osd_mkfs_options_xfs = "-f"
|
||||
osd_mount_options_xfs = "rw,noatime,inode64,logbufs=8,logbsize=256k"
|
||||
|
||||
[mon]
|
||||
mon warn on legacy crush tunables = false
|
||||
# Quiet new warnings on move to Hammer
|
||||
mon pg warn max per osd = 2048
|
||||
mon pg warn max object skew = 0
|
26
ceph/ceph/files/ceph.conf.pmon
Normal file
26
ceph/ceph/files/ceph.conf.pmon
Normal file
@ -0,0 +1,26 @@
|
||||
[process]
|
||||
process = ceph
|
||||
script = /etc/init.d/ceph-init-wrapper
|
||||
|
||||
style = lsb
|
||||
severity = major ; minor, major, critical
|
||||
restarts = 3 ; restart retries before error assertion
|
||||
interval = 30 ; number of seconds to wait between restarts
|
||||
|
||||
mode = status ; Monitoring mode: passive (default) or active
|
||||
; passive: process death monitoring (default: always)
|
||||
; active : heartbeat monitoring, i.e. request / response messaging
|
||||
; status : determine process health with executing "status" command
|
||||
; "start" is used to start the process(es) again
|
||||
; ignore : do not monitor or stop monitoring
|
||||
|
||||
; Status and Active Monitoring Options
|
||||
|
||||
period = 30 ; monitor period in seconds
|
||||
timeout = 120 ; for active mode, messaging timeout period in seconds, must be shorter than period
|
||||
; for status mode, max amount of time for a command to execute
|
||||
|
||||
; Status Monitoring Options
|
||||
start_arg = start ; start argument for the script
|
||||
status_arg = status ; status argument for the script
|
||||
status_failure_text = /tmp/ceph_status_failure.txt ; text to be added to alarms or logs, this is optional
|
16
ceph/ceph/files/ceph.service
Normal file
16
ceph/ceph/files/ceph.service
Normal file
@ -0,0 +1,16 @@
|
||||
[Unit]
|
||||
Description=StarlingX Ceph Startup
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=forking
|
||||
Restart=no
|
||||
KillMode=process
|
||||
RemainAfterExit=yes
|
||||
ExecStart=/etc/rc.d/init.d/ceph start
|
||||
ExecStop=/etc/rc.d/init.d/ceph stop
|
||||
PIDFile=/var/run/ceph/ceph.pid
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
77
ceph/ceph/files/ceph.sh
Executable file
77
ceph/ceph/files/ceph.sh
Executable file
@ -0,0 +1,77 @@
|
||||
#!/bin/bash
|
||||
|
||||
INITDIR=/etc/init.d
|
||||
LOGFILE=/var/log/ceph/ceph-init.log
|
||||
CEPH_FILE=/var/run/.ceph_started
|
||||
|
||||
# Get our nodetype
|
||||
. /etc/platform/platform.conf
|
||||
|
||||
# Exit immediately if ceph not configured (i.e. no mon in the config file)
|
||||
if ! grep -q "mon\." /etc/ceph/ceph.conf
|
||||
then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
logecho ()
|
||||
{
|
||||
echo $1
|
||||
date >> ${LOGFILE}
|
||||
echo $1 >> ${LOGFILE}
|
||||
}
|
||||
|
||||
start ()
|
||||
{
|
||||
if [[ "$nodetype" == "controller" ]] || [[ "$nodetype" == "storage" ]]; then
|
||||
logecho "Starting ceph services..."
|
||||
${INITDIR}/ceph start >> ${LOGFILE} 2>&1
|
||||
RC=$?
|
||||
|
||||
if [ ! -f ${CEPH_FILE} ]; then
|
||||
touch ${CEPH_FILE}
|
||||
fi
|
||||
else
|
||||
logecho "No ceph services on ${nodetype} node"
|
||||
exit 0
|
||||
fi
|
||||
}
|
||||
|
||||
stop ()
|
||||
{
|
||||
if [[ "$nodetype" == "controller" ]] || [[ "$nodetype" == "storage" ]]; then
|
||||
if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" == "simplex" ]]; then
|
||||
logecho "Ceph services will continue to run on node"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
logecho "Stopping ceph services..."
|
||||
|
||||
if [ -f ${CEPH_FILE} ]; then
|
||||
rm -f ${CEPH_FILE}
|
||||
fi
|
||||
|
||||
${INITDIR}/ceph stop >> ${LOGFILE} 2>&1
|
||||
RC=$?
|
||||
else
|
||||
logecho "No ceph services on ${nodetype} node"
|
||||
exit 0
|
||||
fi
|
||||
}
|
||||
|
||||
RC=0
|
||||
|
||||
case "$1" in
|
||||
start)
|
||||
start
|
||||
;;
|
||||
stop)
|
||||
stop
|
||||
;;
|
||||
*)
|
||||
echo "Usage: $0 {start|stop}"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
logecho "RC was: $RC"
|
||||
exit $RC
|
246
ceph/ceph/files/osd-wait-status.py
Normal file
246
ceph/ceph/files/osd-wait-status.py
Normal file
@ -0,0 +1,246 @@
|
||||
#!/usr/bin/python
|
||||
#
|
||||
# Copyright (c) 2019 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
#
|
||||
# Wait for one or a group of OSDs to match one or a group of statuses
|
||||
# as reported by "ceph osd tree".
|
||||
#
|
||||
# Examples:
|
||||
# - wait for osd 0 to be up:
|
||||
# osd-wait-status -o 0 -s up
|
||||
#
|
||||
# - wait for osd 0 and osd 1 to be up:
|
||||
# osd-wait-status -o 0 1 -s up
|
||||
#
|
||||
# The amount of time spent waiting for OSDs to match a status can
|
||||
# be limited by specifying:
|
||||
#
|
||||
# - the maximum retry count; the script will if the status doesn't
|
||||
# match the desired one after more than retry count attempts.
|
||||
# The interval between attempts is controlled by the "-i" flag.
|
||||
# Example:
|
||||
# osd-wait-status -o 0 -s up -c 2 -i 3
|
||||
# will call "ceph osd tree" once to get the status of osd 0 and if
|
||||
# it's not "up" then it will try one more time after 3 seconds.
|
||||
#
|
||||
# - a deadline as the maximum interval of time the script is looping
|
||||
# waiting for OSDs to match status. The interval between attempts
|
||||
# is controlled by the "-i" flag.
|
||||
# Example:
|
||||
# osd-wait-status -o 0 -s up -d 10 -i 3
|
||||
# will call "ceph osd tree" until either osd 0 status is "up" or
|
||||
# no more than 10 seconds have passed, that's 3-4 attempts depending
|
||||
# on how much time it takes to run "ceph osd tree"
|
||||
#
|
||||
# Status match can be reversed by using "-n" flag.
|
||||
# Example:
|
||||
# osd-wait-status -o 0 -n -s up
|
||||
# waits until osd 0 status is NOT up.
|
||||
#
|
||||
# osd-wait-status does not allow matching arbitrary combinations of
|
||||
# OSDs and statuses. For example: "osd 0 up and osd 1 down" is not
|
||||
# supported.
|
||||
#
|
||||
# Return code is 0 if OSDs match expected status before the
|
||||
# retry count*interval / deadline limits are reached.
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import retrying
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
LOG = logging.getLogger('osd-wait-status')
|
||||
|
||||
CEPH_BINARY_PATH = '/usr/bin/ceph'
|
||||
RETRY_INTERVAL_SEC = 1
|
||||
RETRY_FOREVER = 0
|
||||
NO_DEADLINE = 0
|
||||
|
||||
|
||||
class OsdException(Exception):
|
||||
def __init__(self, message, restartable=False):
|
||||
super(OsdException, self).__init__(message)
|
||||
self.restartable = restartable
|
||||
|
||||
|
||||
def get_osd_tree():
|
||||
command = [CEPH_BINARY_PATH,
|
||||
'osd', 'tree', '--format', 'json']
|
||||
try:
|
||||
p = subprocess.Popen(command,
|
||||
stdout = subprocess.PIPE,
|
||||
stderr = subprocess.PIPE)
|
||||
output, error = p.communicate()
|
||||
if p.returncode != 0:
|
||||
raise OsdException(
|
||||
('Command failed: command="{}", '
|
||||
'returncode={}, output="{}"').format(
|
||||
' '.join(command),
|
||||
p.returncode,
|
||||
output, error),
|
||||
restartable=True)
|
||||
except OSError as e:
|
||||
raise OsdException(
|
||||
('Command failed: command="{}", '
|
||||
'reason="{}"').format(command, str(e)))
|
||||
try:
|
||||
return json.loads(output)
|
||||
except ValueError as e:
|
||||
raise OsdException(
|
||||
('JSON decode failed: '
|
||||
'data="{}", error="{}"').format(
|
||||
output, e))
|
||||
|
||||
|
||||
def osd_match_status(target_osd, target_status,
|
||||
reverse_logic):
|
||||
LOG.info(('Match status: '
|
||||
'target_osd={}, '
|
||||
'target status={}, '
|
||||
'reverse_logic={}').format(
|
||||
target_osd, target_status, reverse_logic))
|
||||
tree = get_osd_tree()
|
||||
osd_status = {}
|
||||
for node in tree.get('nodes'):
|
||||
name = node.get('name')
|
||||
if name in target_osd:
|
||||
osd_status[name] = node.get('status')
|
||||
if len(osd_status) == len(target_osd):
|
||||
break
|
||||
LOG.info('Current OSD(s) status: {}'.format(osd_status))
|
||||
for name in target_osd:
|
||||
if name not in osd_status:
|
||||
raise OsdException(
|
||||
('Unable to retrieve status '
|
||||
'for "{}"').format(
|
||||
name))
|
||||
if reverse_logic:
|
||||
if osd_status[name] not in target_status:
|
||||
del osd_status[name]
|
||||
else:
|
||||
if osd_status[name] in target_status:
|
||||
del osd_status[name]
|
||||
if len(osd_status) == 0:
|
||||
LOG.info('OSD(s) status target reached.')
|
||||
return True
|
||||
else:
|
||||
LOG.info('OSD(s) {}matching status {}: {}'.format(
|
||||
'' if reverse_logic else 'not ',
|
||||
target_status,
|
||||
osd_status.keys()))
|
||||
return False
|
||||
|
||||
|
||||
def osd_wait_status(target_osd, target_status,
|
||||
reverse_logic,
|
||||
retry_count, retry_interval,
|
||||
deadline):
|
||||
|
||||
def retry_if_false(result):
|
||||
return (result is False)
|
||||
|
||||
def retry_if_restartable(exception):
|
||||
return (isinstance(exception, OsdException)
|
||||
and exception.restartable)
|
||||
|
||||
LOG.info(('Wait options: '
|
||||
'target_osd={}, '
|
||||
'target_status={}, '
|
||||
'reverse_logic={}, '
|
||||
'retry_count={}, '
|
||||
'retry_interval={}, '
|
||||
'deadline={}').format(
|
||||
target_osd, target_status, reverse_logic,
|
||||
retry_count, retry_interval, deadline))
|
||||
kwargs = {
|
||||
'retry_on_result': retry_if_false,
|
||||
'retry_on_exception': retry_if_restartable}
|
||||
if retry_count != RETRY_FOREVER:
|
||||
kwargs['stop_max_attempt_number'] = retry_count
|
||||
if deadline != NO_DEADLINE:
|
||||
kwargs['stop_max_delay'] = deadline * 1000
|
||||
if retry_interval != 0:
|
||||
kwargs['wait_fixed'] = retry_interval * 1000
|
||||
if not len(target_osd):
|
||||
return
|
||||
retrying.Retrying(**kwargs).call(
|
||||
osd_match_status,
|
||||
target_osd, target_status,
|
||||
reverse_logic)
|
||||
|
||||
|
||||
def non_negative_interger(value):
|
||||
value = int(value)
|
||||
if value < 0:
|
||||
raise argparse.argumenttypeerror(
|
||||
'{} is a negative integer value'.format(value))
|
||||
return value
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Wait for OSD status match')
|
||||
parser.add_argument(
|
||||
'-o', '--osd',
|
||||
nargs='*',
|
||||
help='osd id',
|
||||
type=non_negative_interger,
|
||||
required=True)
|
||||
parser.add_argument(
|
||||
'-n', '--not',
|
||||
dest='reverse_logic',
|
||||
help='reverse logic: wait for status NOT to match',
|
||||
action='store_true',
|
||||
default=False)
|
||||
parser.add_argument(
|
||||
'-s', '--status',
|
||||
nargs='+',
|
||||
help='status',
|
||||
type=str,
|
||||
required=True)
|
||||
parser.add_argument(
|
||||
'-c', '--retry-count',
|
||||
help='retry count',
|
||||
type=non_negative_interger,
|
||||
default=RETRY_FOREVER)
|
||||
parser.add_argument(
|
||||
'-i', '--retry-interval',
|
||||
help='retry interval (seconds)',
|
||||
type=non_negative_interger,
|
||||
default=RETRY_INTERVAL_SEC)
|
||||
parser.add_argument(
|
||||
'-d', '--deadline',
|
||||
help='deadline (seconds)',
|
||||
type=non_negative_interger,
|
||||
default=NO_DEADLINE)
|
||||
args = parser.parse_args()
|
||||
start = time.time()
|
||||
try:
|
||||
osd_wait_status(
|
||||
['osd.{}'.format(o) for o in args.osd],
|
||||
args.status,
|
||||
args.reverse_logic,
|
||||
args.retry_count,
|
||||
args.retry_interval,
|
||||
args.deadline)
|
||||
LOG.info('Elapsed time: {:.02f} seconds'.format(
|
||||
time.time() - start))
|
||||
sys.exit(0)
|
||||
except retrying.RetryError as e:
|
||||
LOG.warn(
|
||||
('Retry error: {}. '
|
||||
'Elapsed time: {:.02f} seconds'.format(
|
||||
e, time.time() - start)))
|
||||
except OsdException as e:
|
||||
LOG.warn(
|
||||
('OSD wait error: {}. '
|
||||
'Elapsed time: {:.02f} seconds').format(
|
||||
e, time.time() - start))
|
||||
sys.exit(1)
|
2
ceph/ceph/files/stx_git_version
Normal file
2
ceph/ceph/files/stx_git_version
Normal file
@ -0,0 +1,2 @@
|
||||
656b5b63ed7c43bd014bcafd81b001959d5f089f
|
||||
v10.2.6
|
Loading…
x
Reference in New Issue
Block a user