359 lines
18 KiB
Bash
Executable File
359 lines
18 KiB
Bash
Executable File
#!/bin/bash
|
|
################################################################################
|
|
#
|
|
# Copyright (c) 2016 Wind River Systems, Inc.
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
################################################################################
|
|
#
|
|
# Description: Displays memory usage information to check for memory leaks.
|
|
#
|
|
# Behaviour : The script takes in a list of commands whose processes you want
|
|
# monitored and then finds their process IDs, and uses that to find
|
|
# their current Resident Set Size (RSS) using ps, and looks up their
|
|
# Proportional Set Size (PSS) in /proc/<pid>/smaps. Only the initial
|
|
# process run by the system is monitored; child processes and other
|
|
# instances of these processes are ignored.
|
|
#
|
|
# This script is to be run on a controller node, and requires that it be run
|
|
# with sudo privileges or else it may not have access to /proc/<pid>/smaps for
|
|
# each of the desired processes.
|
|
#
|
|
# The script should be run with the following options:
|
|
# sudo ./memchk -t [#] --C [commands]
|
|
# Where following -t the time in seconds with which you want this script to repeat
|
|
# should be indicated. The default if no time is specified is 3600 seconds (1 hour).
|
|
# Following --C all arguments are viewed as commands you wish to be monitored.
|
|
# Each command should be separated by a space.
|
|
# e.g. sudo ./memchk -t 1800 --C command1 command2 command3
|
|
#
|
|
# Error logs can be found in /tmp/memchk_err.log
|
|
# Standard output can be found in /tmp/memchk_out.log
|
|
#
|
|
################################################################################
|
|
|
|
changeP=()
|
|
changeR=()
|
|
rss=()
|
|
firstP=()
|
|
lastP=()
|
|
firstR=()
|
|
lastR=()
|
|
leaking=()
|
|
leakFlag=0
|
|
flag=0
|
|
period=0
|
|
s1=() # Holds behaviour of most recent sample (does not change if present behaviour continues)
|
|
s2=() # Holds type of the previously observed behaviour that is different from the present behaviour
|
|
commands=()
|
|
trend=()
|
|
baseline=() # Sum of all RSS values for a given PID
|
|
count=() # Number of times RSS values have been sampled for a given PID
|
|
increasing=()
|
|
decreasing=()
|
|
stable=()
|
|
pattern=() # Stores a string indicating the present pattern
|
|
|
|
|
|
function trapCalled {
|
|
echo $'\nReceived trap signal' >&2
|
|
exit
|
|
}
|
|
|
|
trap trapCalled SIGHUP SIGINT SIGTERM
|
|
|
|
function helpMessage {
|
|
echo "--------------------------------------------------------------------------------------"
|
|
echo "Memory Leak And Information Tracking Tool"
|
|
echo ""
|
|
echo "Usage:"
|
|
echo ""
|
|
echo "sudo ./memchk.sh --C [commands]"
|
|
echo ""
|
|
echo " -t ... time in seconds with which to run this script. No time"
|
|
echo " specified will result in the default of 3600s (1 hour)."
|
|
echo " --C ... space delimited list of commands to monitor. This option"
|
|
echo " must be the last one entered."
|
|
echo " --help | -h ... this info"
|
|
echo ""
|
|
echo "Note: This script must be run using sudo. If it is not, access to the memory information"
|
|
echo " of a given process may not be allowed by the system. PSS info is obtained from"
|
|
echo " /procs/<pid>/smaps"
|
|
echo ""
|
|
echo " Error logs can be found in /tmp/memchk_err.log"
|
|
echo " Standard output can be found in /tmp/memchk_out.log"
|
|
echo ""
|
|
echo ""
|
|
echo "Examples:"
|
|
echo ""
|
|
echo "sudo memchk -t 60 --C mtcClient mtcAgent ... Check PSS and RSS values of the processes belonging to mtcClient"
|
|
echo " and mtcAgent every 60 seconds (1 minute)"
|
|
echo "sudo memchk -t 3600 --C pmond rmond hwmond ... Check PSS and RSS values of pmond, rmond and hwmond every 3600s (1h)"
|
|
echo "sudo memchl --C pmond rmond hwmond ... Check PSS and RSS values of commands using default period of 3600s (1h)"
|
|
echo "--------------------------------------------------------------------------------------"
|
|
exit 0
|
|
}
|
|
|
|
# Prints information on suspected leaking process
|
|
function memLeak {
|
|
printf "\n" >&2
|
|
printf '%0.1s' "*"{1..150} >&2
|
|
# Iterates over all keys in the array.
|
|
for proc in ${!leaking[@]}
|
|
do
|
|
printf "\nPossible mem leak in: %s PID: %s Current RSS: %s Orig RSS: %s Current PSS: %s Orig PSS: %s\n" \
|
|
${leaking[proc]} $proc ${rss[proc]} ${firstR[proc]} ${lastP[proc]} ${firstP[proc]} >&2
|
|
done
|
|
printf '%0.1s' "*"{1..150} >&2
|
|
printf "\n" >&2
|
|
}
|
|
|
|
if [ $UID -ne 0 ]; then
|
|
echo $'\nWarning: Memchk must be run as \'root\' user to access PSS memory information'
|
|
echo $'Use the -h option for help\n'
|
|
exit 1
|
|
fi
|
|
|
|
if [ $# -eq 0 ]; then
|
|
echo $'\nNo commands specified\nPlease try again and enter a command whose memory you would like to monitor'
|
|
echo $'Use the -h option for help\n'
|
|
exit 1
|
|
fi
|
|
|
|
exec > >(tee /tmp/NEWmemchk_out.log) 2> >(tee /tmp/NEWmemchk_err.log >&2)
|
|
|
|
# Cycles through commandline arguments to make sure valid input was received and
|
|
# to assign values correctly
|
|
while [[ $# > 0 ]]; do
|
|
key="$1"
|
|
|
|
case $key in
|
|
|
|
# To make this more user-friendly, instead of having the user enter the period in seconds, consider using
|
|
# 'shopt -s extglob' and 'if [[ $2 = +([0-9])m ]];' to check if the user entered 15m as a period to indicate 15 minutes etc.
|
|
# Modify the regex for seconds and hours as well, then multiply value as necessary to convert into seconds for script
|
|
-t)
|
|
period="$2"
|
|
shift
|
|
;;
|
|
|
|
--C)
|
|
shift
|
|
if [ "$#" -eq "0" ]; then
|
|
printf "Error: No commands specified.\n"
|
|
exit 1
|
|
fi
|
|
for c in "$@"; do
|
|
commands+=("$1")
|
|
shift
|
|
done
|
|
;;
|
|
|
|
-h|--help)
|
|
helpMessage
|
|
;;
|
|
|
|
*)
|
|
printf "\nUnknown argument passed: %s\n" $key
|
|
printf "Use the -h option for help\n"
|
|
exit 1
|
|
esac
|
|
shift
|
|
done
|
|
|
|
# Makes sure period has a positive value
|
|
if [ "$period" -le "0" ]; then
|
|
period=3600
|
|
printf "You have entered an invalid period. Period has been set to 3600 seconds.\n"
|
|
# The rate of kB/h has been hard-coded into the table, if values greater than or equal to 1 hour are used, the table
|
|
# will not show an accurate representation in the change in usage over time. There are various accuracy issues in
|
|
# modifying the code to display data to match your chosen period. Consider this and modify accordingly.
|
|
elif [ "$period" -lt "3600" ]; then
|
|
printf "\nWARNING: You have chosen a period that is less than 1 hour. The rate of change in the table is displayed in kB/h, keep this in mind when reviewing results.\n"
|
|
fi
|
|
|
|
while true; do
|
|
# Prints header for columns
|
|
printf "\n%15s | %8s | Leak | %10s | %13s | %8s | %8s | %8s | %13s | %8s | %8s | %8s | Period: %-${#period}ss\n" \
|
|
"Cmd" "PID" "Trend" "Change in RSS" "RSS" "Orig RSS" "Prev RSS" "Change in PSS" "PSS" "Orig PSS" "Prev PSS" "$period" >&1
|
|
padding=$(printf '%0.1s' "-"{1..180})
|
|
printf '%*.*s' 0 $((156 + ${#period} )) "$padding" # Prints line of hyphens of variable size depending on the number of characters in period.
|
|
# Cycles through each of the originally entered commands. This list does not change.
|
|
for cmd in ${commands[@]}
|
|
do
|
|
# Finds all the PIDs associated with each command (commands may have more than one instance)
|
|
procs="$(pgrep $cmd)"
|
|
|
|
# The number of processes may change on each loop. Keep this in mind if expanding or reusing this script.
|
|
for pid in ${procs[@]}
|
|
do
|
|
# In smaps the PSS value is located 3 lines below the line containing the process name. This works by setting
|
|
# the awk variable comm to contain the same value as cmd, the file is then searched for the string pattern
|
|
# contained in comm (cmd) and the PSS value associated with each instance of comm is summed and then printed.
|
|
pss=$(awk -v comm="$cmd" '$0 ~ comm {getline;getline;getline;sum += $2;} END {print sum}' /proc/"$pid"/smaps)
|
|
# obtains the RSS value of the indicated process
|
|
rssCurrent=$(ps -p "$pid" --no-header -o rss)
|
|
lastR[pid]="${rss[pid]}"
|
|
|
|
# Child processes may exist ephemerally, as a result they may be added to our list of PIDs, but no longer
|
|
# exist when we try to read their associated files in /proc/. This makes sure the file exists and that the
|
|
# parent process is 1. If the parent process ID is not 1 then the process in question is a child proceess
|
|
# and we do not care about its memory usage (for the purposes of this specific script). The continue
|
|
# statement will return us to the for-loop and begin running for the next pid.
|
|
if [ -f "/proc/$pid/status" ] && [ "$(awk '$0 ~ "PPid:" {print $2}' /proc/"$pid"/status)" -ne "1" ]; then continue; fi
|
|
|
|
# This checks that neither rssCurrent nor pss have empty values due to a child process being generated
|
|
# and then killed off before its values could be read. Root occasionally generates a child process of
|
|
# one of the monitored commands so the above if-statement doesn't exclude it because the PPID is 1.
|
|
if [ -z "$rssCurrent" ] || [ -z "$pss" ]; then continue; fi
|
|
|
|
# Sets initial values for PSS and RSS. NA is set instead of 0 because using numbers could lead to false
|
|
# or inaccurate information. It also previously allowed one to see when child processes were spawned.
|
|
if [ "$flag" -ne "1" ]; then
|
|
firstP[pid]="$pss"
|
|
lastP[pid]="NA"
|
|
rss[pid]="$rssCurrent"
|
|
firstR[pid]="${rss[pid]}"
|
|
lastR[pid]="NA"
|
|
s1[pid]=""
|
|
s2[pid]=""
|
|
trend[pid]=0
|
|
increasing[pid]=0
|
|
decreasing[pid]=0
|
|
stable[pid]=0
|
|
count[pid]=0
|
|
baseline[pid]=0
|
|
fi
|
|
|
|
# In the event of a memory leak (the RSS value increasing), an X is placed in the 'Leak' column of the
|
|
# printed table. The PID of the process is also added to an array to be sent to the memLeak function
|
|
# once all of the commands' processes have been checked. A flag indicating that a possible leak has
|
|
# been detected is also set.
|
|
if [ "${rss[pid]}" -lt "$rssCurrent" ]; then
|
|
lastR[pid]="${rss[pid]}"
|
|
rss[pid]="$rssCurrent"
|
|
leaking[pid]="$cmd"
|
|
leak[pid]="X"
|
|
let leakFlag=1
|
|
fi
|
|
|
|
# Calculates the changes in PSS and RSS usage over time. If this is the first run and there is no
|
|
# previous value with which to compare against, delta is set to 0, where delta is the change over
|
|
# time.
|
|
if [ "${lastP[pid]}" = "NA" ]; then changeP[$pid]=0; deltaP=0.000;
|
|
else changeP[pid]="$((changeP[$pid] + $pss - lastP[$pid]))"; deltaP=$(awk -v chP="${changeP[$pid]}" -v hrs="${hours}" -v t="${period}" 'BEGIN {printf "%.3f", (chP/(hrs*t))*3600; exit(0)}');
|
|
fi
|
|
|
|
if [ "${lastR[pid]}" = "NA" ]; then changeR[$pid]=0; deltaR=0.000;
|
|
else changeR[pid]="$((changeR[$pid] + rss[$pid] - lastR[$pid]))"; deltaR=$(awk -v chR="${changeR[$pid]}" -v hrs="${hours}" -v t="${period}" 'BEGIN {printf "%.3f", (chR/(hrs*t))*3600; exit(0)}');
|
|
fi
|
|
|
|
# The below if-else block seeks to determine gradual sustained patterns of RSS usage over time to determine if the memory usage is gradually
|
|
# increasing throughout the lifespan of the process (possible memory leak) or not. Non-gradual usage changes can be due to dynamic reallocation
|
|
# and such 'eratic' behaviour is not indicative of any overall trends.
|
|
# NOTE: If you would like to do this properly and determine whether or not such patterns exist by evaluating the RSS usage accross the entire
|
|
# lifespan of the script, consider the following method:
|
|
# Take the (RSS, time) value-pairs and make an augmented matrix and then use Gaussian elimination to solve the matrix and use the remaining
|
|
# values as the coefficients to create a least-squares parabola, which you can then find the first derivative of to determine the rate of
|
|
# change -- which will indicate increasing or decreasing behaviour at your current point relative to nearby datapoints and the behaviour
|
|
# of the rest of your graph (function).
|
|
# To do this consider using the python package numpy for matrix math and for derivatives. The issue with this method is finding a way to
|
|
# pipe data to a python script and have it return to the bash script. Because piping is usually done asynchronously, you may find that there
|
|
# are issues with having the values returned and printed properly in the bash script. This math can also be done in bash, but will take
|
|
# considerable effort.
|
|
#---------------------------------------------------------------------------------------------------------------------------------------------------
|
|
# This part checks to see that an established trend is being maintained.
|
|
# It first checks that trend[pid] is greater than or equal to 3 because the else-block below increments the trend number such that when the same
|
|
# pattern e.g. an increase in RSS that exceeds the baseline average occurs 3 times in a row, it establishes that there exists a trend of RSS increasing.
|
|
# The existence of a pattern or 'trend' is gauged by whether the same behaviour has occured 3 times in a row, and that it continues to occur without
|
|
# the opposite behaviour happening. For example, if a trend of increasing RSS has been observed, and on the next sample the RSS value is found to
|
|
# be below the baseline average, this indicates that a decrease has occured and thus the trend has been broken, and a new trend must be established.
|
|
# If the current trend is either increasing or decreasing, the RSS value can be equal to the baseline average for two consecutive samples without
|
|
# the trend being broken. However, if the RSS value is equal to the baseline average for 3 consecutive samples, this indicates a new trend of the
|
|
# RSS value reaching a stable value, and the current trend of increasing or decreasing is broken; that is to say, only a trend of 'stable' will be
|
|
# permitted; if an increase or decrease is observed, all temporary and trend values will be reset, and the code will enter the else-block below and
|
|
# attempt to establish a new trend from scratch. The reason for this behaviour is that trends are determined by observing a behaviour three
|
|
# consecutive times, if a stable behaviour starts turning into an increasing or decreasing behaviour, the else-block is entered to wait for a new
|
|
# behaviour to be established.
|
|
# A trend cannot change immediately from increasing to decreasing. This is done to avoid representing erratic behaviour as a long-term pattern.
|
|
# An increasing or decreasing trend must change to 'none' -- no trend observed -- before the opposite trend can be declared.
|
|
# The baseline average is the RSS values for a PID from each sample added together and divided by the number of samples that have taken place.
|
|
let count[pid]+=1
|
|
let baseline[pid]+="$rssCurrent"
|
|
avg=$(awk -v b="${baseline[pid]}" -v c="${count[pid]}" 'BEGIN {printf "%.0f", (b/c); exit(0)}')
|
|
if [ "${trend[pid]}" -ge "3" ]; then
|
|
if [ "${rss[pid]}" -gt "$avg" ] && ([ "${s1[pid]}" = "increasing" ] || ([ "${s1[pid]}" != "decreasing" ] && [ "${s2[pid]}" != "decreasing" ]) && [ "${stable[pid]}" -ne "3" ]); then
|
|
if [ "${s1[pid]}" != "increasing" ]; then
|
|
s2[pid]="${s1[pid]}"
|
|
s1[pid]="increasing"
|
|
fi
|
|
elif [ "${rss[pid]}" -eq "$avg" ]; then
|
|
if [ "${s1[pid]}" != "stable" ]; then
|
|
stable[pid]=0
|
|
s2[pid]="${s1[pid]}"
|
|
s1[pid]="stable"
|
|
fi
|
|
let stable[pid]+=1
|
|
let stable[pid]+=1
|
|
elif [ "${rss[pid]}" -lt "$avg" ] && ([ "${s1[pid]}" = "decreasing" ] || ([ "${s1[pid]}" != "increasing" ] && [ "${s2[pid]}" != "increasing" ]) && [ "${stable[pid]}" -ne "3" ]); then
|
|
if [ "${s1[pid]}" != "decreasing" ]; then
|
|
s2[pid]="${s1[pid]}"
|
|
s1[pid]="decreasing"
|
|
fi
|
|
else
|
|
s1[pid]=""
|
|
s2[pid]=""
|
|
trend[pid]=0
|
|
increasing[pid]=0
|
|
decreasing[pid]=0
|
|
stable[pid]=0
|
|
fi
|
|
# This else-block is used to establish whether or not a trend has been established. It waits for a pattern of the RSS value of a PID to increase,
|
|
# decrease, or remain stable relative to the baseline average three times in a row before it will declare that a trend exists. This is to avoid
|
|
# viewing erratic increases and decreases in RSS as gradual increases or decreases in the system's (process') RSS usage.
|
|
else
|
|
if [ "${count[pid]}" -gt "0" ]; then
|
|
if [ "${rss[pid]}" -gt "$avg" ]; then
|
|
let trend[pid]+=1
|
|
let increasing[pid]+=1
|
|
s1[pid]="increasing"
|
|
elif [ "${rss[pid]}" -eq "$avg" ]; then
|
|
let trend[pid]+=1
|
|
let stable[pid]+=1
|
|
s1[pid]="stable"
|
|
elif [ "${rss[pid]}" -lt "$avg" ]; then
|
|
let trend[pid]+=1
|
|
let decreasing[pid]+=1
|
|
s1[pid]="decreasing"
|
|
fi
|
|
if [ "${increasing[pid]}" -gt "0" ] && [ "${decreasing[pid]}" -gt "0" ]; then
|
|
increasing[pid]=0
|
|
decreasing[pid]=0
|
|
stable[pid]=0
|
|
trend[pid]=0
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
if [ "${trend[pid]}" -ge "3" ]; then pattern[pid]="${s1[pid]}"; else pattern[pid]="none"; fi # Sets the trend variable for printing if a trend exists
|
|
|
|
|
|
|
|
printf "\n%15s | %8s | %2s | %10s | %8s kB/h | %8s | %8s | %8s | %8s kB/h | %8s | %8s | %8s |" \
|
|
$cmd $pid "${leak[pid]}" "${pattern[pid]}" $deltaR ${rss[pid]} ${firstR[pid]} ${lastR[pid]} $deltaP $pss ${firstP[pid]} ${lastP[pid]} >&1
|
|
|
|
lastP[pid]="$pss"
|
|
leak[pid]="" # Resets the indicator in the 'Leak' column
|
|
done
|
|
done
|
|
|
|
if [ "$leakFlag" -eq "1" ]; then memLeak leaking[@]; fi # Calls the mem leak function if flag is set
|
|
unset leaking[@] # Clear the array holding PIDs of processes with potential leaks
|
|
let leakFlag=0
|
|
let hours+=1 # Hour count[pid]er used in calculating delta
|
|
let flag=1 # Flag indicating that first run has completed so we no longer have to set values of 'NA'
|
|
echo $'\n'
|
|
sleep "$period"
|
|
done
|