swift/bin/swift-drive-audit
Andy McCrae 463da7e170 Adds Error Handling to swift-drive-audit for missing or unreadable /var/log/kern.log
Fixes Bug 1049081

Change-Id: If977080350cc5cdb6bc633b6af7d3c490ed23d46
2012-09-11 16:23:32 +00:00

138 lines
5.0 KiB
Python
Executable File

#!/usr/bin/env python
# Copyright (c) 2010-2012 OpenStack, LLC.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import datetime
import os
import re
import subprocess
import sys
from ConfigParser import ConfigParser
from swift.common.utils import get_logger
# To search for more types of errors, add the regex to the list below
error_re = [
re.compile(r'\berror\b.*\b(sd[a-z]{1,2}\d?)\b'),
re.compile(r'\b(sd[a-z]{1,2}\d?)\b.*\berror\b'),
]
def get_devices(device_dir, logger):
devices = []
for line in open('/proc/mounts').readlines():
data = line.strip().split()
block_device = data[0]
mount_point = data[1]
if mount_point.startswith(device_dir):
device = {}
device['mount_point'] = mount_point
device['block_device'] = block_device
try:
device_num = os.stat(block_device).st_rdev
except OSError, e:
# If we can't stat the device, then something weird is going on
logger.error("Error: Could not stat %s!" %
block_device)
continue
device['major'] = str(os.major(device_num))
device['minor'] = str(os.minor(device_num))
devices.append(device)
for line in open('/proc/partitions').readlines()[2:]:
major, minor, blocks, kernel_device = line.strip().split()
device = [d for d in devices
if d['major'] == major and d['minor'] == minor]
if device:
device[0]['kernel_device'] = kernel_device
return devices
def get_errors(minutes):
errors = {}
start_time = datetime.datetime.now() - datetime.timedelta(minutes=minutes)
try:
for line in open('/var/log/kern.log'):
if '[ 0.000000]' in line:
# Ignore anything before the last boot
errors = {}
continue
log_time_string = '%s %s' % (start_time.year,
' '.join(line.split()[:3]))
log_time = datetime.datetime.strptime(
log_time_string, '%Y %b %d %H:%M:%S')
if log_time > start_time:
for err in error_re:
for device in err.findall(line):
errors[device] = errors.get(device, 0) + 1
return errors
except IOError:
logger.error("Error: Unable to open /var/log/kern.log")
print("Unable to open /var/log/kern.log")
sys.exit(1)
def comment_fstab(mount_point):
with open('/etc/fstab', 'r') as fstab:
with open('/etc/fstab.new', 'w') as new_fstab:
for line in fstab:
parts = line.split()
if len(parts) > 2 and line.split()[1] == mount_point:
new_fstab.write('#' + line)
else:
new_fstab.write(line)
os.rename('/etc/fstab.new', '/etc/fstab')
if __name__ == '__main__':
c = ConfigParser()
try:
conf_path = sys.argv[1]
except Exception:
print "Usage: %s CONF_FILE" % sys.argv[0].split('/')[-1]
sys.exit(1)
if not c.read(conf_path):
print "Unable to read config file %s" % conf_path
sys.exit(1)
conf = dict(c.items('drive-audit'))
device_dir = conf.get('device_dir', '/srv/node')
minutes = int(conf.get('minutes', 60))
error_limit = int(conf.get('error_limit', 1))
conf['log_name'] = conf.get('log_name', 'drive-audit')
logger = get_logger(conf, log_route='drive-audit')
devices = get_devices(device_dir, logger)
logger.debug("Devices found: %s" % str(devices))
if not devices:
logger.error("Error: No devices found!")
errors = get_errors(minutes)
logger.debug("Errors found: %s" % str(errors))
unmounts = 0
for kernel_device, count in errors.items():
if count >= error_limit:
device = \
[d for d in devices if d['kernel_device'] == kernel_device]
if device:
mount_point = device[0]['mount_point']
if mount_point.startswith(device_dir):
logger.info("Unmounting %s with %d errors" %
(mount_point, count))
subprocess.call(['umount', '-fl', mount_point])
logger.info("Commenting out %s from /etc/fstab" %
(mount_point))
comment_fstab(mount_point)
unmounts += 1
if unmounts == 0:
logger.info("No drives were unmounted")