added: new logs collector; some fixes and reviews

This commit is contained in:
adobdin 2016-04-07 11:38:58 +00:00
parent 926a824023
commit 66990e579d
6 changed files with 133 additions and 168 deletions

13
conf.py
View File

@ -11,15 +11,15 @@ class Conf(object):
'vars': 'OPENRC=/root/openrc IPTABLES_STR="iptables -nvL"'}
cluster = None
fuelip = 'localhost'
outdir = '/tmp/timmy-gen/info'
outdir = '/tmp/timmy/info'
timeout = 15
logs_archive = '/tmp/timmy-logs.tar'
rqdir = './rq'
logdir = './info'
compress_timeout = 3600
archives = '/tmp/timmy/archives'
find = {'template': "-name '*.gz' -o -name '*.log' -o -name '*-[0-9]4'",
'path': '/var/log/'}
cmds_archive = ''
log_files = {}
log_files['filter'] = {'default': {'include': '.log', 'exclude': None}}
log_files['path'] = '/var/log/'
def __init__(self, **entries):
self.__dict__.update(entries)
@ -39,6 +39,9 @@ class Conf(object):
except ValueError:
logging.error("Could not convert data")
sys.exit(1)
except yaml.parser.ParserError as e:
logging.error("Could not parse %s:\n%s" %(filename, str(e)))
sys.exit(1)
except:
logging.error("Unexpected error: %s" % sys.exc_info()[0])
sys.exit(1)

View File

@ -3,20 +3,19 @@ ssh:
vars: OPENRC=/root/openrc IPTABLES_STR="iptables -nvL"
fuelip: 127.0.0.1
rqdir: ./rq
logdir: ./logs
outdir: ../timmy-ng/info
soft_filter:
status: ['ready']
timeout: 15
find:
template: -name '*.log'
log_files:
path: /var/log
default: -name '*.log'
by_role:
compute: -name '*.log'
controller: -name '*.log' -name '*.log' -o -name '*.log.1' -o -name '*' ! -path '/var/log/atop*'
mongo: -name '*.log'
ceph: -name '*.log'
by_node_id:
0: -name '*.log' -o -name '*.log.1'
filter:
default:
include: '\.log$'
exclude: 'atop-'
by_role:
compute:
include: '\.log$'
controller:
include: '\.log\.1$'
by_node_id:
0: '\.log$'

234
nodes.py
View File

@ -26,6 +26,7 @@ import os
import logging
import sys
import threading
import re
from tools import *
@ -38,7 +39,7 @@ varlogdir = '/var/log'
class Node(object):
def __init__(self, node_id, mac, cluster, roles, os_platform,
online, status, ip, flogs=False):
online, status, ip):
self.node_id = node_id
self.mac = mac
self.cluster = cluster
@ -50,8 +51,7 @@ class Node(object):
self.files = {}
self.data = {}
self.logsize = 0
# include logs from the command 'find /var/log/ ...'
self.flogs = flogs
self.flogs = {}
self.mapcmds = {}
def set_files(self, dirname, key, ds, version):
@ -82,7 +82,7 @@ class Node(object):
(self.node_id, filename))
if bname[0] == '.':
if self.os_platform in bname:
logging.info('os %s in filename %s' %
logging.debug('os %s in filename %s' %
(self.os_platform, filename))
return True
else:
@ -132,9 +132,9 @@ class Node(object):
with open(dfile, 'w') as df:
df.write(outs)
except:
logging.error("Can't write to file %s" % dfile)
logging.error("exec_cmd: can't write to file %s" % dfile)
def exec_simple_cmd(self, cmd, outfile, sshvars, sshopts, timeout=15, fake=False):
def exec_simple_cmd(self, cmd, infile, outfile, sshvars, sshopts, timeout=15, fake=False):
logging.info('node:%s(%s), exec: %s' % (self.node_id, self.ip, cmd))
if not fake:
outs, errs, code = ssh_node(ip=self.ip,
@ -142,7 +142,8 @@ class Node(object):
sshvars=sshvars,
sshopts=sshopts,
timeout=timeout,
outputfile=outfile)
outputfile=outfile,
inputfile=infile)
if code != 0:
logging.warning("node: %s, ip: %s, cmdfile: %s,"
" code: %s, error message: %s" %
@ -176,14 +177,13 @@ class Node(object):
logging.info("node: %s, ip: %s, size: %s" %
(self.node_id, self.ip, self.logsize))
def get_files(self, label, logdir, sshopts, odir='info', timeout=15):
def get_files(self, label, sshopts, odir='info', timeout=15):
logging.info('node:%s(%s), filelist: %s' %
(self.node_id, self.ip, label))
sn = 'node-%s' % self.node_id
cl = 'cluster-%s' % self.cluster
ddir = os.path.join(odir, label, cl, sn)
mdir(ddir)
# logging.info(self.data)
outs, errs, code = get_files_rsync(ip=self.ip,
data=self.data[label],
sshopts=sshopts,
@ -207,10 +207,27 @@ class Node(object):
logging.debug('node: %s, key: %s, data:\n%s' %
(self.node_id, key, self.data[key]))
def log_size_from_find(self, template, sshopts, odir, timeout=5):
logging.info('template find: %s' % template)
cmd = ("find '%s' -type f \( %s \) -exec du -b {} +" %
(varlogdir, str(template)))
def logs_filter(self, lfilter):
flogs = {}
logging.info('logs_filter: node: %s, filter: %s' %(self.node_id, lfilter))
for f in self.dulogs.splitlines():
try:
if (('include' in lfilter and re.search(lfilter['include'], f)) and
('exclude' in lfilter and not re.search(lfilter['exclude'], f))
):
flogs[f.split("\t")[1]] = int(f.split("\t")[0])
else:
logging.debug("filter %s by %s" %(f, lfilter))
except re.error as e:
logging.error('logs_include_filter: filter: %s, str: %s, re.error: %s' %
(lfilter, f, str(e)))
sys.exit(5)
#logging.debug('logs_include_filter: %s, filter: %s' %(flogs, lfilter))
self.flogs.update(flogs)
def log_size_from_find(self, path, sshopts, timeout=5):
cmd = ("find '%s' -type f -exec du -b {} +" %(path))
logging.info('log_size_from_find: node: %s, logs du-cmd: %s' % (self.node_id, cmd))
outs, errs, code = ssh_node(ip=self.ip,
command=cmd,
@ -221,15 +238,10 @@ class Node(object):
logging.error("node: %s, ip: %s, command: %s, "
"timeout code: %s, error message: %s" %
(self.node_id, self.ip, cmd, code, errs))
self.logsize = -1
return -1
size = 0
for s in outs.splitlines():
size += int(s.split()[0])
self.logsize = size
logging.info("log size from find: node: %s, ip: %s, size: %s bytes" %
(self.node_id, self.ip, self.logsize))
return self.logsize
return False
self.dulogs = outs
logging.info('log_size_from_find: dulogs: %s' %(self.dulogs))
return True
def print_files(self):
for k in self.files.keys():
@ -266,7 +278,6 @@ class Nodes(object):
self.destdir = destdir
self.get_version()
self.cluster = cluster
self.logdir = conf.logdir
self.extended = extended
logging.info('extended: %s' % self.extended)
if filename is not None:
@ -391,7 +402,7 @@ class Nodes(object):
for role in node.roles:
if role not in roles:
roles.append(role)
logging.info('role: %s, node: %s' %
logging.debug('role: %s, node: %s' %
(role, node.node_id))
node.add_files(self.dirname, key, self.files)
node.exclude_non_os()
@ -433,36 +444,28 @@ class Nodes(object):
t.join()
lock.unlock()
def calculate_log_size(self, template, timeout=15):
label = lkey
threads = []
def filter_logs(self):
for node in self.nodes.values():
if (self.cluster and str(self.cluster) != str(node.cluster) and
node.cluster != 0):
continue
if node.status in self.conf.soft_filter.status and node.online and node.fltemplate:
t = threading.Thread(target=node.du_logs,
args=(label,
self.sshopts,
5,))
threads.append(t)
t.start()
for t in threads:
t.join()
node.logs_filter(self.conf.log_files['filter']['default'])
for role in node.roles:
if ('by_role' in self.conf.log_files['filter'] and
role in self.conf.log_files['filter']['by_role'].keys()
):
node.logs_filter(self.conf.log_files['filter']['by_role'][role])
logging.debug('filter logs: node-%s: filtered logs: %s' %
(node.node_id, node.flogs))
def calculate_log_size(self, timeout=15):
lsize = 0
for node in self.nodes.values():
lsize += node.logsize
logging.info('Full log size on nodes: %s bytes' % lsize)
#fuelnode = self.nodes[self.fuelip]
#if fuelnode.log_size_from_find(template,
# self.sshopts,
# 5) > 0:
# lsize += fuelnode.logsize
for node in self.nodes.values():
if node.fltemplate and node.log_size_from_find(node.fltemplate,
if not node.log_size_from_find(self.conf.log_files['path'],
self.sshopts,
5) > 0:
lsize += node.logsize
5):
logging.warning("can't get log file list from node %s" %node.node_id)
self.filter_logs()
for node in self.nodes.values():
for f in node.flogs:
lsize += node.flogs[f]
logging.info('Full log size on nodes(with fuel): %s bytes' % lsize)
self.alogsize = lsize / 1024
@ -472,16 +475,17 @@ class Nodes(object):
logging.error("Can't get free space: %s" % errs)
return False
fs = int(outs.rstrip('\n'))
logging.info('logsize: %s, free space: %s Kb' % (self.alogsize, fs))
logging.info('logsize: %s Kb, free space: %s Kb' % (self.alogsize, fs))
if (self.alogsize*coefficient > fs):
logging.error('Not enough space on device')
return False
else:
return True
def create_archive_general(self, outdir, outfile, timeout):
cmd = "tar jcf '%s' %s" % (outfile, outdir)
logging.info(cmd)
def create_archive_general(self, directory, outfile, timeout):
cmd = "tar jcf '%s' -C %s %s" % (outfile, directory, ".")
mdir(self.conf.archives)
logging.debug("create_archive_general: cmd: %s" %cmd)
outs, errs, code = ssh_node(ip='localhost',
command=cmd,
sshopts=self.sshopts,
@ -491,27 +495,35 @@ class Nodes(object):
if code != 0:
logging.error("Can't create archive %s" % (errs))
def create_archive_logs(self, outdir, timeout):
#fuelnode = self.nodes[self.fuelip]
def create_log_archives(self, outdir, timeout):
threads = []
txtfl = []
for node in self.nodes.values():
if (self.cluster and str(self.cluster) != str(node.cluster) and
node.cluster != 0):
continue
if node.status in self.conf.soft_filter.status and node.online and node.fltemplate:
if node.status in self.conf.soft_filter.status and node.online:
tstr = ''
cl = 'cluster-%s' % self.cluster
node.archivelogsfile = os.path.join(outdir, 'node-'+str(node.node_id) + '.tar')
node.archivelogsfile = os.path.join(outdir, 'logs-node-'+str(node.node_id) + '.tar.bz2')
mdir(outdir)
logslistfile = node.archivelogsfile + '.txt'
txtfl.append(logslistfile)
try:
with open(logslistfile, 'w') as llf:
for line in node.flogs:
llf.write(line+"\0")
except:
logging.error("create_archive_logs: Can't write to file %s" % logslistfile)
if str(node.node_id) == '0':
tstr = '--transform \\"flags=r;s|^|logs/fuel/|\\"'
cmd = ("find %s -type f \( %s \) -print0 "
"| tar --create %s --file - "
cmd = ("tar --bzip2 --create %s --file - "
"--null --files-from -" %
(node.flpath, node.fltemplate, tstr))
(tstr))
t = threading.Thread(target=node.exec_simple_cmd,
args=(cmd,
logslistfile,
node.archivelogsfile,
self.sshvars,
self.sshopts,
@ -521,6 +533,8 @@ class Nodes(object):
t.start()
for t in threads:
t.join()
for tfile in txtfl:
os.remove(tfile)
def add_logs_archive(self, directory, key, outfile, timeout):
cmd = ("tar --append --file=%s --directory %s %s" %
@ -532,6 +546,16 @@ class Nodes(object):
if code != 2 and code != 0:
logging.warning("stderr from tar: %s" % (errs))
def compress_logs(self, timeout):
threads = []
for node in self.nodes.values():
if (self.cluster and str(self.cluster) != str(node.cluster) and
node.cluster != 0):
continue
if node.status in self.conf.soft_filter.status and node.online:
self.compress_archive(node.archivelogsfile, timeout)
def compress_archive(self, filename, timeout):
cmd = 'bzip2 -f %s' % filename
outs, errs, code = launch_cmd(command=cmd,
@ -540,18 +564,20 @@ class Nodes(object):
logging.warning("Can't compress archive %s" % (errs))
def set_template_for_find(self):
'''Obsolete'''
for node in self.nodes.values():
node.flpath = self.conf.log_files['path']
node.fltemplate = self.conf.log_files['default']
node.fltemplate = self.conf.log_files['filter']['default']
for role in node.roles:
if role in self.conf.log_files['by_role'].keys():
node.fltemplate = self.conf.log_files['by_role'][role]
logging.info('set_template_for_find: break on role %s' %role)
if role in self.conf.log_files['filter']['by_role'].keys():
node.fltemplate = self.conf.log_files['filter']['by_role'][role]
logging.debug('set_template_for_find: break on role %s' %role)
break
if (self.conf.log_files['by_node_id'] and
node.node_id in self.conf.log_files['by_node_id'].keys()):
if (self.conf.log_files['filter']['by_node_id'] and
node.node_id in self.conf.log_files['filter']['by_node_id'].keys()):
node.fltemplate = self.conf.log_files['by_node_id'][node.node_id]
logging.info('set_template_for_find: node: %s, template: %s' %(node.node_id, node.fltemplate) )
logging.debug('set_template_for_find: node: %s, template: %s' %
(node.node_id, node.fltemplate) )
def get_conf_files(self, odir=fkey, timeout=15):
if fkey not in self.files:
@ -570,7 +596,6 @@ class Nodes(object):
if node.status in self.conf.soft_filter.status and node.online:
t = threading.Thread(target=node.get_files,
args=(label,
self.logdir,
self.sshopts,
odir,
self.timeout,))
@ -581,10 +606,6 @@ class Nodes(object):
lock.unlock()
def get_log_files(self, odir=lkey, timeout=15):
# lock = flock.FLock('/tmp/timmy-logs.lock')
# if not lock.lock():
# logging.warning('Unable to obtain lock, skipping "logs"-part')
# return ''
if lkey not in self.files:
logging.warning("get_log_files: %s directory does not exist" %(lkey))
return
@ -598,7 +619,6 @@ class Nodes(object):
node.online and str(node.node_id) != '0'):
t = threading.Thread(target=node.get_files,
args=(label,
self.logdir,
self.sshopts,
odir,
self.timeout,))
@ -606,7 +626,6 @@ class Nodes(object):
t.start()
for t in threads:
t.join()
# lock.unlock()
def print_nodes(self):
"""print nodes"""
@ -621,70 +640,7 @@ class Nodes(object):
def main(argv=None):
if argv is None:
argv = sys.argv
parser = argparse.ArgumentParser(description='need to add description')
parser.add_argument('-a', '--dest-dir', default='/tmp/',
help='directory with output archive')
parser.add_argument('-f', '--nodes',
help='nodes file', default='nodes.json')
parser.add_argument('-t', '--timeout',
help='timeout for command', type=int, default=15)
parser.add_argument('-l', '--log-dir',
help='log directory', default='./logs/')
parser.add_argument('-o', '--ssh-vars',
help='ssh variables',
default=("OPENRC=/root/openrc "
"IPTABLES_STR=\"iptables -nvL\""))
parser.add_argument('-p', '--ssh-opts',
help='ssh options',
default=("-oConnectTimeout=2 "
"-oStrictHostKeyChecking=no "
"-oUserKnownHostsFile=/dev/null "
"-oLogLevel=error "
"-lroot -oBatchMode=yes"))
parser.add_argument('-r', '--rq-dir',
help='rq directrory', default='./rq')
parser.add_argument('-e', '--extended', default="0",
help='exec once by role cmdfiles')
parser.add_argument('-c', '--cluster', help='cluster id')
parser.add_argument('-i', '--fuel-ip',
help='Fuel admin ip address', default="localhost")
parser.add_argument('-s', '--out-dir', default='info',
help='output directory')
parser.add_argument('-d', '--debug',
help="Print lots of debugging statements",
action="store_const", dest="loglevel",
const=logging.DEBUG,
default=logging.WARNING,)
parser.add_argument('-v', '--verbose',
help="Be verbose",
action="store_const", dest="loglevel",
const=logging.INFO,)
args = parser.parse_args(argv[1:])
logging.basicConfig(level=args.loglevel)
args.extended = args.extended == "1"
nodes = Nodes(filesd=args.rq_dir,
logdir=args.log_dir,
extended=args.extended,
fuelip=args.fuel_ip,
cluster=args.cluster,
sshopts=args.ssh_opts,
sshvars=args.ssh_vars,
timeout=args.timeout,
destdir=args.dest_dir)
# nodes.print_nodes()
nodes.get_node_file_list()
nodes.calculate_log_size(conf.find['template'])
if nodes.is_enough_space():
nodes.get_log_files(args.out_dir)
nodes.launch_ssh(args.out_dir)
nodes.get_conf_files(args.out_dir)
nodes.print_nodes()
return 0
return 0
if __name__ == '__main__':
exit(main(sys.argv))

View File

@ -0,0 +1 @@
su postgres -c 'pg_dumpall --clean'

View File

@ -19,6 +19,7 @@ import argparse
import nodes
import logging
import sys
import os
from conf import Conf
import flock
@ -70,22 +71,25 @@ def main(argv=None):
n.get_node_file_list()
n.launch_ssh(config.outdir)
n.get_conf_files(config.outdir)
n.create_archive_general(config.outdir, '/tmp/timmy-gen.tar.bz2', 60)
n.create_archive_general(config.outdir,
os.path.join(config.archives, 'general.tar.bz2'),
60)
if args.only_logs or args.getlogs:
lock = flock.FLock('/tmp/timmy-logs.lock')
if not lock.lock():
logging.warning('Unable to obtain lock, skipping "logs"-part')
return 1
n.get_node_file_list()
n.set_template_for_find()
n.calculate_log_size(config.find['template'])
#n.set_template_for_find()
n.calculate_log_size()
if n.is_enough_space():
n.get_log_files(config.outdir)
n.create_archive_logs(config.archives,
#n.get_log_files(config.outdir)
n.create_log_archives(config.archives,
config.compress_timeout)
n.add_logs_archive(config.outdir, nodes.lkey,
config.logs_archive, 120)
n.compress_archive(config.logs_archive, config.compress_timeout)
#n.add_logs_archive(config.outdir, nodes.lkey,
# config.logs_archive, 120)
#n.compress_archive(config.logs_archive, config.compress_timeout)
#n.compress_logs(config.compress_timeout)
n.print_nodes()
return 0

View File

@ -75,11 +75,11 @@ def launch_cmd(command, timeout):
def ssh_node(ip, command, sshopts='', sshvars='', timeout=15, filename=None,
outputfile=None, prefix='nice -n 19 ionice -c 3'):
inputfile=None, outputfile=None, prefix='nice -n 19 ionice -c 3'):
if (ip in ['localhost', '127.0.0.1']) or ip.startswith('127.'):
logging.info("skip ssh")
bstr = "%s timeout '%s' %s bash -c " % (
sshvars, timeout, prefix)
bstr = "%s timeout '%s' bash -c " % (
sshvars, timeout)
else:
logging.info("exec ssh")
# base cmd str
@ -89,7 +89,9 @@ def ssh_node(ip, command, sshopts='', sshvars='', timeout=15, filename=None,
cmd = bstr + '"' + prefix + ' ' + command + '"'
else:
cmd = bstr + " '%s bash -s' < '%s'" % (prefix, filename)
# logging.info(cmd)
if inputfile is not None:
cmd = bstr + '"' + prefix + " " + command + '" < ' + inputfile
logging.info("ssh_node: inputfile selected, cmd: %s" %cmd)
if outputfile is not None:
cmd += ' > "' + outputfile + '"'
outs, errs, code = launch_cmd(cmd, timeout)