timmy/nodes.py

647 lines
27 KiB
Python

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
# Copyright 2015 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
"""
main module
"""
import argparse
import flock
import json
import os
import logging
import sys
import threading
import re
from tools import *
ckey = 'cmds'
fkey = 'files'
lkey = 'logs'
varlogdir = '/var/log'
class Node(object):
def __init__(self, node_id, mac, cluster, roles, os_platform,
online, status, ip):
self.node_id = node_id
self.mac = mac
self.cluster = cluster
self.roles = roles
self.os_platform = os_platform
self.online = online
self.status = status
self.ip = ip
self.files = {}
self.data = {}
self.logsize = 0
self.flogs = {}
self.mapcmds = {}
def set_files(self, dirname, key, ds, version):
files = []
for role in self.roles:
if 'by-role' in ds[key] and role in ds[key]['by-role'].keys():
for f in ds[key]['by-role'][role]:
files += [os.path.join(dirname, key, 'by-role', role, f)]
if (('release-'+version in ds[key].keys()) and
(role in ds[key]['release-'+version].keys())):
for f in ds[key]['release-'+version][role]:
files += [os.path.join(dirname, key,
'release-'+version, role, f)]
if 'by-os' in ds[key]:
for f in ds[key]['by-os'][self.os_platform].keys():
files += [os.path.join(dirname, key, 'by-os',
self.os_platform, f)]
if 'default' in ds[key] and 'default' in ds[key]['default']:
for f in ds[key]['default']['default'].keys():
files += [os.path.join(dirname, key, 'default', 'default', f)]
self.files[key] = sorted(set(files))
logging.debug('set_files:\nkey: %s, node: %s, file_list: %s' %
(key, self.node_id, self.files[key]))
def checkos(self, filename):
bname = str(os.path.basename(filename))
logging.debug('check os: node: %s, filename %s' %
(self.node_id, filename))
if bname[0] == '.':
if self.os_platform in bname:
logging.debug('os %s in filename %s' %
(self.os_platform, filename))
return True
else:
return False
return True
def exclude_non_os(self):
for key in self.files.keys():
self.files[key] = [f for f in self.files[key] if self.checkos(f)]
def add_files(self, dirname, key, ds):
for role in self.roles:
if 'once-by-role' in ds[key] and role in ds[key]['once-by-role'].keys():
for f in ds[key]['once-by-role'][role]:
self.files[key] += [os.path.join(dirname, key,
'once-by-role', role, f)]
self.files[key] = sorted(set(self.files[key]))
logging.debug('add files:\nnode: %s, key: %s, files:\n%s' %
(self.node_id, key, self.files[key]))
def exec_cmd(self, label, sshvars, sshopts, odir='info', timeout=15, fake=False):
sn = 'node-%s' % self.node_id
cl = 'cluster-%s' % self.cluster
logging.debug('%s/%s/%s/%s' % (odir, label, cl, sn))
ddir = os.path.join(odir, label, cl, sn)
mdir(ddir)
for f in self.files[label]:
logging.info('node:%s(%s), exec: %s' % (self.node_id, self.ip, f))
if not fake:
outs, errs, code = ssh_node(ip=self.ip,
filename=f,
sshvars=sshvars,
sshopts=sshopts,
timeout=timeout,
command=''
)
if code != 0:
logging.error("node: %s, ip: %s, cmdfile: %s,"
" code: %s, error message: %s" %
(self.node_id, self.ip, f, code, errs))
dfile = os.path.join(ddir, 'node-%s-%s-%s' %
(self.node_id, self.ip, os.path.basename(f)))
logging.info('outfile: %s' % dfile)
self.mapcmds[os.path.basename(f)] = dfile
if not fake:
try:
with open(dfile, 'w') as df:
df.write(outs)
except:
logging.error("exec_cmd: can't write to file %s" % dfile)
def exec_simple_cmd(self, cmd, infile, outfile, sshvars, sshopts, timeout=15, fake=False):
logging.info('node:%s(%s), exec: %s' % (self.node_id, self.ip, cmd))
if not fake:
outs, errs, code = ssh_node(ip=self.ip,
command=cmd,
sshvars=sshvars,
sshopts=sshopts,
timeout=timeout,
outputfile=outfile,
inputfile=infile)
if code != 0:
logging.warning("node: %s, ip: %s, cmdfile: %s,"
" code: %s, error message: %s" %
(self.node_id, self.ip, cmd, code, errs))
def du_logs(self, label, sshopts, odir='info', timeout=15):
logging.info('node:%s(%s), filelist: %s' %
(self.node_id, self.ip, label))
cmd = 'du -b %s' % self.data[label].replace('\n', ' ')
logging.info('node: %s, logs du-cmd: %s' % (self.node_id, cmd))
outs, errs, code = ssh_node(ip=self.ip,
command=cmd,
sshopts=sshopts,
sshvars='',
timeout=timeout)
if code != 0:
logging.warning("node: %s, ip: %s, cmdfile: %s, "
"code: %s, error message: %s" %
(self.node_id, self.ip, label, code, errs))
if code == 124:
logging.error("node: %s, ip: %s, command: %s, "
"timeout code: %s, error message: %s" %
(self.node_id, self.ip, label, code, errs))
# mark node as offline
self.online = False
if self.online:
size = 0
for s in outs.splitlines():
size += int(s.split()[0])
self.logsize = size
logging.info("node: %s, ip: %s, size: %s" %
(self.node_id, self.ip, self.logsize))
def get_files(self, label, sshopts, odir='info', timeout=15):
logging.info('node:%s(%s), filelist: %s' %
(self.node_id, self.ip, label))
sn = 'node-%s' % self.node_id
cl = 'cluster-%s' % self.cluster
ddir = os.path.join(odir, label, cl, sn)
mdir(ddir)
outs, errs, code = get_files_rsync(ip=self.ip,
data=self.data[label],
sshopts=sshopts,
dpath=ddir,
timeout=timeout)
if code != 0:
logging.warning("get_files: node: %s, ip: %s, label: %s, "
"code: %s, error message: %s" %
(self.node_id, self.ip, label, code, errs))
def get_data_from_files(self, key):
self.data[key] = ''
for fname in self.files[key]:
try:
with open(fname, 'r') as dfile:
self.data[key] += '\n'+"".join(line for line in dfile
if (not line.isspace() and
line[0] != '#'))
except:
logging.error('could not read file: %s' % fname)
logging.debug('node: %s, key: %s, data:\n%s' %
(self.node_id, key, self.data[key]))
def logs_filter(self, lfilter):
flogs = {}
logging.info('logs_filter: node: %s, filter: %s' %(self.node_id, lfilter))
for f in self.dulogs.splitlines():
try:
if (('include' in lfilter and re.search(lfilter['include'], f)) and
('exclude' in lfilter and not re.search(lfilter['exclude'], f))
):
flogs[f.split("\t")[1]] = int(f.split("\t")[0])
else:
logging.debug("filter %s by %s" %(f, lfilter))
except re.error as e:
logging.error('logs_include_filter: filter: %s, str: %s, re.error: %s' %
(lfilter, f, str(e)))
sys.exit(5)
#logging.debug('logs_include_filter: %s, filter: %s' %(flogs, lfilter))
self.flogs.update(flogs)
def log_size_from_find(self, path, sshopts, timeout=5):
cmd = ("find '%s' -type f -exec du -b {} +" %(path))
logging.info('log_size_from_find: node: %s, logs du-cmd: %s' % (self.node_id, cmd))
outs, errs, code = ssh_node(ip=self.ip,
command=cmd,
sshopts=sshopts,
sshvars='',
timeout=timeout)
if code == 124:
logging.error("node: %s, ip: %s, command: %s, "
"timeout code: %s, error message: %s" %
(self.node_id, self.ip, cmd, code, errs))
return False
self.dulogs = outs
logging.info('log_size_from_find: dulogs: %s' %(self.dulogs))
return True
def print_files(self):
for k in self.files.keys():
print('key: %s' % (k))
for f in self.files[k]:
print(f)
print('\n')
def __str__(self):
if self.status in ['ready', 'discover'] and self.online:
my_id = self.node_id
else:
my_id = '#' + str(self.node_id)
templ = '{0} {1.cluster} {1.ip} {1.mac} {1.os_platform} '
templ += '{2} {1.online} {1.status}'
return templ.format(my_id, self, ','.join(self.roles))
class Nodes(object):
"""Class nodes """
def __init__(self, cluster, extended, conf, destdir, filename=None):
self.dirname = conf.rqdir.rstrip('/')
if (not os.path.exists(self.dirname)):
logging.error("directory %s doesn't exist" % (self.dirname))
sys.exit(1)
self.files = get_dir_structure(conf.rqdir)[os.path.basename(self.dirname)]
self.fuelip = conf.fuelip
self.sshopts = conf.ssh['opts']
self.sshvars = conf.ssh['vars']
self.timeout = conf.timeout
self.conf = conf
self.destdir = destdir
self.get_version()
self.cluster = cluster
self.extended = extended
logging.info('extended: %s' % self.extended)
if filename is not None:
try:
with open(filename, 'r') as json_data:
self.njdata = json.load(json_data)
except:
logging.error("Can't load data from file %s" % filename)
sys.exit(6)
else:
self.njdata = json.loads(self.get_nodes())
self.load_nodes()
def get_nodes(self):
fuel_node_cmd = 'fuel node list --json'
nodes_json, err, code = ssh_node(ip=self.fuelip,
command=fuel_node_cmd,
sshopts=self.sshopts,
sshvars='DUMMY=""',
timeout=self.timeout,
filename=None)
if code != 0:
logging.error("Can't get fuel node list %s" % err)
sys.exit(4)
return nodes_json
def pass_hard_filter(self, node):
if self.conf.hard_filter:
if self.conf.hard_filter.status and (node.status not in self.conf.hard_filter.status):
logging.info("hard filter by status: excluding node-%s" % node.node_id)
return False
if isinstance(self.conf.hard_filter.online, bool) and (bool(node.online) != bool(self.conf.hard_filter.online)):
logging.info("hard filter by online: excluding node-%s" % node.node_id)
return False
if self.conf.hard_filter.node_ids and ((int(node.node_id) not in self.conf.hard_filter.node_ids) and (str(node.node_id) not in self.conf.hard_filter.node_ids)):
logging.info("hard filter by ids: excluding node-%s" % node.node_id)
return False
if self.conf.hard_filter.roles:
ok_roles = []
for role in node.roles:
if role in self.conf.hard_filter.roles:
ok_roles.append(role)
if not ok_roles:
logging.info("hard filter by roles: excluding node-%s" % node.node_id)
return False
return True
def load_nodes(self):
node = Node(node_id=0,
cluster=0,
mac='n/a',
os_platform='centos',
roles=['fuel'],
status='ready',
online=True,
ip=self.fuelip)
self.nodes = {}
if self.pass_hard_filter(node):
self.nodes = {self.fuelip: node}
for node in self.njdata:
node_roles = node.get('roles')
if not node_roles:
roles = ['None']
elif isinstance(node_roles, list):
roles = node_roles
else:
roles = str(node_roles).split(', ')
node_ip = str(node['ip'])
keys = "cluster mac os_platform status online".split()
params = {'node_id': node['id'],
'roles': roles,
'ip': node_ip}
for key in keys:
params[key] = node[key]
nodeobj = Node(**params)
if self.pass_hard_filter(nodeobj):
self.nodes[node_ip] = nodeobj
def get_version(self):
cmd = "awk -F ':' '/release/ {print \$2}' /etc/nailgun/version.yaml"
release, err, code = ssh_node(ip=self.fuelip,
command=cmd,
sshopts=self.sshopts,
sshvars='',
timeout=self.timeout,
filename=None)
if code != 0:
logging.error("Can't get fuel version %s" % err)
sys.exit(3)
self.version = release.rstrip('\n').strip(' ').strip('"')
logging.info('release:%s' % (self.version))
def get_release(self):
cmd = "awk -F ':' '/fuel_version/ {print \$2}' /etc/astute.yaml"
for node in self.nodes.values():
# skip master
if node.node_id == 0:
node.release = self.version
if (node.node_id != 0) and ( node.status == 'ready'):
release, err, code = ssh_node(ip=node.ip,
command=cmd,
sshopts=self.sshopts,
sshvars='',
timeout=self.timeout,
filename=None)
if code != 0:
logging.warning("get_release: node: %s: Can't get node release" % (node.node_id))
node.release = self.version
continue
node.release = release.strip('\n "\'')
logging.info("get_release: node: %s, release: %s" % (node.node_id, node.release))
def get_node_file_list(self):
for key in self.files.keys():
# ### case
roles = []
for node in self.nodes.values():
node.set_files(self.dirname, key, self.files, self.version)
# once-by-role functionality
if self.extended and key == ckey and node.online:
for role in node.roles:
if role not in roles:
roles.append(role)
logging.debug('role: %s, node: %s' %
(role, node.node_id))
node.add_files(self.dirname, key, self.files)
node.exclude_non_os()
if key == ckey:
logging.info('node: %s, os: %s, key: %s, files: %s' %
(node.node_id,
node.os_platform,
key,
node.files[key]))
for key in [fkey, lkey]:
if key in self.files.keys():
for node in self.nodes.values():
node.get_data_from_files(key)
for node in self.nodes.values():
logging.debug('%s' % node.files[ckey])
def launch_ssh(self, odir='info', timeout=15, fake=False):
lock = flock.FLock('/tmp/timmy-cmds.lock')
if not lock.lock():
logging.warning('Unable to obtain lock, skipping "cmds"-part')
return ''
label = ckey
threads = []
for node in self.nodes.values():
if (self.cluster and str(self.cluster) != str(node.cluster) and
node.cluster != 0):
continue
if node.status in self.conf.soft_filter.status and node.online:
t = threading.Thread(target=node.exec_cmd,
args=(label,
self.sshvars,
self.sshopts,
odir,
self.timeout,
fake))
threads.append(t)
t.start()
for t in threads:
t.join()
lock.unlock()
def filter_logs(self):
for node in self.nodes.values():
node.logs_filter(self.conf.log_files['filter']['default'])
for role in node.roles:
if ('by_role' in self.conf.log_files['filter'] and
role in self.conf.log_files['filter']['by_role'].keys()
):
node.logs_filter(self.conf.log_files['filter']['by_role'][role])
logging.debug('filter logs: node-%s: filtered logs: %s' %
(node.node_id, node.flogs))
def calculate_log_size(self, timeout=15):
lsize = 0
for node in self.nodes.values():
if not node.log_size_from_find(self.conf.log_files['path'],
self.sshopts,
5):
logging.warning("can't get log file list from node %s" %node.node_id)
self.filter_logs()
for node in self.nodes.values():
for f in node.flogs:
lsize += node.flogs[f]
logging.info('Full log size on nodes(with fuel): %s bytes' % lsize)
self.alogsize = lsize / 1024
def is_enough_space(self, coefficient=2.2):
outs, errs, code = free_space(self.destdir, timeout=1)
if code != 0:
logging.error("Can't get free space: %s" % errs)
return False
fs = int(outs.rstrip('\n'))
logging.info('logsize: %s Kb, free space: %s Kb' % (self.alogsize, fs))
if (self.alogsize*coefficient > fs):
logging.error('Not enough space on device')
return False
else:
return True
def create_archive_general(self, directory, outfile, timeout):
cmd = "tar jcf '%s' -C %s %s" % (outfile, directory, ".")
mdir(self.conf.archives)
logging.debug("create_archive_general: cmd: %s" %cmd)
outs, errs, code = ssh_node(ip='localhost',
command=cmd,
sshopts=self.sshopts,
sshvars='',
timeout=timeout,
outputfile=outfile)
if code != 0:
logging.error("Can't create archive %s" % (errs))
def create_log_archives(self, outdir, timeout):
threads = []
txtfl = []
for node in self.nodes.values():
if (self.cluster and str(self.cluster) != str(node.cluster) and
node.cluster != 0):
continue
if node.status in self.conf.soft_filter.status and node.online:
tstr = ''
cl = 'cluster-%s' % self.cluster
node.archivelogsfile = os.path.join(outdir, 'logs-node-'+str(node.node_id) + '.tar.bz2')
mdir(outdir)
logslistfile = node.archivelogsfile + '.txt'
txtfl.append(logslistfile)
try:
with open(logslistfile, 'w') as llf:
for line in node.flogs:
llf.write(line+"\0")
except:
logging.error("create_archive_logs: Can't write to file %s" % logslistfile)
if str(node.node_id) == '0':
tstr = '--transform \\"flags=r;s|^|logs/fuel/|\\"'
cmd = ("tar --bzip2 --create %s --file - "
"--null --files-from -" %
(tstr))
t = threading.Thread(target=node.exec_simple_cmd,
args=(cmd,
logslistfile,
node.archivelogsfile,
self.sshvars,
self.sshopts,
timeout)
)
threads.append(t)
t.start()
for t in threads:
t.join()
for tfile in txtfl:
os.remove(tfile)
def add_logs_archive(self, directory, key, outfile, timeout):
cmd = ("tar --append --file=%s --directory %s %s" %
(outfile, directory, key))
outs, errs, code = ssh_node(ip='localhost', command=cmd,
sshopts=self.sshopts,
sshvars='',
timeout=timeout)
if code != 2 and code != 0:
logging.warning("stderr from tar: %s" % (errs))
def compress_logs(self, timeout):
threads = []
for node in self.nodes.values():
if (self.cluster and str(self.cluster) != str(node.cluster) and
node.cluster != 0):
continue
if node.status in self.conf.soft_filter.status and node.online:
self.compress_archive(node.archivelogsfile, timeout)
def compress_archive(self, filename, timeout):
cmd = 'bzip2 -f %s' % filename
outs, errs, code = launch_cmd(command=cmd,
timeout=timeout)
if code != 0:
logging.warning("Can't compress archive %s" % (errs))
def set_template_for_find(self):
'''Obsolete'''
for node in self.nodes.values():
node.flpath = self.conf.log_files['path']
node.fltemplate = self.conf.log_files['filter']['default']
for role in node.roles:
if role in self.conf.log_files['filter']['by_role'].keys():
node.fltemplate = self.conf.log_files['filter']['by_role'][role]
logging.debug('set_template_for_find: break on role %s' %role)
break
if (self.conf.log_files['filter']['by_node_id'] and
node.node_id in self.conf.log_files['filter']['by_node_id'].keys()):
node.fltemplate = self.conf.log_files['by_node_id'][node.node_id]
logging.debug('set_template_for_find: node: %s, template: %s' %
(node.node_id, node.fltemplate) )
def get_conf_files(self, odir=fkey, timeout=15):
if fkey not in self.files:
logging.warning("get_conf_files: %s directory does not exist" %(fkey))
return
lock = flock.FLock('/tmp/timmy-files.lock')
if not lock.lock():
logging.warning('Unable to obtain lock, skipping "files"-part')
return ''
label = fkey
threads = []
for node in self.nodes.values():
if (self.cluster and str(self.cluster) != str(node.cluster) and
node.cluster != 0):
continue
if node.status in self.conf.soft_filter.status and node.online:
t = threading.Thread(target=node.get_files,
args=(label,
self.sshopts,
odir,
self.timeout,))
threads.append(t)
t.start()
for t in threads:
t.join()
lock.unlock()
def get_log_files(self, odir=lkey, timeout=15):
if lkey not in self.files:
logging.warning("get_log_files: %s directory does not exist" %(lkey))
return
label = lkey
threads = []
for node in self.nodes.values():
if (self.cluster and str(self.cluster) != str(node.cluster) and
node.cluster != 0):
continue
if (node.status in self.conf.soft_filter.status and
node.online and str(node.node_id) != '0'):
t = threading.Thread(target=node.get_files,
args=(label,
self.sshopts,
odir,
self.timeout,))
threads.append(t)
t.start()
for t in threads:
t.join()
def print_nodes(self):
"""print nodes"""
print('#node-id, cluster, admin-ip, mac, os, roles, online, status')
for node in sorted(self.nodes.values(), key=lambda x: x.node_id):
if (self.cluster and
(str(self.cluster) != str(node.cluster)) and
node.cluster != 0):
print("#"+str(node))
else:
print(str(node))
def main(argv=None):
return 0
if __name__ == '__main__':
exit(main(sys.argv))