Merge "Support configurable retries on orchestration layer"

This commit is contained in:
Jenkins 2016-01-26 15:46:57 +00:00 committed by Gerrit Code Review
commit 34b5783b20
7 changed files with 146 additions and 21 deletions

View File

@ -1042,6 +1042,7 @@ class Task(Model):
args = Field(list) args = Field(list)
errmsg = Field(basestring, default=str) errmsg = Field(basestring, default=str)
timelimit = Field(int, default=int) timelimit = Field(int, default=int)
retry = Field(int, default=int)
execution = IndexedField(basestring) execution = IndexedField(basestring)
parents = ParentField(default=list) parents = ParentField(default=list)

View File

@ -40,7 +40,8 @@ def save_graph(graph):
'task_type': graph.node[n].get('type', ''), 'task_type': graph.node[n].get('type', ''),
'args': graph.node[n].get('args', []), 'args': graph.node[n].get('args', []),
'errmsg': graph.node[n].get('errmsg', '') or '', 'errmsg': graph.node[n].get('errmsg', '') or '',
'timelimit': graph.node[n].get('timelimit', 0)}) 'timelimit': graph.node[n].get('timelimit', 0),
'retry': graph.node[n].get('retry', 0)})
graph.node[n]['task'] = t graph.node[n]['task'] = t
for pred in graph.predecessors(n): for pred in graph.predecessors(n):
pred_task = graph.node[pred]['task'] pred_task = graph.node[pred]['task']
@ -49,12 +50,13 @@ def save_graph(graph):
t.save() t.save()
def update_graph(graph): def update_graph(graph, force=False):
for n in graph: for n in graph:
task = graph.node[n]['task'] task = graph.node[n]['task']
task.status = graph.node[n]['status'] task.status = graph.node[n]['status']
task.errmsg = graph.node[n]['errmsg'] or '' task.errmsg = graph.node[n]['errmsg'] or ''
task.save() task.retry = graph.node[n].get('retry', 0)
task.save(force=force)
def set_states(uid, tasks): def set_states(uid, tasks):
@ -79,7 +81,8 @@ def get_graph(uid):
target=t.target or None, target=t.target or None,
errmsg=t.errmsg or None, errmsg=t.errmsg or None,
task=t, task=t,
timelimit=t.timelimit or 0) timelimit=t.timelimit or 0,
retry=t.retry)
for u in t.parents.all_names(): for u in t.parents.all_names():
dg.add_edge(u, t.name) dg.add_edge(u, t.name)
return dg return dg

View File

@ -21,6 +21,7 @@ task can be scheduled for execution if it is not yet visited, and state
not in SKIPPED, INPROGRESS not in SKIPPED, INPROGRESS
PENDING - task that is scheduled to be executed PENDING - task that is scheduled to be executed
ERROR_RETRY - task was in error but will be re-executed
ERROR - visited node, but failed, can be failed by timeout ERROR - visited node, but failed, can be failed by timeout
SUCCESS - visited node, successfull SUCCESS - visited node, successfull
INPROGRESS - task already scheduled, can be moved to ERROR or SUCCESS INPROGRESS - task already scheduled, can be moved to ERROR or SUCCESS
@ -30,7 +31,9 @@ NOOP - task wont be executed, but should be treated as visited
from enum import Enum from enum import Enum
states = Enum('States', 'SUCCESS ERROR NOOP INPROGRESS SKIPPED PENDING') states = Enum(
'States',
'SUCCESS ERROR NOOP INPROGRESS SKIPPED PENDING ERROR_RETRY')
VISITED = (states.SUCCESS.name, states.NOOP.name) VISITED = (states.SUCCESS.name, states.NOOP.name)
BLOCKED = (states.INPROGRESS.name, states.SKIPPED.name, states.ERROR.name) BLOCKED = (states.INPROGRESS.name, states.SKIPPED.name, states.ERROR.name)

View File

@ -18,7 +18,9 @@ from solar.core.log import log
from solar.dblayer.locking import Lock from solar.dblayer.locking import Lock
from solar.orchestration import graph from solar.orchestration import graph
from solar.orchestration import limits from solar.orchestration import limits
from solar.orchestration.traversal import states
from solar.orchestration.traversal import traverse from solar.orchestration.traversal import traverse
from solar.orchestration.traversal import VISITED
from solar.orchestration.workers import base from solar.orchestration.workers import base
from solar.utils import get_current_ident from solar.utils import get_current_ident
@ -33,7 +35,7 @@ class Scheduler(base.Worker):
tasks = traverse(dg) tasks = traverse(dg)
filtered_tasks = list(limits.get_default_chain( filtered_tasks = list(limits.get_default_chain(
dg, dg,
[t for t in dg if dg.node[t]['status'] == 'INPROGRESS'], [t for t in dg if dg.node[t]['status'] == states.INPROGRESS.name],
tasks)) tasks))
return filtered_tasks return filtered_tasks
@ -45,8 +47,8 @@ class Scheduler(base.Worker):
for task_name in rst: for task_name in rst:
task_id = '{}:{}'.format(dg.graph['uid'], task_name) task_id = '{}:{}'.format(dg.graph['uid'], task_name)
task_type = dg.node[task_name]['type'] task_type = dg.node[task_name]['type']
dg.node[task_name]['status'] = 'INPROGRESS'
timelimit = dg.node[task_name].get('timelimit', 0) timelimit = dg.node[task_name].get('timelimit', 0)
dg.node[task_name]['status'] = states.INPROGRESS.name
ctxt = { ctxt = {
'task_id': task_id, 'task_id': task_id,
'task_name': task_name, 'task_name': task_name,
@ -68,10 +70,28 @@ class Scheduler(base.Worker):
with Lock(plan_uid, str(get_current_ident()), retries=20, wait=1): with Lock(plan_uid, str(get_current_ident()), retries=20, wait=1):
dg = graph.get_graph(plan_uid) dg = graph.get_graph(plan_uid)
for n in dg: for n in dg:
if dg.node[n]['status'] == 'PENDING': if dg.node[n]['status'] in (
dg.node[n]['status'] = 'SKIPPED' states.PENDING.name, states.PENDING_RETRY.name):
dg.node[n]['status'] = states.SKIPPED.name
graph.update_graph(dg) graph.update_graph(dg)
def _handle_update(self, plan, task_name, status, errmsg=''):
old_status = plan.node[task_name]['status']
if old_status in VISITED:
return
retries_count = plan.node[task_name]['retry']
if status == states.ERROR.name and retries_count > 0:
retries_count -= 1
status = states.ERROR_RETRY.name
log.debug('Retry task %s in plan, retries left %s',
task_name, plan.graph['uid'], retries_count)
else:
plan.node[task_name]['end_time'] = time.time()
plan.node[task_name]['status'] = status
plan.node[task_name]['errmsg'] = errmsg
plan.node[task_name]['retry'] = retries_count
def update_next(self, ctxt, status, errmsg): def update_next(self, ctxt, status, errmsg):
log.debug( log.debug(
'Received update for TASK %s - %s %s', 'Received update for TASK %s - %s %s',
@ -79,15 +99,13 @@ class Scheduler(base.Worker):
plan_uid, task_name = ctxt['task_id'].rsplit(':', 1) plan_uid, task_name = ctxt['task_id'].rsplit(':', 1)
with Lock(plan_uid, str(get_current_ident()), retries=20, wait=1): with Lock(plan_uid, str(get_current_ident()), retries=20, wait=1):
dg = graph.get_graph(plan_uid) dg = graph.get_graph(plan_uid)
dg.node[task_name]['status'] = status self._handle_update(dg, task_name, status, errmsg=errmsg)
dg.node[task_name]['errmsg'] = errmsg
dg.node[task_name]['end_time'] = time.time()
rst = self._next(dg) rst = self._next(dg)
for task_name in rst: for task_name in rst:
task_id = '{}:{}'.format(dg.graph['uid'], task_name) task_id = '{}:{}'.format(dg.graph['uid'], task_name)
task_type = dg.node[task_name]['type'] task_type = dg.node[task_name]['type']
dg.node[task_name]['status'] = 'INPROGRESS'
timelimit = dg.node[task_name].get('timelimit', 0) timelimit = dg.node[task_name].get('timelimit', 0)
dg.node[task_name]['status'] = states.INPROGRESS.name
ctxt = { ctxt = {
'task_id': task_id, 'task_id': task_id,
'task_name': task_name, 'task_name': task_name,
@ -111,7 +129,7 @@ class SchedulerCallbackClient(object):
self.client = client self.client = client
def update(self, ctxt, result, *args, **kwargs): def update(self, ctxt, result, *args, **kwargs):
self.client.update_next(ctxt, 'SUCCESS', '') self.client.update_next(ctxt, states.SUCCESS.name, '')
def error(self, ctxt, result, *args, **kwargs): def error(self, ctxt, result, *args, **kwargs):
self.client.update_next(ctxt, 'ERROR', repr(result)) self.client.update_next(ctxt, states.ERROR.name, repr(result))

View File

@ -30,7 +30,7 @@ def tasks_worker():
@pytest.fixture @pytest.fixture
def tasks_for_scheduler(tasks_worker, address): def tasks_for_scheduler(request, tasks_worker, address):
address = address + 'tasks' address = address + 'tasks'
executor = zerorpc_executor.Executor(tasks_worker, address) executor = zerorpc_executor.Executor(tasks_worker, address)
gevent.spawn(executor.run) gevent.spawn(executor.run)
@ -38,8 +38,8 @@ def tasks_for_scheduler(tasks_worker, address):
@pytest.fixture @pytest.fixture
def scheduler(tasks_for_scheduler, address): def scheduler(tasks_for_scheduler, scheduler_address):
address = address + 'scheduler' address = scheduler_address
worker = wscheduler.Scheduler(tasks_for_scheduler) worker = wscheduler.Scheduler(tasks_for_scheduler)
def session_end(ctxt): def session_end(ctxt):

View File

@ -0,0 +1,97 @@
# Copyright 2015 Mirantis, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import time
import gevent
import mock
import pytest
from solar.core.log import log
from solar.dblayer.model import ModelMeta
from solar.orchestration.executors import zerorpc_executor
from solar.orchestration import graph
from solar.orchestration.traversal import states
from solar.orchestration.workers import scheduler as wscheduler
from solar.orchestration.workers import tasks as wtasks
@pytest.fixture
def simple_plan_retries(simple_plan):
simple_plan.node['just_fail']['retry'] = 1
graph.update_graph(simple_plan, force=True)
return simple_plan
@pytest.fixture
def scheduler(request, scheduler_address):
tasks_client = None
if 'tasks' in request.node.fixturenames:
tasks_client = zerorpc_executor.Client(
request.getfuncargvalue('tasks_address'))
worker = wscheduler.Scheduler(tasks_client)
def session_end(ctxt):
log.debug('Session end ID %s', id(gevent.getcurrent()))
ModelMeta.session_end()
def session_start(ctxt):
log.debug('Session start ID %s', id(gevent.getcurrent()))
ModelMeta.session_start()
worker.for_all.before(session_start)
worker.for_all.after(session_end)
executor = zerorpc_executor.Executor(worker, scheduler_address)
gevent.spawn(executor.run)
return worker, zerorpc_executor.Client(scheduler_address)
@pytest.fixture
def tasks(request, tasks_address):
worker = wtasks.Tasks()
executor = zerorpc_executor.Executor(worker, tasks_address)
if 'scheduler' in request.node.fixturenames:
scheduler_client = wscheduler.SchedulerCallbackClient(
zerorpc_executor.Client(request.getfuncargvalue(
'scheduler_address')))
worker.for_all.on_success(scheduler_client.update)
worker.for_all.on_error(scheduler_client.error)
gevent.spawn(executor.run)
return worker, zerorpc_executor.Client(tasks_address)
def test_retry_just_fail(scheduler, tasks, simple_plan_retries):
timeout = 3
plan = simple_plan_retries
worker, client = scheduler
tracer = mock.Mock()
worker.for_all.on_success(tracer.update)
def wait_function(timeout):
for summary in graph.wait_finish(plan.graph['uid'], timeout):
assert summary[states.ERROR.name] <= 1
time.sleep(0.5)
return summary
client.next({}, plan.graph['uid'])
waiter = gevent.spawn(wait_function, timeout)
waiter.join(timeout=timeout)
assert len(tracer.update.call_args_list) == 4
for call in tracer.update.call_args_list[2:]:
args, _ = call
ctxt, rst, status, msg = args
assert ctxt['task_name'] == 'just_fail'
assert status == states.ERROR.name

View File

@ -62,7 +62,8 @@ def test_wait_finish(simple):
'NOOP': 0, 'NOOP': 0,
'ERROR': 0, 'ERROR': 0,
'INPROGRESS': 0, 'INPROGRESS': 0,
'PENDING': 0 'PENDING': 0,
'ERROR_RETRY': 0,
} }
@ -76,7 +77,8 @@ def test_several_updates(simple):
'NOOP': 0, 'NOOP': 0,
'ERROR': 1, 'ERROR': 1,
'INPROGRESS': 0, 'INPROGRESS': 0,
'PENDING': 1 'PENDING': 1,
'ERROR_RETRY': 0,
} }
simple.node['echo_stuff']['status'] = states.ERROR.name simple.node['echo_stuff']['status'] = states.ERROR.name
@ -88,5 +90,6 @@ def test_several_updates(simple):
'NOOP': 0, 'NOOP': 0,
'ERROR': 2, 'ERROR': 2,
'INPROGRESS': 0, 'INPROGRESS': 0,
'PENDING': 0 'PENDING': 0,
'ERROR_RETRY': 0,
} }