![ricolin](/assets/img/avatar_default.png)
A scenario for self-healing and auto-scaling with Heat, Mistral, Zaqar, and Aodh. Change-Id: I652c3b0e0caa433bfd08c0cb35b21507f0897889
200 lines
5.3 KiB
YAML
200 lines
5.3 KiB
YAML
heat_template_version: rocky
|
|
description: A load-balancer server
|
|
parameters:
|
|
image:
|
|
type: string
|
|
description: Image used for servers
|
|
key_name:
|
|
type: string
|
|
description: SSH key to connect to the servers
|
|
flavor:
|
|
type: string
|
|
description: flavor used by the servers
|
|
security_group:
|
|
type: string
|
|
description: security_group used by the web servers
|
|
pool_id:
|
|
type: string
|
|
description: Pool to contact
|
|
user_data:
|
|
type: string
|
|
description: Server user_data
|
|
metadata:
|
|
type: json
|
|
network:
|
|
type: string
|
|
description: Network used by the server
|
|
subnet:
|
|
type: string
|
|
description: Subnet used by the server
|
|
external_network:
|
|
type: string
|
|
description: UUID or Name of a Neutron external network
|
|
root_stack_id:
|
|
type: string
|
|
default: ""
|
|
|
|
conditions:
|
|
is_standalone: {equals: [{get_param: root_stack_id}, ""]}
|
|
|
|
resources:
|
|
config:
|
|
type: OS::Heat::SoftwareConfig
|
|
properties:
|
|
group: script
|
|
inputs:
|
|
- name: host
|
|
- name: version
|
|
outputs:
|
|
- name: result
|
|
config:
|
|
get_file: nginx-script.sh
|
|
|
|
deployment:
|
|
type: OS::Heat::SoftwareDeployment
|
|
properties:
|
|
config:
|
|
get_resource: config
|
|
server:
|
|
get_resource: server
|
|
input_values:
|
|
host: { get_attr: [server, first_address] }
|
|
version: "v1.0.0"
|
|
|
|
server:
|
|
type: OS::Nova::Server
|
|
properties:
|
|
flavor: {get_param: flavor}
|
|
security_groups: [{get_param: security_group} ]
|
|
image: {get_param: image}
|
|
key_name: {get_param: key_name}
|
|
metadata: {get_param: metadata}
|
|
user_data: {get_param: user_data}
|
|
user_data_format: SOFTWARE_CONFIG
|
|
networks:
|
|
- {network: {get_param: network} }
|
|
|
|
member:
|
|
#type: OS::Neutron::LBaaS::PoolMember
|
|
type: OS::Octavia::PoolMember
|
|
properties:
|
|
pool: {get_param: pool_id}
|
|
address: {get_attr: [server, first_address]}
|
|
protocol_port: 80
|
|
subnet: {get_param: subnet}
|
|
|
|
server_floating_ip_assoc:
|
|
type: OS::Neutron::FloatingIPAssociation
|
|
properties:
|
|
floatingip_id: {get_resource: floating_ip}
|
|
port_id: {get_attr: [server, addresses, {get_param: network}, 0, port]}
|
|
|
|
floating_ip:
|
|
type: OS::Neutron::FloatingIP
|
|
properties:
|
|
floating_network: {get_param: external_network}
|
|
|
|
alarm_queue:
|
|
type: OS::Zaqar::Queue
|
|
|
|
stop_event_alarm:
|
|
type: OS::Aodh::EventAlarm
|
|
properties:
|
|
event_type: compute.instance.update
|
|
query:
|
|
- field: traits.instance_id
|
|
value: {get_resource: server}
|
|
op: eq
|
|
- field: traits.state
|
|
value: stopped
|
|
op: eq
|
|
alarm_queues:
|
|
- {get_resource: alarm_queue}
|
|
|
|
error_event_alarm:
|
|
type: OS::Aodh::EventAlarm
|
|
properties:
|
|
event_type: compute.instance.update
|
|
query:
|
|
- field: traits.instance_id
|
|
value: {get_resource: server}
|
|
op: eq
|
|
- field: traits.state
|
|
value: error
|
|
op: eq
|
|
alarm_queues:
|
|
- {get_resource: alarm_queue}
|
|
|
|
deleted_event_alarm:
|
|
type: OS::Aodh::EventAlarm
|
|
properties:
|
|
event_type: compute.instance.delete.*
|
|
query:
|
|
- field: traits.instance_id
|
|
value: {get_resource: server}
|
|
op: eq
|
|
alarm_queues:
|
|
- {get_resource: alarm_queue}
|
|
|
|
# The Aodh event alarm does not take effect immediately; it may take up to
|
|
# 60s (by default) for the event_alarm_cache_ttl to expire and the tenant's
|
|
# alarm data to be loaded. This resource ensures the stack is not completed
|
|
# until the alarm is active. See https://bugs.launchpad.net/aodh/+bug/1651273
|
|
alarm_cache_wait:
|
|
type: OS::Heat::TestResource
|
|
properties:
|
|
action_wait_secs:
|
|
create: 60
|
|
update: 60
|
|
value:
|
|
list_join:
|
|
- ''
|
|
- - {get_attr: [stop_event_alarm, show]}
|
|
- {get_attr: [error_event_alarm, show]}
|
|
- {get_attr: [deleted_event_alarm, show]}
|
|
|
|
alarm_subscription:
|
|
type: OS::Zaqar::MistralTrigger
|
|
properties:
|
|
queue_name: {get_resource: alarm_queue}
|
|
workflow_id: {get_resource: autoheal}
|
|
input:
|
|
stack_id: {get_param: "OS::stack_id"}
|
|
root_stack_id:
|
|
if:
|
|
- is_standalone
|
|
- {get_param: "OS::stack_id"}
|
|
- {get_param: "root_stack_id"}
|
|
|
|
autoheal:
|
|
type: OS::Mistral::Workflow
|
|
properties:
|
|
description: >
|
|
Mark a server as unhealthy and commence a stack update to replace it.
|
|
input:
|
|
stack_id:
|
|
root_stack_id:
|
|
type: direct
|
|
tasks:
|
|
- name: resources_mark_unhealthy
|
|
action:
|
|
list_join:
|
|
- ' '
|
|
- - heat.resources_mark_unhealthy
|
|
- stack_id=<% $.stack_id %>
|
|
- resource_name=<% env().notification.body.reason_data.event.traits.where($[0] = 'instance_id').select($[2]).first() %>
|
|
- mark_unhealthy=true
|
|
- resource_status_reason='Marked by alarm'
|
|
on_success:
|
|
- stacks_update
|
|
- name: stacks_update
|
|
action: heat.stacks_update stack_id=<% $.root_stack_id %> existing=true
|
|
|
|
outputs:
|
|
server_ip:
|
|
description: IP Address of the load-balanced server.
|
|
value: { get_attr: [server, first_address] }
|
|
lb_member:
|
|
description: LB member details.
|
|
value: { get_attr: [member, show] }
|