91 lines
5.0 KiB
Markdown
Executable File
91 lines
5.0 KiB
Markdown
Executable File
## Testing node evacuation
|
|
|
|
```sh
|
|
# create guest VM
|
|
cd;source ~/overcloudrc
|
|
openstack server create --image cirros-0.5.1 --flavor m1.small --network internal test-failover
|
|
openstack server list -c Name -c Status
|
|
|
|
+---------------+--------+
|
|
| Name | Status |
|
|
+---------------+--------+
|
|
| test-failover | ACTIVE |
|
|
+---------------+--------+
|
|
|
|
# find the compute node that the guest VM is running upon
|
|
openstack server show test-failover -f json | jq -r '."OS-EXT-SRV-ATTR:host"'
|
|
overcloud-novacomputeiha-3.localdomain
|
|
|
|
# login to the compute node hosting the guest VM, crash the host
|
|
cd;source ~/stackrc
|
|
ssh heat-admin@overcloud-novacomputeiha-3.ctlplane.localdomain
|
|
sudo su -
|
|
echo c > /proc/sysrq-trigger
|
|
# this terminal will fail after a few minutes, the dashboard console view of the guest VM will hang
|
|
# node hard poweroff will achieve the same effect
|
|
|
|
# check nova services
|
|
cd;source ~/overcloudrc
|
|
nova service-list
|
|
|
|
| 0ad301e3-3420-4d5d-a2fb-2f00ba80a00f | nova-compute | overcloud-novacomputeiha-3.localdomain | nova | disabled | down | 2022-05-19T11:49:40.000000 | - | True |
|
|
|
|
# check guest VM is still running, after a few minutes it should be running on another compute node
|
|
openstack server list -c Name -c Status
|
|
openstack server show test-failover -f json | jq -r .status
|
|
# VM Instance has not yet registered as on a down compute node
|
|
ACTIVE
|
|
# Openstack has detected the a down compute node and is moving the instance, rebuilding refers to the QEMU domain there is no VM rebuilding and active OS state is preserved
|
|
REBUILDING
|
|
# if you see an error state either IPMI interfaces cannot be contacted by the controllers or there is a storage migration issue, check with 'openstack server show test-failover'
|
|
ERROR
|
|
# you probably wont see this unless you recover from an ERROR state with 'openstack server stop test-failover'
|
|
SHUTOFF
|
|
|
|
# check VM instance is on a new node
|
|
openstack server show test-failover -f json | jq -r '."OS-EXT-SRV-ATTR:host"'
|
|
overcloud-novacomputeiha-1.localdomain
|
|
|
|
# Unless the compute node does not come back up you should see it automatically rejoined to the cluster
|
|
# If it does not rejoin the cluster try a reboot and wait a good 10 minutes
|
|
# If a node still does not come back you will have to remove it and redeploy from the undercloud - hassle
|
|
nova service-list
|
|
| 1be7bc8f-2769-4986-ac5e-686859779bca | nova-compute | overcloud-novacomputeiha-0.localdomain | nova | enabled | up | 2022-05-19T12:03:27.000000 | - | False |
|
|
| 0ad301e3-3420-4d5d-a2fb-2f00ba80a00f | nova-compute | overcloud-novacomputeiha-3.localdomain | nova | enabled | up | 2022-05-19T12:03:28.000000 | - | False |
|
|
| c8d3cfd8-d639-49a2-9520-5178bc5a426b | nova-compute | overcloud-novacomputeiha-2.localdomain | nova | enabled | up | 2022-05-19T12:03:26.000000 | - | False |
|
|
| 3c918b5b-36a6-4e63-b4de-1b584171a0c0 | nova-compute | overcloud-novacomputeiha-1.localdomain | nova | enabled | up | 2022-05-19T12:03:27.000000 | - | False |
|
|
```
|
|
|
|
Other commands to assist in debug of failover behaviour.
|
|
|
|
> https://access.redhat.com/documentation/en-us/red_hat_openstack_platform/16.2/html/command_line_interface_reference/server#server_migrate # great CLI reference
|
|
> https://docs.openstack.org/nova/rocky/admin/evacuate.html # older reference, prefer openstack CLI commands that act as a wrapper to nova CLI
|
|
|
|
```sh
|
|
# test that the controller nodes can run ipmitool against the compute nodes
|
|
ipmitool -I lanplus -H 10.0.9.45 -p 2000 -U USERID -P PASSW0RD chassis status
|
|
|
|
# list physical nodes
|
|
openstack host list
|
|
nova hypervisor-list
|
|
|
|
# list VMs, get compute node for an instance
|
|
openstack server list
|
|
openstack server list -c Name -c Status
|
|
nova list
|
|
openstack server show <server> -f json | jq -r '."OS-EXT-SRV-ATTR:host"'
|
|
|
|
# if you get a VM instance stuck in a power on/off state and you cant evacuate it from a failed node, issue 'openstack server stop <server>'
|
|
nova reset-state --active <server> # set to active state even if it was in error state
|
|
nova reset-state --all-tenants # seems to set node back to error state if it was in active state but failed and powered off
|
|
nova stop [--all-tenants] <server>
|
|
openstack server stop <server> # new command line reference method, puts node in poweroff state, use for ERROR in migration
|
|
|
|
# evacuate single VM server instance to a different compute node
|
|
# not prefered, older command syntax for direct nova service control
|
|
nova evacuate <server> overcloud-novacomputeiha-3.localdomain # moves VM - pauses but doesn't shut down
|
|
nova evacuate --on-shared-storage test-1 overcloud-novacomputeiha-0.localdomain # live migration
|
|
# prefered openstack CLI native commands
|
|
openstack server migrate --live-migration <server> # moves VM - pauses but doesn't shut down, state is preserved (presumably this only works owing to ceph/shared storage)
|
|
openstack server migrate --shared-migration <server> # requires manual confirmation in web console, stops/starts VM, state not preserved
|
|
``` |