diff --git a/CHANGES.txt b/CHANGES.txt index 113724a..d722a03 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -4,8 +4,25 @@ Release notes 1.4.1 (unreleased) ------------------ -- Nothing changed yet. +- Live migration: fix crash due to failed heartbeats not propagating +tests: reduce test flakiness. +- Introduce `pytest-patterns` as a replacement for the current + use of Ellipsis. + +- Increase general test timeouts so tests are a bit less flaky. + +- Improve some lock logging to make it easier to understand and + debug when reading the locks and I think this also fixes some race + conditions. + + This also removes a file descriptor leak that was found in testing. + +- Fix VM kill handling to include our little supervisor process and + selectively decide whether to kill it or not. + +- Add more testing output about the environment in case something goes + wrong in the tests and we need to debug it. 1.4.0 (2023-10-12) ------------------ diff --git a/pytest.ini b/pytest.ini index 7c0c157..fde24ec 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,5 +1,5 @@ [pytest] -addopts = --tb=native --timeout=3 --cov=src --cov-report=html --junitxml=/tmp/fc.qemu-report.xml +addopts = --tb=native --timeout=20 --cov=src --cov-report=html --junitxml=/tmp/fc.qemu-report.xml markers = slow: mark test as slow. diff --git a/setup.py b/setup.py index 12bafef..fe212ce 100644 --- a/setup.py +++ b/setup.py @@ -35,6 +35,9 @@ "setuptools", "structlog>=16.1.0", ], + tests_require=[ + "pytest-patterns", + ], entry_points={ "console_scripts": [ "fc-qemu = fc.qemu.main:main", diff --git a/src/fc/qemu/agent.py b/src/fc/qemu/agent.py index fc021a0..028e1cf 100644 --- a/src/fc/qemu/agent.py +++ b/src/fc/qemu/agent.py @@ -203,22 +203,24 @@ def locked_func(self, *args, **kw): mode="nonblocking", ) return os.EX_TEMPFAIL + self._lock_count += 1 self.log.debug( - "acquire-lock", target=self.lockfile, result="locked" + "acquire-lock", + target=self.lockfile, + result="locked", + count=self._lock_count, ) - - self._lock_count += 1 - self.log.debug("lock-status", count=self._lock_count) try: return f(self, *args, **kw) finally: self._lock_count -= 1 - self.log.debug("lock-status", count=self._lock_count) - + self.log.debug( + "release-lock", + target=self.lockfile, + count=self._lock_count, + ) if self._lock_count == 0: - # New try: - self.log.debug("release-lock", target=self.lockfile) fcntl.flock(self._lockfile_fd, fcntl.LOCK_UN) self.log.debug( "release-lock", @@ -232,7 +234,8 @@ def locked_func(self, *args, **kw): target=self.lockfile, result="error", ) - pass + os.close(self._lockfile_fd) + self._lockfile_fd = None return locked_func @@ -1005,7 +1008,7 @@ def ensure_(self): self.log.error( "inconsistent-state", action="destroy", exc_info=True ) - self.qemu.destroy() + self.qemu.destroy(kill_supervisor=True) def ensure_offline(self): if self.qemu.is_running(): @@ -1335,7 +1338,7 @@ def restart(self): def kill(self): self.log.info("kill-vm") timeout = TimeOut(15, interval=1, raise_on_timeout=True) - self.qemu.destroy() + self.qemu.destroy(kill_supervisor=True) while timeout.tick(): if not self.qemu.is_running(): self.log.info("killed-vm") @@ -1396,6 +1399,8 @@ def outmigrate(self): client = Outgoing(self) exitcode = client() if not exitcode: + # XXX I think this can lead to inconsistent behaviour + # if the local VM is destroyed during migration? self.consul_deregister() self.log.info("outmigrate-finished", exitcode=exitcode) return exitcode diff --git a/src/fc/qemu/conftest.py b/src/fc/qemu/conftest.py index d57299f..69e7e93 100644 --- a/src/fc/qemu/conftest.py +++ b/src/fc/qemu/conftest.py @@ -2,6 +2,7 @@ import shutil import subprocess import sys +import time import traceback import pkg_resources @@ -38,7 +39,7 @@ def setup_structlog(): from . import util # set to True to temporarily get detailed tracebacks - log_exceptions = False + log_exceptions = True def test_logger(logger, method_name, event): stack = event.pop("stack", None) @@ -119,16 +120,70 @@ def pytest_assertrepr_compare(op, left, right): @pytest.fixture def clean_environment(): def clean(): - subprocess.call("pkill -f qemu", shell=True) + subprocess.call(["pkill", "-f", "supervised-qemu"]) + subprocess.call(["pkill", "-f", "qemu-system-x86_64"]) subprocess.call("rbd rm rbd.ssd/simplevm.swap", shell=True) subprocess.call("rbd snap purge rbd.ssd/simplevm.root", shell=True) subprocess.call("rbd rm rbd.ssd/simplevm.root", shell=True) subprocess.call("rbd rm rbd.ssd/simplevm.tmp", shell=True) + time.sleep(1) clean() + + print( + subprocess.check_output( + "rbd ls rbd.ssd", + shell=True, + ).decode("ascii") + ) + yield clean() + print( + subprocess.check_output( + "free && sync && echo 3 > /proc/sys/vm/drop_caches && free", + shell=True, + ).decode("ascii") + ) + print( + subprocess.check_output( + "ceph df", + shell=True, + ).decode("ascii") + ) + print( + subprocess.check_output( + "ps auxf", + shell=True, + ).decode("ascii") + ) + print( + subprocess.check_output( + "df -h", + shell=True, + ).decode("ascii") + ) + print( + subprocess.check_output( + "rbd showmapped", + shell=True, + ).decode("ascii") + ) + + +@pytest.fixture(autouse=True) +def clean_tmpdir_with_flakefinder(tmpdir, pytestconfig): + """The `tmpdir` is normally not cleaned out for debugging purposes. + + Running with flakefinder causes the tmpdir to quickly grow too big. + So, if flakefinder is active, we clean it out to avoid running out of + disk space. + """ + yield + if pytestconfig.getoption("flake_finder_enable") > 0: + shutil.rmtree(tmpdir) + @pytest.fixture def vm(clean_environment, monkeypatch, tmpdir): @@ -146,7 +201,7 @@ def vm(clean_environment, monkeypatch, tmpdir): vm.qemu.vm_expected_overhead = 128 for snapshot in vm.ceph.root.snapshots: snapshot.remove() - vm.qemu.destroy() + vm.qemu.destroy(kill_supervisor=True) vm.unlock() get_log() yield vm diff --git a/src/fc/qemu/ellipsis.py b/src/fc/qemu/ellipsis.py index a63865a..0a4ef47 100644 --- a/src/fc/qemu/ellipsis.py +++ b/src/fc/qemu/ellipsis.py @@ -3,7 +3,6 @@ class Report(object): - matches = None def __init__(self): @@ -49,7 +48,6 @@ def match(pattern, line): class Ellipsis(object): - # other = other.replace('\t', ' '*8) oder allgemein white-space unsensibel # multi-line support diff --git a/src/fc/qemu/hazmat/qemu.py b/src/fc/qemu/hazmat/qemu.py index 08de25a..c43198c 100644 --- a/src/fc/qemu/hazmat/qemu.py +++ b/src/fc/qemu/hazmat/qemu.py @@ -53,6 +53,7 @@ def detect_current_machine_type( def locked_global(f): LOCK = "/run/fc-qemu.lock" + # This is thread-safe *AS LONG* as every thread uses a separate instance # of the agent. Using multiple file descriptors will guarantee that the # lock can only be held once even within a single process. @@ -85,7 +86,6 @@ def locked(self, *args, **kw): class Qemu(object): - prefix = "" executable = "qemu-system-x86_64" @@ -576,20 +576,43 @@ def graceful_shutdown(self): ], ) - def destroy(self): + def destroy(self, kill_supervisor=False): # We use this destroy command in "fire-and-forget"-style because # sometimes the init script will complain even if we achieve what # we want: that the VM isn't running any longer. We check this # by contacting the monitor instead. - timeout = TimeOut(100, interval=1, raise_on_timeout=True) p = self.proc() if not p: return + + # Check whether the parent is the supervising process. + # Kill that one first so we avoid immediate restarts. + if kill_supervisor: + parent = p.parent() + if "supervised-qemu-wrapped" in parent.cmdline()[1]: + # Do not raise on timeout so we get a chance to actually kill + # the VM even if killing the supervisor fails. + timeout = TimeOut(100, interval=2, raise_on_timeout=False) + attempt = 0 + while parent.is_running() and timeout.tick(): + attempt += 1 + self.log.debug( + "vm-destroy-kill-supervisor", attempt=attempt + ) + try: + parent.terminate() + except psutil.NoSuchProcess: + break + + timeout = TimeOut(100, interval=2, raise_on_timeout=True) + attempt = 0 while p.is_running() and timeout.tick(): + attempt += 1 + self.log.debug("vm-destroy-kill-vm", attempt=attempt) try: p.terminate() except psutil.NoSuchProcess: - pass + break def resize_root(self, size): self.qmp.command("block_resize", device="virtio0", size=size) diff --git a/src/fc/qemu/incoming.py b/src/fc/qemu/incoming.py index 32630b8..771c090 100644 --- a/src/fc/qemu/incoming.py +++ b/src/fc/qemu/incoming.py @@ -139,6 +139,9 @@ def run(self): if self.finished == "success": return 0 else: + # Do not kill the supervisor. We will be giving up here, but the + # supervisor will restart the process, potentially with updated + # config data so we get a "free" retry. self.qemu.destroy() return 1 @@ -229,6 +232,9 @@ def destroy(self): log.info("destroying", machine=self.name) self.finished = "destroyed" try: + # Do not kill the supervisor. We will be giving up here, but the + # supervisor will restart the process, potentially with updated + # config data so we get a "free" retry. self.qemu.destroy() except QemuNotRunning: pass diff --git a/src/fc/qemu/outgoing.py b/src/fc/qemu/outgoing.py index 10e3528..3963dce 100644 --- a/src/fc/qemu/outgoing.py +++ b/src/fc/qemu/outgoing.py @@ -19,6 +19,7 @@ class Heartbeat(object): """ cookie = None + failed = False def __init__(self, log): self.thread = threading.Thread(target=self.run) @@ -48,13 +49,18 @@ def run(self): self.connection.ping(self.cookie) except Exception: self.log.exception("ping-failed", exc_info=True) + self.failed = True time.sleep(10) finally: self.log.debug("stopped-heartbeat-ping") + def propagate(self): + if self.failed: + self.log.warning("heartbeat-propagate") + raise RuntimeError("Heartbeat failed.") -class Outgoing(object): +class Outgoing(object): migration_exitcode = None target = None cookie = None @@ -209,6 +215,7 @@ def acquire_migration_locks(self): if self.agent.has_new_config(): self.target.cancel(self.cookie) raise ConfigChanged() + self.heartbeat.propagate() # Try to acquire local lock if self.agent.qemu.acquire_migration_lock(): @@ -230,6 +237,7 @@ def acquire_migration_locks(self): self.log.debug( "acquire-remote-migration-lock", result="success" ) + self.heartbeat.propagate() except Exception: self.log.exception( "acquire-remote-migration-lock", @@ -264,6 +272,7 @@ def migrate(self): mbps=mbps, output=pprint.pformat(stat), ) + self.heartbeat.propagate() except Exception: self.log.exception("error-waiting-for-migration", exc_info=True) raise @@ -272,12 +281,13 @@ def migrate(self): assert not status["running"], status assert status["status"] == "postmigrate", status self.log.info("finish-migration") + self.destroy(kill_supervisor=True) try: - self.heartbeat.stop() + self.log.info("finish-remote") self.target.finish_incoming(self.cookie) except Exception: self.log.exception("error-finish-remote", exc_info=True) - self.agent.qemu.destroy() + self.heartbeat.stop() def rescue(self): """Outgoing rescue: try to rescue the remote side first.""" @@ -287,7 +297,7 @@ def rescue(self): self.target.rescue(self.cookie) self.target.finish_incoming(self.cookie) self.log.info("rescue-remote-success", action="destroy local") - self.destroy() + self.destroy(kill_supervisor=True) # We managed to rescue on the remote side - hooray! self.migration_exitcode = 0 return @@ -317,6 +327,6 @@ def rescue(self): else: self.log.info("continue-locally", result="success") - def destroy(self): - self.agent.qemu.destroy() + def destroy(self, kill_supervisor=False): + self.agent.qemu.destroy(kill_supervisor) self.agent.qemu.clean_run_files() diff --git a/src/fc/qemu/tests/test_consul.py b/src/fc/qemu/tests/test_consul.py index 182f313..ec00d02 100644 --- a/src/fc/qemu/tests/test_consul.py +++ b/src/fc/qemu/tests/test_consul.py @@ -43,12 +43,21 @@ def test_empty_event(): def test_no_key_event(): stdin = StringIO('[{"ModifyIndex": 123, "Value": ""}]') Agent.handle_consul_event(stdin) - assert util.log_data == [ - "start-consul-events count=1", - "handle-key-failed exc_info=True key=None", - "finish-handle-key key=None", - "finish-consul-events", - ] + + assert ( + Ellipsis( + """\ +start-consul-events count=1 +handle-key-failed key=None +Traceback (most recent call last): + File "/nix/store/...-python3-...-env/lib/python3.8/site-packages/fc/qemu/agent.py", line ..., in handle + log.debug("handle-key", key=event["Key"]) +KeyError: 'Key' +finish-handle-key key=None +finish-consul-events""" + ) + == get_log() + ) @pytest.fixture @@ -260,9 +269,24 @@ def test_snapshot_online_vm(vm): freeze-failed action=continue machine=simplevm reason=Unable to sync with guest agent after 10 tries. snapshot-ignore machine=simplevm reason=not frozen ensure-thawed machine=simplevm volume=root -guest-fsfreeze-thaw-failed exc_info=True machine=simplevm subsystem=qemu +guest-fsfreeze-thaw-failed machine=simplevm subsystem=qemu +Traceback (most recent call last): +... + raise RuntimeError("VM not frozen, not making snapshot.") +RuntimeError: VM not frozen, not making snapshot. + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): +... + raise ClientError( +fc.qemu.hazmat.guestagent.ClientError: Unable to sync with guest agent after 10 tries. ensure-thawed-failed machine=simplevm reason=Unable to sync with guest agent after 10 tries. -handle-key-failed exc_info=True key=snapshot/7468743 +handle-key-failed key=snapshot/7468743 +Traceback (most recent call last): +... + raise RuntimeError("VM not frozen, not making snapshot.") +RuntimeError: VM not frozen, not making snapshot. finish-consul-events""" ) == get_log() @@ -290,15 +314,14 @@ def test_snapshot_nonexisting_vm(): @pytest.mark.live @pytest.mark.timeout(60) def test_snapshot_offline_vm(vm): - util.test_log_options["show_events"] = ["consul", "snapshot"] - vm.enc["parameters"]["kvm_host"] = "foobar" vm.stage_new_config() vm.activate_new_config() vm.ceph.ensure_root_volume() vm.ensure_offline() - get_log() + + print(get_log()) snapshot = {"vm": "simplevm", "snapshot": "backy-1234"} event = prepare_consul_event("snapshot/7468743", snapshot, 123) @@ -307,8 +330,15 @@ def test_snapshot_offline_vm(vm): get_log() == """\ start-consul-events count=1 +handle-key key=snapshot/7468743 +connect-rados machine=simplevm subsystem=ceph snapshot machine=simplevm snapshot=backy-1234 +acquire-lock machine=simplevm target=/run/qemu.simplevm.lock +acquire-lock count=1 machine=simplevm result=locked target=/run/qemu.simplevm.lock snapshot expected=VM running machine=simplevm +release-lock count=0 machine=simplevm target=/run/qemu.simplevm.lock +release-lock machine=simplevm result=unlocked target=/run/qemu.simplevm.lock +finish-handle-key key=snapshot/7468743 finish-consul-events""" ) diff --git a/src/fc/qemu/tests/vm_test.py b/src/fc/qemu/tests/vm_test.py index d8727be..11c725a 100644 --- a/src/fc/qemu/tests/vm_test.py +++ b/src/fc/qemu/tests/vm_test.py @@ -28,7 +28,7 @@ def clean_output(output): return output -@pytest.mark.timeout(60) +@pytest.mark.timeout(90) @pytest.mark.live def test_simple_vm_lifecycle_start_stop(vm): util.test_log_options["show_events"] = ["vm-status", "rbd-status"] @@ -64,8 +64,7 @@ def test_simple_vm_lifecycle_start_stop(vm): assert out == Ellipsis( """\ acquire-lock machine=simplevm target=/run/qemu.simplevm.lock -acquire-lock machine=simplevm result=locked target=/run/qemu.simplevm.lock -lock-status count=1 machine=simplevm +acquire-lock count=1 machine=simplevm result=locked target=/run/qemu.simplevm.lock generate-config machine=simplevm ensure-root machine=simplevm subsystem=ceph create-vm machine=simplevm subsystem=ceph @@ -148,8 +147,7 @@ def test_simple_vm_lifecycle_start_stop(vm): block_set_io_throttle arguments={'device': 'virtio2', 'iops': 10000, 'iops_rd': 0, 'iops_wr': 0, 'bps': 0, 'bps_wr': 0, 'bps_rd': 0} id=None machine=simplevm subsystem=qemu/qmp ensure-watchdog action=none machine=simplevm human-monitor-command arguments={'command-line': 'watchdog_action action=none'} id=None machine=simplevm subsystem=qemu/qmp -lock-status count=0 machine=simplevm -release-lock machine=simplevm target=/run/qemu.simplevm.lock +release-lock count=0 machine=simplevm target=/run/qemu.simplevm.lock release-lock machine=simplevm result=unlocked target=/run/qemu.simplevm.lock""" ) @@ -188,6 +186,11 @@ def test_simple_vm_lifecycle_start_stop(vm): @pytest.mark.timeout(60) @pytest.mark.live def test_simple_vm_lifecycle_ensure_going_offline(vm, capsys, caplog): + print( + subprocess.check_output( + "top -b -n 1 -o %MEM | head -n 20", shell=True + ).decode("ascii") + ) util.test_log_options["show_events"] = [ "vm-status", "rbd-status", @@ -360,6 +363,10 @@ def test_crashed_vm_clean_restart(vm): graceful-shutdown machine=simplevm graceful-shutdown-failed machine=simplevm reason=timeout kill-vm machine=simplevm +vm-destroy-kill-supervisor attempt=1 machine=simplevm subsystem=qemu +vm-destroy-kill-supervisor attempt=2 machine=simplevm subsystem=qemu +vm-destroy-kill-vm attempt=1 machine=simplevm subsystem=qemu +vm-destroy-kill-vm attempt=2 machine=simplevm subsystem=qemu killed-vm machine=simplevm unlock machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.root unlock machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.swap @@ -379,8 +386,9 @@ def test_crashed_vm_clean_restart(vm): def test_do_not_clean_up_crashed_vm_that_doesnt_get_restarted(vm): vm.ensure() assert vm.qemu.is_running() is True - vm.qemu.proc().kill() - vm.qemu.proc().wait(2) + proc = vm.qemu.proc() + proc.kill() + proc.wait(2) assert vm.ceph.locked_by_me() is True vm.enc["parameters"]["online"] = False vm.enc["consul-generation"] += 1 @@ -428,7 +436,26 @@ def test_vm_snapshot_with_missing_guest_agent(vm, monkeypatch): freeze-failed action=continue machine=simplevm reason=Unable to sync with guest agent after 10 tries. snapshot-ignore machine=simplevm reason=not frozen ensure-thawed machine=simplevm volume=root -guest-fsfreeze-thaw-failed exc_info=True machine=simplevm subsystem=qemu +guest-fsfreeze-thaw-failed machine=simplevm subsystem=qemu +Traceback (most recent call last): + File "/nix/store/...-python3-.../site-packages/fc/qemu/agent.py", line ..., in frozen_vm + yield frozen + File "/nix/store/...-python3-.../site-packages/fc/qemu/agent.py", line ..., in snapshot + raise RuntimeError("VM not frozen, not making snapshot.") +RuntimeError: VM not frozen, not making snapshot. + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "/nix/store/...-python3-.../site-packages/fc/qemu/hazmat/qemu.py", line ..., in thaw + self._thaw_via_guest_agent() + File "/nix/store/...-python3-.../site-packages/fc/qemu/hazmat/qemu.py", line ..., in _thaw_via_guest_agent + with self.guestagent as guest: + File "/nix/store/...-python3-.../site-packages/fc/qemu/hazmat/guestagent.py", line ..., in __enter__ + self.sync() + File "/nix/store/...-python3-.../site-packages/fc/qemu/hazmat/guestagent.py", line ..., in sync + raise ClientError( +fc.qemu.hazmat.guestagent.ClientError: Unable to sync with guest agent after 10 tries. ensure-thawed-failed machine=simplevm reason=Unable to sync with guest agent after 10 tries.""" ) == get_log() @@ -444,7 +471,26 @@ def test_vm_snapshot_with_missing_guest_agent(vm, monkeypatch): freeze-failed action=continue machine=simplevm reason=... snapshot-ignore machine=simplevm reason=not frozen ensure-thawed machine=simplevm volume=root -guest-fsfreeze-thaw-failed exc_info=True machine=simplevm subsystem=qemu +guest-fsfreeze-thaw-failed machine=simplevm subsystem=qemu +Traceback (most recent call last): + File "/nix/store/...-python3-.../site-packages/fc/qemu/agent.py", line ..., in frozen_vm + yield frozen + File "/nix/store/...-python3-.../site-packages/fc/qemu/agent.py", line ..., in snapshot + raise RuntimeError("VM not frozen, not making snapshot.") +RuntimeError: VM not frozen, not making snapshot. + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "/nix/store/...-python3-.../site-packages/fc/qemu/hazmat/qemu.py", line ..., in thaw + self._thaw_via_guest_agent() + File "/nix/store/...-python3-.../site-packages/fc/qemu/hazmat/qemu.py", line ..., in _thaw_via_guest_agent + with self.guestagent as guest: + File "/nix/store/...-python3-.../site-packages/fc/qemu/hazmat/guestagent.py", line ..., in __enter__ + self.sync() + File "/nix/store/...-python3-.../site-packages/fc/qemu/hazmat/guestagent.py", line ..., in sync + raise ClientError( +fc.qemu.hazmat.guestagent.ClientError: Unable to sync with guest agent after 10 tries. ensure-thawed-failed machine=simplevm reason=Unable to sync with guest agent after 10 tries.""" ) == get_log() @@ -560,95 +606,116 @@ def _kill_vms(): _kill_vms() -def assert_outmigrate_log(log): - filtered_log = [] - for line in log.split("\n"): - # The heartbeat pings show up in varying places. We need to filter - # those out. Note the leading space to avoid eliminating the - # heartbeat-initialized message. - if " heartbeat-ping" in line: - continue - if "stopped-heartbeat-ping" in line: - continue - filtered_log.append(line) - filtered_log = "\n".join(filtered_log) - - assert filtered_log == Ellipsis( - """\ +@pytest.fixture +def outmigrate_pattern(patterns): + outmigrate = patterns.outmigrate + outmigrate.in_order( + """ /nix/store/.../bin/fc-qemu -v outmigrate simplevm load-system-config simplevm connect-rados subsystem='ceph' simplevm acquire-lock target='/run/qemu.simplevm.lock' -simplevm acquire-lock result='locked' target='/run/qemu.simplevm.lock' -simplevm lock-status count=1 +simplevm acquire-lock count=1 result='locked' target='/run/qemu.simplevm.lock' simplevm qmp_capabilities arguments={} id=None subsystem='qemu/qmp' simplevm query-status arguments={} id=None subsystem='qemu/qmp' + simplevm outmigrate simplevm consul-register -simplevm heartbeat-initialized simplevm locate-inmigration-service simplevm check-staging-config result='none' simplevm located-inmigration-service url='http://host2.mgm.test.gocept.net:...' -simplevm started-heartbeat-ping + simplevm acquire-migration-locks simplevm check-staging-config result='none' simplevm acquire-migration-lock result='success' subsystem='qemu' simplevm acquire-local-migration-lock result='success' simplevm acquire-remote-migration-lock simplevm acquire-remote-migration-lock result='success' + simplevm unlock subsystem='ceph' volume='rbd.ssd/simplevm.root' simplevm unlock subsystem='ceph' volume='rbd.ssd/simplevm.swap' simplevm unlock subsystem='ceph' volume='rbd.ssd/simplevm.tmp' + simplevm prepare-remote-environment simplevm start-migration target='tcp:192.168.4.7:...' simplevm migrate subsystem='qemu' simplevm migrate-set-capabilities arguments={'capabilities': [{'capability': 'xbzrle', 'state': False}, {'capability': 'auto-converge', 'state': True}]} id=None subsystem='qemu/qmp' simplevm migrate-set-parameters arguments={'compress-level': 0, 'downtime-limit': 4000, 'max-bandwidth': 22500} id=None subsystem='qemu/qmp' simplevm migrate arguments={'uri': 'tcp:192.168.4.7:...'} id=None subsystem='qemu/qmp' + simplevm query-migrate-parameters arguments={} id=None subsystem='qemu/qmp' simplevm migrate-parameters announce-initial=50 announce-max=550 announce-rounds=5 announce-step=100 block-incremental=False compress-level=0 compress-threads=8 compress-wait-thread=True cpu-throttle-increment=10 cpu-throttle-initial=20 cpu-throttle-tailslow=False decompress-threads=2 downtime-limit=4000 max-bandwidth=22500 max-cpu-throttle=99 max-postcopy-bandwidth=0 multifd-channels=2 multifd-compression='none' multifd-zlib-level=1 multifd-zstd-level=1 subsystem='qemu' throttle-trigger-threshold=50 tls-authz='' tls-creds='' tls-hostname='' x-checkpoint-delay=20000 xbzrle-cache-size=67108864 + simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' simplevm migration-status mbps='-' remaining='0' status='setup' -simplevm> {'blocked': False, 'status': 'setup'} -simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' -simplevm migration-status mbps=... remaining='...' status='active' -simplevm> {'blocked': False, -simplevm> 'expected-downtime': 4000, -... -simplevm> 'status': 'active', -simplevm> 'total-time': ...} -... + simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' simplevm migration-status mbps=... remaining='...' status='active' -... + simplevm migration-status mbps=... remaining='...' status='completed' -simplevm> {'blocked': False, -simplevm> 'downtime': ..., -... -simplevm> 'status': 'completed', -simplevm> 'total-time': ...} + simplevm query-status arguments={} id=None subsystem='qemu/qmp' simplevm finish-migration + +simplevm vm-destroy-kill-supervisor attempt=1 subsystem='qemu' +simplevm vm-destroy-kill-supervisor attempt=2 subsystem='qemu' +simplevm vm-destroy-kill-vm attempt=1 subsystem='qemu' +simplevm vm-destroy-kill-vm attempt=2 subsystem='qemu' +simplevm clean-run-files subsystem='qemu' +simplevm finish-remote simplevm consul-deregister simplevm outmigrate-finished exitcode=0 -simplevm lock-status count=0 -simplevm release-lock target='/run/qemu.simplevm.lock' +simplevm release-lock count=0 target='/run/qemu.simplevm.lock' simplevm release-lock result='unlocked' target='/run/qemu.simplevm.lock' """ ) + # There are a couple of steps in the migration process that may repeat due to + # timings,so this may or may not appear more often: + outmigrate.optional( + """ +simplevm waiting interval=3 remaining=... +simplevm check-staging-config result='none' +simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' +simplevm migration-status mbps=... remaining='...' status='active' +simplevm vm-destroy-kill-vm attempt=... subsystem='qemu' + """ + ) + # Expect debug output that doesn't matter as much + patterns.debug.optional("simplevm> ...") + + # This part of the heartbeats must show up + patterns.heartbeat.in_order( + """\ +simplevm heartbeat-initialized +simplevm started-heartbeat-ping +simplevm heartbeat-ping +""" + ) + # The pings may happen more times and sometimes the stopping part + # isn't visible because we terminate too fast. + patterns.heartbeat.optional( + """ +simplevm heartbeat-ping +simplevm stopped-heartbeat-ping +""" + ) + + outmigrate.merge("heartbeat", "debug") + + return outmigrate -def test_vm_migration_pattern(): +def test_vm_migration_pattern(outmigrate_pattern): # This is a variety of outputs we have seen that are valid and where we want to be # sure that the Ellipsis matches them properly. - assert_outmigrate_log( - """\ + assert ( + outmigrate_pattern + == """\ /nix/store/c5kfr1yx6920s7lcswsv9dn61g8dc5xk-python3.8-fc.qemu-dev/bin/fc-qemu -v outmigrate simplevm load-system-config simplevm connect-rados subsystem='ceph' simplevm acquire-lock target='/run/qemu.simplevm.lock' -simplevm acquire-lock result='locked' target='/run/qemu.simplevm.lock' -simplevm lock-status count=1 +simplevm acquire-lock count=1 result='locked' target='/run/qemu.simplevm.lock' simplevm qmp_capabilities arguments={} id=None subsystem='qemu/qmp' simplevm query-status arguments={} id=None subsystem='qemu/qmp' simplevm outmigrate @@ -669,7 +736,6 @@ def test_vm_migration_pattern(): simplevm unlock subsystem='ceph' volume='rbd.ssd/simplevm.swap' simplevm unlock subsystem='ceph' volume='rbd.ssd/simplevm.tmp' simplevm prepare-remote-environment -simplevm heartbeat-ping simplevm start-migration target='tcp:192.168.4.7:2345' simplevm migrate subsystem='qemu' simplevm migrate-set-capabilities arguments={'capabilities': [{'capability': 'xbzrle', 'state': False}, {'capability': 'auto-converge', 'state': True}]} id=None subsystem='qemu/qmp' @@ -898,11 +964,16 @@ def test_vm_migration_pattern(): simplevm> 'total-time': 68420} simplevm query-status arguments={} id=None subsystem='qemu/qmp' simplevm finish-migration +simplevm vm-destroy-kill-supervisor attempt=1 subsystem='qemu' +simplevm vm-destroy-kill-supervisor attempt=2 subsystem='qemu' +simplevm vm-destroy-kill-vm attempt=1 subsystem='qemu' +simplevm vm-destroy-kill-vm attempt=2 subsystem='qemu' +simplevm clean-run-files subsystem='qemu' +simplevm finish-remote simplevm stopped-heartbeat-ping simplevm consul-deregister simplevm outmigrate-finished exitcode=0 -simplevm lock-status count=0 -simplevm release-lock target='/run/qemu.simplevm.lock' +simplevm release-lock count=0 target='/run/qemu.simplevm.lock' simplevm release-lock result='unlocked' target='/run/qemu.simplevm.lock' """ ) @@ -910,14 +981,14 @@ def test_vm_migration_pattern(): # This one is missing the "stopped-heartbeat-ping". This can happen # because the heartbeat has a sleep cycle of 10s and only finishes with # this log output when it actually triggers at the right moment. - assert_outmigrate_log( - """\ + assert ( + outmigrate_pattern + == """\ /nix/store/kj63j38ji0c8yk037yvzj9c5f27mzh58-python3.8-fc.qemu-d26a0eae29efd95fe5c328d8a9978eb5a6a4529e/bin/fc-qemu -v outmigrate simplevm load-system-config simplevm connect-rados subsystem='ceph' simplevm acquire-lock target='/run/qemu.simplevm.lock' -simplevm acquire-lock result='locked' target='/run/qemu.simplevm.lock' -simplevm lock-status count=1 +simplevm acquire-lock count=1 result='locked' target='/run/qemu.simplevm.lock' simplevm qmp_capabilities arguments={} id=None subsystem='qemu/qmp' simplevm query-status arguments={} id=None subsystem='qemu/qmp' simplevm outmigrate @@ -1167,18 +1238,23 @@ def test_vm_migration_pattern(): simplevm> 'total-time': 68526} simplevm query-status arguments={} id=None subsystem='qemu/qmp' simplevm finish-migration +simplevm vm-destroy-kill-supervisor attempt=1 subsystem='qemu' +simplevm vm-destroy-kill-supervisor attempt=2 subsystem='qemu' +simplevm vm-destroy-kill-vm attempt=1 subsystem='qemu' +simplevm vm-destroy-kill-vm attempt=2 subsystem='qemu' +simplevm clean-run-files subsystem='qemu' +simplevm finish-remote simplevm consul-deregister simplevm outmigrate-finished exitcode=0 -simplevm lock-status count=0 -simplevm release-lock target='/run/qemu.simplevm.lock' +simplevm release-lock count=0 target='/run/qemu.simplevm.lock' simplevm release-lock result='unlocked' target='/run/qemu.simplevm.lock' """ ) @pytest.mark.live -@pytest.mark.timeout(240) -def test_vm_migration(vm, kill_vms): +@pytest.mark.timeout(300) +def test_vm_migration(vm, kill_vms, outmigrate_pattern, patterns): def call(cmd, wait=True, host=None): for ssh_cmd in ["scp", "ssh"]: if not cmd.startswith(ssh_cmd): @@ -1226,37 +1302,31 @@ def communicate_progress(p): inmigrate = call("ssh host2 'fc-qemu -v inmigrate simplevm'", wait=False) outmigrate = call("fc-qemu -v outmigrate simplevm", wait=False) - outmigrate_result = communicate_progress(outmigrate) - assert_outmigrate_log(outmigrate_result) + assert outmigrate_pattern == communicate_progress(outmigrate) assert outmigrate.returncode == 0 inmigrate_result = communicate_progress(inmigrate) - assert inmigrate_result == Ellipsis( - """\ + inmigrate_pattern = patterns.inmigrate + inmigrate_pattern.in_order( + """ /nix/store/.../bin/fc-qemu -v inmigrate simplevm load-system-config simplevm connect-rados subsystem='ceph' simplevm acquire-lock target='/run/qemu.simplevm.lock' -simplevm acquire-lock result='locked' target='/run/qemu.simplevm.lock' -simplevm lock-status count=1 +simplevm acquire-lock count=1 result='locked' target='/run/qemu.simplevm.lock' + simplevm inmigrate simplevm start-server type='incoming' url='http://host2.mgm.test.gocept.net:.../' simplevm setup-incoming-api cookie='...' simplevm consul-register-inmigrate -simplevm received-ping timeout=60 -simplevm waiting interval=0 remaining=59 + simplevm received-acquire-migration-lock simplevm acquire-migration-lock result='success' subsystem='qemu' -simplevm reset-timeout -simplevm waiting interval=0 remaining=59 -simplevm received-ping timeout=60 -simplevm waiting interval=0 remaining=59 simplevm received-acquire-ceph-locks simplevm lock subsystem='ceph' volume='rbd.ssd/simplevm.root' simplevm lock subsystem='ceph' volume='rbd.ssd/simplevm.swap' simplevm lock subsystem='ceph' volume='rbd.ssd/simplevm.tmp' -simplevm reset-timeout -simplevm waiting interval=0 remaining=59 + simplevm received-prepare-incoming simplevm acquire-global-lock subsystem='qemu' target='/run/fc-qemu.lock' simplevm global-lock-acquire result='locked' subsystem='qemu' target='/run/fc-qemu.lock' @@ -1266,32 +1336,34 @@ def communicate_progress(p): simplevm qemu-system-x86_64 additional_args=['-incoming tcp:192.168.4.7:...'] local_args=['-nodefaults', '-only-migratable', '-cpu qemu64,enforce', '-name simplevm,process=kvm.simplevm', '-chroot /srv/vm/simplevm', '-runas nobody', '-serial file:/var/log/vm/simplevm.log', '-display vnc=127.0.0.1:2345', '-pidfile /run/qemu.simplevm.pid', '-vga std', '-m 256', '-readconfig /run/qemu.simplevm.cfg'] subsystem='qemu' simplevm exec cmd='supervised-qemu qemu-system-x86_64 -nodefaults -only-migratable -cpu qemu64,enforce -name simplevm,process=kvm.simplevm -chroot /srv/vm/simplevm -runas nobody -serial file:/var/log/vm/simplevm.log -display vnc=127.0.0.1:2345 -pidfile /run/qemu.simplevm.pid -vga std -m 256 -readconfig /run/qemu.simplevm.cfg -incoming tcp:192.168.4.7:2345 -D /var/log/vm/simplevm.qemu.internal.log simplevm /var/log/vm/simplevm.supervisor.log' subsystem='qemu' simplevm supervised-qemu-stdout subsystem='qemu' -simplevm> simplevm supervised-qemu-stderr subsystem='qemu' -simplevm> + simplevm global-lock-status count=0 subsystem='qemu' target='/run/fc-qemu.lock' simplevm global-lock-release subsystem='qemu' target='/run/fc-qemu.lock' simplevm global-lock-release result='unlocked' subsystem='qemu' simplevm qmp_capabilities arguments={} id=None subsystem='qemu/qmp' simplevm query-status arguments={} id=None subsystem='qemu/qmp' -simplevm reset-timeout -simplevm waiting interval=0 remaining=59 -simplevm received-ping timeout=60 -simplevm waiting interval=0 remaining=59 -simplevm received-ping timeout=60 -... + simplevm received-finish-incoming simplevm query-status arguments={} id=None subsystem='qemu/qmp' -simplevm reset-timeout simplevm consul-deregister-inmigrate simplevm stop-server result='success' type='incoming' simplevm consul-register simplevm inmigrate-finished exitcode=0 -simplevm lock-status count=0 -simplevm release-lock target='/run/qemu.simplevm.lock' +simplevm release-lock count=0 target='/run/qemu.simplevm.lock' simplevm release-lock result='unlocked' target='/run/qemu.simplevm.lock' """ ) + inmigrate_pattern.optional( + """ +simplevm> +simplevm received-ping timeout=60 +simplevm reset-timeout +simplevm waiting interval=0 remaining=... +""" + ) + + assert inmigrate_pattern == inmigrate_result assert inmigrate.returncode == 0 # The consul check is a bit flaky as it only checks every 5 seconds